X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Fmath%2Fcategoricals.h;h=5e49b4504ab1b9338c19fe089b861fa8d82a328f;hb=80ff0f10da00eae4c7b3b07266a03e403e97d640;hp=b2f6b649b1b1d0d67b656e87fccec70802b8861a;hpb=d31b04850f78d3732d318051e05529f361f0e9a5;p=pspp diff --git a/src/math/categoricals.h b/src/math/categoricals.h index b2f6b649b1..5e49b4504a 100644 --- a/src/math/categoricals.h +++ b/src/math/categoricals.h @@ -22,94 +22,148 @@ #include "data/missing-values.h" struct categoricals; -struct variable; struct ccase; struct interaction; +struct variable; +union value; -union value ; - -struct categoricals *categoricals_create (struct interaction *const *, size_t n_int, - const struct variable *wv, enum mv_class dep_excl, - enum mv_class fctr_excl); - -void categoricals_destroy (struct categoricals *); - -void categoricals_update (struct categoricals *cat, const struct ccase *c); +/* Categoricals. + A categorical variable has a finite and usually small number of possible + values. The categoricals data structure organizes an array of interactions + maong categorical variables, that is, a set of sets of categorical + variables. (Both levels of "set" are ordered.) -/* Return the number of categories (distinct values) for variable N */ -size_t categoricals_n_count (const struct categoricals *cat, size_t n); + The life cycle of a categoricals object looks like this: -size_t categoricals_df (const struct categoricals *cat, size_t n); + 1. Create it with categoricals_create(). This fixes the set of interactions + and other parameters. -/* Return the total number of categories */ -size_t categoricals_n_total (const struct categoricals *cat); + 2. Pass all of the desired cases through the object with + categoricals_update(). -/* Return the total degrees of freedom */ -size_t categoricals_df_total (const struct categoricals *cat); + 3. Finalize the object with categoricals_done(). Only at this point may + most of the categoricals query functions be called. + 4. Use the categoricals object as desired. -/* - Return the total number of variables which participated in these categoricals. - Due to the possibility of missing values, this is NOT necessarily - equal to the number of variables passed in when the object was - created. + 5. Destroy the object with categoricals_destroy(). */ -size_t categoricals_get_n_variables (const struct categoricals *cat); - - -bool categoricals_is_complete (const struct categoricals *cat); +/* Creating and destroying categoricals. */ +struct categoricals *categoricals_create (struct interaction *const *, + size_t n, + const struct variable *wv, + enum mv_class fctr_excl); +void categoricals_destroy (struct categoricals *); -/* - Must be called (once) before any call to the *_by_subscript or *_by_category - functions, but AFTER any calls to categoricals_update. - If this function returns false, then no calls to _by_subscript or *_by_category - are allowed. -*/ -bool categoricals_done (const struct categoricals *cat); +/* Updating categoricals. */ +void categoricals_update (struct categoricals *, const struct ccase *); +void categoricals_done (const struct categoricals *); +bool categoricals_is_complete (const struct categoricals *); +/* Categories. -/* - The *_by_subscript functions use the short map. - Their intended use is by covariance matrix routines, where normally 1 less than - the total number of distinct values of each categorical variable should - be considered. - */ -double categoricals_get_weight_by_subscript (const struct categoricals *cat, int subscript); -const struct interaction *categoricals_get_interaction_by_subscript (const struct categoricals *cat, int subscript); + A variable's number of categories is the number of unique values observed in + the data passed to categoricals_update(). -double categoricals_get_sum_by_subscript (const struct categoricals *cat, int subscript); + An interaction's number of categories is the number of observed unique + values of its variables, which will often be less than the product of its + variables' numbers of categories. -double categoricals_get_code_for_case (const struct categoricals *cat, int subscript, const struct ccase *c); + A categorical object's number of categories is the sum of its interactions' + categories. */ +size_t categoricals_n_count (const struct categoricals *, size_t idx); +size_t categoricals_n_total (const struct categoricals *); +union value *categoricals_get_var_values (const struct categoricals *, + const struct variable *, size_t *n); -/* These use the long map. Useful for descriptive statistics. */ +/* Degrees of freedom. -/* Return the value corresponding to the N'th category */ -const union value * categoricals_get_value_by_category (const struct categoricals *cat, int n); + A categorical variable with N_CATS categories has N_CATS - 1 degrees of + freedom. -const struct ccase * -categoricals_get_case_by_category_real (const struct categoricals *cat, int iact, int n); + An interaction's degrees of freedom is the product of its variables' degrees + of freedom. -void * -categoricals_get_user_data_by_category_real (const struct categoricals *cat, int iact, int n); + A categorical object's degrees of freedom is the sum of its interactions' + degrees of freedom. */ +size_t categoricals_df (const struct categoricals *, size_t idx); +size_t categoricals_df_total (const struct categoricals *); +/* Sanity. */ +bool categoricals_sane (const struct categoricals *cat); -void * categoricals_get_user_data_by_category (const struct categoricals *cat, int category); +/* "Short map". -const struct ccase * categoricals_get_case_by_category (const struct categoricals *cat, int subscript); + These look up an interaction within a categoricals object on the basis of a + "subscript". Interaction 0 with DF_0 degrees of freedom is assigned + subscripts [0, DF_0 - 1], interaction 1 with DF_1 degrees of freedom is + assigned subscripts [DF_0, DF_0 + DF_1 - 1], and so on. The subscripts + passed in must be in the range [0, DF_SUM - 1] where DF_SUM is the total + number of degrees of freedom for the object, as returned by + categoricals_df_total(). + These functions are intended for covariance matrix routines, where normally + 1 less than the total number of distinct values of each categorical variable + should be considered. -struct payload -{ - void* (*create) (const void *aux1, void *aux2); - void (*update) (const void *aux1, void *aux2, void *user_data, const struct ccase *, double weight); - void (*destroy) (const void *aux1, void *aux2, void *user_data); -}; + These functions may be used on an object only after calling + categoricals_done(). +*/ +double categoricals_get_weight_by_subscript (const struct categoricals *, + int subscript); +const struct interaction *categoricals_get_interaction_by_subscript ( + const struct categoricals *, int subscript); +double categoricals_get_sum_by_subscript (const struct categoricals *, + int subscript); +double categoricals_get_dummy_code_for_case (const struct categoricals *, + int subscript, + const struct ccase *); +double categoricals_get_effects_code_for_case (const struct categoricals *, + int subscript, + const struct ccase *); + + +/* "Long map". + + These look up an interaction within a categoricals object on the basis of a + "category index". Interaction 0 in CAT with CAT_0 categories has indexes + [0, CAT_0 - 1], interaction 1 with CAT_1 categories has indexes [CAT_0, + CAT_0 + CAT_1 - 1], and so on. The indexes passed in must be in the range + [0, CAT_TOTAL - 1] where CAT_TOTAL is the total number of categories for the + object, as returned by categoricals_n_total(). + + These functions are useful for descriptive statistics. + + These functions may be used on an object only after calling + categoricals_done(). +*/ +const struct ccase *categoricals_get_case_by_category_real ( + const struct categoricals *, int iact, int n); +void *categoricals_get_user_data_by_category_real ( + const struct categoricals *, int iact, int n); +int categoricals_get_value_index_by_category_real ( + const struct categoricals *, int iact_idx, int cat_idx, int var_idx); -void categoricals_set_payload (struct categoricals *cats, const struct payload *p, const void *aux1, void *aux2); +void *categoricals_get_user_data_by_category (const struct categoricals *, + int category); +const struct ccase *categoricals_get_case_by_category ( + const struct categoricals *cat, int subscript); +struct payload + { + void *(*create) (const void *aux1, void *aux2); + void (*update) (const void *aux1, void *aux2, void *user_data, + const struct ccase *, double weight); + void (*calculate) (const void *aux1, void *aux2, void *user_data); + void (*destroy) (const void *aux1, void *aux2, void *user_data); + }; + +void categoricals_set_payload (struct categoricals *, const struct payload *, + const void *aux1, void *aux2); +bool categoricals_isbalanced (const struct categoricals *); #endif