From: Jason H Stover Date: Wed, 15 Oct 2008 16:12:35 +0000 (-0400) Subject: glm.q: Removed code to us QR decomposition, which requires the entire data X-Git-Tag: v0.7.1~50^2~21^2~2 X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?p=pspp-builds.git;a=commitdiff_plain;h=18918bb7713dd3f7d0b0815d8372e8d454e2f2dc glm.q: Removed code to us QR decomposition, which requires the entire data set to be stored in a matrix. glm.q: fit_model: New function. design-matrix.[ch]: Added array of size_t's to store the number of data for each variable in struct design_matrix. Added accessor functions design_matrix_increment_case_count, design_matrix_set_case_count and design_matrix_get_case_count. --- diff --git a/src/language/stats/glm.q b/src/language/stats/glm.q index 39e4f1c5..024b5429 100644 --- a/src/language/stats/glm.q +++ b/src/language/stats/glm.q @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -47,8 +48,6 @@ #include "xalloc.h" #include "gettext.h" -#define GLM_LARGE_DATA 10000 - /* (headers) */ /* (specification) @@ -152,7 +151,7 @@ glm_custom_dependent (struct lexer *lexer, struct dataset *ds, model. That means the dependent variable is in there, too. */ static void -coeff_init (pspp_linreg_cache * c, struct design_matrix *cov) +coeff_init (pspp_linreg_cache * c, const struct design_matrix *cov) { c->coeff = xnmalloc (cov->m->size2, sizeof (*c->coeff)); c->n_coeffs = cov->m->size2 - 1; @@ -213,26 +212,39 @@ data_pass_one (struct casereader *input, return n_data; } +static pspp_linreg_cache * +fit_model (const struct design_matrix *cov, const struct moments1 **mom, + const struct variable *dep_var, + const struct variable ** indep_vars, + size_t n_data, size_t n_indep) +{ + pspp_linreg_cache *result = NULL; + result = pspp_linreg_cache_alloc (dep_var, indep_vars, n_data, n_indep); + coeff_init (result, cov); + pspp_linreg_with_cov (cov, result); + + return result; +} + static bool run_glm (struct casereader *input, struct cmd_glm *cmd, const struct dataset *ds) { - pspp_linreg_cache *model = NULL; - size_t i; - size_t j; - int n_indep = 0; - struct ccase c; + casenumber row; const struct variable **indep_vars; const struct variable **all_vars; - struct design_matrix *X; - struct moments_var **mom; - struct casereader *reader; - casenumber row; + int n_indep = 0; + pspp_linreg_cache *model = NULL; + pspp_linreg_opts lopts; + struct ccase c; + size_t i; size_t n_all_vars; size_t n_data; /* Number of valid cases. */ - - pspp_linreg_opts lopts; + struct casereader *reader; + struct design_matrix *cov; + struct hsh_table *cov_hash; + struct moments1 **mom; if (!casereader_peek (input, 0, &c)) { @@ -266,69 +278,47 @@ run_glm (struct casereader *input, } n_indep = cmd->n_by; mom = xnmalloc (n_all_vars, sizeof (*mom)); - + for (i = 0; i < n_all_vars; i++) + mom[i] = moments1_create (MOMENT_MEAN); reader = casereader_clone (input); reader = casereader_create_filter_missing (reader, indep_vars, n_indep, MV_ANY, NULL, NULL); reader = casereader_create_filter_missing (reader, v_dependent, 1, MV_ANY, NULL, NULL); - n_data = data_pass_one (casereader_clone (reader), - (const struct variable **) all_vars, n_all_vars, - mom); - if ((n_data > 0) && (n_indep > 0)) + if (n_indep > 0) { for (i = 0; i < n_all_vars; i++) if (var_is_alpha (all_vars[i])) cat_stored_values_create (all_vars[i]); - X = - covariance_matrix_create (n_all_vars, - (const struct variable **) all_vars); + cov_hash = covariance_hsh_create (n_all_vars); reader = casereader_create_counter (reader, &row, -1); for (; casereader_read (reader, &c); case_destroy (&c)) { /* Accumulate the covariance matrix. - */ - for (i = 0; i < n_all_vars; ++i) - { - const struct variable *v = all_vars[i]; - const union value *val_v = case_data (&c, v); - if (var_is_alpha (all_vars[i])) - cat_value_update (all_vars[i], val_v); - for (j = i; j < n_all_vars; j++) - { - const struct variable *w = all_vars[j]; - const union value *val_w = case_data (&c, w); - covariance_pass_two (X, *mom[i]->mean, *mom[j]->mean, - (double) n_data, - v, w, val_v, val_w); - } - } + */ + covariance_accumulate (cov_hash, mom, &c, all_vars, n_all_vars); + n_data++; } - model = pspp_linreg_cache_alloc (v_dependent[0], indep_vars, n_data, n_indep); - /* - For large data sets, use QR decomposition. - */ - if (n_data > sqrt (n_indep) && n_data > GLM_LARGE_DATA) + cov = covariance_accumulator_to_matrix (cov_hash, mom, all_vars, n_all_vars, n_data); + + hsh_destroy (cov_hash); + for (i = 0; i < n_dependent; i++) { - model->method = PSPP_LINREG_QR; + model = fit_model (cov, mom, v_dependent[i], indep_vars, n_data, n_indep); + pspp_linreg_cache_free (model); } - coeff_init (model, X); - pspp_linreg_with_cov (X, model); + casereader_destroy (reader); for (i = 0; i < n_all_vars; i++) { - moments1_destroy (mom[i]->m); - free (mom[i]->mean); - free (mom[i]->variance); - free (mom[i]->weight); - free (mom[i]); + moments1_destroy (mom[i]); } free (mom); - covariance_matrix_destroy (X); + covariance_matrix_destroy (cov); } else { @@ -336,7 +326,6 @@ run_glm (struct casereader *input, } free (indep_vars); free (lopts.get_indep_mean_std); - pspp_linreg_cache_free (model); casereader_destroy (input); return true; diff --git a/src/math/design-matrix.c b/src/math/design-matrix.c index 4e6ccf17..e3743228 100644 --- a/src/math/design-matrix.c +++ b/src/math/design-matrix.c @@ -54,10 +54,12 @@ design_matrix_create (int n_variables, dm = xmalloc (sizeof *dm); dm->vars = xnmalloc (n_variables, sizeof *dm->vars); + dm->n_cases = xnmalloc (n_variables, sizeof (*dm->n_cases)); dm->n_vars = n_variables; for (i = 0; i < n_variables; i++) { + dm->n_cases[i] = 0; v = v_variables[i]; assert ((dm->vars + i) != NULL); (dm->vars + i)->v = v; /* Allows us to look up the variable from @@ -79,6 +81,7 @@ design_matrix_create (int n_variables, dm->m = gsl_matrix_calloc (n_data, n_cols); col = 0; + return dm; } @@ -87,6 +90,7 @@ design_matrix_destroy (struct design_matrix *dm) { free (dm->vars); gsl_matrix_free (dm->m); + free (dm->n_cases); free (dm); } @@ -212,3 +216,46 @@ design_matrix_clone (const struct design_matrix *dm) return result; } +/* + Increment the number of cases for V. + */ +void +design_matrix_increment_case_count (struct design_matrix *dm, const struct variable *v) +{ + size_t i; + assert (dm != NULL); + assert (dm->n_cases != NULL); + assert (v != NULL); + i = design_matrix_var_to_column (dm, v); + dm->n_cases[i]++; +} + +/* + Set the number of cases for V. + */ +void +design_matrix_set_case_count (struct design_matrix *dm, const struct variable *v, size_t n) +{ + size_t i; + assert (dm != NULL); + assert (dm->n_cases != NULL); + assert (v != NULL); + i = design_matrix_var_to_column (dm, v); + dm->n_cases[i] = n; +} + +/* + Get the number of cases for V. + */ +void +design_matrix_get_case_count (struct design_matrix *dm, const struct variable *v) +{ + size_t i; + assert (dm != NULL); + assert (dm->n_cases != NULL); + assert (v != NULL); + i = design_matrix_var_to_column (dm, v); + return dm->n_cases[i]; +} + + diff --git a/src/math/design-matrix.h b/src/math/design-matrix.h index 8d56d010..01026a22 100644 --- a/src/math/design-matrix.h +++ b/src/math/design-matrix.h @@ -58,6 +58,9 @@ struct design_matrix design_matrix_var structure. */ + size_t *n_cases; /* Element i is the number of valid cases for this + variable. + */ size_t n_vars; }; @@ -82,5 +85,9 @@ size_t design_matrix_var_to_column (const struct design_matrix *, const struct variable *design_matrix_col_to_var (const struct design_matrix *, size_t); +void design_matrix_increment_case_count (struct design_matrix *, const struct variable *); +void design_matrix_set_case_count (struct design_matrix *, const struct variable *, size_t); + +void design_matrix_get_case_count (struct design_matrix *, const struct variable *); #endif