From a98dca0147497cc47e34902ae3de618f3d6672f5 Mon Sep 17 00:00:00 2001 From: Jason Stover Date: Tue, 10 Jul 2007 04:01:00 +0000 Subject: [PATCH] initial version of glm --- src/language/stats/ChangeLog | 4 + src/language/stats/automake.mk | 1 + src/language/stats/glm.q | 352 +++++++++++++++++++++++++++++++++ 3 files changed, 357 insertions(+) create mode 100644 src/language/stats/glm.q diff --git a/src/language/stats/ChangeLog b/src/language/stats/ChangeLog index 91956d82..3ec57a78 100644 --- a/src/language/stats/ChangeLog +++ b/src/language/stats/ChangeLog @@ -1,3 +1,7 @@ +2007-07-10 Jason Stover + + * glm.q: Initial version of the GLM procedure. + 2007-06-06 Ben Pfaff Adapt case sources, sinks, and clients of procedure code to the diff --git a/src/language/stats/automake.mk b/src/language/stats/automake.mk index 5060539d..ea674e98 100644 --- a/src/language/stats/automake.mk +++ b/src/language/stats/automake.mk @@ -7,6 +7,7 @@ src_language_stats_built_sources = \ src/language/stats/crosstabs.c \ src/language/stats/examine.c \ src/language/stats/frequencies.c \ + src/language/stats/glm.c \ src/language/stats/means.c \ src/language/stats/npar.c \ src/language/stats/oneway.c \ diff --git a/src/language/stats/glm.q b/src/language/stats/glm.q new file mode 100644 index 00000000..8cfc2547 --- /dev/null +++ b/src/language/stats/glm.q @@ -0,0 +1,352 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2007 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gettext.h" + +#define GLM_LARGE_DATA 1000 + +/* (headers) */ + +/* (specification) + "GLM" (glm_): + *dependent=custom; + by=varlist; + with=varlist. +*/ +/* (declarations) */ +/* (functions) */ +static struct cmd_glm cmd; + +/* + Moments for each of the variables used. + */ +struct moments_var +{ + struct moments1 *m; + const struct variable *v; +}; + + +/* + Dependent variable used. + */ +static const struct variable **v_dependent; + +/* + Number of dependent variables. + */ +static size_t n_dependent; + +/* + Return value for the procedure. + */ +static int pspp_glm_rc = CMD_SUCCESS; + +static bool run_glm (struct casereader*, + struct cmd_glm *, + const struct dataset *, + pspp_linreg_cache *); + +int +cmd_glm (struct lexer *lexer, struct dataset *ds) +{ + struct casegrouper *grouper; + struct casereader *group; + pspp_linreg_cache *model = NULL; + + bool ok; + + model = xmalloc (sizeof *model); + + if (!parse_glm (lexer, ds, &cmd, NULL)) + return CMD_FAILURE; + + /* Data pass. */ + grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds)); + while (casegrouper_get_next_group (grouper, &group)) + { + run_glm (group, &cmd, ds, model); + } + ok = casegrouper_destroy (grouper); + ok = proc_commit (ds) && ok; + + free (model); + free (v_dependent); + return ok ? CMD_SUCCESS : CMD_FAILURE; +} + +/* Parser for the dependent sub command */ +static int +glm_custom_dependent (struct lexer *lexer, struct dataset *ds, + struct cmd_glm *cmd UNUSED, + void *aux UNUSED) +{ + const struct dictionary *dict = dataset_dict (ds); + + if ((lex_token (lexer) != T_ID + || dict_lookup_var (dict, lex_tokid (lexer)) == NULL) + && lex_token (lexer) != T_ALL) + return 2; + + if (!parse_variables_const + (lexer, dict, &v_dependent, &n_dependent, PV_NONE)) + { + free (v_dependent); + return 0; + } + assert (n_dependent); + if (n_dependent > 1) + msg (SE, _("Multivariate GLM not yet supported")); + n_dependent = 1; /* Drop this line after adding support for multivariate GLM. */ + + return 1; +} + +static void +coeff_init (pspp_linreg_cache * c, struct design_matrix *dm) +{ + c->coeff = xnmalloc (dm->m->size2 + 1, sizeof (*c->coeff)); + c->coeff[0] = xmalloc (sizeof (*(c->coeff[0]))); /* The first coefficient is the intercept. */ + c->coeff[0]->v_info = NULL; /* Intercept has no associated variable. */ + pspp_coeff_init (c->coeff + 1, dm); +} + +/* + Put the moments in the linreg cache. + */ +static void +compute_moments (pspp_linreg_cache * c, struct moments_var *mom, + struct design_matrix *dm, size_t n) +{ + size_t i; + size_t j; + double weight; + double mean; + double variance; + double skewness; + double kurtosis; + /* + Scan the variable names in the columns of the design matrix. + When we find the variable we need, insert its mean in the cache. + */ + for (i = 0; i < dm->m->size2; i++) + { + for (j = 0; j < n; j++) + { + if (design_matrix_col_to_var (dm, i) == (mom + j)->v) + { + moments1_calculate ((mom + j)->m, &weight, &mean, &variance, + &skewness, &kurtosis); + gsl_vector_set (c->indep_means, i, mean); + gsl_vector_set (c->indep_std, i, sqrt (variance)); + } + } + } +} +/* Encode categorical variables. + Returns number of valid cases. */ +static int +prepare_categories (struct casereader *input, + const struct variable **vars, size_t n_vars, + struct moments_var *mom) +{ + int n_data; + struct ccase c; + size_t i; + + for (i = 0; i < n_vars; i++) + if (var_is_alpha (vars[i])) + cat_stored_values_create (vars[i]); + + n_data = 0; + for (; casereader_read (input, &c); case_destroy (&c)) + { + /* + The second condition ensures the program will run even if + there is only one variable to act as both explanatory and + response. + */ + for (i = 0; i < n_vars; i++) + { + const union value *val = case_data (&c, vars[i]); + if (var_is_alpha (vars[i])) + cat_value_update (vars[i], val); + else + moments1_add (mom[i].m, val->f, 1.0); + } + n_data++; + } + casereader_destroy (input); + + return n_data; +} + +static bool +run_glm (struct casereader *input, + struct cmd_glm *cmd, + const struct dataset *ds, + pspp_linreg_cache *model) +{ + size_t i; + int n_indep = 0; + struct ccase c; + const struct variable **indep_vars; + struct design_matrix *X; + struct moments_var *mom; + gsl_vector *Y; + struct casereader *reader; + casenumber row; + size_t n_data; /* Number of valid cases. */ + + pspp_linreg_opts lopts; + + assert (model != NULL); + + if (!casereader_peek (input, 0, &c)) + return true; + output_split_file_values (ds, &c); + case_destroy (&c); + + if (!v_dependent) + { + dict_get_vars (dataset_dict (ds), &v_dependent, &n_dependent, + 1u << DC_SYSTEM); + } + + for (i = 0; i < n_dependent; i++) + { + if (!var_is_numeric (v_dependent[i])) + { + msg (SE, _("Dependent variable must be numeric.")); + return false; + } + } + + mom = xnmalloc (n_dependent, sizeof (*mom)); + mom->m = moments1_create (MOMENT_VARIANCE); + mom->v = v_dependent[0]; + lopts.get_depvar_mean_std = 1; + + lopts.get_indep_mean_std = xnmalloc (n_dependent, sizeof (int)); + indep_vars = xnmalloc (cmd->n_by, sizeof *indep_vars); + + for (i = 0; i < cmd->n_by; i++) + { + indep_vars[i] = cmd->v_by[i]; + } + n_indep = cmd->n_by; + + reader = casereader_clone (input); + reader = casereader_create_filter_missing (reader, indep_vars, n_indep, + MV_ANY, NULL); + reader = casereader_create_filter_missing (reader, v_dependent, 1, + MV_ANY, NULL); + n_data = prepare_categories (casereader_clone (reader), + indep_vars, n_indep, mom); + + if ((n_data > 0) && (n_indep > 0)) + { + Y = gsl_vector_alloc (n_data); + X = + design_matrix_create (n_indep, + (const struct variable **) indep_vars, + n_data); + for (i = 0; i < X->m->size2; i++) + { + lopts.get_indep_mean_std[i] = 1; + } + model = pspp_linreg_cache_alloc (X->m->size1, X->m->size2); + model->indep_means = gsl_vector_alloc (X->m->size2); + model->indep_std = gsl_vector_alloc (X->m->size2); + model->depvar = v_dependent[0]; + reader = casereader_create_counter (reader, &row, -1); + for (; casereader_read (reader, &c); case_destroy (&c)) + { + for (i = 0; i < n_indep; ++i) + { + const struct variable *v = indep_vars[i]; + const union value *val = case_data (&c, v); + if (var_is_alpha (v)) + design_matrix_set_categorical (X, row, v, val); + else + design_matrix_set_numeric (X, row, v, val); + } + gsl_vector_set (Y, row, case_num (&c, model->depvar)); + } + casereader_destroy (reader); + /* + Now that we know the number of coefficients, allocate space + and store pointers to the variables that correspond to the + coefficients. + */ + coeff_init (model, X); + + /* + Find the least-squares estimates and other statistics. + */ + pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, model); + compute_moments (model, mom, X, n_dependent); + + gsl_vector_free (Y); + design_matrix_destroy (X); + } + else + { + msg (SE, gettext ("No valid data found. This command was skipped.")); + } + free (indep_vars); + free (lopts.get_indep_mean_std); + casereader_destroy (input); + + return true; +} + +/* + Local Variables: + mode: c + End: +*/ -- 2.30.2