/* PSPP - a program for statistical analysis.
- Copyright (C) 2007 Free Software Foundation, Inc.
+ Copyright (C) 2007, 2009 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
#include <stdlib.h>
#include <data/case.h>
-#include <data/category.h>
#include <data/casegrouper.h>
#include <data/casereader.h>
#include <data/dictionary.h>
#include <language/lexer/lexer.h>
#include <libpspp/compiler.h>
#include <libpspp/message.h>
-#include <math/covariance-matrix.h>
-#include <math/coefficient.h>
+#include <math/covariance.h>
+#include <math/categoricals.h>
#include <math/linreg.h>
#include <math/moments.h>
-#include <output/table.h>
+#include <output/tab.h>
#include "xalloc.h"
#include "gettext.h"
-#define GLM_LARGE_DATA 10000
-
/* (headers) */
/* (specification)
"GLM" (glm_):
*dependent=custom;
+ design=custom;
by=varlist;
with=varlist.
*/
/* (functions) */
static struct cmd_glm cmd;
+
/*
Moments for each of the variables used.
*/
*/
static size_t n_dependent;
-#if 0
-/*
- Return value for the procedure.
- */
-static int pspp_glm_rc = CMD_SUCCESS;
-#else
+size_t n_inter; /* Number of interactions. */
+size_t n_members; /* Number of memebr variables in an interaction. */
+
+struct interaction_variable **interactions;
+
int cmd_glm (struct lexer *lexer, struct dataset *ds);
-#endif
static bool run_glm (struct casereader *,
struct cmd_glm *,
- const struct dataset *, pspp_linreg_cache *);
-
+ const struct dataset *);
+/*
+ If the DESIGN subcommand was not specified, specify all possible
+ two-way interactions.
+ */
+static void
+check_interactions (struct dataset *ds, struct cmd_glm *cmd)
+{
+ size_t i;
+ size_t j;
+ size_t k = 0;
+ struct variable **interaction_vars;
+
+ /*
+ User did not specify the design matrix, so we
+ specify it here.
+ */
+ n_inter = (cmd->n_by + cmd->n_with) * (cmd->n_by + cmd->n_with - 1) / 2;
+ interactions = xnmalloc (n_inter, sizeof (*interactions));
+ interaction_vars = xnmalloc (2, sizeof (*interaction_vars));
+ for (i = 0; i < cmd->n_by; i++)
+ {
+ for (j = i + 1; j < cmd->n_by; j++)
+ {
+ interaction_vars[0] = cmd->v_by[i];
+ interaction_vars[1] = cmd->v_by[j];
+ interactions[k] = interaction_variable_create (interaction_vars, 2);
+ k++;
+ }
+ for (j = 0; j < cmd->n_with; j++)
+ {
+ interaction_vars[0] = cmd->v_by[i];
+ interaction_vars[1] = cmd->v_with[j];
+ interactions[k] = interaction_variable_create (interaction_vars, 2);
+ k++;
+ }
+ }
+ for (i = 0; i < cmd->n_with; i++)
+ {
+ for (j = i + 1; j < cmd->n_with; j++)
+ {
+ interaction_vars[0] = cmd->v_with[i];
+ interaction_vars[1] = cmd->v_with[j];
+ interactions[k] = interaction_variable_create (interaction_vars, 2);
+ k++;
+ }
+ }
+}
int
cmd_glm (struct lexer *lexer, struct dataset *ds)
{
struct casegrouper *grouper;
struct casereader *group;
- pspp_linreg_cache *model = NULL;
bool ok;
- model = xmalloc (sizeof *model);
-
if (!parse_glm (lexer, ds, &cmd, NULL))
return CMD_FAILURE;
- /* Data pass. */
+ if (!lex_match_id (lexer, "DESIGN"))
+ {
+ check_interactions (ds, &cmd);
+ }
+ /* Data pass. */
grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds));
while (casegrouper_get_next_group (grouper, &group))
{
- run_glm (group, &cmd, ds, model);
+ run_glm (group, &cmd, ds);
}
ok = casegrouper_destroy (grouper);
ok = proc_commit (ds) && ok;
- free (model);
free (v_dependent);
return ok ? CMD_SUCCESS : CMD_FAILURE;
}
+static int
+parse_interactions (struct lexer *lexer, const struct variable **interaction_vars, int n_members,
+ int max_members, struct dataset *ds)
+{
+ if (lex_match (lexer, '*'))
+ {
+ if (n_members > max_members)
+ {
+ max_members *= 2;
+ xnrealloc (interaction_vars, max_members, sizeof (*interaction_vars));
+ }
+ interaction_vars[n_members] = parse_variable (lexer, dataset_dict (ds));
+ parse_interactions (lexer, interaction_vars, n_members++, max_members, ds);
+ }
+ return n_members;
+}
+/* Parser for the design subcommand. */
+static int
+glm_custom_design (struct lexer *lexer, struct dataset *ds,
+ struct cmd_glm *cmd UNUSED, void *aux UNUSED)
+{
+ size_t n_allocated = 2;
+ size_t n_members;
+ struct variable **interaction_vars;
+ struct variable *this_var;
+
+ interactions = xnmalloc (n_allocated, sizeof (*interactions));
+ n_inter = 1;
+ while (lex_token (lexer) != T_STOP && lex_token (lexer) != '.')
+ {
+ this_var = parse_variable (lexer, dataset_dict (ds));
+ if (lex_match (lexer, '('))
+ {
+ lex_force_match (lexer, ')');
+ }
+ else if (lex_match (lexer, '*'))
+ {
+ interaction_vars = xnmalloc (2 * n_inter, sizeof (*interaction_vars));
+ n_members = parse_interactions (lexer, interaction_vars, 1, 2 * n_inter, ds);
+ if (n_allocated < n_inter)
+ {
+ n_allocated *= 2;
+ xnrealloc (interactions, n_allocated, sizeof (*interactions));
+ }
+ interactions [n_inter - 1] =
+ interaction_variable_create (interaction_vars, n_members);
+ n_inter++;
+ free (interaction_vars);
+ }
+ }
+ return 1;
+}
/* Parser for the dependent sub command */
static int
glm_custom_dependent (struct lexer *lexer, struct dataset *ds,
struct cmd_glm *cmd UNUSED, void *aux UNUSED)
{
const struct dictionary *dict = dataset_dict (ds);
+ size_t i;
if ((lex_token (lexer) != T_ID
|| dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
&& lex_token (lexer) != T_ALL)
return 2;
- if (!parse_variables_const
- (lexer, dict, &v_dependent, &n_dependent, PV_NONE))
+ if (!parse_variables_const (lexer, dict, &v_dependent, &n_dependent, PV_NONE))
{
free (v_dependent);
return 0;
}
+ for (i = 0; i < n_dependent; i++)
+ {
+ assert (var_is_numeric (v_dependent[i]));
+ }
assert (n_dependent);
if (n_dependent > 1)
msg (SE, _("Multivariate GLM not yet supported"));
return 1;
}
-static void
-coeff_init (pspp_linreg_cache * c, struct design_matrix *dm)
-{
- c->coeff = xnmalloc (dm->m->size2 + 1, sizeof (*c->coeff));
- c->coeff[0] = xmalloc (sizeof (*(c->coeff[0]))); /* The first coefficient is the intercept. */
- c->coeff[0]->v_info = NULL; /* Intercept has no associated variable. */
- pspp_coeff_init (c->coeff + 1, dm);
-}
-
-/*
- Put the moments in the linreg cache.
- */
-static void
-compute_moments (pspp_linreg_cache * c, struct moments_var *mom,
- struct design_matrix *dm, size_t n)
-{
- size_t i;
- size_t j;
- double weight;
- double mean;
- double variance;
- double skewness;
- double kurtosis;
- /*
- Scan the variable names in the columns of the design matrix.
- When we find the variable we need, insert its mean in the cache.
- */
- for (i = 0; i < dm->m->size2; i++)
- {
- for (j = 0; j < n; j++)
- {
- if (design_matrix_col_to_var (dm, i) == (mom + j)->v)
- {
- moments1_calculate ((mom + j)->m, &weight, &mean, &variance,
- &skewness, &kurtosis);
- gsl_vector_set (c->indep_means, i, mean);
- gsl_vector_set (c->indep_std, i, sqrt (variance));
- }
- }
- }
-}
-
-/* Encode categorical variables.
- Returns number of valid cases. */
-static int
-data_pass_one (struct casereader *input,
- const struct variable **vars, size_t n_vars,
- struct moments_var **mom)
+static linreg *
+fit_model (const struct covariance *cov,
+ const struct variable *dep_var,
+ const struct variable ** indep_vars,
+ size_t n_data, size_t n_indep)
{
- int n_data;
- struct ccase c;
- size_t i;
-
- for (i = 0; i < n_vars; i++)
- {
- mom[i] = xmalloc (sizeof (*mom[i]));
- mom[i]->v = vars[i];
- mom[i]->mean = xmalloc (sizeof (*mom[i]->mean));
- mom[i]->variance = xmalloc (sizeof (*mom[i]->mean));
- mom[i]->weight = xmalloc (sizeof (*mom[i]->weight));
- mom[i]->m = moments1_create (MOMENT_VARIANCE);
- if (var_is_alpha (vars[i]))
- cat_stored_values_create (vars[i]);
- }
-
- n_data = 0;
- for (; casereader_read (input, &c); case_destroy (&c))
- {
- /*
- The second condition ensures the program will run even if
- there is only one variable to act as both explanatory and
- response.
- */
- for (i = 0; i < n_vars; i++)
- {
- const union value *val = case_data (&c, vars[i]);
- if (var_is_alpha (vars[i]))
- cat_value_update (vars[i], val);
- else
- moments1_add (mom[i]->m, val->f, 1.0);
- }
- n_data++;
- }
- casereader_destroy (input);
- for (i = 0; i < n_vars; i++)
- {
- if (var_is_numeric (mom[i]->v))
- {
- moments1_calculate (mom[i]->m, mom[i]->weight, mom[i]->mean,
- mom[i]->variance, NULL, NULL);
- }
- }
-
- return n_data;
+ linreg *result = NULL;
+
+ return result;
}
static bool
run_glm (struct casereader *input,
struct cmd_glm *cmd,
- const struct dataset *ds, pspp_linreg_cache * model)
+ const struct dataset *ds)
{
- size_t i;
- size_t j;
- int n_indep = 0;
- struct ccase c;
- const struct variable **indep_vars;
- const struct variable **all_vars;
- struct design_matrix *X;
- struct moments_var **mom;
- struct casereader *reader;
casenumber row;
- size_t n_all_vars;
- size_t n_data; /* Number of valid cases. */
-
+ const struct variable **numerics = NULL;
+ const struct variable **categoricals = NULL;
+ int n_indep = 0;
+ linreg *model = NULL;
pspp_linreg_opts lopts;
+ struct ccase *c;
+ size_t i;
+ size_t n_data; /* Number of valid cases. */
+ size_t n_categoricals = 0;
+ size_t n_numerics;
+ struct casereader *reader;
+ struct covariance *cov;
- assert (model != NULL);
-
- if (!casereader_peek (input, 0, &c))
+ c = casereader_peek (input, 0);
+ if (c == NULL)
{
casereader_destroy (input);
return true;
}
- output_split_file_values (ds, &c);
- case_destroy (&c);
+ output_split_file_values (ds, c);
+ case_unref (c);
if (!v_dependent)
{
1u << DC_SYSTEM);
}
-
-
lopts.get_depvar_mean_std = 1;
lopts.get_indep_mean_std = xnmalloc (n_dependent, sizeof (int));
- indep_vars = xnmalloc (cmd->n_by, sizeof *indep_vars);
- n_all_vars = cmd->n_by + n_dependent;
- all_vars = xnmalloc (n_all_vars, sizeof *all_vars);
- for (i = 0; i < n_dependent; i++)
+
+ n_numerics = n_dependent;
+ for (i = 0; i < cmd->n_with; i++)
{
- all_vars[i] = v_dependent[i];
+ if (var_is_alpha (cmd->v_with[i]))
+ {
+ n_categoricals++;
+ }
+ else
+ {
+ n_numerics++;
+ }
}
for (i = 0; i < cmd->n_by; i++)
{
- indep_vars[i] = cmd->v_by[i];
- all_vars[i + n_dependent] = cmd->v_by[i];
+ if (var_is_alpha (cmd->v_by[i]))
+ {
+ n_categoricals++;
+ }
+ else
+ {
+ n_numerics++;
+ }
}
- n_indep = cmd->n_by;
- mom = xnmalloc (n_all_vars, sizeof (*mom));
-
-
- reader = casereader_clone (input);
- reader = casereader_create_filter_missing (reader, indep_vars, n_indep,
- MV_ANY, NULL, NULL);
- reader = casereader_create_filter_missing (reader, v_dependent, 1,
- MV_ANY, NULL, NULL);
- n_data = data_pass_one (casereader_clone (reader),
- (const struct variable **) all_vars, n_all_vars,
- mom);
-
- if ((n_data > 0) && (n_indep > 0))
+ numerics = xnmalloc (n_numerics, sizeof *numerics);
+ categoricals = xnmalloc (n_categoricals, sizeof (*categoricals));
+ size_t j = 0;
+ size_t k = 0;
+ for (i = 0; i < cmd->n_by; i++)
{
- X =
- covariance_matrix_create (n_all_vars,
- (const struct variable **) all_vars);
- reader = casereader_create_counter (reader, &row, -1);
- for (; casereader_read (reader, &c); case_destroy (&c))
+ if (var_is_alpha (cmd->v_by[i]))
{
- /*
- Accumulate the covariance matrix.
- */
- for (i = 0; i < n_all_vars; ++i)
- {
- const struct variable *v = all_vars[i];
- const union value *val_v = case_data (&c, v);
- for (j = i; j < n_all_vars; j++)
- {
- const struct variable *w = all_vars[j];
- const union value *val_w = case_data (&c, w);
- covariance_pass_two (X, *mom[i]->mean, *mom[j]->mean,
- (double) n_data,
- v, w, val_v, val_w);
- }
- }
+ categoricals[j] = cmd->v_by[i];
+ j++;
}
- model = pspp_linreg_cache_alloc (n_data, n_indep);
- model->depvar = v_dependent;
- /*
- For large data sets, use QR decomposition.
- */
- if (n_data > sqrt (n_indep) && n_data > GLM_LARGE_DATA)
+ else
{
- model->method = PSPP_LINREG_QR;
+ numerics[k] = cmd->v_by[i];
+ k++;
}
- coeff_init (model, X);
- pspp_linreg_with_cov (X, model);
- casereader_destroy (reader);
- for (i = 0; i < n_all_vars; i++)
+ }
+ for (i = 0; i < cmd->n_with; i++)
+ {
+ if (var_is_alpha (cmd->v_with[i]))
+ {
+ categoricals[j] = cmd->v_with[i];
+ j++;
+ }
+ else
{
- moments1_destroy (mom[i]->m);
- free (mom[i]->mean);
- free (mom[i]->variance);
- free (mom[i]->weight);
- free (mom[i]);
+ numerics[k] = cmd->v_with[i];
+ k++;
}
- free (mom);
- covariance_matrix_destroy (X);
}
- else
+ for (i = 0; i < n_dependent; i++)
+ {
+ numerics[k] = v_dependent[i];
+ k++;
+ }
+
+ struct categoricals *cats =
+ categoricals_create (categoricals, n_categoricals,
+ NULL, MV_NEVER,
+ NULL, NULL, NULL, NULL);
+
+ cov = covariance_2pass_create (n_numerics, numerics,
+ cats,
+ NULL, MV_NEVER);
+
+ reader = casereader_clone (input);
+ reader = casereader_create_filter_missing (reader, numerics, n_numerics,
+ MV_ANY, NULL, NULL);
+ reader = casereader_create_filter_missing (reader, categoricals, n_categoricals,
+ MV_ANY, NULL, NULL);
+ struct casereader *r = casereader_clone (reader);
+
+ reader = casereader_create_counter (reader, &row, -1);
+
+ for (; (c = casereader_read (reader)) != NULL; case_unref (c))
+ {
+ covariance_accumulate_pass1 (cov, c);
+ }
+ for (; (c = casereader_read (r)) != NULL; case_unref (c))
{
- msg (SE, gettext ("No valid data found. This command was skipped."));
+ covariance_accumulate_pass2 (cov, c);
}
- free (indep_vars);
+
+ covariance_destroy (cov);
+ casereader_destroy (reader);
+ casereader_destroy (r);
+
+ free (numerics);
+ free (categoricals);
free (lopts.get_indep_mean_std);
- pspp_linreg_cache_free (model);
casereader_destroy (input);
return true;