/* PSPP - a program for statistical analysis.
- Copyright (C) 2007 Free Software Foundation, Inc.
+ Copyright (C) 2007, 2009 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
#include <language/data-io/file-handle.h>
#include <language/lexer/lexer.h>
#include <libpspp/compiler.h>
+#include <libpspp/hash.h>
#include <libpspp/message.h>
#include <math/covariance-matrix.h>
#include <math/coefficient.h>
#include "xalloc.h"
#include "gettext.h"
-#define GLM_LARGE_DATA 10000
-
/* (headers) */
/* (specification)
static bool run_glm (struct casereader *,
struct cmd_glm *,
- const struct dataset *, pspp_linreg_cache *);
+ const struct dataset *);
int
cmd_glm (struct lexer *lexer, struct dataset *ds)
{
struct casegrouper *grouper;
struct casereader *group;
- pspp_linreg_cache *model = NULL;
bool ok;
- model = xmalloc (sizeof *model);
-
if (!parse_glm (lexer, ds, &cmd, NULL))
return CMD_FAILURE;
grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds));
while (casegrouper_get_next_group (grouper, &group))
{
- run_glm (group, &cmd, ds, model);
+ run_glm (group, &cmd, ds);
}
ok = casegrouper_destroy (grouper);
ok = proc_commit (ds) && ok;
- free (model);
free (v_dependent);
return ok ? CMD_SUCCESS : CMD_FAILURE;
}
return 1;
}
-static void
-coeff_init (pspp_linreg_cache * c, struct design_matrix *dm)
-{
- c->coeff = xnmalloc (dm->m->size2 + 1, sizeof (*c->coeff));
- c->coeff[0] = xmalloc (sizeof (*(c->coeff[0]))); /* The first coefficient is the intercept. */
- c->coeff[0]->v_info = NULL; /* Intercept has no associated variable. */
- pspp_coeff_init (c->coeff + 1, dm);
-}
-
/*
- Put the moments in the linreg cache.
+ COV is the covariance matrix for variables included in the
+ model. That means the dependent variable is in there, too.
*/
static void
-compute_moments (pspp_linreg_cache * c, struct moments_var *mom,
- struct design_matrix *dm, size_t n)
+coeff_init (pspp_linreg_cache * c, const struct design_matrix *cov)
{
- size_t i;
- size_t j;
- double weight;
- double mean;
- double variance;
- double skewness;
- double kurtosis;
- /*
- Scan the variable names in the columns of the design matrix.
- When we find the variable we need, insert its mean in the cache.
- */
- for (i = 0; i < dm->m->size2; i++)
- {
- for (j = 0; j < n; j++)
- {
- if (design_matrix_col_to_var (dm, i) == (mom + j)->v)
- {
- moments1_calculate ((mom + j)->m, &weight, &mean, &variance,
- &skewness, &kurtosis);
- gsl_vector_set (c->indep_means, i, mean);
- gsl_vector_set (c->indep_std, i, sqrt (variance));
- }
- }
- }
+ c->coeff = xnmalloc (cov->m->size2, sizeof (*c->coeff));
+ c->n_coeffs = cov->m->size2 - 1;
+ pspp_coeff_init (c->coeff, cov);
}
-/* Encode categorical variables.
- Returns number of valid cases. */
-static int
-data_pass_one (struct casereader *input,
- const struct variable **vars, size_t n_vars,
- struct moments_var **mom)
-{
- int n_data;
- struct ccase c;
- size_t i;
-
- for (i = 0; i < n_vars; i++)
- {
- mom[i] = xmalloc (sizeof (*mom[i]));
- mom[i]->v = vars[i];
- mom[i]->mean = xmalloc (sizeof (*mom[i]->mean));
- mom[i]->variance = xmalloc (sizeof (*mom[i]->mean));
- mom[i]->weight = xmalloc (sizeof (*mom[i]->weight));
- mom[i]->m = moments1_create (MOMENT_VARIANCE);
- if (var_is_alpha (vars[i]))
- cat_stored_values_create (vars[i]);
- }
- n_data = 0;
- for (; casereader_read (input, &c); case_destroy (&c))
- {
- /*
- The second condition ensures the program will run even if
- there is only one variable to act as both explanatory and
- response.
- */
- for (i = 0; i < n_vars; i++)
- {
- const union value *val = case_data (&c, vars[i]);
- if (var_is_alpha (vars[i]))
- cat_value_update (vars[i], val);
- else
- moments1_add (mom[i]->m, val->f, 1.0);
- }
- n_data++;
- }
- casereader_destroy (input);
- for (i = 0; i < n_vars; i++)
- {
- if (var_is_numeric (mom[i]->v))
- {
- moments1_calculate (mom[i]->m, mom[i]->weight, mom[i]->mean,
- mom[i]->variance, NULL, NULL);
- }
- }
-
- return n_data;
+static pspp_linreg_cache *
+fit_model (const struct covariance_matrix *cov,
+ const struct variable *dep_var,
+ const struct variable ** indep_vars,
+ size_t n_data, size_t n_indep)
+{
+ pspp_linreg_cache *result = NULL;
+ result = pspp_linreg_cache_alloc (dep_var, indep_vars, n_data, n_indep);
+ coeff_init (result, covariance_to_design (cov));
+ pspp_linreg_with_cov (cov, result);
+
+ return result;
}
static bool
run_glm (struct casereader *input,
struct cmd_glm *cmd,
- const struct dataset *ds, pspp_linreg_cache * model)
+ const struct dataset *ds)
{
- size_t i;
- size_t j;
- int n_indep = 0;
- struct ccase c;
+ casenumber row;
const struct variable **indep_vars;
const struct variable **all_vars;
- struct design_matrix *X;
- struct moments_var **mom;
- struct casereader *reader;
- casenumber row;
+ int n_indep = 0;
+ pspp_linreg_cache *model = NULL;
+ pspp_linreg_opts lopts;
+ struct ccase *c;
+ size_t i;
size_t n_all_vars;
size_t n_data; /* Number of valid cases. */
+ struct casereader *reader;
+ struct covariance_matrix *cov;
- pspp_linreg_opts lopts;
-
- assert (model != NULL);
-
- if (!casereader_peek (input, 0, &c))
+ c = casereader_peek (input, 0);
+ if (c == NULL)
{
casereader_destroy (input);
return true;
}
- output_split_file_values (ds, &c);
- case_destroy (&c);
+ output_split_file_values (ds, c);
+ case_unref (c);
if (!v_dependent)
{
1u << DC_SYSTEM);
}
-
-
lopts.get_depvar_mean_std = 1;
lopts.get_indep_mean_std = xnmalloc (n_dependent, sizeof (int));
all_vars[i + n_dependent] = cmd->v_by[i];
}
n_indep = cmd->n_by;
- mom = xnmalloc (n_all_vars, sizeof (*mom));
-
reader = casereader_clone (input);
reader = casereader_create_filter_missing (reader, indep_vars, n_indep,
MV_ANY, NULL, NULL);
reader = casereader_create_filter_missing (reader, v_dependent, 1,
MV_ANY, NULL, NULL);
- n_data = data_pass_one (casereader_clone (reader),
- (const struct variable **) all_vars, n_all_vars,
- mom);
- if ((n_data > 0) && (n_indep > 0))
+ if (n_indep > 0)
{
- X =
- covariance_matrix_create (n_all_vars,
- (const struct variable **) all_vars);
+ for (i = 0; i < n_all_vars; i++)
+ if (var_is_alpha (all_vars[i]))
+ cat_stored_values_create (all_vars[i]);
+
+ cov = covariance_matrix_init (n_all_vars, all_vars, ONE_PASS, PAIRWISE, MV_ANY);
reader = casereader_create_counter (reader, &row, -1);
- for (; casereader_read (reader, &c); case_destroy (&c))
+ for (; (c = casereader_read (reader)) != NULL; case_unref (c))
{
/*
Accumulate the covariance matrix.
- */
- for (i = 0; i < n_all_vars; ++i)
- {
- const struct variable *v = all_vars[i];
- const union value *val_v = case_data (&c, v);
- for (j = i; j < n_all_vars; j++)
- {
- const struct variable *w = all_vars[j];
- const union value *val_w = case_data (&c, w);
- covariance_pass_two (X, *mom[i]->mean, *mom[j]->mean,
- (double) n_data,
- v, w, val_v, val_w);
- }
- }
+ */
+ covariance_matrix_accumulate (cov, c, NULL, 0);
+ n_data++;
}
- model = pspp_linreg_cache_alloc (n_data, n_indep);
- model->depvar = v_dependent;
- /*
- For large data sets, use QR decomposition.
- */
- if (n_data > sqrt (n_indep) && n_data > GLM_LARGE_DATA)
+ covariance_matrix_compute (cov);
+
+ for (i = 0; i < n_dependent; i++)
{
- model->method = PSPP_LINREG_QR;
+ model = fit_model (cov, v_dependent[i], indep_vars, n_data, n_indep);
+ pspp_linreg_cache_free (model);
}
- coeff_init (model, X);
- pspp_linreg_with_cov (X, model);
+
casereader_destroy (reader);
- for (i = 0; i < n_all_vars; i++)
- {
- moments1_destroy (mom[i]->m);
- free (mom[i]->mean);
- free (mom[i]->variance);
- free (mom[i]->weight);
- free (mom[i]);
- }
- free (mom);
- covariance_matrix_destroy (X);
+ covariance_matrix_destroy (cov);
}
else
{
}
free (indep_vars);
free (lopts.get_indep_mean_std);
- pspp_linreg_cache_free (model);
casereader_destroy (input);
return true;