return 1;
}
-/*
- COV is the covariance matrix for variables included in the
- model. That means the dependent variable is in there, too.
- */
-static void
-coeff_init (pspp_linreg_cache * c, const struct design_matrix *cov)
-{
- c->coeff = xnmalloc (cov->m->size2, sizeof (*c->coeff));
- c->n_coeffs = cov->m->size2 - 1;
- pspp_coeff_init (c->coeff, cov);
-}
-
-
-static pspp_linreg_cache *
+static linreg *
fit_model (const struct covariance *cov,
const struct variable *dep_var,
const struct variable ** indep_vars,
size_t n_data, size_t n_indep)
{
- pspp_linreg_cache *result = NULL;
+ linreg *result = NULL;
return result;
}
const struct variable **numerics = NULL;
const struct variable **categoricals = NULL;
int n_indep = 0;
- pspp_linreg_cache *model = NULL;
+ linreg *model = NULL;
pspp_linreg_opts lopts;
struct ccase *c;
size_t i;
#include <gsl/gsl_vector.h>
#include <math.h>
#include <stdlib.h>
-
#include <data/case.h>
#include <data/casegrouper.h>
#include <data/casereader.h>
#include <libpspp/compiler.h>
#include <libpspp/message.h>
#include <libpspp/taint.h>
-#include <math/design-matrix.h>
-#include <math/coefficient.h>
+#include <math/covariance.h>
#include <math/linreg.h>
#include <math/moments.h>
#include <output/table.h>
{
int n_trns; /* Number of transformations. */
int trns_id; /* Which trns is this one? */
- pspp_linreg_cache *c; /* Linear model for this trns. */
+ linreg *c; /* Linear model for this trns. */
};
/*
Variables used (both explanatory and response).
static size_t n_variables;
static bool run_regression (struct casereader *, struct cmd_regression *,
- struct dataset *, pspp_linreg_cache **);
+ struct dataset *, linreg **);
/*
STATISTICS subcommand output functions.
*/
-static void reg_stats_r (pspp_linreg_cache *);
-static void reg_stats_coeff (pspp_linreg_cache *);
-static void reg_stats_anova (pspp_linreg_cache *);
-static void reg_stats_outs (pspp_linreg_cache *);
-static void reg_stats_zpp (pspp_linreg_cache *);
-static void reg_stats_label (pspp_linreg_cache *);
-static void reg_stats_sha (pspp_linreg_cache *);
-static void reg_stats_ci (pspp_linreg_cache *);
-static void reg_stats_f (pspp_linreg_cache *);
-static void reg_stats_bcov (pspp_linreg_cache *);
-static void reg_stats_ses (pspp_linreg_cache *);
-static void reg_stats_xtx (pspp_linreg_cache *);
-static void reg_stats_collin (pspp_linreg_cache *);
-static void reg_stats_tol (pspp_linreg_cache *);
-static void reg_stats_selection (pspp_linreg_cache *);
-static void statistics_keyword_output (void (*)(pspp_linreg_cache *),
- int, pspp_linreg_cache *);
+static void reg_stats_r (linreg *);
+static void reg_stats_coeff (linreg *);
+static void reg_stats_anova (linreg *);
+static void reg_stats_outs (linreg *);
+static void reg_stats_zpp (linreg *);
+static void reg_stats_label (linreg *);
+static void reg_stats_sha (linreg *);
+static void reg_stats_ci (linreg *);
+static void reg_stats_f (linreg *);
+static void reg_stats_bcov (linreg *);
+static void reg_stats_ses (linreg *);
+static void reg_stats_xtx (linreg *);
+static void reg_stats_collin (linreg *);
+static void reg_stats_tol (linreg *);
+static void reg_stats_selection (linreg *);
+static void statistics_keyword_output (void (*)(linreg *),
+ int, linreg *);
static void
-reg_stats_r (pspp_linreg_cache * c)
+reg_stats_r (linreg * c)
{
struct tab_table *t;
int n_rows = 2;
assert (c != NULL);
rsq = c->ssm / c->sst;
adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
- std_error = sqrt (pspp_linreg_mse (c));
+ std_error = sqrt (linreg_mse (c));
t = tab_create (n_cols, n_rows, 0);
tab_dim (t, tab_natural_dimensions, NULL);
tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
Table showing estimated regression coefficients.
*/
static void
-reg_stats_coeff (pspp_linreg_cache * c)
+reg_stats_coeff (linreg * c)
{
size_t j;
int n_cols = 7;
const char *label;
const struct variable *v;
- const union value *val;
struct tab_table *t;
assert (c != NULL);
tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
- tab_double (t, 2, 1, 0, c->intercept, NULL);
- std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
+ tab_double (t, 2, 1, 0, linreg_intercept (c), NULL);
+ std_err = sqrt (gsl_matrix_get (linreg_cov (c), 0, 0));
tab_double (t, 3, 1, 0, std_err, NULL);
tab_double (t, 4, 1, 0, 0.0, NULL);
- t_stat = c->intercept / std_err;
+ t_stat = linreg_intercept (c) / std_err;
tab_double (t, 5, 1, 0, t_stat, NULL);
pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
tab_double (t, 6, 1, 0, pval, NULL);
- for (j = 0; j < c->n_coeffs; j++)
+ for (j = 0; j < linreg_n_coeffs (c); j++)
{
struct string tstr;
ds_init_empty (&tstr);
this_row = j + 2;
- v = pspp_coeff_get_var (c->coeff[j], 0);
+ v = linreg_indep_var (c, j);
label = var_to_string (v);
/* Do not overwrite the variable's name. */
ds_put_cstr (&tstr, label);
- if (var_is_alpha (v))
- {
- /*
- Append the value associated with this coefficient.
- This makes sense only if we us the usual binary encoding
- for that value.
- */
-
- val = pspp_coeff_get_value (c->coeff[j], v);
-
- var_append_value_name (v, val, &tstr);
- }
-
tab_text (t, 1, this_row, TAB_CENTER, ds_cstr (&tstr));
/*
Regression coefficients.
*/
- tab_double (t, 2, this_row, 0, c->coeff[j]->estimate, NULL);
+ tab_double (t, 2, this_row, 0, linreg_coeff (c, j), NULL);
/*
Standard error of the coefficients.
*/
- std_err = sqrt (gsl_matrix_get (c->cov, j + 1, j + 1));
+ std_err = sqrt (gsl_matrix_get (linreg_cov (c), j + 1, j + 1));
tab_double (t, 3, this_row, 0, std_err, NULL);
/*
Standardized coefficient, i.e., regression coefficient
if all variables had unit variance.
*/
- beta = pspp_coeff_get_sd (c->coeff[j]);
- beta *= c->coeff[j]->estimate / c->depvar_std;
+ beta = sqrt (gsl_matrix_get (linreg_cov (c), j, j));
+ beta *= linreg_coeff (c, j) / c->depvar_std;
tab_double (t, 4, this_row, 0, beta, NULL);
/*
Test statistic for H0: coefficient is 0.
*/
- t_stat = c->coeff[j]->estimate / std_err;
+ t_stat = linreg_coeff (c, j) / std_err;
tab_double (t, 5, this_row, 0, t_stat, NULL);
/*
P values for the test statistic above.
*/
pval =
2 * gsl_cdf_tdist_Q (fabs (t_stat),
- (double) (c->n_obs - c->n_coeffs));
+ (double) (linreg_n_obs (c) - linreg_n_coeffs (c)));
tab_double (t, 6, this_row, 0, pval, NULL);
ds_destroy (&tstr);
}
Display the ANOVA table.
*/
static void
-reg_stats_anova (pspp_linreg_cache * c)
+reg_stats_anova (linreg * c)
{
int n_cols = 7;
int n_rows = 4;
- const double msm = c->ssm / c->dfm;
- const double mse = pspp_linreg_mse (c);
+ const double msm = linreg_ssreg (c) / linreg_dfmodel (c);
+ const double mse = linreg_mse (c);
const double F = msm / mse;
const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
}
static void
-reg_stats_outs (pspp_linreg_cache * c)
+reg_stats_outs (linreg * c)
{
assert (c != NULL);
}
static void
-reg_stats_zpp (pspp_linreg_cache * c)
+reg_stats_zpp (linreg * c)
{
assert (c != NULL);
}
static void
-reg_stats_label (pspp_linreg_cache * c)
+reg_stats_label (linreg * c)
{
assert (c != NULL);
}
static void
-reg_stats_sha (pspp_linreg_cache * c)
+reg_stats_sha (linreg * c)
{
assert (c != NULL);
}
static void
-reg_stats_ci (pspp_linreg_cache * c)
+reg_stats_ci (linreg * c)
{
assert (c != NULL);
}
static void
-reg_stats_f (pspp_linreg_cache * c)
+reg_stats_f (linreg * c)
{
assert (c != NULL);
}
static void
-reg_stats_bcov (pspp_linreg_cache * c)
+reg_stats_bcov (linreg * c)
{
int n_cols;
int n_rows;
tab_vline (t, TAL_0, 1, 0, 0);
tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
- for (i = 0; i < c->n_coeffs; i++)
+ for (i = 0; i < linreg_n_coeffs (c); i++)
{
- const struct variable *v = pspp_coeff_get_var (c->coeff[i], 0);
+ const struct variable *v = linreg_indep_var (c, i);
label = var_to_string (v);
tab_text (t, 2, i, TAB_CENTER, label);
tab_text (t, i + 2, 0, TAB_CENTER, label);
- for (k = 1; k < c->n_coeffs; k++)
+ for (k = 1; k < linreg_n_coeffs (c); k++)
{
col = (i <= k) ? k : i;
row = (i <= k) ? i : k;
tab_submit (t);
}
static void
-reg_stats_ses (pspp_linreg_cache * c)
+reg_stats_ses (linreg * c)
{
assert (c != NULL);
}
static void
-reg_stats_xtx (pspp_linreg_cache * c)
+reg_stats_xtx (linreg * c)
{
assert (c != NULL);
}
static void
-reg_stats_collin (pspp_linreg_cache * c)
+reg_stats_collin (linreg * c)
{
assert (c != NULL);
}
static void
-reg_stats_tol (pspp_linreg_cache * c)
+reg_stats_tol (linreg * c)
{
assert (c != NULL);
}
static void
-reg_stats_selection (pspp_linreg_cache * c)
+reg_stats_selection (linreg * c)
{
assert (c != NULL);
}
static void
-statistics_keyword_output (void (*function) (pspp_linreg_cache *),
- int keyword, pspp_linreg_cache * c)
+statistics_keyword_output (void (*function) (linreg *),
+ int keyword, linreg * c)
{
if (keyword)
{
}
static void
-subcommand_statistics (int *keywords, pspp_linreg_cache * c)
+subcommand_statistics (int *keywords, linreg * c)
{
/*
The order here must match the order in which the STATISTICS
if (t->trns_id == t->n_trns)
{
- result = pspp_linreg_cache_free (t->c);
+ result = linreg_free (t->c);
}
free (t);
size_t i;
size_t n_vals;
struct reg_trns *trns = t_;
- pspp_linreg_cache *model;
+ linreg *model;
union value *output = NULL;
- const union value **vals = NULL;
+ const union value *tmp;
+ double *vals;
const struct variable **vars = NULL;
assert (trns != NULL);
assert (model->depvar != NULL);
assert (model->pred != NULL);
- vars = xnmalloc (model->n_coeffs, sizeof (*vars));
- n_vals = (*model->get_vars) (model, vars);
-
+ vars = linreg_get_vars (model);
+ n_vals = linreg_n_coeffs (model);
vals = xnmalloc (n_vals, sizeof (*vals));
*c = case_unshare (*c);
+
output = case_data_rw (*c, model->pred);
for (i = 0; i < n_vals; i++)
{
- vals[i] = case_data (*c, vars[i]);
+ tmp = case_data (*c, vars[i]);
+ vals[i] = tmp->f;
}
- output->f = (*model->predict) ((const struct variable **) vars,
- vals, model, n_vals);
+ output->f = linreg_predict (model, vals, n_vals);
free (vals);
- free (vars);
return TRNS_CONTINUE;
}
size_t i;
size_t n_vals;
struct reg_trns *trns = t_;
- pspp_linreg_cache *model;
+ linreg *model;
union value *output = NULL;
- const union value **vals = NULL;
- const union value *obs = NULL;
+ const union value *tmp;
+ double *vals = NULL;
+ double obs;
const struct variable **vars = NULL;
assert (trns != NULL);
assert (model->depvar != NULL);
assert (model->resid != NULL);
- vars = xnmalloc (model->n_coeffs, sizeof (*vars));
- n_vals = (*model->get_vars) (model, vars);
+ vars = linreg_get_vars (model);
+ n_vals = linreg_n_coeffs (model);
vals = xnmalloc (n_vals, sizeof (*vals));
*c = case_unshare (*c);
for (i = 0; i < n_vals; i++)
{
- vals[i] = case_data (*c, vars[i]);
+ tmp = case_data (*c, vars[i]);
+ vals[i] = tmp->f;
}
- obs = case_data (*c, model->depvar);
- output->f = (*model->residual) ((const struct variable **) vars,
- vals, obs, model, n_vals);
+ tmp = case_data (*c, model->depvar);
+ obs = tmp->f;
+ output->f = linreg_residual (model, obs, vals, n_vals);
free (vals);
- free (vars);
+
return TRNS_CONTINUE;
}
static void
reg_save_var (struct dataset *ds, const char *prefix, trns_proc_func * f,
- pspp_linreg_cache * c, struct variable **v, int n_trns)
+ linreg * c, struct variable **v, int n_trns)
{
struct dictionary *dict = dataset_dict (ds);
static int trns_index = 1;
trns_index++;
}
static void
-subcommand_save (struct dataset *ds, int save, pspp_linreg_cache ** models)
+subcommand_save (struct dataset *ds, int save, linreg ** models)
{
- pspp_linreg_cache **lc;
+ linreg **lc;
int n_trns = 0;
int i;
{
if (*lc != NULL)
{
- pspp_linreg_cache_free (*lc);
+ linreg_free (*lc);
}
}
}
{
struct casegrouper *grouper;
struct casereader *group;
- pspp_linreg_cache **models;
+ linreg **models;
bool ok;
size_t i;
}
return n_indep_vars;
}
-
-/* Encode categorical variables.
- Returns number of valid cases. */
-static int
-prepare_categories (struct casereader *input,
- const struct variable **vars, size_t n_vars,
- struct moments_var *mom)
+static double
+fill_covariance (gsl_matrix *cov, struct covariance *all_cov,
+ const struct variable **vars,
+ size_t n_vars, const struct variable *dep_var,
+ const struct variable **all_vars, size_t n_all_vars)
{
- int n_data;
- struct ccase *c;
size_t i;
-
- assert (vars != NULL);
- assert (mom != NULL);
-
- for (i = 0; i < n_vars; i++)
- if (var_is_alpha (vars[i]))
- cat_stored_values_create (vars[i]);
-
- n_data = 0;
- for (; (c = casereader_read (input)) != NULL; case_unref (c))
+ size_t j;
+ size_t k = 0;
+ size_t dep_subscript;
+ size_t *rows;
+ const gsl_matrix *ssizes;
+ const gsl_matrix *cm;
+ double result = 0.0;
+
+ cm = covariance_calculate (all_cov);
+ rows = xnmalloc (cov->size1 - 1, sizeof (*rows));
+
+ for (i = 0; i < n_all_vars; i++)
{
- /*
- The second condition ensures the program will run even if
- there is only one variable to act as both explanatory and
- response.
- */
- for (i = 0; i < n_vars; i++)
+ for (j = k; j < n_vars; j++)
{
- const union value *val = case_data (c, vars[i]);
- if (var_is_alpha (vars[i]))
- cat_value_update (vars[i], val);
- else
- moments1_add (mom[i].m, val->f, 1.0);
+ if (vars[j] == all_vars[i])
+ {
+ if (vars[j] != dep_var)
+ {
+ rows[j] = i;
+ }
+ else
+ {
+ dep_subscript = i;
+ }
+ k++;
+ break;
+ }
}
- n_data++;
}
- casereader_destroy (input);
-
- return n_data;
-}
-
-static void
-coeff_init (pspp_linreg_cache * c, struct design_matrix *dm)
-{
- c->coeff = xnmalloc (dm->m->size2, sizeof (*c->coeff));
- pspp_coeff_init (c->coeff, dm);
+ for (i = 0; i < cov->size1 - 1; i++)
+ {
+ for (j = 0; j < cov->size2 - 1; j++)
+ {
+ gsl_matrix_set (cov, i, j, gsl_matrix_get (cm, rows[i], rows[j]));
+ gsl_matrix_set (cov, j, i, gsl_matrix_get (cm, rows[j], rows[i]));
+ }
+ }
+ ssizes = covariance_moments (all_cov, MOMENT_NONE);
+ result = gsl_matrix_get (ssizes, dep_subscript, rows[0]);
+ for (i = 0; i < cov->size1 - 1; i++)
+ {
+ gsl_matrix_set (cov, i, cov->size1 - 1,
+ gsl_matrix_get (cm, rows[i], dep_subscript));
+ gsl_matrix_set (cov, cov->size1 - 1, i,
+ gsl_matrix_get (cm, rows[i], dep_subscript));
+ if (result > gsl_matrix_get (ssizes, rows[i], dep_subscript))
+ {
+ result = gsl_matrix_get (ssizes, rows[i], dep_subscript);
+ }
+ }
+ free (rows);
}
static bool
run_regression (struct casereader *input, struct cmd_regression *cmd,
- struct dataset *ds, pspp_linreg_cache **models)
+ struct dataset *ds, linreg **models)
{
size_t i;
int n_indep = 0;
int k;
+ double n_data;
struct ccase *c;
- const struct variable **indep_vars;
- struct design_matrix *X;
- struct moments_var *mom;
- gsl_vector *Y;
-
- pspp_linreg_opts lopts;
+ struct covariance *cov;
+ const struct variable **vars;
+ const struct variable *dep_var;
+ struct casereader *reader;
+ const struct dictionary *dict;
+ gsl_matrix *this_cm;
assert (models != NULL);
+ for (i = 0; i < n_variables; i++)
+ {
+ if (!var_is_numeric (v_variables[i]))
+ {
+ msg (SE, _("REGRESSION requires numeric variables."));
+ return false;
+ }
+ }
+
c = casereader_peek (input, 0);
if (c == NULL)
{
output_split_file_values (ds, c);
case_unref (c);
+ dict = dataset_dict (ds);
if (!v_variables)
{
- dict_get_vars (dataset_dict (ds), &v_variables, &n_variables, 0);
+ dict_get_vars (dict, &v_variables, &n_variables, 0);
}
-
- for (i = 0; i < cmd->n_dependent; i++)
+ vars = xnmalloc (n_variables, sizeof (*vars));
+ cov = covariance_1pass_create (n_variables, v_variables,
+ dict_get_weight (dict), MV_ANY);
+
+ reader = casereader_clone (input);
+ reader = casereader_create_filter_missing (reader, v_variables, n_variables,
+ MV_ANY, NULL, NULL);
+ for (; (c = casereader_read (reader)) != NULL; case_unref (c))
{
- if (!var_is_numeric (cmd->v_dependent[i]))
- {
- msg (SE, _("Dependent variable must be numeric."));
- return false;
- }
+ covariance_accumulate (cov, c);
}
-
- mom = xnmalloc (n_variables, sizeof (*mom));
- for (i = 0; i < n_variables; i++)
- {
- (mom + i)->m = moments1_create (MOMENT_VARIANCE);
- (mom + i)->v = v_variables[i];
- }
- lopts.get_depvar_mean_std = 1;
-
-
- indep_vars = xnmalloc (n_variables, sizeof *indep_vars);
-
+
for (k = 0; k < cmd->n_dependent; k++)
{
- const struct variable *dep_var;
- struct casereader *reader;
- casenumber row;
- struct ccase *c;
- size_t n_data; /* Number of valid cases. */
-
dep_var = cmd->v_dependent[k];
- n_indep = identify_indep_vars (indep_vars, dep_var);
- reader = casereader_clone (input);
- reader = casereader_create_filter_missing (reader, indep_vars, n_indep,
- MV_ANY, NULL, NULL);
- reader = casereader_create_filter_missing (reader, &dep_var, 1,
- MV_ANY, NULL, NULL);
- n_data = prepare_categories (casereader_clone (reader),
- indep_vars, n_indep, mom);
-
- if ((n_data > 0) && (n_indep > 0))
+ n_indep = identify_indep_vars (vars, dep_var);
+
+ this_cm = gsl_matrix_alloc (n_indep + 1, n_indep + 1);
+ n_data = fill_covariance (this_cm, cov, vars, n_indep,
+ dep_var, v_variables, n_variables);
+ models[k] = linreg_alloc (dep_var, (const struct variable **) vars,
+ n_data, n_indep);
+ models[k]->depvar = dep_var;
+
+ /*
+ For large data sets, use QR decomposition.
+ */
+ if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
+ {
+ models[k]->method = LINREG_QR;
+ }
+
+ if (n_data > 0)
{
- Y = gsl_vector_alloc (n_data);
- X =
- design_matrix_create (n_indep,
- (const struct variable **) indep_vars,
- n_data);
- lopts.get_indep_mean_std = xnmalloc (X->m->size2, sizeof (int));
- for (i = 0; i < X->m->size2; i++)
- {
- lopts.get_indep_mean_std[i] = 1;
- }
- models[k] = pspp_linreg_cache_alloc (dep_var, (const struct variable **) indep_vars,
- X->m->size1, n_indep);
- models[k]->depvar = dep_var;
- /*
- For large data sets, use QR decomposition.
- */
- if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
- {
- models[k]->method = PSPP_LINREG_QR;
- }
-
- /*
- The second pass fills the design matrix.
- */
- reader = casereader_create_counter (reader, &row, -1);
- for (; (c = casereader_read (reader)) != NULL; case_unref (c))
- {
- for (i = 0; i < n_indep; ++i)
- {
- const struct variable *v = indep_vars[i];
- const union value *val = case_data (c, v);
- if (var_is_alpha (v))
- design_matrix_set_categorical (X, row, v, val);
- else
- design_matrix_set_numeric (X, row, v, val);
- }
- gsl_vector_set (Y, row, case_num (c, dep_var));
- }
- /*
- Now that we know the number of coefficients, allocate space
- and store pointers to the variables that correspond to the
- coefficients.
- */
- coeff_init (models[k], X);
-
/*
- Find the least-squares estimates and other statistics.
- */
- pspp_linreg ((const gsl_vector *) Y, X, &lopts, models[k]);
-
+ Find the least-squares estimates and other statistics.
+ */
+ linreg_fit (this_cm, models[k]);
+
if (!taint_has_tainted_successor (casereader_get_taint (input)))
{
subcommand_statistics (cmd->a_statistics, models[k]);
}
-
- gsl_vector_free (Y);
- design_matrix_destroy (X);
- free (lopts.get_indep_mean_std);
}
else
{
msg (SE,
gettext ("No valid data found. This command was skipped."));
}
- casereader_destroy (reader);
- }
- for (i = 0; i < n_variables; i++)
- {
- moments1_destroy ((mom + i)->m);
}
- free (mom);
- free (indep_vars);
- casereader_destroy (input);
+ casereader_destroy (reader);
+ free (vars);
+ casereader_destroy (input);
+ covariance_destroy (cov);
+
return true;
}
src/math/chart-geometry.c \
src/math/chart-geometry.h \
src/math/box-whisker.c src/math/box-whisker.h \
- src/math/coefficient.c \
- src/math/coefficient.h \
src/math/categoricals.h \
src/math/categoricals.c \
src/math/covariance.c \
src/math/covariance.h \
- src/math/covariance-matrix.c \
- src/math/covariance-matrix.h \
- src/math/design-matrix.c src/math/design-matrix.h \
src/math/extrema.c src/math/extrema.h \
src/math/group.c src/math/group.h \
src/math/group-proc.h \
#include <gsl/gsl_cblas.h>
#include <gsl/gsl_errno.h>
#include <gsl/gsl_fit.h>
+#include <gsl/gsl_linalg.h>
#include <gsl/gsl_multifit.h>
#include <linreg/sweep.h>
-#include <math/coefficient.h>
#include <math/linreg.h>
-#include <math/coefficient.h>
-#include <math/design-matrix.h>
#include <src/data/category.h>
#include <src/data/variable.h>
#include <src/data/value.h>
*/
-/*
- Get the mean and standard deviation of a vector
- of doubles via a form of the Kalman filter as
- described on page 32 of [3].
- */
-static int
-linreg_mean_std (gsl_vector_const_view v, double *mp, double *sp, double *ssp)
-{
- size_t i;
- double j = 0.0;
- double d;
- double tmp;
- double mean;
- double variance;
-
- mean = gsl_vector_get (&v.vector, 0);
- variance = 0;
- for (i = 1; i < v.vector.size; i++)
- {
- j = (double) i + 1.0;
- tmp = gsl_vector_get (&v.vector, i);
- d = (tmp - mean) / j;
- mean += d;
- variance += j * (j - 1.0) * d * d;
- }
- *mp = mean;
- *sp = sqrt (variance / (j - 1.0));
- *ssp = variance;
-
- return GSL_SUCCESS;
-}
-
-/*
- Set V to contain an array of pointers to the variables
- used in the model. V must be at least C->N_COEFFS in length.
- The return value is the number of distinct variables found.
- */
-int
-pspp_linreg_get_vars (const void *c_, const struct variable **v)
+const struct variable **
+linreg_get_vars (const linreg *c)
{
- const pspp_linreg_cache *c = c_;
- const struct variable *tmp;
- int i;
- int j;
- int result = 0;
-
- /*
- Make sure the caller doesn't try to sneak a variable
- into V that is not in the model.
- */
- for (i = 0; i < c->n_coeffs; i++)
- {
- v[i] = NULL;
- }
- for (j = 0; j < c->n_coeffs; j++)
- {
- tmp = pspp_coeff_get_var (c->coeff[j], 0);
- assert (tmp != NULL);
- /* Repeated variables are likely to bunch together, at the end
- of the array. */
- i = result - 1;
- while (i >= 0 && v[i] != tmp)
- {
- i--;
- }
- if (i < 0 && result < c->n_coeffs)
- {
- v[result] = tmp;
- result++;
- }
- }
- return result;
+ return c->indep_vars;
}
/*
- Allocate a pspp_linreg_cache and return a pointer
- to it. n is the number of cases, p is the number of
- independent variables.
+ Allocate a linreg and return a pointer to it. n is the number of
+ cases, p is the number of independent variables.
*/
-pspp_linreg_cache *
-pspp_linreg_cache_alloc (const struct variable *depvar, const struct variable **indep_vars,
- size_t n, size_t p)
+linreg *
+linreg_alloc (const struct variable *depvar, const struct variable **indep_vars,
+ double n, size_t p)
{
- size_t i;
- pspp_linreg_cache *c;
+ linreg *c;
- c = (pspp_linreg_cache *) malloc (sizeof (pspp_linreg_cache));
+ c = (linreg *) malloc (sizeof (linreg));
c->depvar = depvar;
c->indep_vars = indep_vars;
c->indep_means = gsl_vector_alloc (p);
*/
c->n_obs = n;
c->n_indeps = p;
- c->n_coeffs = 0;
- for (i = 0; i < p; i++)
- {
- if (var_is_numeric (indep_vars[i]))
- {
- c->n_coeffs++;
- }
- else
- {
- c->n_coeffs += cat_get_n_categories (indep_vars[i]) - 1;
- }
- }
-
+ c->n_coeffs = p;
+ c->coeff = xnmalloc (p, sizeof (*c->coeff));
c->cov = gsl_matrix_alloc (c->n_coeffs + 1, c->n_coeffs + 1);
+ c->dft = n - 1;
+ c->dfm = p;
+ c->dfe = c->dft - c->dfm;
+ c->intercept = 0.0;
/*
Default settings.
*/
- c->method = PSPP_LINREG_SWEEP;
- c->predict = pspp_linreg_predict;
- c->residual = pspp_linreg_residual; /* The procedure to compute my
- residuals. */
- c->get_vars = pspp_linreg_get_vars; /* The procedure that returns
- pointers to model
- variables. */
+ c->method = LINREG_SWEEP;
c->resid = NULL; /* The variable storing my residuals. */
c->pred = NULL; /* The variable storing my predicted values. */
}
bool
-pspp_linreg_cache_free (void *m)
+linreg_free (void *m)
{
- int i;
-
- pspp_linreg_cache *c = m;
+ linreg *c = m;
if (c != NULL)
{
gsl_vector_free (c->indep_means);
gsl_vector_free (c->indep_std);
- gsl_vector_free (c->ss_indeps);
gsl_matrix_free (c->cov);
gsl_vector_free (c->ssx);
- for (i = 0; i < c->n_coeffs; i++)
- {
- pspp_coeff_free (c->coeff[i]);
- }
free (c->coeff);
free (c);
}
return true;
}
-static void
-cache_init (pspp_linreg_cache *cache)
-{
- assert (cache != NULL);
- cache->dft = cache->n_obs - 1;
- cache->dfm = cache->n_indeps;
- cache->dfe = cache->dft - cache->dfm;
- cache->intercept = 0.0;
-}
static void
-post_sweep_computations (pspp_linreg_cache *cache, const struct design_matrix *dm,
- gsl_matrix *sw)
+post_sweep_computations (linreg *l, gsl_matrix *sw)
{
gsl_matrix *xm;
gsl_matrix_view xtx;
int rc;
assert (sw != NULL);
- assert (cache != NULL);
+ assert (l != NULL);
- cache->sse = gsl_matrix_get (sw, cache->n_indeps, cache->n_indeps);
- cache->mse = cache->sse / cache->dfe;
+ l->sse = gsl_matrix_get (sw, l->n_indeps, l->n_indeps);
+ l->mse = l->sse / l->dfe;
/*
Get the intercept.
*/
- m = cache->depvar_mean;
- for (i = 0; i < cache->n_indeps; i++)
+ m = l->depvar_mean;
+ for (i = 0; i < l->n_indeps; i++)
{
- tmp = gsl_matrix_get (sw, i, cache->n_indeps);
- cache->coeff[i]->estimate = tmp;
- m -= tmp * pspp_linreg_get_indep_variable_mean (cache, design_matrix_col_to_var (dm, i));
+ tmp = gsl_matrix_get (sw, i, l->n_indeps);
+ l->coeff[i] = tmp;
+ m -= tmp * linreg_get_indep_variable_mean (l, i);
}
/*
Get the covariance matrix of the parameter estimates.
The loops below do not compute the entries related
to the estimated intercept.
*/
- for (i = 0; i < cache->n_indeps; i++)
- for (j = i; j < cache->n_indeps; j++)
+ for (i = 0; i < l->n_indeps; i++)
+ for (j = i; j < l->n_indeps; j++)
{
- tmp = -1.0 * cache->mse * gsl_matrix_get (sw, i, j);
- gsl_matrix_set (cache->cov, i + 1, j + 1, tmp);
+ tmp = -1.0 * l->mse * gsl_matrix_get (sw, i, j);
+ gsl_matrix_set (l->cov, i + 1, j + 1, tmp);
}
/*
Get the covariances related to the intercept.
*/
- xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
- xmxtx = gsl_matrix_submatrix (cache->cov, 0, 1, 1, cache->n_indeps);
- xm = gsl_matrix_calloc (1, cache->n_indeps);
+ xtx = gsl_matrix_submatrix (sw, 0, 0, l->n_indeps, l->n_indeps);
+ xmxtx = gsl_matrix_submatrix (l->cov, 0, 1, 1, l->n_indeps);
+ xm = gsl_matrix_calloc (1, l->n_indeps);
for (i = 0; i < xm->size2; i++)
{
gsl_matrix_set (xm, 0, i,
- pspp_linreg_get_indep_variable_mean (cache, design_matrix_col_to_var (dm, i)));
+ linreg_get_indep_variable_mean (l, i));
}
- rc = gsl_blas_dsymm (CblasRight, CblasUpper, cache->mse,
+ rc = gsl_blas_dsymm (CblasRight, CblasUpper, l->mse,
&xtx.matrix, xm, 0.0, &xmxtx.matrix);
gsl_matrix_free (xm);
if (rc == GSL_SUCCESS)
{
- tmp = cache->mse / cache->n_obs;
- for (i = 1; i < 1 + cache->n_indeps; i++)
+ tmp = l->mse / l->n_obs;
+ for (i = 1; i < 1 + l->n_indeps; i++)
{
- tmp -= gsl_matrix_get (cache->cov, 0, i)
- * pspp_linreg_get_indep_variable_mean (cache, design_matrix_col_to_var (dm, i - 1));
+ tmp -= gsl_matrix_get (l->cov, 0, i)
+ * linreg_get_indep_variable_mean (l, i - 1);
}
- gsl_matrix_set (cache->cov, 0, 0, tmp);
+ gsl_matrix_set (l->cov, 0, 0, tmp);
- cache->intercept = m;
+ l->intercept = m;
}
else
{
exit (rc);
}
}
-
-/*
- Fit the linear model via least squares. All pointers passed to pspp_linreg
- are assumed to be allocated to the correct size and initialized to the
- values as indicated by opts.
- */
-int
-pspp_linreg (const gsl_vector * Y, const struct design_matrix *dm,
- const pspp_linreg_opts * opts, pspp_linreg_cache * cache)
-{
- int rc;
- gsl_matrix *design = NULL;
- gsl_matrix_view xtx;
- gsl_vector_view xty;
- gsl_vector_view xi;
- gsl_vector_view xj;
- gsl_vector *param_estimates;
- struct pspp_coeff *coef;
- const struct variable *v;
- const union value *val;
-
- size_t i;
- size_t j;
- double tmp;
- double m;
- double s;
- double ss;
-
- if (cache == NULL)
- {
- return GSL_EFAULT;
- }
- if (opts->get_depvar_mean_std)
- {
- linreg_mean_std (gsl_vector_const_subvector (Y, 0, Y->size),
- &m, &s, &ss);
- cache->depvar_mean = m;
- cache->depvar_std = s;
- cache->sst = ss;
- }
- cache_init (cache);
- cache->n_coeffs = dm->m->size2;
- for (i = 0; i < dm->m->size2; i++)
- {
- if (opts->get_indep_mean_std[i])
- {
- linreg_mean_std (gsl_matrix_const_column (dm->m, i), &m, &s, &ss);
- v = design_matrix_col_to_var (dm, i);
- val = NULL;
- if (var_is_alpha (v))
- {
- j = i - design_matrix_var_to_column (dm, v);
- val = cat_subscript_to_value (j, v);
- }
- coef = pspp_linreg_get_coeff (cache, v, val);
- pspp_coeff_set_mean (coef, m);
- pspp_coeff_set_sd (coef, s);
- gsl_vector_set (cache->ssx, i, ss);
-
- }
- }
-
- if (cache->method == PSPP_LINREG_SWEEP)
- {
- gsl_matrix *sw;
- /*
- Subtract the means to improve the condition of the design
- matrix. This requires copying dm->m and Y. We do not divide by the
- standard deviations of the independent variables here since doing
- so would cause a miscalculation of the residual sums of
- squares. Dividing by the standard deviation is done GSL's linear
- regression functions, so if the design matrix has a poor
- condition, use QR decomposition.
-
- The design matrix here does not include a column for the intercept
- (i.e., a column of 1's). If using PSPP_LINREG_QR, we need that column,
- so design is allocated here when sweeping, or below if using QR.
- */
- design = gsl_matrix_alloc (dm->m->size1, dm->m->size2);
- for (i = 0; i < dm->m->size2; i++)
- {
- v = design_matrix_col_to_var (dm, i);
- m = pspp_linreg_get_indep_variable_mean (cache, v);
- for (j = 0; j < dm->m->size1; j++)
- {
- tmp = (gsl_matrix_get (dm->m, j, i) - m);
- gsl_matrix_set (design, j, i, tmp);
- }
- }
- sw = gsl_matrix_calloc (cache->n_coeffs + 1, cache->n_coeffs + 1);
- xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_coeffs, cache->n_coeffs);
-
- for (i = 0; i < xtx.matrix.size1; i++)
- {
- tmp = gsl_vector_get (cache->ssx, i);
- gsl_matrix_set (&(xtx.matrix), i, i, tmp);
- xi = gsl_matrix_column (design, i);
- for (j = (i + 1); j < xtx.matrix.size2; j++)
- {
- xj = gsl_matrix_column (design, j);
- gsl_blas_ddot (&(xi.vector), &(xj.vector), &tmp);
- gsl_matrix_set (&(xtx.matrix), i, j, tmp);
- }
- }
-
- gsl_matrix_set (sw, cache->n_coeffs, cache->n_coeffs, cache->sst);
- xty = gsl_matrix_column (sw, cache->n_coeffs);
- /*
- This loop starts at 1, with i=0 outside the loop, so we can get
- the model sum of squares due to the first independent variable.
- */
- xi = gsl_matrix_column (design, 0);
- gsl_blas_ddot (&(xi.vector), Y, &tmp);
- gsl_vector_set (&(xty.vector), 0, tmp);
- tmp *= tmp / gsl_vector_get (cache->ssx, 0);
- gsl_vector_set (cache->ss_indeps, 0, tmp);
- for (i = 1; i < cache->n_coeffs; i++)
- {
- xi = gsl_matrix_column (design, i);
- gsl_blas_ddot (&(xi.vector), Y, &tmp);
- gsl_vector_set (&(xty.vector), i, tmp);
- }
-
- /*
- Sweep on the matrix sw, which contains XtX, XtY and YtY.
- */
- reg_sweep (sw);
- post_sweep_computations (cache, dm, sw);
- gsl_matrix_free (sw);
- }
- else if (cache->method == PSPP_LINREG_CONDITIONAL_INVERSE)
- {
- /*
- Use the SVD of X^T X to find a conditional inverse of X^TX. If
- the SVD is X^T X = U D V^T, then set the conditional inverse
- to (X^T X)^c = V D^- U^T. D^- is defined as follows: If entry
- (i, i) has value sigma_i, then entry (i, i) of D^- is 1 /
- sigma_i if sigma_i > 0, and 0 otherwise. Then solve the normal
- equations by setting the estimated parameter vector to
- (X^TX)^c X^T Y.
- */
- }
- else
- {
- gsl_multifit_linear_workspace *wk;
- /*
- Use QR decomposition via GSL.
- */
-
- param_estimates = gsl_vector_alloc (1 + dm->m->size2);
- design = gsl_matrix_alloc (dm->m->size1, 1 + dm->m->size2);
-
- for (j = 0; j < dm->m->size1; j++)
- {
- gsl_matrix_set (design, j, 0, 1.0);
- for (i = 0; i < dm->m->size2; i++)
- {
- tmp = gsl_matrix_get (dm->m, j, i);
- gsl_matrix_set (design, j, i + 1, tmp);
- }
- }
-
- wk = gsl_multifit_linear_alloc (design->size1, design->size2);
- rc = gsl_multifit_linear (design, Y, param_estimates,
- cache->cov, &(cache->sse), wk);
- for (i = 0; i < cache->n_coeffs; i++)
- {
- cache->coeff[i]->estimate = gsl_vector_get (param_estimates, i + 1);
- }
- cache->intercept = gsl_vector_get (param_estimates, 0);
- if (rc == GSL_SUCCESS)
- {
- gsl_multifit_linear_free (wk);
- gsl_vector_free (param_estimates);
- }
- else
- {
- fprintf (stderr, "%s:%d: gsl_multifit_linear returned %d\n",
- __FILE__, __LINE__, rc);
- }
- }
-
-
- cache->ssm = cache->sst - cache->sse;
- /*
- Get the remaining sums of squares for the independent
- variables.
- */
- m = 0;
- for (i = 1; i < cache->n_indeps; i++)
- {
- j = i - 1;
- m += gsl_vector_get (cache->ss_indeps, j);
- tmp = cache->ssm - m;
- gsl_vector_set (cache->ss_indeps, i, tmp);
- }
-
- gsl_matrix_free (design);
- return GSL_SUCCESS;
-}
/*
- Is the coefficient COEF contained in the list of coefficients
- COEF_LIST?
- */
-static int
-has_coefficient (const struct pspp_coeff **coef_list, const struct pspp_coeff *coef,
- size_t n)
-{
- size_t i = 0;
-
- while (i < n)
- {
- if (coef_list[i] == coef)
- {
- return 1;
- }
- i++;
- }
- return 0;
-}
-/*
- Predict the value of the dependent variable with the
- new set of predictors. PREDICTORS must point to a list
- of variables, each of whose values are stored in VALS,
- in the same order.
+ Predict the value of the dependent variable with the new set of
+ predictors. VALS are assumed to be in the order corresponding to the
+ order of the coefficients in the linreg struct.
*/
double
-pspp_linreg_predict (const struct variable **predictors,
- const union value **vals, const void *c_, int n_vals)
+linreg_predict (const linreg *c, const double *vals, size_t n_vals)
{
- const pspp_linreg_cache *c = c_;
- int j;
- size_t next_coef = 0;
- const struct pspp_coeff **coef_list;
- const struct pspp_coeff *coe;
+ size_t j;
double result;
- double tmp;
- if (predictors == NULL || vals == NULL || c == NULL)
+ assert (n_vals = c->n_coeffs);
+ if (vals == NULL || c == NULL)
{
return GSL_NAN;
}
/* The stupid model: just guess the mean. */
return c->depvar_mean;
}
- coef_list = xnmalloc (c->n_coeffs, sizeof (*coef_list));
result = c->intercept;
- /*
- The loops guard against the possibility that the caller passed us
- inadequate information, such as too few or too many values, or
- a redundant list of variable names.
- */
for (j = 0; j < n_vals; j++)
{
- coe = pspp_linreg_get_coeff (c, predictors[j], vals[j]);
- if (!has_coefficient (coef_list, coe, next_coef))
- {
- tmp = pspp_coeff_get_est (coe);
- if (var_is_numeric (predictors[j]))
- {
- tmp *= vals[j]->f;
- }
- result += tmp;
- coef_list[next_coef++] = coe;
- }
+ result += linreg_coeff (c, j) * vals[j];
}
- free (coef_list);
return result;
}
double
-pspp_linreg_residual (const struct variable **predictors,
- const union value **vals,
- const union value *obs, const void *c, int n_vals)
+linreg_residual (const linreg *c, double obs, const double *vals, size_t n_vals)
{
- double pred;
- double result;
-
- if (predictors == NULL || vals == NULL || c == NULL || obs == NULL)
+ if (vals == NULL || c == NULL)
{
return GSL_NAN;
}
- pred = pspp_linreg_predict (predictors, vals, c, n_vals);
-
- result = isnan (pred) ? GSL_NAN : (obs->f - pred);
- return result;
+ return (obs - linreg_predict (c, vals, n_vals));
}
-/*
- Which coefficient is associated with V? The VAL argument is relevant
- only to categorical variables.
- */
-struct pspp_coeff *
-pspp_linreg_get_coeff (const pspp_linreg_cache * c,
- const struct variable *v, const union value *val)
-{
- if (c == NULL)
- {
- return NULL;
- }
- if (c->coeff == NULL || c->n_indeps == 0 || v == NULL)
- {
- return NULL;
- }
- return pspp_coeff_var_to_coeff (v, c->coeff, c->n_coeffs, val);
-}
-/*
- Return the standard deviation of the independent variable.
- */
-double pspp_linreg_get_indep_variable_sd (pspp_linreg_cache *c, const struct variable *v)
+double linreg_get_indep_variable_sd (linreg *c, size_t j)
{
- if (var_is_numeric (v))
- {
- const struct pspp_coeff *coef;
- coef = pspp_linreg_get_coeff (c, v, NULL);
- return pspp_coeff_get_sd (coef);
- }
- return GSL_NAN;
+ assert (c != NULL);
+ return gsl_vector_get (c->indep_std, j);
}
-void pspp_linreg_set_indep_variable_sd (pspp_linreg_cache *c, const struct variable *v,
- double s)
+void linreg_set_indep_variable_sd (linreg *c, size_t j, double s)
{
- if (var_is_numeric (v))
- {
- struct pspp_coeff *coef;
- coef = pspp_linreg_get_coeff (c, v, NULL);
- pspp_coeff_set_sd (coef, s);
- }
+ assert (c != NULL);
+ gsl_vector_set (c->indep_std, j, s);
}
/*
Mean of the independent variable.
*/
-double pspp_linreg_get_indep_variable_mean (pspp_linreg_cache *c, const struct variable *v)
+double linreg_get_indep_variable_mean (linreg *c, size_t j)
{
- if (var_is_numeric (v))
- {
- struct pspp_coeff *coef;
- coef = pspp_linreg_get_coeff (c, v, NULL);
- return pspp_coeff_get_mean (coef);
- }
- return 0.0;
+ assert (c != NULL);
+ return gsl_vector_get (c->indep_means, j);
}
-void pspp_linreg_set_indep_variable_mean (pspp_linreg_cache *c, const struct variable *v,
- double m)
+void linreg_set_indep_variable_mean (linreg *c, size_t j, double m)
{
- if (var_is_numeric (v))
- {
- struct pspp_coeff *coef;
- coef = pspp_linreg_get_coeff (c, v, NULL);
- pspp_coeff_set_mean (coef, m);
- }
+ assert (c != NULL);
+ gsl_vector_set (c->indep_means, j, m);
}
-/*
- Make sure the dependent variable is at the last column, and that
- only variables in the model are in the covariance matrix.
- */
-static struct design_matrix *
-rearrange_covariance_matrix (const struct covariance_matrix *cm, pspp_linreg_cache *c)
+static void
+linreg_fit_qr (const gsl_matrix *cov, linreg *l)
{
- const struct variable **model_vars;
- struct design_matrix *cov;
- struct design_matrix *result;
- size_t *permutation;
+ gsl_matrix *xtx;
+ gsl_matrix *q;
+ gsl_matrix *r;
+ gsl_vector *xty;
+ gsl_vector *tau;
+ gsl_vector *params;
+ double tmp = 0.0;
size_t i;
size_t j;
- size_t k;
- size_t n_coeffs = 0;
- assert (cm != NULL);
- cov = covariance_to_design (cm);
- assert (cov != NULL);
- assert (c != NULL);
- assert (cov->m->size1 > 0);
- assert (cov->m->size2 == cov->m->size1);
- model_vars = xnmalloc (1 + c->n_indeps, sizeof (*model_vars));
+ xtx = gsl_matrix_alloc (cov->size1 - 1, cov->size2 - 1);
+ xty = gsl_vector_alloc (cov->size1 - 1);
+ tau = gsl_vector_alloc (cov->size1 - 1);
+ params = gsl_vector_alloc (cov->size1 - 1);
- /*
- Put the model variables in the right order in MODEL_VARS.
- Count the number of coefficients.
- */
- for (i = 0; i < c->n_indeps; i++)
+ for (i = 0; i < xtx->size1; i++)
{
- model_vars[i] = c->indep_vars[i];
+ gsl_vector_set (xty, i, gsl_matrix_get (cov, cov->size2 - 1, i));
+ for (j = 0; j < xtx->size2; j++)
+ {
+ gsl_matrix_set (xtx, i, j, gsl_matrix_get (cov, i, j));
+ }
+ }
+ gsl_linalg_QR_decomp (xtx, tau);
+ q = gsl_matrix_alloc (xtx->size1, xtx->size2);
+ r = gsl_matrix_alloc (xtx->size1, xtx->size2);
+ gsl_linalg_QR_unpack (xtx, tau, q, r);
+ gsl_linalg_QR_solve (xtx, tau, xty, params);
+ for (i = 0; i < params->size; i++)
+ {
+ l->coeff[i] = gsl_vector_get (params, i);
+ }
+
+ l->intercept = l->depvar_mean;
+ for (i = 0; i < l->n_indeps; i++)
+ {
+ l->intercept -= l->coeff[i] * linreg_get_indep_variable_mean (l, i);
}
- model_vars[i] = c->depvar;
- result = covariance_matrix_create (1 + c->n_indeps, model_vars);
- permutation = xnmalloc (design_matrix_get_n_cols (result), sizeof (*permutation));
- for (j = 0; j < cov->m->size2; j++)
+ l->sse = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1);
+ for (i = 0; i < l->n_indeps; i++)
+ {
+ tmp += gsl_vector_get (xty, i) * l->coeff[i];
+ }
+ l->sse -= 2.0 * tmp;
+ for (i = 0; i < xtx->size1; i++)
{
- k = 0;
- while (k < result->m->size2)
+ tmp = 0.0;
+ for (j = i; j < xtx->size2; j++)
{
- if (design_matrix_col_to_var (cov, j) == design_matrix_col_to_var (result, k))
- {
- permutation[k] = j;
- }
- k++;
+ tmp += gsl_matrix_get (xtx, i, j) * l->coeff[j];
}
+ l->sse += tmp * tmp;
}
- for (i = 0; i < result->m->size1; i++)
- for (j = 0; j < result->m->size2; j++)
- {
- gsl_matrix_set (result->m, i, j, gsl_matrix_get (cov->m, permutation[i], permutation[j]));
- }
- free (permutation);
- free (model_vars);
- return result;
+
+#if 0
+ p = l->hat->size1 - 1;
+ for (i = 0; i < l->cov->size1 - 1; i++)
+ {
+ gsl_matrix_set (l->cov, p - i, p - i, 1.0 / gsl_matrix_get (r, p - i, p - i));
+ for (j = 0; j < i; j++)
+ {
+ tmp = -1.0 * gsl_matrix_get (r, p - i, p - j);
+ tmp /= gsl_matrix_get (r, p - i, p - i) * gsl_matrix_get (r, p - j, p - j);
+ gsl_matrix_set (l->cov, p - i, p - j, tmp);
+ }
+ }
+#endif
+ gsl_matrix_transpose_memcpy (l->cov, q);
+ gsl_blas_dtrsm (CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 1.0,
+ r, l->cov);
+
+ gsl_matrix_free (q);
+ gsl_matrix_free (r);
+ gsl_vector_free (xty);
+ gsl_vector_free (tau);
+ gsl_matrix_free (xtx);
+ gsl_vector_free (params);
}
+
/*
- Estimate the model parameters from the covariance matrix only. This
- method uses less memory than PSPP_LINREG, which requires the entire
- data set to be stored in memory.
-
- The function assumes FULL_COV may contain columns corresponding to
- variables that are not in the model. It fixes this in
- REARRANG_COVARIANCE_MATRIX. This allows the caller to compute a
- large covariance matrix once before, then pass it to this without
- having to alter it. The problem is that this means the caller must
- set CACHE->N_COEFFS.
+ Estimate the model parameters from the covariance matrix. This
+ function assumes the covariance entries corresponding to the
+ dependent variable are in the final row and column of the covariance
+ matrix.
*/
void
-pspp_linreg_with_cov (const struct covariance_matrix *full_cov,
- pspp_linreg_cache * cache)
+linreg_fit (const gsl_matrix *cov, linreg *l)
{
- struct design_matrix *cov;
+ gsl_matrix *params;
+ assert (l != NULL);
+ assert (cov != NULL);
- assert (full_cov != NULL);
- assert (cache != NULL);
+ params = gsl_matrix_calloc (cov->size1, cov->size2);
+ gsl_matrix_memcpy (params, cov);
- cov = rearrange_covariance_matrix (full_cov, cache);
- cache_init (cache);
- reg_sweep (cov->m);
- post_sweep_computations (cache, cov, cov->m);
- design_matrix_destroy (cov);
+ if (l->method == LINREG_SWEEP)
+ {
+ reg_sweep (params);
+ post_sweep_computations (l, params);
+ }
+ else if (l->method == LINREG_QR)
+ {
+ linreg_fit_qr (params, l);
+ }
+ gsl_matrix_free (params);
}
-double pspp_linreg_mse (const pspp_linreg_cache *c)
+double linreg_mse (const linreg *c)
{
assert (c != NULL);
return (c->sse / c->dfe);
}
+
+double linreg_intercept (const linreg *c)
+{
+ return c->intercept;
+}
+
+gsl_matrix *
+linreg_cov (const linreg *c)
+{
+ return c->cov;
+}
+
+double
+linreg_coeff (const linreg *c, size_t i)
+{
+ return (c->coeff[i]);
+}
+
+const struct variable *
+linreg_indep_var (const linreg *c, size_t i)
+{
+ return (c->indep_vars[i]);
+}
+
+size_t
+linreg_n_coeffs (const linreg *c)
+{
+ return c->n_coeffs;
+}
+
+size_t
+linreg_n_obs (const linreg *c)
+{
+ return c->n_obs;
+}
+
+double
+linreg_ssreg (const linreg *c)
+{
+ return c->ssm;
+}
+
+double
+linreg_dfmodel ( const linreg *c)
+{
+ return c->dfm;
+}
#include <gsl/gsl_math.h>
#include <gsl/gsl_vector.h>
#include <gsl/gsl_matrix.h>
-#include <src/math/coefficient.h>
-#include <math/covariance-matrix.h>
enum
{
- PSPP_LINREG_CONDITIONAL_INVERSE,
- PSPP_LINREG_QR,
- PSPP_LINREG_SWEEP,
+ LINREG_CONDITIONAL_INVERSE,
+ LINREG_QR,
+ LINREG_SWEEP,
};
*/
-struct pspp_linreg_cache_struct
+struct linreg_struct
{
- int n_obs; /* Number of observations. */
+ double n_obs; /* Number of observations. */
int n_indeps; /* Number of independent variables. */
int n_coeffs; /* The intercept is not considered a
coefficient here. */
const struct variable *depvar;
const struct variable **indep_vars;
- gsl_vector *residuals;
- struct pspp_coeff **coeff;
+ double *coeff;
double intercept;
int method; /* Method to use to estimate parameters. */
/*
*/
gsl_matrix *hat;
- double (*predict) (const struct variable **, const union value **,
- const void *, int);
- double (*residual) (const struct variable **,
- const union value **,
- const union value *, const void *, int);
- /*
- Returns pointers to the variables used in the model.
- */
- int (*get_vars) (const void *, const struct variable **);
struct variable *resid;
struct variable *pred;
};
-typedef struct pspp_linreg_cache_struct pspp_linreg_cache;
+typedef struct linreg_struct linreg;
-/*
- Allocate a pspp_linreg_cache and return a pointer
- to it. n is the number of cases, p is the number of
- independent variables.
- */
-pspp_linreg_cache *pspp_linreg_cache_alloc (const struct variable *, const struct variable **,
- size_t, size_t);
+linreg *linreg_alloc (const struct variable *, const struct variable **,
+ double, size_t);
-bool pspp_linreg_cache_free (void *);
+bool linreg_free (void *);
/*
Fit the linear model via least squares. All pointers passed to pspp_linreg
are assumed to be allocated to the correct size and initialized to the
values as indicated by opts.
*/
-int
-pspp_linreg (const gsl_vector *, const struct design_matrix *,
- const pspp_linreg_opts *, pspp_linreg_cache *);
+void
+linreg_fit (const gsl_matrix *, linreg *);
double
-pspp_linreg_predict (const struct variable **, const union value **,
- const void *, int);
+linreg_predict (const linreg *, const double *, size_t);
double
-pspp_linreg_residual (const struct variable **, const union value **,
- const union value *, const void *, int);
-/*
- All variables used in the model.
- */
-int pspp_linreg_get_vars (const void *, const struct variable **);
+linreg_residual (const linreg *, double, const double *, size_t);
+const struct variable ** linreg_get_vars (const linreg *);
-struct pspp_coeff *pspp_linreg_get_coeff (const pspp_linreg_cache
- *,
- const struct variable
- *,
- const union value *);
/*
Return or set the standard deviation of the independent variable.
*/
-double pspp_linreg_get_indep_variable_sd (pspp_linreg_cache *, const struct variable *);
-void pspp_linreg_set_indep_variable_sd (pspp_linreg_cache *, const struct variable *, double);
+double linreg_get_indep_variable_sd (linreg *, size_t);
+void linreg_set_indep_variable_sd (linreg *, size_t, double);
/*
Mean of the independent variable.
*/
-double pspp_linreg_get_indep_variable_mean (pspp_linreg_cache *, const struct variable *);
-void pspp_linreg_set_indep_variable_mean (pspp_linreg_cache *, const struct variable *, double);
+double linreg_get_indep_variable_mean (linreg *, size_t);
+void linreg_set_indep_variable_mean (linreg *, size_t, double);
-/*
- Regression using only the covariance matrix.
- */
-void pspp_linreg_with_cov (const struct covariance_matrix *, pspp_linreg_cache *);
-double pspp_linreg_mse (const pspp_linreg_cache *);
+double linreg_mse (const linreg *);
+
+double linreg_intercept (const linreg *);
+
+gsl_matrix * linreg_cov (const linreg *);
+double linreg_coeff (const linreg *, size_t);
+const struct variable * linreg_indep_var (const linreg *, size_t);
+size_t linreg_n_coeffs (const linreg *);
+size_t linreg_n_obs (const linreg *);
+double linreg_ssreg (const linreg *);
+double linreg_dfmodel (const linreg *);
#endif