-
-/*
- Fit the linear model via least squares. All pointers passed to pspp_linreg
- are assumed to be allocated to the correct size and initialized to the
- values as indicated by opts.
- */
-int
-pspp_linreg (const gsl_vector * Y, const struct design_matrix *dm,
- const pspp_linreg_opts * opts, pspp_linreg_cache * cache)
-{
- int rc;
- gsl_matrix *design = NULL;
- gsl_matrix_view xtx;
- gsl_vector_view xty;
- gsl_vector_view xi;
- gsl_vector_view xj;
- gsl_vector *param_estimates;
- struct pspp_coeff *coef;
- const struct variable *v;
- const union value *val;
-
- size_t i;
- size_t j;
- double tmp;
- double m;
- double s;
- double ss;
-
- if (cache == NULL)
- {
- return GSL_EFAULT;
- }
- if (opts->get_depvar_mean_std)
- {
- linreg_mean_std (gsl_vector_const_subvector (Y, 0, Y->size),
- &m, &s, &ss);
- cache->depvar_mean = m;
- cache->depvar_std = s;
- cache->sst = ss;
- }
- cache_init (cache, dm);
- for (i = 0; i < dm->m->size2; i++)
- {
- if (opts->get_indep_mean_std[i])
- {
- linreg_mean_std (gsl_matrix_const_column (dm->m, i), &m, &s, &ss);
- v = design_matrix_col_to_var (dm, i);
- val = NULL;
- if (var_is_alpha (v))
- {
- j = i - design_matrix_var_to_column (dm, v);
- val = cat_subscript_to_value (j, v);
- }
- coef = pspp_linreg_get_coeff (cache, v, val);
- pspp_coeff_set_mean (coef, m);
- pspp_coeff_set_sd (coef, s);
- gsl_vector_set (cache->ssx, i, ss);
-
- }
- }
-
- if (cache->method == PSPP_LINREG_SWEEP)
- {
- gsl_matrix *sw;
- /*
- Subtract the means to improve the condition of the design
- matrix. This requires copying dm->m and Y. We do not divide by the
- standard deviations of the independent variables here since doing
- so would cause a miscalculation of the residual sums of
- squares. Dividing by the standard deviation is done GSL's linear
- regression functions, so if the design matrix has a poor
- condition, use QR decomposition.
-
- The design matrix here does not include a column for the intercept
- (i.e., a column of 1's). If using PSPP_LINREG_QR, we need that column,
- so design is allocated here when sweeping, or below if using QR.
- */
- design = gsl_matrix_alloc (dm->m->size1, dm->m->size2);
- for (i = 0; i < dm->m->size2; i++)
- {
- v = design_matrix_col_to_var (dm, i);
- m = pspp_linreg_get_indep_variable_mean (cache, v);
- for (j = 0; j < dm->m->size1; j++)
- {
- tmp = (gsl_matrix_get (dm->m, j, i) - m);
- gsl_matrix_set (design, j, i, tmp);
- }
- }
- sw = gsl_matrix_calloc (cache->n_coeffs + 1, cache->n_coeffs + 1);
- xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_coeffs, cache->n_coeffs);
-
- for (i = 0; i < xtx.matrix.size1; i++)
- {
- tmp = gsl_vector_get (cache->ssx, i);
- gsl_matrix_set (&(xtx.matrix), i, i, tmp);
- xi = gsl_matrix_column (design, i);
- for (j = (i + 1); j < xtx.matrix.size2; j++)
- {
- xj = gsl_matrix_column (design, j);
- gsl_blas_ddot (&(xi.vector), &(xj.vector), &tmp);
- gsl_matrix_set (&(xtx.matrix), i, j, tmp);
- }
- }
-
- gsl_matrix_set (sw, cache->n_coeffs, cache->n_coeffs, cache->sst);
- xty = gsl_matrix_column (sw, cache->n_coeffs);
- /*
- This loop starts at 1, with i=0 outside the loop, so we can get
- the model sum of squares due to the first independent variable.
- */
- xi = gsl_matrix_column (design, 0);
- gsl_blas_ddot (&(xi.vector), Y, &tmp);
- gsl_vector_set (&(xty.vector), 0, tmp);
- tmp *= tmp / gsl_vector_get (cache->ssx, 0);
- gsl_vector_set (cache->ss_indeps, 0, tmp);
- for (i = 1; i < cache->n_coeffs; i++)
- {
- xi = gsl_matrix_column (design, i);
- gsl_blas_ddot (&(xi.vector), Y, &tmp);
- gsl_vector_set (&(xty.vector), i, tmp);
- }
-
- /*
- Sweep on the matrix sw, which contains XtX, XtY and YtY.
- */
- reg_sweep (sw);
- post_sweep_computations (cache, dm, sw);
- gsl_matrix_free (sw);
- }
- else if (cache->method == PSPP_LINREG_CONDITIONAL_INVERSE)
- {
- /*
- Use the SVD of X^T X to find a conditional inverse of X^TX. If
- the SVD is X^T X = U D V^T, then set the conditional inverse
- to (X^T X)^c = V D^- U^T. D^- is defined as follows: If entry
- (i, i) has value sigma_i, then entry (i, i) of D^- is 1 /
- sigma_i if sigma_i > 0, and 0 otherwise. Then solve the normal
- equations by setting the estimated parameter vector to
- (X^TX)^c X^T Y.
- */
- }
- else
- {
- gsl_multifit_linear_workspace *wk;
- /*
- Use QR decomposition via GSL.
- */
-
- param_estimates = gsl_vector_alloc (1 + dm->m->size2);
- design = gsl_matrix_alloc (dm->m->size1, 1 + dm->m->size2);
-
- for (j = 0; j < dm->m->size1; j++)
- {
- gsl_matrix_set (design, j, 0, 1.0);
- for (i = 0; i < dm->m->size2; i++)
- {
- tmp = gsl_matrix_get (dm->m, j, i);
- gsl_matrix_set (design, j, i + 1, tmp);
- }
- }
-
- wk = gsl_multifit_linear_alloc (design->size1, design->size2);
- rc = gsl_multifit_linear (design, Y, param_estimates,
- cache->cov, &(cache->sse), wk);
- for (i = 0; i < cache->n_coeffs; i++)
- {
- cache->coeff[i]->estimate = gsl_vector_get (param_estimates, i + 1);
- }
- cache->intercept = gsl_vector_get (param_estimates, 0);
- if (rc == GSL_SUCCESS)
- {
- gsl_multifit_linear_free (wk);
- gsl_vector_free (param_estimates);
- }
- else
- {
- fprintf (stderr, "%s:%d: gsl_multifit_linear returned %d\n",
- __FILE__, __LINE__, rc);
- }
- }
-
-
- cache->ssm = cache->sst - cache->sse;
- /*
- Get the remaining sums of squares for the independent
- variables.
- */
- m = 0;
- for (i = 1; i < cache->n_indeps; i++)
- {
- j = i - 1;
- m += gsl_vector_get (cache->ss_indeps, j);
- tmp = cache->ssm - m;
- gsl_vector_set (cache->ss_indeps, i, tmp);
- }
-
- gsl_matrix_free (design);
- return GSL_SUCCESS;
-}
-
-/*
- Is the coefficient COEF contained in the list of coefficients
- COEF_LIST?
- */
-static int
-has_coefficient (const struct pspp_coeff **coef_list, const struct pspp_coeff *coef,
- size_t n)
-{
- size_t i = 0;