X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Fstats%2Fregression.q;h=80f9c0293c0386ae51372ae3bf0ac29c666a989f;hb=871f4456a207925fdce3df3150af3f3b263b2776;hp=7068d7f67108b5af00431712d78dea051e08b966;hpb=888d0f91d57e0c3c5a4206c30ac71eb87bf44227;p=pspp-builds.git diff --git a/src/language/stats/regression.q b/src/language/stats/regression.q index 7068d7f6..80f9c029 100644 --- a/src/language/stats/regression.q +++ b/src/language/stats/regression.q @@ -1,6 +1,5 @@ /* PSPP - linear regression. Copyright (C) 2005 Free Software Foundation, Inc. - Written by Jason H Stover . This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as @@ -46,6 +45,7 @@ #include #include #include +#include #include #include "gettext.h" @@ -83,6 +83,15 @@ /* (functions) */ static struct cmd_regression cmd; +/* + Moments for each of the variables used. + */ +struct moments_var +{ + struct moments1 *m; + struct variable *v; +}; + /* Linear regression models. */ static pspp_linreg_cache **models = NULL; @@ -239,7 +248,7 @@ reg_stats_coeff (pspp_linreg_cache * c) */ val = pspp_coeff_get_value (c->coeff[j], v); - val_s = value_to_string (val, v); + val_s = var_get_value_name (v, val); strncat (tmp, val_s, MAX_STRING); } @@ -270,7 +279,7 @@ reg_stats_coeff (pspp_linreg_cache * c) /* P values for the test statistic above. */ - pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0); + pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), (double) (c->n_obs - c->n_coeffs)); tab_float (t, 6, j + 1, 0, pval, 10, 2); } tab_title (t, _("Coefficients")); @@ -566,12 +575,12 @@ regression_trns_pred_proc (void *t_, struct ccase *c, n_vals = (*model->get_vars) (model, vars); vals = xnmalloc (n_vals, sizeof (*vals)); - output = case_data_rw (c, model->pred->fv); + output = case_data_rw (c, model->pred); assert (output != NULL); for (i = 0; i < n_vals; i++) { - vals[i] = case_data (c, vars[i]->fv); + vals[i] = case_data (c, vars[i]); } output->f = (*model->predict) ((const struct variable **) vars, vals, model, n_vals); @@ -606,14 +615,14 @@ regression_trns_resid_proc (void *t_, struct ccase *c, n_vals = (*model->get_vars) (model, vars); vals = xnmalloc (n_vals, sizeof (*vals)); - output = case_data_rw (c, model->resid->fv); + output = case_data_rw (c, model->resid); assert (output != NULL); for (i = 0; i < n_vals; i++) { - vals[i] = case_data (c, vars[i]->fv); + vals[i] = case_data (c, vars[i]); } - obs = case_data (c, model->depvar->fv); + obs = case_data (c, model->depvar); output->f = (*model->residual) ((const struct variable **) vars, vals, obs, model, n_vals); free (vals); @@ -722,7 +731,7 @@ reg_inserted (const struct variable *v, struct variable **varlist, int n_vars) for (i = 0; i < n_vars; i++) { - if (v->index == varlist[i]->index) + if (v == varlist[i]) { return 1; } @@ -734,20 +743,16 @@ static void reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c) { int i; - size_t j; int n_vars = 0; struct variable **varlist; - struct pspp_coeff *coeff; - const struct variable *v; - union value *val; fprintf (fp, "%s", reg_export_categorical_encode_1); varlist = xnmalloc (c->n_indeps, sizeof (*varlist)); for (i = 1; i < c->n_indeps; i++) /* c->coeff[0] is the intercept. */ { - coeff = c->coeff[i]; - v = pspp_coeff_get_var (coeff, 0); + struct pspp_coeff *coeff = c->coeff[i]; + const struct variable *v = pspp_coeff_get_var (coeff, 0); if (var_is_alpha (v)) { if (!reg_inserted (v, varlist, n_vars)) @@ -770,20 +775,22 @@ reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c) for (i = 0; i < n_vars; i++) { - coeff = c->coeff[i]; + int n_categories = cat_get_n_categories (varlist[i]); + int j; + fprintf (fp, "%s.name = \"%s\";\n\t", var_get_name (varlist[i]), var_get_name (varlist[i])); fprintf (fp, "%s.n_vals = %d;\n\t", var_get_name (varlist[i]), - varlist[i]->obs_vals->n_categories); + n_categories); - for (j = 0; j < varlist[i]->obs_vals->n_categories; j++) + for (j = 0; j < n_categories; j++) { - val = cat_subscript_to_value ((const size_t) j, varlist[i]); + union value *val = cat_subscript_to_value (j, varlist[i]); fprintf (fp, "%s.values[%d] = \"%s\";\n\t", var_get_name (varlist[i]), j, - value_to_string (val, varlist[i])); + var_get_value_name (varlist[i], val)); } } fprintf (fp, "%s", reg_export_categorical_encode_2); @@ -959,36 +966,35 @@ cmd_regression (struct lexer *lexer, struct dataset *ds) static bool is_depvar (size_t k, const struct variable *v) { - /* - compare_var_names returns 0 if the variable - names match. - */ - if (!compare_var_names (v, v_variables[k], NULL)) - return true; - - return false; + return v == v_variables[k]; } /* Mark missing cases. Return the number of non-missing cases. + Compute the first two moments. */ static size_t mark_missing_cases (const struct casefile *cf, struct variable *v, - int *is_missing_case, double n_data) + int *is_missing_case, double n_data, struct moments_var *mom) { struct casereader *r; struct ccase c; size_t row; const union value *val; + double w = 1.0; for (r = casefile_get_reader (cf, NULL); casereader_read (r, &c); case_destroy (&c)) { row = casereader_cnum (r) - 1; - val = case_data (&c, v->fv); + val = case_data (&c, v); + if (mom != NULL) + { + moments1_add (mom->m, val->f, w); + } cat_value_update (v, val); - if (var_is_value_missing (v, val)) + if (var_is_value_missing (v, val, MV_ANY)) { if (!is_missing_case[row]) { @@ -1059,7 +1065,7 @@ get_n_indep (const struct variable *v) static int prepare_data (int n_data, int is_missing_case[], struct variable **indep_vars, - struct variable *depvar, const struct casefile *cf) + struct variable *depvar, const struct casefile *cf, struct moments_var *mom) { int i; int j; @@ -1079,13 +1085,13 @@ prepare_data (int n_data, int is_missing_case[], cat_stored_values_create (v_variables[i]); } n_data = - mark_missing_cases (cf, v_variables[i], is_missing_case, n_data); + mark_missing_cases (cf, v_variables[i], is_missing_case, n_data, mom + i); } } /* Mark missing cases for the dependent variable. */ - n_data = mark_missing_cases (cf, depvar, is_missing_case, n_data); + n_data = mark_missing_cases (cf, depvar, is_missing_case, n_data, NULL); return n_data; } @@ -1098,6 +1104,37 @@ coeff_init (pspp_linreg_cache * c, struct design_matrix *dm) pspp_coeff_init (c->coeff + 1, dm); } +/* + Put the moments in the linreg cache. + */ +static void +compute_moments (pspp_linreg_cache *c, struct moments_var *mom, struct design_matrix *dm, size_t n) +{ + size_t i; + size_t j; + double weight; + double mean; + double variance; + double skewness; + double kurtosis; + /* + Scan the variable names in the columns of the design matrix. + When we find the variable we need, insert its mean in the cache. + */ + for (i = 0; i < dm->m->size2; i++) + { + for (j = 0; j < n; j++) + { + if (design_matrix_col_to_var (dm, i) == (mom + j)->v) + { + moments1_calculate ((mom + j)->m, &weight, &mean, &variance, + &skewness, &kurtosis); + gsl_vector_set (c->indep_means, i, mean); + gsl_vector_set (c->indep_std, i, sqrt (variance)); + } + } + } +} static bool run_regression (const struct ccase *first, const struct casefile *cf, void *cmd_ UNUSED, const struct dataset *ds) @@ -1118,6 +1155,7 @@ run_regression (const struct ccase *first, struct ccase c; struct variable **indep_vars; struct design_matrix *X; + struct moments_var *mom; gsl_vector *Y; pspp_linreg_opts lopts; @@ -1145,7 +1183,12 @@ run_regression (const struct ccase *first, } is_missing_case = xnmalloc (n_cases, sizeof (*is_missing_case)); - + mom = xnmalloc (n_variables, sizeof (*mom)); + for (i = 0; i < n_variables; i++) + { + (mom + i)->m = moments1_create (MOMENT_VARIANCE); + (mom + i)->v = v_variables[i]; + } lopts.get_depvar_mean_std = 1; for (k = 0; k < cmd.n_dependent; k++) @@ -1161,7 +1204,7 @@ run_regression (const struct ccase *first, } n_data = prepare_data (n_cases, is_missing_case, indep_vars, cmd.v_dependent[k], - (const struct casefile *) cf); + (const struct casefile *) cf, mom); Y = gsl_vector_alloc (n_data); X = @@ -1199,7 +1242,7 @@ run_regression (const struct ccase *first, current case. */ { - val = case_data (&c, v_variables[i]->fv); + val = case_data (&c, v_variables[i]); /* Independent/dependent variable separation. The 'variables' subcommand specifies a varlist which contains @@ -1222,7 +1265,7 @@ run_regression (const struct ccase *first, } } } - val = case_data (&c, cmd.v_dependent[k]->fv); + val = case_data (&c, cmd.v_dependent[k]); gsl_vector_set (Y, row, val->f); row++; } @@ -1238,6 +1281,7 @@ run_regression (const struct ccase *first, Find the least-squares estimates and other statistics. */ pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, models[k]); + compute_moments (models[k], mom, X, n_variables); subcommand_statistics (cmd.a_statistics, models[k]); subcommand_export (cmd.sbc_export, models[k]); @@ -1247,7 +1291,11 @@ run_regression (const struct ccase *first, free (lopts.get_indep_mean_std); casereader_destroy (r); } - + for (i = 0; i < n_variables; i++) + { + moments1_destroy ((mom + i)->m); + } + free (mom); free (is_missing_case); return true;