From 3cd65292e3cc6bd6532214dcc8c8ddc65bdc2972 Mon Sep 17 00:00:00 2001 From: John Darrington Date: Sun, 11 Nov 2012 14:19:05 +0100 Subject: [PATCH] Logistic Regression: Added categorical variable support With this commit Logistic Regression now supports the /CATEGORICALS subcommand, allowing a dependent variable to be regressed against one or more categorical predictor variables. --- doc/statistics.texi | 8 +- src/language/stats/logistic.c | 483 +++++++++++++++++---- src/math/categoricals.c | 23 +- tests/language/stats/logistic.at | 722 ++++++++++++++++++++++++++++++- 4 files changed, 1139 insertions(+), 97 deletions(-) diff --git a/doc/statistics.texi b/doc/statistics.texi index 6e8b5c67a4..5723323337 100644 --- a/doc/statistics.texi +++ b/doc/statistics.texi @@ -732,7 +732,9 @@ The default is @subcmd{LISTWISE}. @cindex bivariate logistic regression @display -LOGISTIC REGRESSION [VARIABLES =] @var{dependent_var} WITH @var{var_list} +LOGISTIC REGRESSION [VARIABLES =] @var{dependent_var} WITH @var{predictors} + + [/CATEGORICAL = @var{categorical_predictors}] [@{/NOCONST | /ORIGIN | /NOORIGIN @}] @@ -763,6 +765,10 @@ Hence, the full model is + \dots + b_n {\bf x_n} } + +Predictor variables which are categorical in nature should be listed on the @subcmd{/CATEGORICAL} subcommand. +Simple variables as well as interactions between variables may be listed here. + If you want a model without the constant term @math{b_0}, use the keyword @subcmd{/ORIGIN}. @subcmd{/NOCONST} is a synonym for @subcmd{/ORIGIN}. diff --git a/src/language/stats/logistic.c b/src/language/stats/logistic.c index 93472e0294..91a16488bd 100644 --- a/src/language/stats/logistic.c +++ b/src/language/stats/logistic.c @@ -64,6 +64,8 @@ #include "libpspp/message.h" #include "libpspp/misc.h" #include "math/categoricals.h" +#include "math/interaction.h" + #include "output/tab.h" #include "gettext.h" @@ -91,8 +93,13 @@ struct lr_spec /* The dependent variable */ const struct variable *dep_var; - size_t n_predictor_vars; + /* The predictor variables (excluding categorical ones) */ const struct variable **predictor_vars; + size_t n_predictor_vars; + + /* The categorical predictors */ + struct interaction **cat_predictors; + size_t n_cat_predictors; /* Which classes of missing vars are to be excluded */ enum mv_class exclude; @@ -100,6 +107,7 @@ struct lr_spec /* The weight variable */ const struct variable *wv; + /* The dictionary of the dataset */ const struct dictionary *dict; /* True iff the constant (intercept) is to be included in the model */ @@ -122,15 +130,17 @@ struct lr_spec double cut_point; }; + /* The results and intermediate result of the procedure. These are mutated as the procedure runs. Used for temporary variables etc. */ struct lr_result { + /* Used to indicate if a pass should flag a warning when + invalid (ie negative or missing) weight values are encountered */ bool warn_bad_weight; - /* The two values of the dependent variable. */ union value y0; union value y1; @@ -139,36 +149,49 @@ struct lr_result /* The sum of caseweights */ double cc; + /* The number of missing and nonmissing cases */ casenumber n_missing; casenumber n_nonmissing; + + + gsl_matrix *hessian; + + /* The categoricals and their payload. Null if the analysis has no + categorical predictors */ + struct categoricals *cats; + struct payload cp; }; /* - Convert INPUT into a dichotomous scalar. For simple cases, this is a 1:1 mapping + Convert INPUT into a dichotomous scalar, according to how the dependent variable's + values are mapped. + For simple cases, this is a 1:1 mapping The return value is always either 0 or 1 */ static double map_dependent_var (const struct lr_spec *cmd, const struct lr_result *res, const union value *input) { - int width = var_get_width (cmd->dep_var); + const int width = var_get_width (cmd->dep_var); if (value_equal (input, &res->y0, width)) return 0; if (value_equal (input, &res->y1, width)) return 1; - + + /* This should never happen. If it does, then y0 and/or y1 have probably not been set */ NOT_REACHED (); return SYSMIS; } +static void output_categories (const struct lr_spec *cmd, const struct lr_result *res); static void output_depvarmap (const struct lr_spec *cmd, const struct lr_result *); static void output_variables (const struct lr_spec *cmd, - const gsl_vector *, + const struct lr_result *, const gsl_vector *); static void output_model_summary (const struct lr_result *, @@ -177,27 +200,57 @@ static void output_model_summary (const struct lr_result *, static void case_processing_summary (const struct lr_result *); +/* Return the value of case C corresponding to the INDEX'th entry in the + model */ +static double +predictor_value (const struct ccase *c, + const struct variable **x, size_t n_x, + const struct categoricals *cats, + size_t index) +{ + /* Values of the scalar predictor variables */ + if (index < n_x) + return case_data (c, x[index])->f; + + /* Coded values of categorical predictor variables (or interactions) */ + if (cats && index - n_x < categoricals_df_total (cats)) + { + double x = categoricals_get_dummy_code_for_case (cats, index - n_x, c); + return x; + } + + /* The constant term */ + return 1.0; +} + + /* Return the probability estimator (that is the estimator of logit(y) ) corresponding to the coefficient estimator beta_hat for case C */ static double pi_hat (const struct lr_spec *cmd, + struct lr_result *res, const gsl_vector *beta_hat, const struct variable **x, size_t n_x, const struct ccase *c) { int v0; double pi = 0; - for (v0 = 0; v0 < n_x; ++v0) + size_t n_coeffs = beta_hat->size; + + if (cmd->constant) + { + pi += gsl_vector_get (beta_hat, beta_hat->size - 1); + n_coeffs--; + } + + for (v0 = 0; v0 < n_coeffs; ++v0) { pi += gsl_vector_get (beta_hat, v0) * - case_data (c, x[v0])->f; + predictor_value (c, x, n_x, res->cats, v0); } - if (cmd->constant) - pi += gsl_vector_get (beta_hat, beta_hat->size - 1); - pi = 1.0 / (1.0 + exp(-pi)); return pi; @@ -213,26 +266,26 @@ pi_hat (const struct lr_spec *cmd, If ALL predicted values derivatives are close to zero or one, then CONVERGED will be set to true. */ -static gsl_matrix * +static void hessian (const struct lr_spec *cmd, struct lr_result *res, struct casereader *input, const struct variable **x, size_t n_x, const gsl_vector *beta_hat, - bool *converged - ) + bool *converged) { struct casereader *reader; struct ccase *c; - gsl_matrix *output = gsl_matrix_calloc (beta_hat->size, beta_hat->size); double max_w = -DBL_MAX; + gsl_matrix_set_zero (res->hessian); + for (reader = casereader_clone (input); (c = casereader_read (reader)) != NULL; case_unref (c)) { int v0, v1; - double pi = pi_hat (cmd, beta_hat, x, n_x, c); + double pi = pi_hat (cmd, res, beta_hat, x, n_x, c); double weight = dict_get_case_weight (cmd->dict, c, &res->warn_bad_weight); double w = pi * (1 - pi); @@ -242,25 +295,22 @@ hessian (const struct lr_spec *cmd, for (v0 = 0; v0 < beta_hat->size; ++v0) { - double in0 = v0 < n_x ? case_data (c, x[v0])->f : 1.0; + double in0 = predictor_value (c, x, n_x, res->cats, v0); for (v1 = 0; v1 < beta_hat->size; ++v1) { - double in1 = v1 < n_x ? case_data (c, x[v1])->f : 1.0 ; - double *o = gsl_matrix_ptr (output, v0, v1); + double in1 = predictor_value (c, x, n_x, res->cats, v1); + double *o = gsl_matrix_ptr (res->hessian, v0, v1); *o += in0 * w * in1; } } } casereader_destroy (reader); - if ( max_w < cmd->min_epsilon) { *converged = true; msg (MN, _("All predicted values are either 1 or 0")); } - - return output; } @@ -289,7 +339,7 @@ xt_times_y_pi (const struct lr_spec *cmd, (c = casereader_read (reader)) != NULL; case_unref (c)) { int v0; - double pi = pi_hat (cmd, beta_hat, x, n_x, c); + double pi = pi_hat (cmd, res, beta_hat, x, n_x, c); double weight = dict_get_case_weight (cmd->dict, c, &res->warn_bad_weight); @@ -299,7 +349,7 @@ xt_times_y_pi (const struct lr_spec *cmd, for (v0 = 0; v0 < beta_hat->size; ++v0) { - double in0 = v0 < n_x ? case_data (c, x[v0])->f : 1.0; + double in0 = predictor_value (c, x, n_x, res->cats, v0); double *o = gsl_vector_ptr (output, v0); *o += in0 * (y - pi) * weight; } @@ -310,10 +360,42 @@ xt_times_y_pi (const struct lr_spec *cmd, return output; } + + +/* "payload" functions for the categoricals. + The only function is to accumulate the frequency of each + category. + */ + +static void * +frq_create (const void *aux1 UNUSED, void *aux2 UNUSED) +{ + return xzalloc (sizeof (double)); +} + +static void +frq_update (const void *aux1 UNUSED, void *aux2 UNUSED, + void *ud, const struct ccase *c UNUSED , double weight) +{ + double *freq = ud; + *freq += weight; +} + +static void +frq_destroy (const void *aux1 UNUSED, void *aux2 UNUSED, void *user_data UNUSED) +{ + free (user_data); +} + + /* - Makes an initial pass though the data, checks that the dependent variable is - dichotomous, and calculates necessary initial values. + Makes an initial pass though the data, doing the following: + + * Checks that the dependent variable is dichotomous, + * Creates and initialises the categoricals, + * Accumulates summary results, + * Calculates necessary initial values. Returns an initial value for \hat\beta the vector of estimators of \beta */ @@ -336,7 +418,19 @@ beta_hat_initial (const struct lr_spec *cmd, struct lr_result *res, struct caser if (cmd->constant) n_coefficients++; - b0 = gsl_vector_calloc (n_coefficients); + /* Create categoricals if appropriate */ + if (cmd->n_cat_predictors > 0) + { + res->cp.create = frq_create; + res->cp.update = frq_update; + res->cp.calculate = NULL; + res->cp.destroy = frq_destroy; + + res->cats = categoricals_create (cmd->cat_predictors, cmd->n_cat_predictors, + cmd->wv, cmd->exclude, MV_ANY); + + categoricals_set_payload (res->cats, &res->cp, cmd, res); + } res->cc = 0; for (reader = casereader_clone (input); @@ -357,14 +451,15 @@ beta_hat_initial (const struct lr_spec *cmd, struct lr_result *res, struct caser } } + /* Accumulate the missing and non-missing counts */ if (missing) { res->n_missing++; continue; } - res->n_nonmissing++; + /* Find the values of the dependent variable */ if (!v0set) { value_clone (&res->y0, depval, width); @@ -398,9 +493,13 @@ beta_hat_initial (const struct lr_spec *cmd, struct lr_result *res, struct caser res->cc += weight; + + categoricals_update (res->cats, c); } casereader_destroy (reader); + categoricals_done (res->cats); + sum = sumB; /* Ensure that Y0 is less than Y1. Otherwise the mapping gets @@ -415,6 +514,9 @@ beta_hat_initial (const struct lr_spec *cmd, struct lr_result *res, struct caser sum = sumA; } + n_coefficients += categoricals_df_total (res->cats); + b0 = gsl_vector_calloc (n_coefficients); + if ( cmd->constant) { double mean = sum / res->cc; @@ -430,16 +532,18 @@ beta_hat_initial (const struct lr_spec *cmd, struct lr_result *res, struct caser +/* Start of the logistic regression routine proper */ static bool run_lr (const struct lr_spec *cmd, struct casereader *input, const struct dataset *ds UNUSED) { - int i,j; + int i; gsl_vector *beta_hat; - gsl_vector *se ; bool converged = false; + + /* Set the likelihoods to a negative sentinel value */ double likelihood = -1; double prev_likelihood = -1; double initial_likelihood = -1; @@ -448,6 +552,7 @@ run_lr (const struct lr_spec *cmd, struct casereader *input, work.n_missing = 0; work.n_nonmissing = 0; work.warn_bad_weight = true; + work.cats = NULL; /* Get the initial estimates of \beta and their standard errors */ @@ -457,8 +562,6 @@ run_lr (const struct lr_spec *cmd, struct casereader *input, output_depvarmap (cmd, &work); - se = gsl_vector_alloc (beta_hat->size); - case_processing_summary (&work); @@ -470,20 +573,22 @@ run_lr (const struct lr_spec *cmd, struct casereader *input, NULL); + work.hessian = gsl_matrix_calloc (beta_hat->size, beta_hat->size); + /* Start the Newton Raphson iteration process... */ for( i = 0 ; i < cmd->max_iter ; ++i) { double min, max; - gsl_matrix *m ; gsl_vector *v ; - m = hessian (cmd, &work, input, + + hessian (cmd, &work, input, cmd->predictor_vars, cmd->n_predictor_vars, beta_hat, &converged); - gsl_linalg_cholesky_decomp (m); - gsl_linalg_cholesky_invert (m); + gsl_linalg_cholesky_decomp (work.hessian); + gsl_linalg_cholesky_invert (work.hessian); v = xt_times_y_pi (cmd, &work, input, cmd->predictor_vars, cmd->n_predictor_vars, @@ -494,16 +599,9 @@ run_lr (const struct lr_spec *cmd, struct casereader *input, { /* delta = M.v */ gsl_vector *delta = gsl_vector_alloc (v->size); - gsl_blas_dgemv (CblasNoTrans, 1.0, m, v, 0, delta); + gsl_blas_dgemv (CblasNoTrans, 1.0, work.hessian, v, 0, delta); gsl_vector_free (v); - for (j = 0; j < se->size; ++j) - { - double *ptr = gsl_vector_ptr (se, j); - *ptr = gsl_matrix_get (m, j, j); - } - - gsl_matrix_free (m); gsl_vector_add (beta_hat, delta); @@ -537,17 +635,21 @@ run_lr (const struct lr_spec *cmd, struct casereader *input, casereader_destroy (input); assert (initial_likelihood >= 0); - for (i = 0; i < se->size; ++i) - { - double *ptr = gsl_vector_ptr (se, i); - *ptr = sqrt (*ptr); - } + if ( ! converged) + msg (MW, _("Estimation terminated at iteration number %d because maximum iterations has been reached"), i ); + output_model_summary (&work, initial_likelihood, likelihood); - output_variables (cmd, beta_hat, se); + if (work.cats) + output_categories (cmd, &work); + + output_variables (cmd, &work, beta_hat); + + gsl_matrix_free (work.hessian); gsl_vector_free (beta_hat); - gsl_vector_free (se); + + categoricals_destroy (work.cats); return true; } @@ -556,6 +658,12 @@ run_lr (const struct lr_spec *cmd, struct casereader *input, int cmd_logistic (struct lexer *lexer, struct dataset *ds) { + /* Temporary location for the predictor variables. + These may or may not include the categorical predictors */ + const struct variable **pred_vars; + size_t n_pred_vars; + + int v, x; struct lr_spec lr; lr.dict = dataset_dict (ds); lr.n_predictor_vars = 0; @@ -570,6 +678,9 @@ cmd_logistic (struct lexer *lexer, struct dataset *ds) lr.constant = true; lr.confidence = 95; lr.print = PRINT_DEFAULT; + lr.cat_predictors = NULL; + lr.n_cat_predictors = 0; + if (lex_match_id (lexer, "VARIABLES")) @@ -581,8 +692,8 @@ cmd_logistic (struct lexer *lexer, struct dataset *ds) lex_force_match (lexer, T_WITH); if (!parse_variables_const (lexer, lr.dict, - &lr.predictor_vars, &lr.n_predictor_vars, - PV_NO_DUPLICATE | PV_NUMERIC)) + &pred_vars, &n_pred_vars, + PV_NO_DUPLICATE)) goto error; @@ -627,6 +738,19 @@ cmd_logistic (struct lexer *lexer, struct dataset *ds) { /* This is for compatibility. It does nothing */ } + else if (lex_match_id (lexer, "CATEGORICAL")) + { + lex_match (lexer, T_EQUALS); + do + { + lr.cat_predictors = xrealloc (lr.cat_predictors, + sizeof (*lr.cat_predictors) * ++lr.n_cat_predictors); + lr.cat_predictors[lr.n_cat_predictors - 1] = 0; + } + while (parse_design_interaction (lexer, lr.dict, + lr.cat_predictors + lr.n_cat_predictors - 1)); + lr.n_cat_predictors--; + } else if (lex_match_id (lexer, "PRINT")) { lex_match (lexer, T_EQUALS); @@ -775,8 +899,41 @@ cmd_logistic (struct lexer *lexer, struct dataset *ds) } } + /* Copy the predictor variables from the temporary location into the + final one, dropping any categorical variables which appear there. + FIXME: This is O(NxM). + */ + for (v = x = 0; v < n_pred_vars; ++v) + { + bool drop = false; + const struct variable *var = pred_vars[v]; + int cv = 0; + for (cv = 0; cv < lr.n_cat_predictors ; ++cv) + { + int iv; + const struct interaction *iact = lr.cat_predictors[cv]; + for (iv = 0 ; iv < iact->n_vars ; ++iv) + { + if (var == iact->vars[iv]) + { + drop = true; + goto dropped; + } + } + } + + dropped: + if (drop) + continue; + + lr.predictor_vars = xrealloc (lr.predictor_vars, sizeof *lr.predictor_vars * (x + 1)); + lr.predictor_vars[x++] = var; + lr.n_predictor_vars++; + } + free (pred_vars); + /* Run logistical regression for each split group */ { struct casegrouper *grouper; struct casereader *group; @@ -790,11 +947,13 @@ cmd_logistic (struct lexer *lexer, struct dataset *ds) } free (lr.predictor_vars); + free (lr.cat_predictors); return CMD_SUCCESS; error: free (lr.predictor_vars); + free (lr.cat_predictors); return CMD_FAILURE; } @@ -851,19 +1010,20 @@ output_depvarmap (const struct lr_spec *cmd, const struct lr_result *res) /* Show the Variables in the Equation box */ static void output_variables (const struct lr_spec *cmd, - const gsl_vector *beta, - const gsl_vector *se) + const struct lr_result *res, + const gsl_vector *beta) { int row = 0; const int heading_columns = 1; int heading_rows = 1; struct tab_table *t; - int idx; - int n_rows = cmd->n_predictor_vars; - int nc = 8; int nr ; + int i = 0; + int ivar = 0; + int idx_correction = 0; + if (cmd->print & PRINT_CI) { nc += 2; @@ -874,6 +1034,9 @@ output_variables (const struct lr_spec *cmd, if (cmd->constant) nr++; + if (res->cats) + nr += categoricals_df_total (res->cats) + cmd->n_cat_predictors; + t = tab_create (nc, nr); tab_title (t, _("Variables in the Equation")); @@ -902,45 +1065,103 @@ output_variables (const struct lr_spec *cmd, tab_text (t, 9, row, TAB_CENTER | TAT_TITLE, _("Upper")); } - if (cmd->constant) - n_rows++; - - for (idx = 0 ; idx < n_rows; ++idx) + for (row = heading_rows ; row < nr; ++row) { - const int r = idx + heading_rows; + const int idx = row - heading_rows - idx_correction; const double b = gsl_vector_get (beta, idx); - const double sigma = gsl_vector_get (se, idx); - const double wald = pow2 (b / sigma); + const double sigma2 = gsl_matrix_get (res->hessian, idx, idx); + const double wald = pow2 (b) / sigma2; const double df = 1; if (idx < cmd->n_predictor_vars) - tab_text (t, 1, r, TAB_LEFT | TAT_TITLE, - var_to_string (cmd->predictor_vars[idx])); + { + tab_text (t, 1, row, TAB_LEFT | TAT_TITLE, + var_to_string (cmd->predictor_vars[idx])); + } + else if (i < cmd->n_cat_predictors) + { + double wald; + bool summary = false; + struct string str; + const struct interaction *cat_predictors = cmd->cat_predictors[i]; + const int df = categoricals_df (res->cats, i); + + ds_init_empty (&str); + interaction_to_string (cat_predictors, &str); + + if (ivar == 0) + { + /* Calculate the Wald statistic, + which is \beta' C^-1 \beta . + where \beta is the vector of the coefficient estimates comprising this + categorial variable. and C is the corresponding submatrix of the + hessian matrix. + */ + gsl_matrix_const_view mv = + gsl_matrix_const_submatrix (res->hessian, idx, idx, df, df); + gsl_matrix *subhessian = gsl_matrix_alloc (mv.matrix.size1, mv.matrix.size2); + gsl_vector_const_view vv = gsl_vector_const_subvector (beta, idx, df); + gsl_vector *temp = gsl_vector_alloc (df); + + gsl_matrix_memcpy (subhessian, &mv.matrix); + gsl_linalg_cholesky_decomp (subhessian); + gsl_linalg_cholesky_invert (subhessian); + + gsl_blas_dgemv (CblasTrans, 1.0, subhessian, &vv.vector, 0, temp); + gsl_blas_ddot (temp, &vv.vector, &wald); + + tab_double (t, 4, row, 0, wald, 0); + tab_double (t, 5, row, 0, df, &F_8_0); + tab_double (t, 6, row, 0, gsl_cdf_chisq_Q (wald, df), 0); + + idx_correction ++; + summary = true; + gsl_matrix_free (subhessian); + gsl_vector_free (temp); + } + else + { + ds_put_format (&str, "(%d)", ivar); + } + + tab_text (t, 1, row, TAB_LEFT | TAT_TITLE, ds_cstr (&str)); + if (ivar++ == df) + { + ++i; /* next interaction */ + ivar = 0; + } + + ds_destroy (&str); + + if (summary) + continue; + } + else + { + tab_text (t, 1, row, TAB_LEFT | TAT_TITLE, _("Constant")); + } - tab_double (t, 2, r, 0, b, 0); - tab_double (t, 3, r, 0, sigma, 0); - tab_double (t, 4, r, 0, wald, 0); - tab_double (t, 5, r, 0, df, &F_8_0); - tab_double (t, 6, r, 0, gsl_cdf_chisq_Q (wald, df), 0); - tab_double (t, 7, r, 0, exp (b), 0); + tab_double (t, 2, row, 0, b, 0); + tab_double (t, 3, row, 0, sqrt (sigma2), 0); + tab_double (t, 4, row, 0, wald, 0); + tab_double (t, 5, row, 0, df, &F_8_0); + tab_double (t, 6, row, 0, gsl_cdf_chisq_Q (wald, df), 0); + tab_double (t, 7, row, 0, exp (b), 0); if (cmd->print & PRINT_CI) { double wc = gsl_cdf_ugaussian_Pinv (0.5 + cmd->confidence / 200.0); - wc *= sigma; + wc *= sqrt (sigma2); if (idx < cmd->n_predictor_vars) { - tab_double (t, 8, r, 0, exp (b - wc), 0); - tab_double (t, 9, r, 0, exp (b + wc), 0); + tab_double (t, 8, row, 0, exp (b - wc), 0); + tab_double (t, 9, row, 0, exp (b + wc), 0); } } } - if ( cmd->constant) - tab_text (t, 1, nr - 1, TAB_LEFT | TAT_TITLE, _("Constant")); - tab_submit (t); } @@ -1028,3 +1249,107 @@ case_processing_summary (const struct lr_result *res) tab_submit (t); } +static void +output_categories (const struct lr_spec *cmd, const struct lr_result *res) +{ + const struct fmt_spec *wfmt = + cmd->wv ? var_get_print_format (cmd->wv) : &F_8_0; + + int cumulative_df; + int i = 0; + const int heading_columns = 2; + const int heading_rows = 2; + struct tab_table *t; + + int nc ; + int nr ; + + int v; + int r = 0; + + int max_df = 0; + int total_cats = 0; + for (i = 0; i < cmd->n_cat_predictors; ++i) + { + size_t n = categoricals_n_count (res->cats, i); + size_t df = categoricals_df (res->cats, i); + if (max_df < df) + max_df = df; + total_cats += n; + } + + nc = heading_columns + 1 + max_df; + nr = heading_rows + total_cats; + + t = tab_create (nc, nr); + tab_title (t, _("Categorical Variables' Codings")); + + tab_headers (t, heading_columns, 0, heading_rows, 0); + + tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, nc - 1, nr - 1); + + tab_hline (t, TAL_2, 0, nc - 1, heading_rows); + tab_vline (t, TAL_2, heading_columns, 0, nr - 1); + + + tab_text (t, heading_columns, 1, TAB_CENTER | TAT_TITLE, _("Frequency")); + + tab_joint_text_format (t, heading_columns + 1, 0, nc - 1, 0, + TAB_CENTER | TAT_TITLE, _("Parameter coding")); + + + for (i = 0; i < max_df; ++i) + { + int c = heading_columns + 1 + i; + tab_text_format (t, c, 1, TAB_CENTER | TAT_TITLE, _("(%d)"), i + 1); + } + + cumulative_df = 0; + for (v = 0; v < cmd->n_cat_predictors; ++v) + { + int cat; + const struct interaction *cat_predictors = cmd->cat_predictors[v]; + int df = categoricals_df (res->cats, v); + struct string str; + ds_init_empty (&str); + + interaction_to_string (cat_predictors, &str); + + tab_text (t, 0, heading_rows + r, TAB_LEFT | TAT_TITLE, ds_cstr (&str) ); + + ds_destroy (&str); + + for (cat = 0; cat < categoricals_n_count (res->cats, v) ; ++cat) + { + struct string str; + const struct ccase *c = categoricals_get_case_by_category_real (res->cats, v, cat); + const double *freq = categoricals_get_user_data_by_category_real (res->cats, v, cat); + + int x; + ds_init_empty (&str); + + for (x = 0; x < cat_predictors->n_vars; ++x) + { + const union value *val = case_data (c, cat_predictors->vars[x]); + var_append_value_name (cat_predictors->vars[x], val, &str); + + if (x < cat_predictors->n_vars - 1) + ds_put_cstr (&str, " "); + } + + tab_text (t, 1, heading_rows + r, 0, ds_cstr (&str)); + ds_destroy (&str); + tab_double (t, 2, heading_rows + r, 0, *freq, wfmt); + + for (x = 0; x < df; ++x) + { + tab_double (t, heading_columns + 1 + x, heading_rows + r, 0, (cat == x), &F_8_0); + } + ++r; + } + cumulative_df += df; + } + + tab_submit (t); + +} diff --git a/src/math/categoricals.c b/src/math/categoricals.c index 71ffe20ad7..d0247f6d38 100644 --- a/src/math/categoricals.c +++ b/src/math/categoricals.c @@ -76,15 +76,6 @@ struct variable_node struct hmap valmap; /* A map of value nodes */ int n_vals; /* Number of values for this variable */ - - int *indirection; /* An array (of size n_vals) of integers, which serve to - permute the index members of the values in valmap. - - Doing this, means that categories are considered in the order - of their values. Mathematically the order is irrelevant. - However certain procedures (eg logistic regression) want to report - statisitics for particular categories */ - }; @@ -412,7 +403,8 @@ categoricals_update (struct categoricals *cat, const struct ccase *c) if (valn == NULL) { valn = pool_malloc (cat->pool, sizeof *valn); - valn->index = vn->n_vals++; + valn->index = -1; + vn->n_vals++; value_init (&valn->val, width); value_copy (&valn->val, val, width); hmap_insert (&vn->valmap, &valn->node, hash); @@ -553,15 +545,14 @@ categoricals_done (const struct categoricals *cat_) return; } - vn->indirection = pool_calloc (cat->pool, vn->n_vals, sizeof *vn->indirection); - /* Sort the VALMAP here */ array = xcalloc (sizeof *array, vn->n_vals); + x = 0; HMAP_FOR_EACH (valnd, struct value_node, node, &vn->valmap) { /* Note: This loop is probably superfluous, it could be done in the update stage (at the expense of a realloc) */ - array[valnd->index] = valnd; + array[x++] = valnd; } sort (array, vn->n_vals, sizeof (*array), @@ -570,7 +561,7 @@ categoricals_done (const struct categoricals *cat_) for (x = 0; x < vn->n_vals; ++x) { struct value_node *vvv = array[x]; - vn->indirection[vn->n_vals - x - 1] = vvv->index; + vvv->index = x; } free (array); @@ -753,9 +744,9 @@ categoricals_get_code_for_case (const struct categoricals *cat, int subscript, const int index = ((subscript - base_index) % iap->df_prod[v] ) / dfp; dfp = iap->df_prod [v]; - if (effects_coding && vn->indirection [valn->index] == df ) + if (effects_coding && valn->index == df ) bin = -1.0; - else if ( vn->indirection [valn->index] != index ) + else if ( valn->index != index ) bin = 0; result *= bin; diff --git a/tests/language/stats/logistic.at b/tests/language/stats/logistic.at index 8903c2069d..7db121338b 100644 --- a/tests/language/stats/logistic.at +++ b/tests/language/stats/logistic.at @@ -1,3 +1,4 @@ + AT_BANNER([LOGISTIC REGRESSION]) dnl These examples are adapted from @@ -288,4 +289,723 @@ AT_CHECK([pspp -O format=csv non-dich.sps], [1], error: Dependent variable's values are not dichotomous. ]) -AT_CLEANUP \ No newline at end of file +AT_CLEANUP + + + +dnl An example to check the behaviour of LOGISTIC REGRESSION with a categorical +dnl variable. This examṕle was inspired from that at: +dnl http://www.ats.ucla.edu/stat/spss/dae/logit.htm +AT_SETUP([LOGISTIC REGRESSION with categorical]) + +AT_DATA([lr-cat.data], [dnl + 620 3.07 2 4 + 800 4.00 3 9 + 580 3.40 2 4 + 600 3.13 2 4 + 540 2.70 2 4 + 660 3.31 4 4 + 480 3.58 1 9 + 620 4.00 1 9 + 680 3.98 2 9 + 580 3.40 4 4 + 760 3.35 3 4 + 700 3.72 2 4 + 460 3.64 1 9 + 540 3.28 3 4 + 680 3.48 3 4 + 740 3.31 1 4 + 460 3.77 3 4 + 740 3.54 1 4 + 600 3.63 3 4 + 620 3.05 2 4 + 560 3.04 3 4 + 520 2.70 3 4 + 640 3.35 3 4 + 620 3.58 2 4 + 660 3.70 4 9 + 500 2.86 4 4 + 640 3.50 2 4 + 720 4.00 3 4 + 720 3.94 3 4 + 400 3.65 2 4 + 800 2.90 2 4 + 520 2.90 3 4 + 440 3.24 4 4 + 580 3.51 2 4 + 500 3.31 3 4 + 440 3.22 1 4 + 540 3.17 1 9 + 420 3.02 1 4 + 780 3.22 2 9 + 440 3.13 4 4 + 800 3.66 1 9 + 580 3.32 2 9 + 480 2.67 2 9 + 700 4.00 1 9 + 740 2.97 2 9 + 700 3.83 2 4 + 640 3.93 2 4 + 800 3.90 2 4 + 400 3.38 2 4 + 700 3.52 2 4 + 680 3.00 4 9 + 540 3.20 1 4 + 580 4.00 2 4 + 780 4.00 2 9 + 220 2.83 3 4 + 580 3.20 2 9 + 580 3.50 2 4 + 620 3.30 1 4 + 520 3.65 4 9 + 600 3.38 3 9 + 660 3.77 3 4 + 580 2.86 4 9 + 580 3.46 2 9 + 560 3.36 3 4 + 740 4.00 3 9 + 480 3.44 3 4 + 640 3.19 4 9 + 600 3.54 1 9 + 540 3.38 4 4 + 500 2.81 3 4 + 360 2.56 3 4 + 460 3.15 4 4 + 460 2.63 2 4 + 440 2.76 2 4 + 740 3.62 4 4 + 380 3.38 2 4 + 640 3.63 1 9 + 800 3.73 1 4 + 660 3.67 2 4 + 760 3.00 2 9 + 420 2.96 1 4 + 740 3.74 4 4 + 800 3.75 2 4 + 620 3.40 2 4 + 660 3.67 3 9 + 400 3.35 3 4 + 680 3.14 2 4 + 660 3.47 3 9 + 660 3.63 2 9 + 420 3.41 4 4 + 660 4.00 1 4 + 680 3.70 2 4 + 620 3.23 3 9 + 520 3.35 3 4 + 500 4.00 3 4 + 400 3.36 2 4 + 700 3.56 1 9 + 540 3.81 1 9 + 520 2.68 3 9 + 540 3.50 2 4 + 700 4.00 2 4 + 600 3.64 3 9 + 800 3.31 3 4 + 520 3.29 1 4 + 580 3.69 1 4 + 380 3.43 3 4 + 560 3.19 3 4 + 760 2.81 1 9 + 540 3.13 2 4 + 660 3.14 2 9 + 520 3.81 1 9 + 680 3.19 4 4 + 540 3.78 4 4 + 500 3.57 3 4 + 660 3.49 2 4 + 340 3.00 2 9 + 400 3.15 2 9 + 420 3.92 4 4 + 760 3.35 2 9 + 700 2.94 2 4 + 540 3.04 1 4 + 780 3.87 4 4 + 560 3.78 2 4 + 700 3.82 3 4 + 400 2.93 3 4 + 440 3.45 2 9 + 800 3.47 3 4 + 340 3.15 3 4 + 520 4.00 1 9 + 520 3.15 3 4 + 600 2.98 2 9 + 420 2.69 2 4 + 460 3.44 2 4 + 620 3.71 1 9 + 480 3.13 2 4 + 580 3.40 3 4 + 540 3.39 3 9 + 540 3.94 3 4 + 440 2.98 3 4 + 380 3.59 4 4 + 500 2.97 4 4 + 340 2.92 3 4 + 440 3.15 2 4 + 600 3.48 2 4 + 420 2.67 3 4 + 460 3.07 2 4 + 460 3.45 3 9 + 480 3.39 4 4 + 480 2.78 3 4 + 720 3.42 2 9 + 680 3.67 2 9 + 800 3.89 2 4 + 360 3.00 3 4 + 620 3.17 2 9 + 700 3.52 4 9 + 540 3.19 2 4 + 580 3.30 2 4 + 800 4.00 3 9 + 660 3.33 2 4 + 380 3.34 3 4 + 720 3.84 3 4 + 600 3.59 2 4 + 500 3.03 3 4 + 640 3.81 2 4 + 540 3.49 1 9 + 680 3.85 3 9 + 540 3.84 2 9 + 460 2.93 3 4 + 380 2.94 3 4 + 620 3.22 2 4 + 740 3.37 4 4 + 620 4.00 2 4 + 800 3.74 1 9 + 400 3.31 3 4 + 540 3.46 4 4 + 620 3.18 2 9 + 480 2.91 1 9 + 300 2.84 2 9 + 440 2.48 4 4 + 640 2.79 2 4 + 400 3.23 4 9 + 680 3.46 2 9 + 620 3.37 1 9 + 700 3.92 2 4 + 620 3.37 2 9 + 620 3.63 2 4 + 620 3.95 3 9 + 560 2.52 2 4 + 520 2.62 2 4 + 600 3.35 2 4 + 700 4.00 1 4 + 640 3.67 3 4 + 640 4.00 3 4 + 520 2.93 4 4 + 620 3.21 4 4 + 680 3.99 3 4 + 660 3.34 3 4 + 700 3.45 3 4 + 560 3.36 1 9 + 800 2.78 2 4 + 500 3.88 4 4 + 700 3.65 2 4 + 680 3.76 3 9 + 660 3.07 3 4 + 580 3.46 4 4 + 460 2.87 2 4 + 600 3.31 4 4 + 620 3.94 4 4 + 400 3.05 2 4 + 800 3.43 2 9 + 600 3.58 1 9 + 580 3.36 2 4 + 540 3.16 3 4 + 500 2.71 2 4 + 600 3.28 3 4 + 600 2.82 4 4 + 460 3.58 2 4 + 520 2.85 3 4 + 740 3.52 4 9 + 500 3.95 4 4 + 560 3.61 3 4 + 620 3.45 2 9 + 640 3.51 2 4 + 660 3.44 2 9 + 660 2.91 3 9 + 540 3.28 1 4 + 560 2.98 1 9 + 800 3.97 1 4 + 720 3.77 3 4 + 720 3.64 1 9 + 480 3.71 4 9 + 680 3.34 2 4 + 680 3.11 2 4 + 540 2.81 3 4 + 620 3.75 2 9 + 540 3.12 1 4 + 560 3.48 2 9 + 720 3.40 3 4 + 680 3.90 1 4 + 640 3.76 3 4 + 560 3.16 1 4 + 520 3.30 2 9 + 640 3.12 3 4 + 580 3.57 3 4 + 540 3.55 4 9 + 780 3.63 4 9 + 600 3.89 1 9 + 800 4.00 1 9 + 580 3.29 4 4 + 360 3.27 3 4 + 800 4.00 2 9 + 640 3.52 4 4 + 720 3.45 4 4 + 580 3.06 2 4 + 580 3.02 2 4 + 500 3.60 3 9 + 580 3.12 3 9 + 600 2.82 4 4 + 620 3.99 3 4 + 700 4.00 3 4 + 480 4.00 2 4 + 560 2.95 2 4 + 560 4.00 3 4 + 560 2.65 3 9 + 400 3.08 2 4 + 480 2.62 2 9 + 640 3.86 3 4 + 480 3.57 2 4 + 540 3.51 2 4 + 380 3.33 4 4 + 680 3.64 3 4 + 400 3.51 3 4 + 340 2.90 1 4 + 700 3.08 2 4 + 480 3.02 1 9 + 600 3.15 2 9 + 780 3.80 3 9 + 520 3.74 2 9 + 520 3.51 2 4 + 640 3.73 3 4 + 560 3.32 4 4 + 620 2.85 2 4 + 700 3.28 1 4 + 760 4.00 1 9 + 800 3.60 2 4 + 580 3.34 2 4 + 540 3.77 2 9 + 640 3.17 2 4 + 540 3.02 4 4 + 680 3.08 4 4 + 680 3.31 2 4 + 680 2.96 3 9 + 700 2.88 2 4 + 580 3.77 4 4 + 540 3.49 2 9 + 700 3.56 2 9 + 600 3.56 2 9 + 560 3.59 2 4 + 640 2.94 2 9 + 560 3.33 4 4 + 620 3.69 3 4 + 680 3.27 2 9 + 460 3.14 3 4 + 500 3.53 4 4 + 620 3.33 3 4 + 600 3.62 3 4 + 500 3.01 4 4 + 740 3.34 4 4 + 560 3.69 3 9 + 620 3.95 3 9 + 740 3.86 2 9 + 800 3.53 1 9 + 620 3.78 3 4 + 700 3.27 2 4 + 540 3.78 2 9 + 700 3.65 2 4 + 800 3.22 1 9 + 560 3.59 2 9 + 800 3.15 4 4 + 520 3.90 3 9 + 520 3.74 4 9 + 480 2.55 1 4 + 800 4.00 4 4 + 620 3.09 4 4 + 560 3.49 4 4 + 500 3.17 3 4 + 480 3.40 2 4 + 460 2.98 1 4 + 580 3.58 1 9 + 640 3.30 2 4 + 480 3.45 2 4 + 440 3.17 2 4 + 660 3.32 1 4 + 500 3.08 3 4 + 660 3.94 2 4 + 720 3.31 1 4 + 460 3.64 3 9 + 500 2.93 4 4 + 800 3.54 3 4 + 580 2.93 2 4 + 620 3.61 1 9 + 500 2.98 3 4 + 660 4.00 2 9 + 560 3.24 4 4 + 560 2.42 2 4 + 580 3.80 2 4 + 500 3.23 4 4 + 680 2.42 1 9 + 580 3.46 3 4 + 800 3.91 3 4 + 700 2.90 4 4 + 520 3.12 2 4 + 300 2.92 4 4 + 560 3.43 3 4 + 620 3.63 3 4 + 500 2.79 4 4 + 360 3.14 1 4 + 640 3.94 2 9 + 460 3.99 3 9 + 300 3.01 3 4 + 520 2.73 2 4 + 600 3.47 2 9 + 580 3.25 1 4 + 520 3.10 4 4 + 620 3.43 3 4 + 380 2.91 4 4 + 660 3.59 3 4 + 660 3.95 2 9 + 540 3.33 3 4 + 740 4.00 3 4 + 640 3.38 3 4 + 600 3.89 3 4 + 720 3.88 3 4 + 580 4.00 3 4 + 420 2.26 4 4 + 520 4.00 2 9 + 800 3.70 1 9 + 700 4.00 1 9 + 480 3.43 2 4 + 660 3.45 4 4 + 520 3.25 3 4 + 560 2.71 3 4 + 600 3.32 2 4 + 580 2.88 2 4 + 660 3.88 2 9 + 600 3.22 1 4 + 580 4.00 1 4 + 660 3.60 3 9 + 500 3.35 2 4 + 520 2.98 2 4 + 660 3.49 2 9 + 560 3.07 2 4 + 500 3.13 2 9 + 720 3.50 3 9 + 440 3.39 2 9 + 640 3.95 2 9 + 380 3.61 3 4 + 800 3.05 2 9 + 520 3.19 3 9 + 600 3.40 3 4 +]) + +AT_DATA([lr-cat.sps], [dnl +set format=F20.3. + +data list notable list file='lr-cat.data' /b1 b2 bcat y. + +logistic regression + y with b1 b2 bcat + /categorical = bcat + . +]) + +AT_CHECK([pspp -O format=csv lr-cat.sps], [0], + [dnl +Table: Dependent Variable Encoding +Original Value,Internal Value +4.000,0 +9.000,1 + +Table: Case Processing Summary +Unweighted Cases,N,Percent +Included in Analysis,400,100.000 +Missing Cases,0,.000 +Total,400,100.000 + +note: Estimation terminated at iteration number 4 because parameter estimates changed by less than 0.001 + +Table: Model Summary +Step 1,-2 Log likelihood,Cox & Snell R Square,Nagelkerke R Square +,458.517,.098,.138 + +Table: Categorical Variables' Codings +,,,Parameter coding,, +,,Frequency,(1),(2),(3) +bcat,1.000,61,1,0,0 +,2.000,151,0,1,0 +,3.000,121,0,0,1 +,4.000,67,0,0,0 + +Table: Variables in the Equation +,,B,S.E.,Wald,df,Sig.,Exp(B) +Step 1,b1,.002,.001,4.284,1,.038,1.002 +,b2,.804,.332,5.872,1,.015,2.235 +,bcat,,,20.895,3,.000, +,bcat(1),1.551,.418,13.788,1,.000,4.718 +,bcat(2),.876,.367,5.706,1,.017,2.401 +,bcat(3),.211,.393,.289,1,.591,1.235 +,Constant,-5.541,1.138,23.709,1,.000,.004 +]) + +AT_CLEANUP + + + +dnl This example is inspired by http://www.ats.ucla.edu/stat/spss/output/logistic.htm +AT_SETUP([LOGISTIC REGRESSION with cat var 2]) + +AT_DATA([lr-cat2.data], [dnl + 60.00 1.00 8.00 50.00 + 47.00 .00 9.00 42.00 + 57.00 1.00 7.00 53.00 + 60.00 .00 8.00 53.00 + 68.00 .00 8.00 66.00 + 63.00 .00 8.00 55.00 + 65.00 .00 8.00 63.00 + 52.00 .00 8.00 61.00 + 34.00 .00 9.00 42.00 + 37.00 .00 8.00 39.00 + 68.00 1.00 9.00 69.00 + 60.00 .00 9.00 61.00 + 44.00 .00 9.00 58.00 + 42.00 .00 8.00 47.00 + 57.00 1.00 7.00 61.00 + 55.00 1.00 8.00 50.00 + 55.00 .00 9.00 58.00 + 44.00 .00 8.00 63.00 + 50.00 1.00 9.00 66.00 + 44.00 .00 8.00 39.00 + 55.00 .00 8.00 58.00 + 44.00 .00 8.00 50.00 + 47.00 1.00 7.00 34.00 + 48.00 .00 8.00 44.00 + 45.00 .00 7.00 31.00 + 43.00 .00 8.00 50.00 + 39.00 .00 8.00 42.00 + 63.00 .00 9.00 50.00 + 47.00 .00 8.00 58.00 + 42.00 .00 7.00 50.00 + 50.00 .00 9.00 36.00 + 47.00 .00 7.00 33.00 + 60.00 .00 9.00 61.00 + 47.00 .00 7.00 42.00 + 68.00 1.00 9.00 69.00 + 52.00 .00 8.00 54.00 + 63.00 1.00 9.00 61.00 + 65.00 1.00 9.00 61.00 + 63.00 1.00 9.00 53.00 + 57.00 .00 8.00 51.00 + 34.00 .00 8.00 36.00 + 50.00 .00 8.00 39.00 + 52.00 1.00 7.00 56.00 + 45.00 .00 7.00 34.00 + 47.00 1.00 7.00 53.00 + 34.00 .00 7.00 39.00 + 50.00 1.00 8.00 55.00 + 60.00 .00 9.00 58.00 + 63.00 .00 8.00 58.00 + 35.00 .00 7.00 51.00 + 50.00 .00 8.00 58.00 + 68.00 .00 8.00 63.00 + 41.00 .00 9.00 34.00 + 47.00 .00 8.00 47.00 + 76.00 .00 9.00 64.00 + 44.00 .00 8.00 44.00 + 36.00 .00 9.00 50.00 + 68.00 1.00 9.00 55.00 + 47.00 1.00 8.00 50.00 + 50.00 .00 7.00 53.00 + 68.00 .00 8.00 74.00 + 39.00 .00 7.00 44.00 + 50.00 .00 8.00 55.00 + 52.00 .00 9.00 61.00 + 47.00 .00 8.00 53.00 + 39.00 .00 7.00 47.00 + 55.00 1.00 9.00 49.00 + 68.00 1.00 8.00 50.00 + 52.00 1.00 9.00 63.00 + 55.00 .00 8.00 58.00 + 57.00 .00 8.00 55.00 + 66.00 1.00 9.00 61.00 + 65.00 1.00 7.00 58.00 + 42.00 .00 7.00 42.00 + 68.00 1.00 7.00 59.00 + 60.00 1.00 9.00 61.00 + 52.00 .00 8.00 55.00 + 57.00 1.00 7.00 54.00 + 42.00 .00 9.00 50.00 + 42.00 .00 8.00 47.00 + 57.00 .00 8.00 50.00 + 47.00 .00 7.00 45.00 + 44.00 .00 7.00 40.00 + 43.00 .00 9.00 55.00 + 31.00 .00 8.00 39.00 + 37.00 .00 7.00 33.00 + 63.00 1.00 7.00 63.00 + 47.00 .00 8.00 39.00 + 57.00 1.00 8.00 63.00 + 52.00 .00 8.00 44.00 + 44.00 .00 7.00 35.00 + 52.00 .00 7.00 55.00 + 55.00 .00 7.00 69.00 + 52.00 .00 8.00 53.00 + 55.00 .00 9.00 61.00 + 65.00 1.00 9.00 63.00 + 55.00 .00 8.00 44.00 + 63.00 .00 7.00 65.00 + 44.00 .00 7.00 39.00 + 47.00 .00 7.00 36.00 + 63.00 1.00 9.00 55.00 + 68.00 .00 8.00 66.00 + 34.00 .00 8.00 39.00 + 47.00 .00 9.00 50.00 + 50.00 .00 9.00 58.00 + 63.00 .00 8.00 66.00 + 44.00 .00 7.00 34.00 + 44.00 .00 8.00 50.00 + 50.00 .00 8.00 53.00 + 47.00 1.00 9.00 69.00 + 65.00 .00 9.00 58.00 + 57.00 .00 8.00 47.00 + 39.00 .00 8.00 39.00 + 47.00 .00 8.00 53.00 + 50.00 1.00 7.00 63.00 + 50.00 .00 8.00 50.00 + 63.00 .00 9.00 53.00 + 73.00 1.00 9.00 61.00 + 44.00 .00 7.00 47.00 + 47.00 .00 8.00 42.00 + 47.00 .00 8.00 58.00 + 36.00 .00 7.00 61.00 + 57.00 1.00 8.00 55.00 + 53.00 1.00 8.00 57.00 + 63.00 .00 7.00 66.00 + 50.00 .00 8.00 34.00 + 47.00 .00 9.00 48.00 + 57.00 1.00 8.00 58.00 + 39.00 .00 8.00 53.00 + 42.00 .00 8.00 42.00 + 42.00 .00 9.00 31.00 + 42.00 .00 8.00 72.00 + 46.00 .00 8.00 44.00 + 55.00 .00 8.00 42.00 + 42.00 .00 8.00 47.00 + 50.00 .00 8.00 44.00 + 44.00 .00 9.00 39.00 + 73.00 1.00 8.00 69.00 + 71.00 1.00 9.00 58.00 + 50.00 .00 9.00 49.00 + 63.00 1.00 7.00 54.00 + 42.00 .00 8.00 36.00 + 47.00 .00 7.00 42.00 + 39.00 .00 9.00 26.00 + 63.00 .00 8.00 58.00 + 50.00 .00 8.00 55.00 + 65.00 1.00 8.00 55.00 + 76.00 1.00 9.00 67.00 + 71.00 1.00 8.00 66.00 + 39.00 .00 9.00 47.00 + 47.00 1.00 9.00 63.00 + 60.00 .00 7.00 50.00 + 63.00 .00 9.00 55.00 + 54.00 1.00 9.00 55.00 + 55.00 1.00 8.00 58.00 + 57.00 .00 8.00 61.00 + 55.00 1.00 9.00 63.00 + 42.00 .00 7.00 50.00 + 50.00 .00 8.00 44.00 + 55.00 .00 8.00 42.00 + 42.00 .00 7.00 50.00 + 34.00 .00 8.00 39.00 + 65.00 .00 9.00 46.00 + 52.00 .00 7.00 58.00 + 44.00 .00 8.00 39.00 + 65.00 1.00 9.00 66.00 + 47.00 .00 8.00 42.00 + 41.00 .00 7.00 39.00 + 68.00 .00 9.00 63.00 + 63.00 1.00 8.00 72.00 + 52.00 .00 8.00 53.00 + 57.00 .00 8.00 50.00 + 68.00 .00 8.00 55.00 + 42.00 .00 8.00 56.00 + 47.00 .00 8.00 48.00 + 73.00 1.00 9.00 58.00 + 39.00 .00 8.00 50.00 + 63.00 1.00 9.00 69.00 + 60.00 .00 8.00 55.00 + 65.00 1.00 9.00 66.00 + 73.00 1.00 8.00 63.00 + 52.00 .00 8.00 55.00 + 36.00 .00 8.00 42.00 + 28.00 .00 7.00 44.00 + 47.00 .00 8.00 44.00 + 57.00 .00 7.00 47.00 + 34.00 .00 7.00 29.00 + 47.00 .00 9.00 66.00 + 57.00 .00 8.00 58.00 + 60.00 1.00 9.00 50.00 + 50.00 .00 9.00 47.00 + 73.00 1.00 9.00 55.00 + 52.00 1.00 8.00 47.00 + 55.00 .00 8.00 53.00 + 47.00 .00 8.00 53.00 + 50.00 .00 8.00 61.00 + 61.00 .00 7.00 44.00 + 52.00 .00 9.00 53.00 + 47.00 .00 7.00 40.00 + 47.00 .00 7.00 50.00 +]) + +AT_DATA([stringcat.sps], [dnl +set format=F20.3. +data list notable file='lr-cat2.data' list /read honcomp wiz science *. + +string ses(a1). +recode wiz (7 = "a") (8 = "b") (9 = "c") into ses. + +logistic regression honcomp with read science ses + /categorical = ses. + +]) + +AT_CHECK([pspp -O format=csv stringcat.sps], [0], + [dnl +Table: Dependent Variable Encoding +Original Value,Internal Value +.000,0 +1.000,1 + +Table: Case Processing Summary +Unweighted Cases,N,Percent +Included in Analysis,200,100.000 +Missing Cases,0,.000 +Total,200,100.000 + +note: Estimation terminated at iteration number 5 because parameter estimates changed by less than 0.001 + +Table: Model Summary +Step 1,-2 Log likelihood,Cox & Snell R Square,Nagelkerke R Square +,165.701,.280,.408 + +Table: Categorical Variables' Codings +,,,Parameter coding, +,,Frequency,(1),(2) +ses,a,47,1,0 +,b,95,0,1 +,c,58,0,0 + +Table: Variables in the Equation +,,B,S.E.,Wald,df,Sig.,Exp(B) +Step 1,read,.098,.025,15.199,1,.000,1.103 +,science,.066,.027,5.867,1,.015,1.068 +,ses,,,6.690,2,.035, +,ses(1),.058,.532,.012,1,.913,1.060 +,ses(2),-1.013,.444,5.212,1,.022,.363 +,Constant,-9.561,1.662,33.113,1,.000,.000 +]) + +AT_CLEANUP -- 2.30.2