--- /dev/null
+/* pspp - a program for statistical analysis.
+ Copyright (C) 2012 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+
+/*
+ References:
+ 1. "Coding Logistic Regression with Newton-Raphson", James McCaffrey
+ http://msdn.microsoft.com/en-us/magazine/jj618304.aspx
+
+ 2. "SPSS Statistical Algorithms" Chapter LOGISTIC REGRESSION Algorithms
+
+
+ The Newton Raphson method finds successive approximations to $\bf b$ where
+ approximation ${\bf b}_t$ is (hopefully) better than the previous ${\bf b}_{t-1}$.
+
+ $ {\bf b}_t = {\bf b}_{t -1} + ({\bf X}^T{\bf W}_{t-1}{\bf X})^{-1}{\bf X}^T({\bf y} - {\bf \pi}_{t-1})$
+ where:
+
+ $\bf X$ is the $n \times p$ design matrix, $n$ being the number of cases,
+ $p$ the number of parameters, \par
+ $\bf W$ is the diagonal matrix whose diagonal elements are
+ $\hat{\pi}_0(1 - \hat{\pi}_0), \, \hat{\pi}_1(1 - \hat{\pi}_2)\dots \hat{\pi}_{n-1}(1 - \hat{\pi}_{n-1})$
+ \par
+
+*/
+
+#include <config.h>
+
+#include <gsl/gsl_blas.h>
+
+#include <gsl/gsl_linalg.h>
+#include <gsl/gsl_cdf.h>
+#include <gsl/gsl_matrix.h>
+#include <gsl/gsl_vector.h>
+#include <math.h>
+
+#include "data/case.h"
+#include "data/casegrouper.h"
+#include "data/casereader.h"
+#include "data/dataset.h"
+#include "data/dictionary.h"
+#include "data/format.h"
+#include "data/value.h"
+#include "language/command.h"
+#include "language/dictionary/split-file.h"
+#include "language/lexer/lexer.h"
+#include "language/lexer/value-parser.h"
+#include "language/lexer/variable-parser.h"
+#include "libpspp/assertion.h"
+#include "libpspp/ll.h"
+#include "libpspp/message.h"
+#include "libpspp/misc.h"
+#include "math/categoricals.h"
+#include "output/tab.h"
+
+#include "gettext.h"
+#define _(msgid) gettext (msgid)
+
+
+
+
+#define PRINT_EACH_STEP 0x01
+#define PRINT_SUMMARY 0x02
+#define PRINT_CORR 0x04
+#define PRINT_ITER 0x08
+#define PRINT_GOODFIT 0x10
+#define PRINT_CI 0x20
+
+
+#define PRINT_DEFAULT (PRINT_SUMMARY | PRINT_EACH_STEP)
+
+/*
+ The constant parameters of the procedure.
+ That is, those which are set by the user.
+*/
+struct lr_spec
+{
+ /* The dependent variable */
+ const struct variable *dep_var;
+
+ size_t n_predictor_vars;
+ const struct variable **predictor_vars;
+
+ /* Which classes of missing vars are to be excluded */
+ enum mv_class exclude;
+
+ /* The weight variable */
+ const struct variable *wv;
+
+ const struct dictionary *dict;
+
+ /* True iff the constant (intercept) is to be included in the model */
+ bool constant;
+
+ /* Ths maximum number of iterations */
+ int max_iter;
+
+ /* Other iteration limiting conditions */
+ double bcon;
+ double min_epsilon;
+ double lcon;
+
+ /* The confidence interval (in percent) */
+ int confidence;
+
+ /* What results should be presented */
+ unsigned int print;
+
+ double cut_point;
+};
+
+/* The results and intermediate result of the procedure.
+ These are mutated as the procedure runs. Used for
+ temporary variables etc.
+*/
+struct lr_result
+{
+ bool warn_bad_weight;
+
+
+ /* The two values of the dependent variable. */
+ union value y0;
+ union value y1;
+
+
+ /* The sum of caseweights */
+ double cc;
+
+ casenumber n_missing;
+ casenumber n_nonmissing;
+};
+
+
+/*
+ Convert INPUT into a dichotomous scalar. For simple cases, this is a 1:1 mapping
+ The return value is always either 0 or 1
+*/
+static double
+map_dependent_var (const struct lr_spec *cmd, const struct lr_result *res, const union value *input)
+{
+ int width = var_get_width (cmd->dep_var);
+ if (value_equal (input, &res->y0, width))
+ return 0;
+
+ if (value_equal (input, &res->y1, width))
+ return 1;
+
+ NOT_REACHED ();
+
+ return SYSMIS;
+}
+
+
+
+static void output_depvarmap (const struct lr_spec *cmd, const struct lr_result *);
+
+static void output_variables (const struct lr_spec *cmd,
+ const gsl_vector *,
+ const gsl_vector *);
+
+static void output_model_summary (const struct lr_result *,
+ double initial_likelihood, double likelihood);
+
+static void case_processing_summary (const struct lr_result *);
+
+
+/*
+ Return the probability estimator (that is the estimator of logit(y) )
+ corresponding to the coefficient estimator beta_hat for case C
+*/
+static double
+pi_hat (const struct lr_spec *cmd,
+ const gsl_vector *beta_hat,
+ const struct variable **x, size_t n_x,
+ const struct ccase *c)
+{
+ int v0;
+ double pi = 0;
+ for (v0 = 0; v0 < n_x; ++v0)
+ {
+ pi += gsl_vector_get (beta_hat, v0) *
+ case_data (c, x[v0])->f;
+ }
+
+ if (cmd->constant)
+ pi += gsl_vector_get (beta_hat, beta_hat->size - 1);
+
+ pi = 1.0 / (1.0 + exp(-pi));
+
+ return pi;
+}
+
+
+/*
+ Calculates the Hessian matrix X' V X,
+ where: X is the n by N_X matrix comprising the n cases in INPUT
+ V is a diagonal matrix { (pi_hat_0)(1 - pi_hat_0), (pi_hat_1)(1 - pi_hat_1), ... (pi_hat_{N-1})(1 - pi_hat_{N-1})}
+ (the partial derivative of the predicted values)
+
+ If ALL predicted values derivatives are close to zero or one, then CONVERGED
+ will be set to true.
+*/
+static gsl_matrix *
+hessian (const struct lr_spec *cmd,
+ struct lr_result *res,
+ struct casereader *input,
+ const struct variable **x, size_t n_x,
+ const gsl_vector *beta_hat,
+ bool *converged
+ )
+{
+ struct casereader *reader;
+ struct ccase *c;
+ gsl_matrix *output = gsl_matrix_calloc (beta_hat->size, beta_hat->size);
+
+ double max_w = -DBL_MAX;
+
+ for (reader = casereader_clone (input);
+ (c = casereader_read (reader)) != NULL; case_unref (c))
+ {
+ int v0, v1;
+ double pi = pi_hat (cmd, beta_hat, x, n_x, c);
+
+ double weight = dict_get_case_weight (cmd->dict, c, &res->warn_bad_weight);
+ double w = pi * (1 - pi);
+ if (w > max_w)
+ max_w = w;
+ w *= weight;
+
+ for (v0 = 0; v0 < beta_hat->size; ++v0)
+ {
+ double in0 = v0 < n_x ? case_data (c, x[v0])->f : 1.0;
+ for (v1 = 0; v1 < beta_hat->size; ++v1)
+ {
+ double in1 = v1 < n_x ? case_data (c, x[v1])->f : 1.0 ;
+ double *o = gsl_matrix_ptr (output, v0, v1);
+ *o += in0 * w * in1;
+ }
+ }
+ }
+ casereader_destroy (reader);
+
+
+ if ( max_w < cmd->min_epsilon)
+ {
+ *converged = true;
+ msg (MN, _("All predicted values are either 1 or 0"));
+ }
+
+ return output;
+}
+
+
+/* Calculates the value X' (y - pi)
+ where X is the design model,
+ y is the vector of observed independent variables
+ pi is the vector of estimates for y
+
+ As a side effect, the likelihood is stored in LIKELIHOOD
+*/
+static gsl_vector *
+xt_times_y_pi (const struct lr_spec *cmd,
+ struct lr_result *res,
+ struct casereader *input,
+ const struct variable **x, size_t n_x,
+ const struct variable *y_var,
+ const gsl_vector *beta_hat,
+ double *likelihood)
+{
+ struct casereader *reader;
+ struct ccase *c;
+ gsl_vector *output = gsl_vector_calloc (beta_hat->size);
+
+ *likelihood = 1.0;
+ for (reader = casereader_clone (input);
+ (c = casereader_read (reader)) != NULL; case_unref (c))
+ {
+ int v0;
+ double pi = pi_hat (cmd, beta_hat, x, n_x, c);
+ double weight = dict_get_case_weight (cmd->dict, c, &res->warn_bad_weight);
+
+
+ double y = map_dependent_var (cmd, res, case_data (c, y_var));
+
+ *likelihood *= pow (pi, weight * y) * pow (1 - pi, weight * (1 - y));
+
+ for (v0 = 0; v0 < beta_hat->size; ++v0)
+ {
+ double in0 = v0 < n_x ? case_data (c, x[v0])->f : 1.0;
+ double *o = gsl_vector_ptr (output, v0);
+ *o += in0 * (y - pi) * weight;
+ }
+ }
+
+ casereader_destroy (reader);
+
+ return output;
+}
+
+
+/*
+ Makes an initial pass though the data, checks that the dependent variable is
+ dichotomous, and calculates necessary initial values.
+
+ Returns an initial value for \hat\beta the vector of estimators of \beta
+*/
+static gsl_vector *
+beta_hat_initial (const struct lr_spec *cmd, struct lr_result *res, struct casereader *input)
+{
+ const int width = var_get_width (cmd->dep_var);
+
+ struct ccase *c;
+ struct casereader *reader;
+ gsl_vector *b0 ;
+ double sum;
+ double sumA = 0.0;
+ double sumB = 0.0;
+
+ bool v0set = false;
+ bool v1set = false;
+
+ size_t n_coefficients = cmd->n_predictor_vars;
+ if (cmd->constant)
+ n_coefficients++;
+
+ b0 = gsl_vector_calloc (n_coefficients);
+
+ res->cc = 0;
+ for (reader = casereader_clone (input);
+ (c = casereader_read (reader)) != NULL; case_unref (c))
+ {
+ int v;
+ bool missing = false;
+ double weight = dict_get_case_weight (cmd->dict, c, &res->warn_bad_weight);
+ const union value *depval = case_data (c, cmd->dep_var);
+
+ for (v = 0; v < cmd->n_predictor_vars; ++v)
+ {
+ const union value *val = case_data (c, cmd->predictor_vars[v]);
+ if (var_is_value_missing (cmd->predictor_vars[v], val, cmd->exclude))
+ {
+ missing = true;
+ break;
+ }
+ }
+
+ if (missing)
+ {
+ res->n_missing++;
+ continue;
+ }
+
+ res->n_nonmissing++;
+
+ if (!v0set)
+ {
+ value_clone (&res->y0, depval, width);
+ v0set = true;
+ }
+ else if (!v1set)
+ {
+ if ( !value_equal (&res->y0, depval, width))
+ {
+ value_clone (&res->y1, depval, width);
+ v1set = true;
+ }
+ }
+ else
+ {
+ if (! value_equal (&res->y0, depval, width)
+ &&
+ ! value_equal (&res->y1, depval, width)
+ )
+ {
+ msg (ME, _("Dependent variable's values are not dichotomous."));
+ goto error;
+ }
+ }
+
+ if (value_equal (&res->y0, depval, width))
+ sumA += weight;
+
+ if (value_equal (&res->y1, depval, width))
+ sumB += weight;
+
+
+ res->cc += weight;
+ }
+ casereader_destroy (reader);
+
+ sum = sumB;
+
+ /* Ensure that Y0 is less than Y1. Otherwise the mapping gets
+ inverted, which is confusing to users */
+ if (var_is_numeric (cmd->dep_var) && value_compare_3way (&res->y0, &res->y1, width) > 0)
+ {
+ union value tmp;
+ value_clone (&tmp, &res->y0, width);
+ value_copy (&res->y0, &res->y1, width);
+ value_copy (&res->y1, &tmp, width);
+ value_destroy (&tmp, width);
+ sum = sumA;
+ }
+
+ if ( cmd->constant)
+ {
+ double mean = sum / res->cc;
+ gsl_vector_set (b0, b0->size - 1, log (mean / (1 - mean)));
+ }
+
+ return b0;
+
+ error:
+ casereader_destroy (reader);
+ return NULL;
+}
+
+
+
+static bool
+run_lr (const struct lr_spec *cmd, struct casereader *input,
+ const struct dataset *ds UNUSED)
+{
+ int i,j;
+
+ gsl_vector *beta_hat;
+ gsl_vector *se ;
+
+ bool converged = false;
+ double likelihood;
+ double prev_likelihood = -1;
+ double initial_likelihood ;
+
+ struct lr_result work;
+ work.n_missing = 0;
+ work.n_nonmissing = 0;
+ work.warn_bad_weight = true;
+
+
+ /* Get the initial estimates of \beta and their standard errors */
+ beta_hat = beta_hat_initial (cmd, &work, input);
+ if (NULL == beta_hat)
+ return false;
+
+ output_depvarmap (cmd, &work);
+
+ se = gsl_vector_alloc (beta_hat->size);
+
+ case_processing_summary (&work);
+
+
+ input = casereader_create_filter_missing (input,
+ cmd->predictor_vars,
+ cmd->n_predictor_vars,
+ cmd->exclude,
+ NULL,
+ NULL);
+
+
+ /* Start the Newton Raphson iteration process... */
+ for( i = 0 ; i < cmd->max_iter ; ++i)
+ {
+ double min, max;
+ gsl_matrix *m ;
+ gsl_vector *v ;
+
+ m = hessian (cmd, &work, input,
+ cmd->predictor_vars, cmd->n_predictor_vars,
+ beta_hat,
+ &converged);
+
+ gsl_linalg_cholesky_decomp (m);
+ gsl_linalg_cholesky_invert (m);
+
+ v = xt_times_y_pi (cmd, &work, input,
+ cmd->predictor_vars, cmd->n_predictor_vars,
+ cmd->dep_var,
+ beta_hat,
+ &likelihood);
+
+ {
+ /* delta = M.v */
+ gsl_vector *delta = gsl_vector_alloc (v->size);
+ gsl_blas_dgemv (CblasNoTrans, 1.0, m, v, 0, delta);
+ gsl_vector_free (v);
+
+ for (j = 0; j < se->size; ++j)
+ {
+ double *ptr = gsl_vector_ptr (se, j);
+ *ptr = gsl_matrix_get (m, j, j);
+ }
+
+ gsl_matrix_free (m);
+
+ gsl_vector_add (beta_hat, delta);
+
+ gsl_vector_minmax (delta, &min, &max);
+
+ if ( fabs (min) < cmd->bcon && fabs (max) < cmd->bcon)
+ {
+ msg (MN, _("Estimation terminated at iteration number %d because parameter estimates changed by less than %g"),
+ i + 1, cmd->bcon);
+ converged = true;
+ }
+
+ gsl_vector_free (delta);
+ }
+
+ if ( prev_likelihood >= 0)
+ {
+ if (-log (likelihood) > -(1.0 - cmd->lcon) * log (prev_likelihood))
+ {
+ msg (MN, _("Estimation terminated at iteration number %d because Log Likelihood decreased by less than %g%%"), i + 1, 100 * cmd->lcon);
+ converged = true;
+ }
+ }
+ if (i == 0)
+ initial_likelihood = likelihood;
+ prev_likelihood = likelihood;
+
+ if (converged)
+ break;
+ }
+ casereader_destroy (input);
+
+ for (i = 0; i < se->size; ++i)
+ {
+ double *ptr = gsl_vector_ptr (se, i);
+ *ptr = sqrt (*ptr);
+ }
+
+ output_model_summary (&work, initial_likelihood, likelihood);
+ output_variables (cmd, beta_hat, se);
+
+ gsl_vector_free (beta_hat);
+ gsl_vector_free (se);
+
+ return true;
+}
+
+/* Parse the LOGISTIC REGRESSION command syntax */
+int
+cmd_logistic (struct lexer *lexer, struct dataset *ds)
+{
+ struct lr_spec lr;
+ lr.dict = dataset_dict (ds);
+ lr.n_predictor_vars = 0;
+ lr.predictor_vars = NULL;
+ lr.exclude = MV_ANY;
+ lr.wv = dict_get_weight (lr.dict);
+ lr.max_iter = 20;
+ lr.lcon = 0.0000;
+ lr.bcon = 0.001;
+ lr.min_epsilon = 0.00000001;
+ lr.cut_point = 0.5;
+ lr.constant = true;
+ lr.confidence = 95;
+ lr.print = PRINT_DEFAULT;
+
+
+ if (lex_match_id (lexer, "VARIABLES"))
+ lex_match (lexer, T_EQUALS);
+
+ if (! (lr.dep_var = parse_variable_const (lexer, lr.dict)))
+ goto error;
+
+ lex_force_match (lexer, T_WITH);
+
+ if (!parse_variables_const (lexer, lr.dict,
+ &lr.predictor_vars, &lr.n_predictor_vars,
+ PV_NO_DUPLICATE | PV_NUMERIC))
+ goto error;
+
+
+ while (lex_token (lexer) != T_ENDCMD)
+ {
+ lex_match (lexer, T_SLASH);
+
+ if (lex_match_id (lexer, "MISSING"))
+ {
+ lex_match (lexer, T_EQUALS);
+ while (lex_token (lexer) != T_ENDCMD
+ && lex_token (lexer) != T_SLASH)
+ {
+ if (lex_match_id (lexer, "INCLUDE"))
+ {
+ lr.exclude = MV_SYSTEM;
+ }
+ else if (lex_match_id (lexer, "EXCLUDE"))
+ {
+ lr.exclude = MV_ANY;
+ }
+ else
+ {
+ lex_error (lexer, NULL);
+ goto error;
+ }
+ }
+ }
+ else if (lex_match_id (lexer, "ORIGIN"))
+ {
+ lr.constant = false;
+ }
+ else if (lex_match_id (lexer, "NOORIGIN"))
+ {
+ lr.constant = true;
+ }
+ else if (lex_match_id (lexer, "NOCONST"))
+ {
+ lr.constant = false;
+ }
+ else if (lex_match_id (lexer, "EXTERNAL"))
+ {
+ /* This is for compatibility. It does nothing */
+ }
+ else if (lex_match_id (lexer, "PRINT"))
+ {
+ lex_match (lexer, T_EQUALS);
+ while (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_SLASH)
+ {
+ if (lex_match_id (lexer, "DEFAULT"))
+ {
+ lr.print |= PRINT_DEFAULT;
+ }
+ else if (lex_match_id (lexer, "SUMMARY"))
+ {
+ lr.print |= PRINT_SUMMARY;
+ }
+#if 0
+ else if (lex_match_id (lexer, "CORR"))
+ {
+ lr.print |= PRINT_CORR;
+ }
+ else if (lex_match_id (lexer, "ITER"))
+ {
+ lr.print |= PRINT_ITER;
+ }
+ else if (lex_match_id (lexer, "GOODFIT"))
+ {
+ lr.print |= PRINT_GOODFIT;
+ }
+#endif
+ else if (lex_match_id (lexer, "CI"))
+ {
+ lr.print |= PRINT_CI;
+ if (lex_force_match (lexer, T_LPAREN))
+ {
+ if (! lex_force_int (lexer))
+ {
+ lex_error (lexer, NULL);
+ goto error;
+ }
+ lr.confidence = lex_integer (lexer);
+ lex_get (lexer);
+ if ( ! lex_force_match (lexer, T_RPAREN))
+ {
+ lex_error (lexer, NULL);
+ goto error;
+ }
+ }
+ }
+ else if (lex_match_id (lexer, "ALL"))
+ {
+ lr.print = ~0x0000;
+ }
+ else
+ {
+ lex_error (lexer, NULL);
+ goto error;
+ }
+ }
+ }
+ else if (lex_match_id (lexer, "CRITERIA"))
+ {
+ lex_match (lexer, T_EQUALS);
+ while (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_SLASH)
+ {
+ if (lex_match_id (lexer, "BCON"))
+ {
+ if (lex_force_match (lexer, T_LPAREN))
+ {
+ if (! lex_force_num (lexer))
+ {
+ lex_error (lexer, NULL);
+ goto error;
+ }
+ lr.bcon = lex_number (lexer);
+ lex_get (lexer);
+ if ( ! lex_force_match (lexer, T_RPAREN))
+ {
+ lex_error (lexer, NULL);
+ goto error;
+ }
+ }
+ }
+ else if (lex_match_id (lexer, "ITERATE"))
+ {
+ if (lex_force_match (lexer, T_LPAREN))
+ {
+ if (! lex_force_int (lexer))
+ {
+ lex_error (lexer, NULL);
+ goto error;
+ }
+ lr.max_iter = lex_integer (lexer);
+ lex_get (lexer);
+ if ( ! lex_force_match (lexer, T_RPAREN))
+ {
+ lex_error (lexer, NULL);
+ goto error;
+ }
+ }
+ }
+ else if (lex_match_id (lexer, "LCON"))
+ {
+ if (lex_force_match (lexer, T_LPAREN))
+ {
+ if (! lex_force_num (lexer))
+ {
+ lex_error (lexer, NULL);
+ goto error;
+ }
+ lr.lcon = lex_number (lexer);
+ lex_get (lexer);
+ if ( ! lex_force_match (lexer, T_RPAREN))
+ {
+ lex_error (lexer, NULL);
+ goto error;
+ }
+ }
+ }
+ else if (lex_match_id (lexer, "EPS"))
+ {
+ if (lex_force_match (lexer, T_LPAREN))
+ {
+ if (! lex_force_num (lexer))
+ {
+ lex_error (lexer, NULL);
+ goto error;
+ }
+ lr.min_epsilon = lex_number (lexer);
+ lex_get (lexer);
+ if ( ! lex_force_match (lexer, T_RPAREN))
+ {
+ lex_error (lexer, NULL);
+ goto error;
+ }
+ }
+ }
+ else
+ {
+ lex_error (lexer, NULL);
+ goto error;
+ }
+ }
+ }
+ else
+ {
+ lex_error (lexer, NULL);
+ goto error;
+ }
+ }
+
+
+
+ {
+ struct casegrouper *grouper;
+ struct casereader *group;
+ bool ok;
+
+ grouper = casegrouper_create_splits (proc_open (ds), lr.dict);
+ while (casegrouper_get_next_group (grouper, &group))
+ ok = run_lr (&lr, group, ds);
+ ok = casegrouper_destroy (grouper);
+ ok = proc_commit (ds) && ok;
+ }
+
+ free (lr.predictor_vars);
+ return CMD_SUCCESS;
+
+ error:
+
+ free (lr.predictor_vars);
+ return CMD_FAILURE;
+}
+
+
+\f
+
+/* Show the Dependent Variable Encoding box.
+ This indicates how the dependent variable
+ is mapped to the internal zero/one values.
+*/
+static void
+output_depvarmap (const struct lr_spec *cmd, const struct lr_result *res)
+{
+ const int heading_columns = 0;
+ const int heading_rows = 1;
+ struct tab_table *t;
+ struct string str;
+
+ const int nc = 2;
+ int nr = heading_rows + 2;
+
+ t = tab_create (nc, nr);
+ tab_title (t, _("Dependent Variable Encoding"));
+
+ tab_headers (t, heading_columns, 0, heading_rows, 0);
+
+ tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, nc - 1, nr - 1);
+
+ tab_hline (t, TAL_2, 0, nc - 1, heading_rows);
+ tab_vline (t, TAL_2, heading_columns, 0, nr - 1);
+
+ tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Original Value"));
+ tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("Internal Value"));
+
+
+
+ ds_init_empty (&str);
+ var_append_value_name (cmd->dep_var, &res->y0, &str);
+ tab_text (t, 0, 0 + heading_rows, 0, ds_cstr (&str));
+
+ ds_clear (&str);
+ var_append_value_name (cmd->dep_var, &res->y1, &str);
+ tab_text (t, 0, 1 + heading_rows, 0, ds_cstr (&str));
+
+
+ tab_double (t, 1, 0 + heading_rows, 0, map_dependent_var (cmd, res, &res->y0), &F_8_0);
+ tab_double (t, 1, 1 + heading_rows, 0, map_dependent_var (cmd, res, &res->y1), &F_8_0);
+ ds_destroy (&str);
+
+ tab_submit (t);
+}
+
+
+/* Show the Variables in the Equation box */
+static void
+output_variables (const struct lr_spec *cmd,
+ const gsl_vector *beta,
+ const gsl_vector *se)
+{
+ int row = 0;
+ const int heading_columns = 1;
+ int heading_rows = 1;
+ struct tab_table *t;
+
+ int idx;
+ int n_rows = cmd->n_predictor_vars;
+
+ int nc = 8;
+ int nr ;
+ if (cmd->print & PRINT_CI)
+ {
+ nc += 2;
+ heading_rows += 1;
+ row++;
+ }
+ nr = heading_rows + cmd->n_predictor_vars;
+ if (cmd->constant)
+ nr++;
+
+ t = tab_create (nc, nr);
+ tab_title (t, _("Variables in the Equation"));
+
+ tab_headers (t, heading_columns, 0, heading_rows, 0);
+
+ tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, nc - 1, nr - 1);
+
+ tab_hline (t, TAL_2, 0, nc - 1, heading_rows);
+ tab_vline (t, TAL_2, heading_columns, 0, nr - 1);
+
+ tab_text (t, 0, row + 1, TAB_CENTER | TAT_TITLE, _("Step 1"));
+
+ tab_text (t, 2, row, TAB_CENTER | TAT_TITLE, _("B"));
+ tab_text (t, 3, row, TAB_CENTER | TAT_TITLE, _("S.E."));
+ tab_text (t, 4, row, TAB_CENTER | TAT_TITLE, _("Wald"));
+ tab_text (t, 5, row, TAB_CENTER | TAT_TITLE, _("df"));
+ tab_text (t, 6, row, TAB_CENTER | TAT_TITLE, _("Sig."));
+ tab_text (t, 7, row, TAB_CENTER | TAT_TITLE, _("Exp(B)"));
+
+ if (cmd->print & PRINT_CI)
+ {
+ tab_joint_text_format (t, 8, 0, 9, 0,
+ TAB_CENTER | TAT_TITLE, _("%d%% CI for Exp(B)"), cmd->confidence);
+
+ tab_text (t, 8, row, TAB_CENTER | TAT_TITLE, _("Lower"));
+ tab_text (t, 9, row, TAB_CENTER | TAT_TITLE, _("Upper"));
+ }
+
+ if (cmd->constant)
+ n_rows++;
+
+ for (idx = 0 ; idx < n_rows; ++idx)
+ {
+ const int r = idx + heading_rows;
+
+ const double b = gsl_vector_get (beta, idx);
+ const double sigma = gsl_vector_get (se, idx);
+ const double wald = pow2 (b / sigma);
+ const double df = 1;
+
+ if (idx < cmd->n_predictor_vars)
+ tab_text (t, 1, r, TAB_LEFT | TAT_TITLE,
+ var_to_string (cmd->predictor_vars[idx]));
+
+ tab_double (t, 2, r, 0, b, 0);
+ tab_double (t, 3, r, 0, sigma, 0);
+ tab_double (t, 4, r, 0, wald, 0);
+ tab_double (t, 5, r, 0, df, &F_8_0);
+ tab_double (t, 6, r, 0, gsl_cdf_chisq_Q (wald, df), 0);
+ tab_double (t, 7, r, 0, exp (b), 0);
+
+ if (cmd->print & PRINT_CI)
+ {
+ double wc = gsl_cdf_ugaussian_Pinv (0.5 + cmd->confidence / 200.0);
+ wc *= sigma;
+
+ if (idx < cmd->n_predictor_vars)
+ {
+ tab_double (t, 8, r, 0, exp (b - wc), 0);
+ tab_double (t, 9, r, 0, exp (b + wc), 0);
+ }
+ }
+ }
+
+ if ( cmd->constant)
+ tab_text (t, 1, nr - 1, TAB_LEFT | TAT_TITLE, _("Constant"));
+
+ tab_submit (t);
+}
+
+
+/* Show the model summary box */
+static void
+output_model_summary (const struct lr_result *res,
+ double initial_likelihood, double likelihood)
+{
+ const int heading_columns = 0;
+ const int heading_rows = 1;
+ struct tab_table *t;
+
+ const int nc = 4;
+ int nr = heading_rows + 1;
+ double cox;
+
+ t = tab_create (nc, nr);
+ tab_title (t, _("Model Summary"));
+
+ tab_headers (t, heading_columns, 0, heading_rows, 0);
+
+ tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, nc - 1, nr - 1);
+
+ tab_hline (t, TAL_2, 0, nc - 1, heading_rows);
+ tab_vline (t, TAL_2, heading_columns, 0, nr - 1);
+
+ tab_text (t, 0, 0, TAB_LEFT | TAT_TITLE, _("Step 1"));
+ tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("-2 Log likelihood"));
+ tab_double (t, 1, 1, 0, -2 * log (likelihood), 0);
+
+
+ tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Cox & Snell R Square"));
+ cox = 1.0 - pow (initial_likelihood /likelihood, 2 / res->cc);
+ tab_double (t, 2, 1, 0, cox, 0);
+
+ tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Nagelkerke R Square"));
+ tab_double (t, 3, 1, 0, cox / ( 1.0 - pow (initial_likelihood, 2 / res->cc)), 0);
+
+
+ tab_submit (t);
+}
+
+/* Show the case processing summary box */
+static void
+case_processing_summary (const struct lr_result *res)
+{
+ const int heading_columns = 1;
+ const int heading_rows = 1;
+ struct tab_table *t;
+
+ const int nc = 3;
+ const int nr = heading_rows + 3;
+ casenumber total;
+
+ t = tab_create (nc, nr);
+ tab_title (t, _("Case Processing Summary"));
+
+ tab_headers (t, heading_columns, 0, heading_rows, 0);
+
+ tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, nc - 1, nr - 1);
+
+ tab_hline (t, TAL_2, 0, nc - 1, heading_rows);
+ tab_vline (t, TAL_2, heading_columns, 0, nr - 1);
+
+ tab_text (t, 0, 0, TAB_LEFT | TAT_TITLE, _("Unweighted Cases"));
+ tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("N"));
+ tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Percent"));
+
+
+ tab_text (t, 0, 1, TAB_LEFT | TAT_TITLE, _("Included in Analysis"));
+ tab_text (t, 0, 2, TAB_LEFT | TAT_TITLE, _("Missing Cases"));
+ tab_text (t, 0, 3, TAB_LEFT | TAT_TITLE, _("Total"));
+
+ tab_double (t, 1, 1, 0, res->n_nonmissing, &F_8_0);
+ tab_double (t, 1, 2, 0, res->n_missing, &F_8_0);
+
+ total = res->n_nonmissing + res->n_missing;
+ tab_double (t, 1, 3, 0, total , &F_8_0);
+
+ tab_double (t, 2, 1, 0, 100 * res->n_nonmissing / (double) total, 0);
+ tab_double (t, 2, 2, 0, 100 * res->n_missing / (double) total, 0);
+ tab_double (t, 2, 3, 0, 100 * total / (double) total, 0);
+
+ tab_submit (t);
+}
+
--- /dev/null
+AT_BANNER([LOGISTIC REGRESSION])
+
+dnl These examples are adapted from
+dnl http://www.uvm.edu/~dhowell/gradstat/psych341/lectures/Logistic%20Regression/LogisticReg1.html
+
+
+
+m4_define([LOGIT_TEST_DATA],
+ [AT_DATA([lr-data.txt], dnl
+ 105.00 1.00 33.00 3.00 2.00 .35 17.00 20.00 .50110 -2.00440 1
+ 106.00 1.00 50.00 2.00 3.00 .38 7.00 15.00 .20168 -1.25264 1
+ 107.00 1.00 91.00 3.00 2.00 .28 15.00 7.00 .00897 -1.00905 1
+ 108.00 1.00 90.00 3.00 2.00 .20 2.00 2.00 .00972 -1.00982 1
+ 109.00 1.00 70.00 3.00 3.00 .38 23.00 27.00 .04745 -1.04981 1
+ 111.00 2.00 31.00 2.00 2.00 .00 19.00 10.00 .54159 1.84640 1
+ 112.00 1.00 91.00 2.00 3.00 .18 6.00 16.00 .00897 -1.00905 1
+ 113.00 1.00 81.00 3.00 2.00 .00 3.00 9.00 .01998 -1.02039 1
+ 114.00 2.00 15.00 1.00 2.00 .13 19.00 13.00 .81241 1.23090 1
+ 116.00 2.00 1.00 1.00 2.00 .88 15.00 7.00 .93102 1.07410 1
+ 117.00 1.00 93.00 3.00 2.00 .18 9.00 15.00 .00764 -1.00770 1
+ 118.00 2.00 14.00 1.00 3.00 .15 23.00 18.00 .82447 1.21289 1
+ 120.00 1.00 91.00 2.00 2.00 .43 17.00 14.00 .00897 -1.00905 1
+ 121.00 1.00 55.00 3.00 2.00 .69 20.00 14.00 .14409 -1.16834 1
+ 122.00 1.00 70.00 2.00 3.00 .03 .00 6.00 .04745 -1.04981 1
+ 123.00 1.00 25.00 2.00 2.00 .45 4.00 10.00 .65789 -2.92301 1
+ 125.00 1.00 91.00 2.00 2.00 .13 .00 3.00 .00897 -1.00905 1
+ 126.00 1.00 91.00 3.00 3.00 .23 4.00 6.00 .00897 -1.00905 1
+ 127.00 1.00 91.00 3.00 2.00 .00 8.00 8.00 .00897 -1.00905 1
+ 128.00 2.00 13.00 2.00 2.00 .65 16.00 14.00 .83592 1.19629 1
+ 129.00 1.00 50.00 2.00 2.00 .25 20.00 23.00 .20168 -1.25264 1
+ 135.00 1.00 90.00 3.00 3.00 .03 5.00 12.00 .00972 -1.00982 1
+ 138.00 1.00 70.00 3.00 3.00 .10 1.00 6.00 .04745 -1.04981 1
+ 139.00 2.00 19.00 3.00 3.00 .10 11.00 12.00 .75787 1.31949 1
+ 149.00 2.00 50.00 3.00 2.00 .03 .00 .00 .20168 4.95826 1
+ 204.00 1.00 50.00 3.00 1.00 .13 .00 1.00 .20168 -1.25264 1
+ 205.00 1.00 91.00 3.00 3.00 .72 16.00 18.00 .00897 -1.00905 1
+ 206.00 2.00 24.00 1.00 1.00 .10 5.00 21.00 .67592 1.47947 1
+ 207.00 1.00 80.00 3.00 3.00 .13 6.00 7.00 .02164 -1.02212 1
+ 208.00 1.00 87.00 2.00 2.00 .18 9.00 20.00 .01237 -1.01253 1
+ 209.00 1.00 70.00 2.00 2.00 .53 15.00 12.00 .04745 -1.04981 1
+ 211.00 1.00 55.00 2.00 1.00 .33 8.00 5.00 .14409 -1.16834 1
+ 212.00 1.00 56.00 3.00 1.00 .30 6.00 20.00 .13436 -1.15522 1
+ 214.00 1.00 54.00 2.00 2.00 .15 .00 16.00 .15439 -1.18258 1
+ 215.00 1.00 71.00 3.00 3.00 .35 12.00 12.00 .04391 -1.04592 1
+ 217.00 2.00 36.00 1.00 1.00 .10 12.00 8.00 .44049 2.27020 1
+ 218.00 1.00 91.00 2.00 2.00 .05 11.00 25.00 .00897 -1.00905 1
+ 219.00 1.00 91.00 2.00 2.00 1.23 11.00 24.00 .00897 -1.00905 1
+ 220.00 1.00 91.00 2.00 3.00 .08 8.00 11.00 .00897 -1.00905 1
+ 221.00 1.00 91.00 2.00 2.00 .33 5.00 11.00 .00897 -1.00905 1
+ 222.00 2.00 36.00 2.00 1.00 .18 5.00 3.00 .44049 2.27020 1
+ 223.00 1.00 70.00 2.00 3.00 .18 14.00 3.00 .04745 -1.04981 1
+ 224.00 1.00 91.00 2.00 2.00 .43 2.00 10.00 .00897 -1.00905 1
+ 225.00 1.00 55.00 2.00 1.00 .18 6.00 11.00 .14409 -1.16834 1
+ 229.00 2.00 75.00 2.00 2.00 .40 30.00 25.00 .03212 31.12941 1
+ 232.00 1.00 91.00 3.00 2.00 .15 6.00 3.00 .00897 -1.00905 1
+ 233.00 1.00 70.00 2.00 1.00 .00 11.00 8.00 .04745 -1.04981 1
+ 234.00 1.00 54.00 3.00 2.00 .10 .00 .00 .15439 -1.18258 1
+ 237.00 1.00 70.00 3.00 2.00 .18 5.00 25.00 .04745 -1.04981 1
+ 241.00 1.00 19.00 2.00 3.00 .33 13.00 9.00 .75787 -4.12995 1
+ 304.00 2.00 18.00 2.00 2.00 .26 25.00 6.00 .77245 1.29458 1
+ 305.00 1.00 88.00 3.00 2.00 1.35 17.00 29.00 .01142 -1.01155 1
+ 306.00 1.00 70.00 2.00 3.00 .63 14.00 33.00 .04745 -1.04981 1
+ 307.00 1.00 85.00 2.00 2.00 2.65 18.00 14.00 .01452 -1.01474 1
+ 308.00 1.00 13.00 2.00 2.00 .23 5.00 5.00 .83592 -6.09442 1
+ 309.00 2.00 13.00 2.00 2.00 .23 7.00 17.00 .83592 1.19629 1
+ 311.00 2.00 1.00 2.00 2.00 .50 20.00 14.00 .93102 1.07410 1
+ 315.00 1.00 19.00 2.00 3.00 .18 1.00 11.00 .75787 -4.12995 1
+ 316.00 1.00 88.00 2.00 2.00 .38 12.00 11.00 .01142 -1.01155 2
+ 318.00 1.00 88.00 3.00 2.00 .03 5.00 5.00 .01142 -1.01155 3
+ 319.00 2.00 18.00 2.00 3.00 .30 15.00 16.00 .77245 1.29458 1
+ 321.00 2.00 15.00 2.00 2.00 .63 15.00 18.00 .81241 1.23090 1
+ 322.00 1.00 88.00 3.00 2.00 .40 18.00 15.00 .01142 -1.01155 1
+ 325.00 2.00 18.00 2.00 3.00 1.00 28.00 18.00 .77245 1.29458 1
+ 329.00 1.00 88.00 3.00 2.00 .03 7.00 11.00 .01142 -1.01155 4
+ 332.00 2.00 2.00 2.00 2.00 .05 8.00 9.00 .92562 1.08036 1
+)])
+
+dnl Note: In the above data cases 305, 316 318 and 329 have identical values
+dnl of the 2nd and 3rd variables. We use this for weight testing.
+
+AT_SETUP([LOGISTIC REGRESSION basic test])
+
+LOGIT_TEST_DATA
+
+AT_DATA([lr-data.sps], [dnl
+set format = F12.3.
+set decimal dot.
+data list notable file='lr-data.txt'
+ list /id outcome survrate prognos amttreat gsi avoid intrus pre_1 lre_1 w *.
+
+logistic regression
+ variables = outcome with survrate
+ .
+])
+
+AT_CHECK([pspp -O format=csv lr-data.sps], [0],
+ [dnl
+Table: Dependent Variable Encoding
+Original Value,Internal Value
+1.000,0
+2.000,1
+
+Table: Case Processing Summary
+Unweighted Cases,N,Percent
+Included in Analysis,66,100.000
+Missing Cases,0,.000
+Total,66,100.000
+
+note: Estimation terminated at iteration number 6 because parameter estimates changed by less than 0.001
+
+Table: Model Summary
+Step 1,-2 Log likelihood,Cox & Snell R Square,Nagelkerke R Square
+,37.323,.455,.659
+
+Table: Variables in the Equation
+,,B,S.E.,Wald,df,Sig.,Exp(B)
+Step 1,survrate,-.081,.019,17.756,1,.000,.922
+,Constant,2.684,.811,10.941,1,.001,14.639
+])
+
+
+AT_CLEANUP
+
+AT_SETUP([LOGISTIC REGRESSION missing values])
+
+LOGIT_TEST_DATA
+
+AT_DATA([lr-data.sps], [dnl
+set format = F12.3.
+set decimal dot.
+data list notable file='lr-data.txt'
+ list /id outcome survrate prognos amttreat gsi avoid intrus pre_1 lre_1 w *.
+
+missing values survrate (999) avoid (44444).
+
+logistic regression
+ variables = outcome with survrate avoid
+ .
+])
+
+AT_CHECK([pspp -O format=csv lr-data.sps > run0], [0], [ignore])
+
+cat >> lr-data.txt << HERE
+ 105.00 1.00 999.00 3.00 2.00 .35 17.00 20.00 .50110 -2.00440 1
+ 106.00 1.00 999.00 2.00 3.00 .38 7.00 15.00 .20168 -1.25264 1
+ 107.00 1.00 5.00 3.00 2.00 .28 44444 34 .00897 -1.00905 1
+HERE
+
+AT_CHECK([pspp -O format=csv lr-data.sps > run1], [0], [ignore])
+
+dnl Only the summary information should be different
+AT_CHECK([diff run0 run1], [1], [dnl
+8,10c8,10
+< Included in Analysis,66,100.000
+< Missing Cases,0,.000
+< Total,66,100.000
+---
+> Included in Analysis,66,95.652
+> Missing Cases,3,4.348
+> Total,69,100.000
+])
+
+AT_CLEANUP
+
+
+
+dnl Check that a weighted dataset is interpreted correctly
+dnl To do this, the same data set is used, one weighted, one not.
+dnl The weighted dataset omits certain cases which are identical
+AT_SETUP([LOGISTIC REGRESSION weights])
+
+LOGIT_TEST_DATA
+
+AT_DATA([lr-data-unweighted.sps], [dnl
+set format = F12.3.
+set decimal dot.
+data list notable file='lr-data.txt'
+ list /id outcome survrate prognos amttreat gsi avoid intrus pre_1 lre_1 w *.
+
+logistic regression
+ variables = outcome with survrate
+ .
+])
+
+AT_DATA([lr-data-weighted.sps], [dnl
+set format = F12.3.
+set decimal dot.
+data list notable file='lr-data.txt'
+ list /id outcome survrate prognos amttreat gsi avoid intrus pre_1 lre_1 w *.
+
+weight by w.
+
+* Omit duplicate cases.
+select if id <> 305 and id <> 316 and id <> 318.
+
+logistic regression
+ variables = outcome with survrate
+ .
+])
+
+
+AT_CHECK([pspp -O format=csv lr-data-unweighted.sps > unweighted-result], [0], [ignore])
+AT_CHECK([pspp -O format=csv lr-data-weighted.sps > weighted-result], [0], [ignore])
+
+dnl The only difference should be the summary information, since
+dnl this displays the unweighted totals.
+AT_CHECK([diff unweighted-result weighted-result], [1], [dnl
+8c8
+< Included in Analysis,66,100.000
+---
+> Included in Analysis,63,100.000
+10c10
+< Total,66,100.000
+---
+> Total,63,100.000
+])
+
+
+AT_CLEANUP
+
+
+dnl Check that the /NOCONST option works as intended.
+dnl The results this produces are very similar to those
+dnl at the example in http://www.ats.ucla.edu/stat/SPSS/faq/logregconst.htm
+AT_SETUP([LOGISTIC REGRESSION without constant])
+
+AT_DATA([non-const.sps], [dnl
+set format=F20.3.
+
+input program.
+ loop #i = 1 to 200.
+ compute female = (#i > 91).
+ end case.
+ end loop.
+end file.
+end input program.
+
+compute constant = 1.
+
+logistic regression female with constant /noconst.
+])
+
+AT_CHECK([pspp -O format=csv non-const.sps], [0],
+ [dnl
+Table: Dependent Variable Encoding
+Original Value,Internal Value
+.00,0
+1.00,1
+
+Table: Case Processing Summary
+Unweighted Cases,N,Percent
+Included in Analysis,200,100.000
+Missing Cases,0,.000
+Total,200,100.000
+
+note: Estimation terminated at iteration number 2 because parameter estimates changed by less than 0.001
+
+Table: Model Summary
+Step 1,-2 Log likelihood,Cox & Snell R Square,Nagelkerke R Square
+,275.637,.008,.011
+
+Table: Variables in the Equation
+,,B,S.E.,Wald,df,Sig.,Exp(B)
+Step 1,constant,.180,.142,1.616,1,.204,1.198
+])
+
+AT_CLEANUP
+
+
+
+dnl Check that if somebody passes a dependent variable which is not dichtomous,
+dnl then an error is raised.
+AT_SETUP([LOGISTIC REGRESSION non-dichotomous dep var])
+
+AT_DATA([non-dich.sps], [dnl
+data list notable list /y x1 x2 x3 x4.
+begin data.
+1 2 3 4 5
+0 2 3 4 8
+2 3 4 5 6
+end data.
+
+logistic regression y with x1 x2 x3 x4.
+])
+
+AT_CHECK([pspp -O format=csv non-dich.sps], [1],
+ [dnl
+error: Dependent variable's values are not dichotomous.
+])
+
+AT_CLEANUP
\ No newline at end of file