Logistic Regression: Added categorical variable support

author John Darrington <john@darrington.wattle.id.au>

Sun, 11 Nov 2012 13:19:05 +0000 (14:19 +0100)

committer John Darrington <john@darrington.wattle.id.au>

Sun, 11 Nov 2012 13:19:05 +0000 (14:19 +0100)
author John Darrington <john@darrington.wattle.id.au>
Sun, 11 Nov 2012 13:19:05 +0000 (14:19 +0100)
committer John Darrington <john@darrington.wattle.id.au>
Sun, 11 Nov 2012 13:19:05 +0000 (14:19 +0100)
diff --git a/doc/statistics.texi b/doc/statistics.texi

index 6e8b5c67a41ed07093d8192aa9da0fe798135545..5723323337559ec4076be50b300d9b8152e1a1c2 100644 (file)
--- a/doc/statistics.texi
+++ b/doc/statistics.texi
@@ -732,7 +732,9 @@ The default is @subcmd{LISTWISE}.
  @cindex bivariate logistic regression
  
  @display
-LOGISTIC REGRESSION [VARIABLES =] @var{dependent_var} WITH @var{var_list}
+LOGISTIC REGRESSION [VARIABLES =] @var{dependent_var} WITH @var{predictors}
+
+     [/CATEGORICAL = @var{categorical_predictors}]
  
       [@{/NOCONST | /ORIGIN | /NOORIGIN @}]
  
@@ -763,6 +765,10 @@ Hence, the full model is
  + \dots
  + b_n {\bf x_n}
  }
+
+Predictor variables which are categorical in nature should be listed on the @subcmd{/CATEGORICAL} subcommand.
+Simple variables as well as interactions between variables may be listed here.
+
  If you want a model without the constant term @math{b_0}, use the keyword @subcmd{/ORIGIN}.
  @subcmd{/NOCONST} is a synonym for @subcmd{/ORIGIN}.
  
diff --git a/src/language/stats/logistic.c b/src/language/stats/logistic.c

index 93472e0294f78243aff4b7c41a2258deaf4e7bee..91a16488bd2d548716487425ecfd804b4e2b5754 100644 (file)
--- a/src/language/stats/logistic.c
+++ b/src/language/stats/logistic.c
@@ -64,6 +64,8 @@
  #include "libpspp/message.h"
  #include "libpspp/misc.h"
  #include "math/categoricals.h"
+#include "math/interaction.h"
+
  #include "output/tab.h"
  
  #include "gettext.h"
@@ -91,8 +93,13 @@ struct lr_spec
    /* The dependent variable */
    const struct variable *dep_var;
  
-  size_t n_predictor_vars;
+  /* The predictor variables (excluding categorical ones) */
    const struct variable **predictor_vars;
+  size_t n_predictor_vars;
+
+  /* The categorical predictors */
+  struct interaction **cat_predictors;
+  size_t n_cat_predictors;
  
    /* Which classes of missing vars are to be excluded */
    enum mv_class exclude;
@@ -100,6 +107,7 @@ struct lr_spec
    /* The weight variable */
    const struct variable *wv;
  
+  /* The dictionary of the dataset */
    const struct dictionary *dict;
  
    /* True iff the constant (intercept) is to be included in the model */
@@ -122,15 +130,17 @@ struct lr_spec
    double cut_point;
  };
  
+
  /* The results and intermediate result of the procedure.
     These are mutated as the procedure runs. Used for
     temporary variables etc.
  */
  struct lr_result
  {
+  /* Used to indicate if a pass should flag a warning when 
+     invalid (ie negative or missing) weight values are encountered */
    bool warn_bad_weight;
  
-
    /* The two values of the dependent variable. */
    union value y0;
    union value y1;
@@ -139,36 +149,49 @@ struct lr_result
    /* The sum of caseweights */
    double cc;
  
+  /* The number of missing and nonmissing cases */
    casenumber n_missing;
    casenumber n_nonmissing;
+
+
+  gsl_matrix *hessian;
+
+  /* The categoricals and their payload. Null if  the analysis has no
+   categorical predictors */
+  struct categoricals *cats;
+  struct payload cp;
  };
  
  
  /*
-  Convert INPUT into a dichotomous scalar.  For simple cases, this is a 1:1 mapping
+  Convert INPUT into a dichotomous scalar, according to how the dependent variable's
+  values are mapped.
+  For simple cases, this is a 1:1 mapping
    The return value is always either 0 or 1
  */
  static double
  map_dependent_var (const struct lr_spec *cmd, const struct lr_result *res, const union value *input)
  {
-  int width = var_get_width (cmd->dep_var);
+  const int width = var_get_width (cmd->dep_var);
    if (value_equal (input, &res->y0, width))
      return 0;
  
    if (value_equal (input, &res->y1, width))
      return 1;
-         
+
+  /* This should never happen.  If it does,  then y0 and/or y1 have probably not been set */
    NOT_REACHED ();
  
    return SYSMIS;
  }
  
  
+static void output_categories (const struct lr_spec *cmd, const struct lr_result *res);
  
  static void output_depvarmap (const struct lr_spec *cmd, const struct lr_result *);
  
  static void output_variables (const struct lr_spec *cmd, 
-                             const gsl_vector *,
+                             const struct lr_result *,
                               const gsl_vector *);
  
  static void output_model_summary (const struct lr_result *,
@@ -177,27 +200,57 @@ static void output_model_summary (const struct lr_result *,
  static void case_processing_summary (const struct lr_result *);
  
  
+/* Return the value of case C corresponding to the INDEX'th entry in the
+   model */
+static double
+predictor_value (const struct ccase *c, 
+                    const struct variable **x, size_t n_x, 
+                    const struct categoricals *cats,
+                    size_t index)
+{
+  /* Values of the scalar predictor variables */
+  if (index < n_x) 
+    return case_data (c, x[index])->f;
+
+  /* Coded values of categorical predictor variables (or interactions) */
+  if (cats && index - n_x  < categoricals_df_total (cats))
+    {
+      double x = categoricals_get_dummy_code_for_case (cats, index - n_x, c);
+      return x;
+    }
+
+  /* The constant term */
+  return 1.0;
+}
+
+
  /*
    Return the probability estimator (that is the estimator of logit(y) )
    corresponding to the coefficient estimator beta_hat for case C
  */
  static double 
  pi_hat (const struct lr_spec *cmd, 
+       struct lr_result *res,
         const gsl_vector *beta_hat,
         const struct variable **x, size_t n_x,
         const struct ccase *c)
  {
    int v0;
    double pi = 0;
-  for (v0 = 0; v0 < n_x; ++v0)
+  size_t n_coeffs = beta_hat->size;
+
+  if (cmd->constant)
+    {
+      pi += gsl_vector_get (beta_hat, beta_hat->size - 1);
+      n_coeffs--;
+    }
+  
+  for (v0 = 0; v0 < n_coeffs; ++v0)
      {
        pi += gsl_vector_get (beta_hat, v0) * 
-       case_data (c, x[v0])->f;
+       predictor_value (c, x, n_x, res->cats, v0);
      }
  
-  if (cmd->constant)
-    pi += gsl_vector_get (beta_hat, beta_hat->size - 1);
-
    pi = 1.0 / (1.0 + exp(-pi));
  
    return pi;
@@ -213,26 +266,26 @@ pi_hat (const struct lr_spec *cmd,
    If ALL predicted values derivatives are close to zero or one, then CONVERGED
    will be set to true.
  */
-static gsl_matrix *
+static void
  hessian (const struct lr_spec *cmd, 
          struct lr_result *res,
          struct casereader *input,
          const struct variable **x, size_t n_x,
          const gsl_vector *beta_hat,
-        bool *converged
-        )
+        bool *converged)
  {
    struct casereader *reader;
    struct ccase *c;
-  gsl_matrix *output = gsl_matrix_calloc (beta_hat->size, beta_hat->size);
  
    double max_w = -DBL_MAX;
  
+  gsl_matrix_set_zero (res->hessian);
+
    for (reader = casereader_clone (input);
         (c = casereader_read (reader)) != NULL; case_unref (c))
      {
        int v0, v1;
-      double pi = pi_hat (cmd, beta_hat, x, n_x, c);
+      double pi = pi_hat (cmd, res, beta_hat, x, n_x, c);
  
        double weight = dict_get_case_weight (cmd->dict, c, &res->warn_bad_weight);
        double w = pi * (1 - pi);
@@ -242,25 +295,22 @@ hessian (const struct lr_spec *cmd,
  
        for (v0 = 0; v0 < beta_hat->size; ++v0)
         {
-         double in0 = v0 < n_x ? case_data (c, x[v0])->f : 1.0;
+         double in0 = predictor_value (c, x, n_x, res->cats, v0);
           for (v1 = 0; v1 < beta_hat->size; ++v1)
             {
-             double in1 = v1 < n_x ? case_data (c, x[v1])->f : 1.0 ;
-             double *o = gsl_matrix_ptr (output, v0, v1);
+             double in1 = predictor_value (c, x, n_x, res->cats, v1);
+             double *o = gsl_matrix_ptr (res->hessian, v0, v1);
               *o += in0 * w * in1;
             }
         }
      }
    casereader_destroy (reader);
  
-
    if ( max_w < cmd->min_epsilon)
      {
        *converged = true;
        msg (MN, _("All predicted values are either 1 or 0"));
      }
-
-  return output;
  }
  
  
@@ -289,7 +339,7 @@ xt_times_y_pi (const struct lr_spec *cmd,
         (c = casereader_read (reader)) != NULL; case_unref (c))
      {
        int v0;
-      double pi = pi_hat (cmd, beta_hat, x, n_x, c);
+      double pi = pi_hat (cmd, res, beta_hat, x, n_x, c);
        double weight = dict_get_case_weight (cmd->dict, c, &res->warn_bad_weight);
  
  
@@ -299,7 +349,7 @@ xt_times_y_pi (const struct lr_spec *cmd,
  
        for (v0 = 0; v0 < beta_hat->size; ++v0)
         {
-         double in0 = v0 < n_x ? case_data (c, x[v0])->f : 1.0;
+         double in0 = predictor_value (c, x, n_x, res->cats, v0);
           double *o = gsl_vector_ptr (output, v0);
           *o += in0 * (y - pi) * weight;
         }
@@ -310,10 +360,42 @@ xt_times_y_pi (const struct lr_spec *cmd,
    return output;
  }
  
+\f
+
+/* "payload" functions for the categoricals.
+   The only function is to accumulate the frequency of each
+   category.
+ */
+
+static void *
+frq_create  (const void *aux1 UNUSED, void *aux2 UNUSED)
+{
+  return xzalloc (sizeof (double));
+}
+
+static void
+frq_update  (const void *aux1 UNUSED, void *aux2 UNUSED,
+            void *ud, const struct ccase *c UNUSED , double weight)
+{
+  double *freq = ud;
+  *freq += weight;
+}
+
+static void 
+frq_destroy (const void *aux1 UNUSED, void *aux2 UNUSED, void *user_data UNUSED)
+{
+  free (user_data);
+}
+
+\f
  
  /* 
-   Makes an initial pass though the data, checks that the dependent variable is
-   dichotomous, and calculates necessary initial values.
+   Makes an initial pass though the data, doing the following:
+
+   * Checks that the dependent variable is  dichotomous,
+   * Creates and initialises the categoricals,
+   * Accumulates summary results,
+   * Calculates necessary initial values.
  
     Returns an initial value for \hat\beta the vector of estimators of \beta
  */
@@ -336,7 +418,19 @@ beta_hat_initial (const struct lr_spec *cmd, struct lr_result *res, struct caser
    if (cmd->constant)
      n_coefficients++;
  
-  b0 = gsl_vector_calloc (n_coefficients);
+  /* Create categoricals if appropriate */
+  if (cmd->n_cat_predictors > 0)
+    {
+      res->cp.create = frq_create;
+      res->cp.update = frq_update;
+      res->cp.calculate = NULL;
+      res->cp.destroy = frq_destroy;
+
+      res->cats = categoricals_create (cmd->cat_predictors, cmd->n_cat_predictors,
+                                      cmd->wv, cmd->exclude, MV_ANY);
+
+      categoricals_set_payload (res->cats, &res->cp, cmd, res);
+    }
  
    res->cc = 0;
    for (reader = casereader_clone (input);
@@ -357,14 +451,15 @@ beta_hat_initial (const struct lr_spec *cmd, struct lr_result *res, struct caser
             }
         }
  
+      /* Accumulate the missing and non-missing counts */
        if (missing)
         {
           res->n_missing++;
           continue;
         }
-
        res->n_nonmissing++;
  
+      /* Find the values of the dependent variable */
        if (!v0set)
         {
           value_clone (&res->y0, depval, width);
@@ -398,9 +493,13 @@ beta_hat_initial (const struct lr_spec *cmd, struct lr_result *res, struct caser
  
  
        res->cc += weight;
+
+      categoricals_update (res->cats, c);
      }
    casereader_destroy (reader);
  
+  categoricals_done (res->cats);
+
    sum = sumB;
  
    /* Ensure that Y0 is less than Y1.  Otherwise the mapping gets
@@ -415,6 +514,9 @@ beta_hat_initial (const struct lr_spec *cmd, struct lr_result *res, struct caser
        sum = sumA;
      }
  
+  n_coefficients += categoricals_df_total (res->cats);
+  b0 = gsl_vector_calloc (n_coefficients);
+
    if ( cmd->constant)
      {
        double mean = sum / res->cc;
@@ -430,16 +532,18 @@ beta_hat_initial (const struct lr_spec *cmd, struct lr_result *res, struct caser
  
  
  
+/* Start of the logistic regression routine proper */
  static bool
  run_lr (const struct lr_spec *cmd, struct casereader *input,
         const struct dataset *ds UNUSED)
  {
-  int i,j;
+  int i;
  
    gsl_vector *beta_hat;
-  gsl_vector *se ;
  
    bool converged = false;
+
+  /* Set the likelihoods to a negative sentinel value */
    double likelihood = -1;
    double prev_likelihood = -1;
    double initial_likelihood = -1;
@@ -448,6 +552,7 @@ run_lr (const struct lr_spec *cmd, struct casereader *input,
    work.n_missing = 0;
    work.n_nonmissing = 0;
    work.warn_bad_weight = true;
+  work.cats = NULL;
  
  
    /* Get the initial estimates of \beta and their standard errors */
@@ -457,8 +562,6 @@ run_lr (const struct lr_spec *cmd, struct casereader *input,
  
    output_depvarmap (cmd, &work);
  
-  se = gsl_vector_alloc (beta_hat->size);
-
    case_processing_summary (&work);
  
  
@@ -470,20 +573,22 @@ run_lr (const struct lr_spec *cmd, struct casereader *input,
                                             NULL);
  
  
+  work.hessian = gsl_matrix_calloc (beta_hat->size, beta_hat->size);
+
    /* Start the Newton Raphson iteration process... */
    for( i = 0 ; i < cmd->max_iter ; ++i)
      {
        double min, max;
-      gsl_matrix *m ;
        gsl_vector *v ;
  
-      m = hessian (cmd, &work, input,
+      
+      hessian (cmd, &work, input,
                    cmd->predictor_vars, cmd->n_predictor_vars,
                    beta_hat,
                    &converged);
  
-      gsl_linalg_cholesky_decomp (m);
-      gsl_linalg_cholesky_invert (m);
+      gsl_linalg_cholesky_decomp (work.hessian);
+      gsl_linalg_cholesky_invert (work.hessian);
  
        v = xt_times_y_pi (cmd, &work, input,
                          cmd->predictor_vars, cmd->n_predictor_vars,
@@ -494,16 +599,9 @@ run_lr (const struct lr_spec *cmd, struct casereader *input,
        {
         /* delta = M.v */
         gsl_vector *delta = gsl_vector_alloc (v->size);
-       gsl_blas_dgemv (CblasNoTrans, 1.0, m, v, 0, delta);
+       gsl_blas_dgemv (CblasNoTrans, 1.0, work.hessian, v, 0, delta);
         gsl_vector_free (v);
  
-       for (j = 0; j < se->size; ++j)
-         {
-           double *ptr = gsl_vector_ptr (se, j);
-           *ptr = gsl_matrix_get (m, j, j);
-         }
-
-       gsl_matrix_free (m);
  
         gsl_vector_add (beta_hat, delta);
  
@@ -537,17 +635,21 @@ run_lr (const struct lr_spec *cmd, struct casereader *input,
    casereader_destroy (input);
    assert (initial_likelihood >= 0);
  
-  for (i = 0; i < se->size; ++i)
-    {
-      double *ptr = gsl_vector_ptr (se, i);
-      *ptr = sqrt (*ptr);
-    }
+  if ( ! converged) 
+    msg (MW, _("Estimation terminated at iteration number %d because maximum iterations has been reached"), i );
+
  
    output_model_summary (&work, initial_likelihood, likelihood);
-  output_variables (cmd, beta_hat, se);
  
+  if (work.cats)
+    output_categories (cmd, &work);
+
+  output_variables (cmd, &work, beta_hat);
+
+  gsl_matrix_free (work.hessian);
    gsl_vector_free (beta_hat); 
-  gsl_vector_free (se);
+  
+  categoricals_destroy (work.cats);
  
    return true;
  }
@@ -556,6 +658,12 @@ run_lr (const struct lr_spec *cmd, struct casereader *input,
  int
  cmd_logistic (struct lexer *lexer, struct dataset *ds)
  {
+  /* Temporary location for the predictor variables.
+     These may or may not include the categorical predictors */
+  const struct variable **pred_vars;
+  size_t n_pred_vars;
+
+  int v, x;
    struct lr_spec lr;
    lr.dict = dataset_dict (ds);
    lr.n_predictor_vars = 0;
@@ -570,6 +678,9 @@ cmd_logistic (struct lexer *lexer, struct dataset *ds)
    lr.constant = true;
    lr.confidence = 95;
    lr.print = PRINT_DEFAULT;
+  lr.cat_predictors = NULL;
+  lr.n_cat_predictors = 0;
+
  
  
    if (lex_match_id (lexer, "VARIABLES"))
@@ -581,8 +692,8 @@ cmd_logistic (struct lexer *lexer, struct dataset *ds)
    lex_force_match (lexer, T_WITH);
  
    if (!parse_variables_const (lexer, lr.dict,
-                             &lr.predictor_vars, &lr.n_predictor_vars,
-                             PV_NO_DUPLICATE | PV_NUMERIC))
+                             &pred_vars, &n_pred_vars,
+                             PV_NO_DUPLICATE))
      goto error;
  
  
@@ -627,6 +738,19 @@ cmd_logistic (struct lexer *lexer, struct dataset *ds)
         {
           /* This is for compatibility.  It does nothing */
         }
+      else if (lex_match_id (lexer, "CATEGORICAL"))
+       {
+         lex_match (lexer, T_EQUALS);
+         do
+           {
+             lr.cat_predictors = xrealloc (lr.cat_predictors,
+                                 sizeof (*lr.cat_predictors) * ++lr.n_cat_predictors);
+             lr.cat_predictors[lr.n_cat_predictors - 1] = 0;
+           }
+         while (parse_design_interaction (lexer, lr.dict, 
+                                          lr.cat_predictors + lr.n_cat_predictors - 1));
+         lr.n_cat_predictors--;
+       }
        else if (lex_match_id (lexer, "PRINT"))
         {
           lex_match (lexer, T_EQUALS);
@@ -775,8 +899,41 @@ cmd_logistic (struct lexer *lexer, struct dataset *ds)
         }
      }
  
+  /* Copy the predictor variables from the temporary location into the 
+     final one, dropping any categorical variables which appear there.
+     FIXME: This is O(NxM).
+  */
+  for (v = x = 0; v < n_pred_vars; ++v)
+    {
+      bool drop = false;
+      const struct variable *var = pred_vars[v];
+      int cv = 0;
+      for (cv = 0; cv < lr.n_cat_predictors ; ++cv)
+       {
+         int iv;
+         const struct interaction *iact = lr.cat_predictors[cv];
+         for (iv = 0 ; iv < iact->n_vars ; ++iv)
+           {
+             if (var == iact->vars[iv])
+               {
+                 drop = true;
+                 goto dropped;
+               }
+           }
+       }
+
+    dropped:
+      if (drop)
+       continue;
+
+      lr.predictor_vars = xrealloc (lr.predictor_vars, sizeof *lr.predictor_vars * (x + 1));
+      lr.predictor_vars[x++] = var;
+      lr.n_predictor_vars++;
+    }
+  free (pred_vars);
  
  
+  /* Run logistical regression for each split group */
    {
      struct casegrouper *grouper;
      struct casereader *group;
@@ -790,11 +947,13 @@ cmd_logistic (struct lexer *lexer, struct dataset *ds)
    }
  
    free (lr.predictor_vars);
+  free (lr.cat_predictors);
    return CMD_SUCCESS;
  
   error:
  
    free (lr.predictor_vars);
+  free (lr.cat_predictors);
    return CMD_FAILURE;
  }
  
@@ -851,19 +1010,20 @@ output_depvarmap (const struct lr_spec *cmd, const struct lr_result *res)
  /* Show the Variables in the Equation box */
  static void
  output_variables (const struct lr_spec *cmd, 
-                 const gsl_vector *beta, 
-                 const gsl_vector *se)
+                 const struct lr_result *res,
+                 const gsl_vector *beta)
  {
    int row = 0;
    const int heading_columns = 1;
    int heading_rows = 1;
    struct tab_table *t;
  
-  int idx;
-  int n_rows = cmd->n_predictor_vars;
-
    int nc = 8;
    int nr ;
+  int i = 0;
+  int ivar = 0;
+  int idx_correction = 0;
+
    if (cmd->print & PRINT_CI)
      {
        nc += 2;
@@ -874,6 +1034,9 @@ output_variables (const struct lr_spec *cmd,
    if (cmd->constant)
      nr++;
  
+  if (res->cats)
+    nr += categoricals_df_total (res->cats) + cmd->n_cat_predictors;
+
    t = tab_create (nc, nr);
    tab_title (t, _("Variables in the Equation"));
  
@@ -902,45 +1065,103 @@ output_variables (const struct lr_spec *cmd,
        tab_text (t,  9, row, TAB_CENTER | TAT_TITLE, _("Upper"));
      }
   
-  if (cmd->constant)
-    n_rows++;
-
-  for (idx = 0 ; idx < n_rows; ++idx)
+  for (row = heading_rows ; row < nr; ++row)
      {
-      const int r = idx + heading_rows;
+      const int idx = row - heading_rows - idx_correction;
  
        const double b = gsl_vector_get (beta, idx);
-      const double sigma = gsl_vector_get (se, idx);
-      const double wald = pow2 (b / sigma);
+      const double sigma2 = gsl_matrix_get (res->hessian, idx, idx);
+      const double wald = pow2 (b) / sigma2;
        const double df = 1;
  
        if (idx < cmd->n_predictor_vars)
-       tab_text (t, 1, r, TAB_LEFT | TAT_TITLE, 
-                 var_to_string (cmd->predictor_vars[idx]));
+       {
+         tab_text (t, 1, row, TAB_LEFT | TAT_TITLE, 
+                   var_to_string (cmd->predictor_vars[idx]));
+       }
+      else if (i < cmd->n_cat_predictors)
+       {
+         double wald;
+         bool summary = false;
+         struct string str;
+         const struct interaction *cat_predictors = cmd->cat_predictors[i];
+         const int df = categoricals_df (res->cats, i);
+
+         ds_init_empty (&str);
+         interaction_to_string (cat_predictors, &str);
+
+         if (ivar == 0)
+           {
+             /* Calculate the Wald statistic,
+                which is \beta' C^-1 \beta .
+                where \beta is the vector of the coefficient estimates comprising this
+                categorial variable. and C is the corresponding submatrix of the 
+                hessian matrix.
+             */
+             gsl_matrix_const_view mv =
+               gsl_matrix_const_submatrix (res->hessian, idx, idx, df, df);
+             gsl_matrix *subhessian = gsl_matrix_alloc (mv.matrix.size1, mv.matrix.size2);
+             gsl_vector_const_view vv = gsl_vector_const_subvector (beta, idx, df);
+             gsl_vector *temp = gsl_vector_alloc (df);
+
+             gsl_matrix_memcpy (subhessian, &mv.matrix);
+             gsl_linalg_cholesky_decomp (subhessian);
+             gsl_linalg_cholesky_invert (subhessian);
+
+             gsl_blas_dgemv (CblasTrans, 1.0, subhessian, &vv.vector, 0, temp);
+             gsl_blas_ddot (temp, &vv.vector, &wald);
+
+             tab_double (t, 4, row, 0, wald, 0);
+             tab_double (t, 5, row, 0, df, &F_8_0);
+             tab_double (t, 6, row, 0, gsl_cdf_chisq_Q (wald, df), 0);
+
+             idx_correction ++;
+             summary = true;
+             gsl_matrix_free (subhessian);
+             gsl_vector_free (temp);
+           }
+         else
+           {
+             ds_put_format (&str, "(%d)", ivar);
+           }
+
+         tab_text (t, 1, row, TAB_LEFT | TAT_TITLE, ds_cstr (&str));
+         if (ivar++ == df)
+           {
+             ++i; /* next interaction */
+             ivar = 0;
+           }
+
+         ds_destroy (&str);
+
+         if (summary)
+           continue;
+       }
+      else
+       {
+         tab_text (t, 1, row, TAB_LEFT | TAT_TITLE, _("Constant"));
+       }
  
-      tab_double (t, 2, r, 0, b, 0);
-      tab_double (t, 3, r, 0, sigma, 0);
-      tab_double (t, 4, r, 0, wald, 0);
-      tab_double (t, 5, r, 0, df, &F_8_0);
-      tab_double (t, 6, r, 0,  gsl_cdf_chisq_Q (wald, df), 0);
-      tab_double (t, 7, r, 0, exp (b), 0);
+      tab_double (t, 2, row, 0, b, 0);
+      tab_double (t, 3, row, 0, sqrt (sigma2), 0);
+      tab_double (t, 4, row, 0, wald, 0);
+      tab_double (t, 5, row, 0, df, &F_8_0);
+      tab_double (t, 6, row, 0, gsl_cdf_chisq_Q (wald, df), 0);
+      tab_double (t, 7, row, 0, exp (b), 0);
  
        if (cmd->print & PRINT_CI)
         {
           double wc = gsl_cdf_ugaussian_Pinv (0.5 + cmd->confidence / 200.0);
-         wc *= sigma;
+         wc *= sqrt (sigma2);
  
           if (idx < cmd->n_predictor_vars)
             {
-             tab_double (t, 8, r, 0, exp (b - wc), 0);
-             tab_double (t, 9, r, 0, exp (b + wc), 0);
+             tab_double (t, 8, row, 0, exp (b - wc), 0);
+             tab_double (t, 9, row, 0, exp (b + wc), 0);
             }
         }
      }
  
-  if ( cmd->constant)
-    tab_text (t, 1, nr - 1, TAB_LEFT | TAT_TITLE, _("Constant"));
-
    tab_submit (t);
  }
  
@@ -1028,3 +1249,107 @@ case_processing_summary (const struct lr_result *res)
    tab_submit (t);
  }
  
+static void
+output_categories (const struct lr_spec *cmd, const struct lr_result *res)
+{
+  const struct fmt_spec *wfmt =
+    cmd->wv ? var_get_print_format (cmd->wv) : &F_8_0;
+
+  int cumulative_df;
+  int i = 0;
+  const int heading_columns = 2;
+  const int heading_rows = 2;
+  struct tab_table *t;
+
+  int nc ;
+  int nr ;
+
+  int v;
+  int r = 0;
+
+  int max_df = 0;
+  int total_cats = 0;
+  for (i = 0; i < cmd->n_cat_predictors; ++i)
+    {
+      size_t n = categoricals_n_count (res->cats, i);
+      size_t df = categoricals_df (res->cats, i);
+      if (max_df < df)
+       max_df = df;
+      total_cats += n;
+    }
+
+  nc = heading_columns + 1 + max_df;
+  nr = heading_rows + total_cats;
+
+  t = tab_create (nc, nr);
+  tab_title (t, _("Categorical Variables' Codings"));
+
+  tab_headers (t, heading_columns, 0, heading_rows, 0);
+
+  tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, nc - 1, nr - 1);
+
+  tab_hline (t, TAL_2, 0, nc - 1, heading_rows);
+  tab_vline (t, TAL_2, heading_columns, 0, nr - 1);
+
+
+  tab_text (t, heading_columns, 1, TAB_CENTER | TAT_TITLE, _("Frequency"));
+
+  tab_joint_text_format (t, heading_columns + 1, 0, nc - 1, 0,
+                        TAB_CENTER | TAT_TITLE, _("Parameter coding"));
+
+
+  for (i = 0; i < max_df; ++i)
+    {
+      int c = heading_columns + 1 + i;
+      tab_text_format (t,  c, 1, TAB_CENTER | TAT_TITLE, _("(%d)"), i + 1);
+    }
+
+  cumulative_df = 0;
+  for (v = 0; v < cmd->n_cat_predictors; ++v)
+    {
+      int cat;
+      const struct interaction *cat_predictors = cmd->cat_predictors[v];
+      int df =  categoricals_df (res->cats, v);
+      struct string str;
+      ds_init_empty (&str);
+
+      interaction_to_string (cat_predictors, &str);
+
+      tab_text (t, 0, heading_rows + r, TAB_LEFT | TAT_TITLE, ds_cstr (&str) );
+
+      ds_destroy (&str);
+
+      for (cat = 0; cat < categoricals_n_count (res->cats, v) ; ++cat)
+       {
+         struct string str;
+         const struct ccase *c = categoricals_get_case_by_category_real (res->cats, v, cat);
+         const double *freq = categoricals_get_user_data_by_category_real (res->cats, v, cat);
+         
+         int x;
+         ds_init_empty (&str);
+
+         for (x = 0; x < cat_predictors->n_vars; ++x)
+           {
+             const union value *val = case_data (c, cat_predictors->vars[x]);
+             var_append_value_name (cat_predictors->vars[x], val, &str);
+
+             if (x < cat_predictors->n_vars - 1)
+               ds_put_cstr (&str, " ");
+           }
+         
+         tab_text   (t, 1, heading_rows + r, 0, ds_cstr (&str));
+         ds_destroy (&str);
+                 tab_double (t, 2, heading_rows + r, 0, *freq, wfmt);
+
+         for (x = 0; x < df; ++x)
+           {
+             tab_double (t, heading_columns + 1 + x, heading_rows + r, 0, (cat == x), &F_8_0);
+           }
+         ++r;
+       }
+      cumulative_df += df;
+    }
+
+  tab_submit (t);
+
+}
diff --git a/src/math/categoricals.c b/src/math/categoricals.c

index 71ffe20ad7ee66ec646f1684bd524e44f5c043ca..d0247f6d38a453cbbf7220e04f11f879a5140d65 100644 (file)
--- a/src/math/categoricals.c
+++ b/src/math/categoricals.c
@@ -76,15 +76,6 @@ struct variable_node
  
    struct hmap valmap;         /* A map of value nodes */
    int n_vals;                 /* Number of values for this variable */
-
-  int *indirection;           /* An array (of size n_vals) of integers, which serve to
-                                permute the index members of the values in valmap.
-                                
-                                Doing this, means that categories are considered in the order
-                                of their values.  Mathematically the order is irrelevant.
-                                However certain procedures (eg logistic regression)  want to report
-                                statisitics for particular categories */
-
  };
  
  
@@ -412,7 +403,8 @@ categoricals_update (struct categoricals *cat, const struct ccase *c)
        if (valn == NULL)
         {
           valn = pool_malloc (cat->pool, sizeof *valn);
-         valn->index = vn->n_vals++;
+         valn->index = -1; 
+         vn->n_vals++;
           value_init (&valn->val, width);
           value_copy (&valn->val, val, width);
           hmap_insert (&vn->valmap, &valn->node, hash);
@@ -553,15 +545,14 @@ categoricals_done (const struct categoricals *cat_)
               return;
             }
  
-         vn->indirection = pool_calloc (cat->pool, vn->n_vals, sizeof *vn->indirection);
-
           /* Sort the VALMAP here */
           array = xcalloc (sizeof *array, vn->n_vals);
+         x = 0;
           HMAP_FOR_EACH (valnd, struct value_node, node, &vn->valmap)
             {
               /* Note: This loop is probably superfluous, it could be done in the 
                update stage (at the expense of a realloc) */
-             array[valnd->index] = valnd;
+             array[x++] = valnd;
             }
  
           sort (array, vn->n_vals, sizeof (*array), 
@@ -570,7 +561,7 @@ categoricals_done (const struct categoricals *cat_)
           for (x = 0; x <  vn->n_vals; ++x)
             {
               struct value_node *vvv = array[x];
-             vn->indirection[vn->n_vals - x - 1] = vvv->index;
+             vvv->index = x;
             }
           free (array);
  
@@ -753,9 +744,9 @@ categoricals_get_code_for_case (const struct categoricals *cat, int subscript,
        const int index = ((subscript - base_index) % iap->df_prod[v] ) / dfp;
        dfp = iap->df_prod [v];
  
-      if (effects_coding && vn->indirection [valn->index] == df )
+      if (effects_coding && valn->index == df )
         bin = -1.0;
-      else if ( vn->indirection [valn->index] != index )
+      else if ( valn->index != index )
         bin = 0;
      
        result *= bin;
diff --git a/tests/language/stats/logistic.at b/tests/language/stats/logistic.at

index 8903c2069df64acfa895d764a0a6be23756b39a4..7db121338b9c2ac5741b3f892ad38af2d4572610 100644 (file)
--- a/tests/language/stats/logistic.at
+++ b/tests/language/stats/logistic.at
@@ -1,3 +1,4 @@
+
  AT_BANNER([LOGISTIC REGRESSION])
  
  dnl These examples are adapted from
@@ -288,4 +289,723 @@ AT_CHECK([pspp -O format=csv non-dich.sps], [1],
  error: Dependent variable's values are not dichotomous.
  ])
  
-AT_CLEANUP
-\ No newline at end of file
+AT_CLEANUP
+
+
+
+dnl An example to check the behaviour of LOGISTIC REGRESSION with a categorical
+dnl variable.  This examṕle was inspired from that at:
+dnl http://www.ats.ucla.edu/stat/spss/dae/logit.htm 
+AT_SETUP([LOGISTIC REGRESSION with categorical])
+
+AT_DATA([lr-cat.data], [dnl
+ 620 3.07 2 4 
+ 800 4.00 3 9 
+ 580 3.40 2 4 
+ 600 3.13 2 4 
+ 540 2.70 2 4 
+ 660 3.31 4 4 
+ 480 3.58 1 9 
+ 620 4.00 1 9 
+ 680 3.98 2 9 
+ 580 3.40 4 4 
+ 760 3.35 3 4 
+ 700 3.72 2 4 
+ 460 3.64 1 9 
+ 540 3.28 3 4 
+ 680 3.48 3 4 
+ 740 3.31 1 4 
+ 460 3.77 3 4 
+ 740 3.54 1 4 
+ 600 3.63 3 4 
+ 620 3.05 2 4 
+ 560 3.04 3 4 
+ 520 2.70 3 4 
+ 640 3.35 3 4 
+ 620 3.58 2 4 
+ 660 3.70 4 9 
+ 500 2.86 4 4 
+ 640 3.50 2 4 
+ 720 4.00 3 4 
+ 720 3.94 3 4 
+ 400 3.65 2 4 
+ 800 2.90 2 4 
+ 520 2.90 3 4 
+ 440 3.24 4 4 
+ 580 3.51 2 4 
+ 500 3.31 3 4 
+ 440 3.22 1 4 
+ 540 3.17 1 9 
+ 420 3.02 1 4 
+ 780 3.22 2 9 
+ 440 3.13 4 4 
+ 800 3.66 1 9 
+ 580 3.32 2 9 
+ 480 2.67 2 9 
+ 700 4.00 1 9 
+ 740 2.97 2 9 
+ 700 3.83 2 4 
+ 640 3.93 2 4 
+ 800 3.90 2 4 
+ 400 3.38 2 4 
+ 700 3.52 2 4 
+ 680 3.00 4 9 
+ 540 3.20 1 4 
+ 580 4.00 2 4 
+ 780 4.00 2 9 
+ 220 2.83 3 4 
+ 580 3.20 2 9 
+ 580 3.50 2 4 
+ 620 3.30 1 4 
+ 520 3.65 4 9 
+ 600 3.38 3 9 
+ 660 3.77 3 4 
+ 580 2.86 4 9 
+ 580 3.46 2 9 
+ 560 3.36 3 4 
+ 740 4.00 3 9 
+ 480 3.44 3 4 
+ 640 3.19 4 9 
+ 600 3.54 1 9 
+ 540 3.38 4 4 
+ 500 2.81 3 4 
+ 360 2.56 3 4 
+ 460 3.15 4 4 
+ 460 2.63 2 4 
+ 440 2.76 2 4 
+ 740 3.62 4 4 
+ 380 3.38 2 4 
+ 640 3.63 1 9 
+ 800 3.73 1 4 
+ 660 3.67 2 4 
+ 760 3.00 2 9 
+ 420 2.96 1 4 
+ 740 3.74 4 4 
+ 800 3.75 2 4 
+ 620 3.40 2 4 
+ 660 3.67 3 9 
+ 400 3.35 3 4 
+ 680 3.14 2 4 
+ 660 3.47 3 9 
+ 660 3.63 2 9 
+ 420 3.41 4 4 
+ 660 4.00 1 4 
+ 680 3.70 2 4 
+ 620 3.23 3 9 
+ 520 3.35 3 4 
+ 500 4.00 3 4 
+ 400 3.36 2 4 
+ 700 3.56 1 9 
+ 540 3.81 1 9 
+ 520 2.68 3 9 
+ 540 3.50 2 4 
+ 700 4.00 2 4 
+ 600 3.64 3 9 
+ 800 3.31 3 4 
+ 520 3.29 1 4 
+ 580 3.69 1 4 
+ 380 3.43 3 4 
+ 560 3.19 3 4 
+ 760 2.81 1 9 
+ 540 3.13 2 4 
+ 660 3.14 2 9 
+ 520 3.81 1 9 
+ 680 3.19 4 4 
+ 540 3.78 4 4 
+ 500 3.57 3 4 
+ 660 3.49 2 4 
+ 340 3.00 2 9 
+ 400 3.15 2 9 
+ 420 3.92 4 4 
+ 760 3.35 2 9 
+ 700 2.94 2 4 
+ 540 3.04 1 4 
+ 780 3.87 4 4 
+ 560 3.78 2 4 
+ 700 3.82 3 4 
+ 400 2.93 3 4 
+ 440 3.45 2 9 
+ 800 3.47 3 4 
+ 340 3.15 3 4 
+ 520 4.00 1 9 
+ 520 3.15 3 4 
+ 600 2.98 2 9 
+ 420 2.69 2 4 
+ 460 3.44 2 4 
+ 620 3.71 1 9 
+ 480 3.13 2 4 
+ 580 3.40 3 4 
+ 540 3.39 3 9 
+ 540 3.94 3 4 
+ 440 2.98 3 4 
+ 380 3.59 4 4 
+ 500 2.97 4 4 
+ 340 2.92 3 4 
+ 440 3.15 2 4 
+ 600 3.48 2 4 
+ 420 2.67 3 4 
+ 460 3.07 2 4 
+ 460 3.45 3 9 
+ 480 3.39 4 4 
+ 480 2.78 3 4 
+ 720 3.42 2 9 
+ 680 3.67 2 9 
+ 800 3.89 2 4 
+ 360 3.00 3 4 
+ 620 3.17 2 9 
+ 700 3.52 4 9 
+ 540 3.19 2 4 
+ 580 3.30 2 4 
+ 800 4.00 3 9 
+ 660 3.33 2 4 
+ 380 3.34 3 4 
+ 720 3.84 3 4 
+ 600 3.59 2 4 
+ 500 3.03 3 4 
+ 640 3.81 2 4 
+ 540 3.49 1 9 
+ 680 3.85 3 9 
+ 540 3.84 2 9 
+ 460 2.93 3 4 
+ 380 2.94 3 4 
+ 620 3.22 2 4 
+ 740 3.37 4 4 
+ 620 4.00 2 4 
+ 800 3.74 1 9 
+ 400 3.31 3 4 
+ 540 3.46 4 4 
+ 620 3.18 2 9 
+ 480 2.91 1 9 
+ 300 2.84 2 9 
+ 440 2.48 4 4 
+ 640 2.79 2 4 
+ 400 3.23 4 9 
+ 680 3.46 2 9 
+ 620 3.37 1 9 
+ 700 3.92 2 4 
+ 620 3.37 2 9 
+ 620 3.63 2 4 
+ 620 3.95 3 9 
+ 560 2.52 2 4 
+ 520 2.62 2 4 
+ 600 3.35 2 4 
+ 700 4.00 1 4 
+ 640 3.67 3 4 
+ 640 4.00 3 4 
+ 520 2.93 4 4 
+ 620 3.21 4 4 
+ 680 3.99 3 4 
+ 660 3.34 3 4 
+ 700 3.45 3 4 
+ 560 3.36 1 9 
+ 800 2.78 2 4 
+ 500 3.88 4 4 
+ 700 3.65 2 4 
+ 680 3.76 3 9 
+ 660 3.07 3 4 
+ 580 3.46 4 4 
+ 460 2.87 2 4 
+ 600 3.31 4 4 
+ 620 3.94 4 4 
+ 400 3.05 2 4 
+ 800 3.43 2 9 
+ 600 3.58 1 9 
+ 580 3.36 2 4 
+ 540 3.16 3 4 
+ 500 2.71 2 4 
+ 600 3.28 3 4 
+ 600 2.82 4 4 
+ 460 3.58 2 4 
+ 520 2.85 3 4 
+ 740 3.52 4 9 
+ 500 3.95 4 4 
+ 560 3.61 3 4 
+ 620 3.45 2 9 
+ 640 3.51 2 4 
+ 660 3.44 2 9 
+ 660 2.91 3 9 
+ 540 3.28 1 4 
+ 560 2.98 1 9 
+ 800 3.97 1 4 
+ 720 3.77 3 4 
+ 720 3.64 1 9 
+ 480 3.71 4 9 
+ 680 3.34 2 4 
+ 680 3.11 2 4 
+ 540 2.81 3 4 
+ 620 3.75 2 9 
+ 540 3.12 1 4 
+ 560 3.48 2 9 
+ 720 3.40 3 4 
+ 680 3.90 1 4 
+ 640 3.76 3 4 
+ 560 3.16 1 4 
+ 520 3.30 2 9 
+ 640 3.12 3 4 
+ 580 3.57 3 4 
+ 540 3.55 4 9 
+ 780 3.63 4 9 
+ 600 3.89 1 9 
+ 800 4.00 1 9 
+ 580 3.29 4 4 
+ 360 3.27 3 4 
+ 800 4.00 2 9 
+ 640 3.52 4 4 
+ 720 3.45 4 4 
+ 580 3.06 2 4 
+ 580 3.02 2 4 
+ 500 3.60 3 9 
+ 580 3.12 3 9 
+ 600 2.82 4 4 
+ 620 3.99 3 4 
+ 700 4.00 3 4 
+ 480 4.00 2 4 
+ 560 2.95 2 4 
+ 560 4.00 3 4 
+ 560 2.65 3 9 
+ 400 3.08 2 4 
+ 480 2.62 2 9 
+ 640 3.86 3 4 
+ 480 3.57 2 4 
+ 540 3.51 2 4 
+ 380 3.33 4 4 
+ 680 3.64 3 4 
+ 400 3.51 3 4 
+ 340 2.90 1 4 
+ 700 3.08 2 4 
+ 480 3.02 1 9 
+ 600 3.15 2 9 
+ 780 3.80 3 9 
+ 520 3.74 2 9 
+ 520 3.51 2 4 
+ 640 3.73 3 4 
+ 560 3.32 4 4 
+ 620 2.85 2 4 
+ 700 3.28 1 4 
+ 760 4.00 1 9 
+ 800 3.60 2 4 
+ 580 3.34 2 4 
+ 540 3.77 2 9 
+ 640 3.17 2 4 
+ 540 3.02 4 4 
+ 680 3.08 4 4 
+ 680 3.31 2 4 
+ 680 2.96 3 9 
+ 700 2.88 2 4 
+ 580 3.77 4 4 
+ 540 3.49 2 9 
+ 700 3.56 2 9 
+ 600 3.56 2 9 
+ 560 3.59 2 4 
+ 640 2.94 2 9 
+ 560 3.33 4 4 
+ 620 3.69 3 4 
+ 680 3.27 2 9 
+ 460 3.14 3 4 
+ 500 3.53 4 4 
+ 620 3.33 3 4 
+ 600 3.62 3 4 
+ 500 3.01 4 4 
+ 740 3.34 4 4 
+ 560 3.69 3 9 
+ 620 3.95 3 9 
+ 740 3.86 2 9 
+ 800 3.53 1 9 
+ 620 3.78 3 4 
+ 700 3.27 2 4 
+ 540 3.78 2 9 
+ 700 3.65 2 4 
+ 800 3.22 1 9 
+ 560 3.59 2 9 
+ 800 3.15 4 4 
+ 520 3.90 3 9 
+ 520 3.74 4 9 
+ 480 2.55 1 4 
+ 800 4.00 4 4 
+ 620 3.09 4 4 
+ 560 3.49 4 4 
+ 500 3.17 3 4 
+ 480 3.40 2 4 
+ 460 2.98 1 4 
+ 580 3.58 1 9 
+ 640 3.30 2 4 
+ 480 3.45 2 4 
+ 440 3.17 2 4 
+ 660 3.32 1 4 
+ 500 3.08 3 4 
+ 660 3.94 2 4 
+ 720 3.31 1 4 
+ 460 3.64 3 9 
+ 500 2.93 4 4 
+ 800 3.54 3 4 
+ 580 2.93 2 4 
+ 620 3.61 1 9 
+ 500 2.98 3 4 
+ 660 4.00 2 9 
+ 560 3.24 4 4 
+ 560 2.42 2 4 
+ 580 3.80 2 4 
+ 500 3.23 4 4 
+ 680 2.42 1 9 
+ 580 3.46 3 4 
+ 800 3.91 3 4 
+ 700 2.90 4 4 
+ 520 3.12 2 4 
+ 300 2.92 4 4 
+ 560 3.43 3 4 
+ 620 3.63 3 4 
+ 500 2.79 4 4 
+ 360 3.14 1 4 
+ 640 3.94 2 9 
+ 460 3.99 3 9 
+ 300 3.01 3 4 
+ 520 2.73 2 4 
+ 600 3.47 2 9 
+ 580 3.25 1 4 
+ 520 3.10 4 4 
+ 620 3.43 3 4 
+ 380 2.91 4 4 
+ 660 3.59 3 4 
+ 660 3.95 2 9 
+ 540 3.33 3 4 
+ 740 4.00 3 4 
+ 640 3.38 3 4 
+ 600 3.89 3 4 
+ 720 3.88 3 4 
+ 580 4.00 3 4 
+ 420 2.26 4 4 
+ 520 4.00 2 9 
+ 800 3.70 1 9 
+ 700 4.00 1 9 
+ 480 3.43 2 4 
+ 660 3.45 4 4 
+ 520 3.25 3 4 
+ 560 2.71 3 4 
+ 600 3.32 2 4 
+ 580 2.88 2 4 
+ 660 3.88 2 9 
+ 600 3.22 1 4 
+ 580 4.00 1 4 
+ 660 3.60 3 9 
+ 500 3.35 2 4 
+ 520 2.98 2 4 
+ 660 3.49 2 9 
+ 560 3.07 2 4 
+ 500 3.13 2 9 
+ 720 3.50 3 9 
+ 440 3.39 2 9 
+ 640 3.95 2 9 
+ 380 3.61 3 4 
+ 800 3.05 2 9 
+ 520 3.19 3 9 
+ 600 3.40 3 4 
+])
+
+AT_DATA([lr-cat.sps], [dnl
+set format=F20.3.
+
+data list notable list file='lr-cat.data' /b1 b2 bcat y.
+
+logistic regression
+         y with b1 b2 bcat
+          /categorical = bcat
+          .
+])
+
+AT_CHECK([pspp -O format=csv lr-cat.sps], [0],
+ [dnl
+Table: Dependent Variable Encoding
+Original Value,Internal Value
+4.000,0
+9.000,1
+
+Table: Case Processing Summary
+Unweighted Cases,N,Percent
+Included in Analysis,400,100.000
+Missing Cases,0,.000
+Total,400,100.000
+
+note: Estimation terminated at iteration number 4 because parameter estimates changed by less than 0.001
+
+Table: Model Summary
+Step 1,-2 Log likelihood,Cox & Snell R Square,Nagelkerke R Square
+,458.517,.098,.138
+
+Table: Categorical Variables' Codings
+,,,Parameter coding,,
+,,Frequency,(1),(2),(3)
+bcat,1.000,61,1,0,0
+,2.000,151,0,1,0
+,3.000,121,0,0,1
+,4.000,67,0,0,0
+
+Table: Variables in the Equation
+,,B,S.E.,Wald,df,Sig.,Exp(B)
+Step 1,b1,.002,.001,4.284,1,.038,1.002
+,b2,.804,.332,5.872,1,.015,2.235
+,bcat,,,20.895,3,.000,
+,bcat(1),1.551,.418,13.788,1,.000,4.718
+,bcat(2),.876,.367,5.706,1,.017,2.401
+,bcat(3),.211,.393,.289,1,.591,1.235
+,Constant,-5.541,1.138,23.709,1,.000,.004
+])
+
+AT_CLEANUP
+
+
+
+dnl  This example is inspired by http://www.ats.ucla.edu/stat/spss/output/logistic.htm
+AT_SETUP([LOGISTIC REGRESSION with cat var 2])
+
+AT_DATA([lr-cat2.data], [dnl
+     60.00     1.00      8.00     50.00 
+     47.00      .00      9.00     42.00 
+     57.00     1.00      7.00     53.00 
+     60.00      .00      8.00     53.00 
+     68.00      .00      8.00     66.00 
+     63.00      .00      8.00     55.00 
+     65.00      .00      8.00     63.00 
+     52.00      .00      8.00     61.00 
+     34.00      .00      9.00     42.00 
+     37.00      .00      8.00     39.00 
+     68.00     1.00      9.00     69.00 
+     60.00      .00      9.00     61.00 
+     44.00      .00      9.00     58.00 
+     42.00      .00      8.00     47.00 
+     57.00     1.00      7.00     61.00 
+     55.00     1.00      8.00     50.00 
+     55.00      .00      9.00     58.00 
+     44.00      .00      8.00     63.00 
+     50.00     1.00      9.00     66.00 
+     44.00      .00      8.00     39.00 
+     55.00      .00      8.00     58.00 
+     44.00      .00      8.00     50.00 
+     47.00     1.00      7.00     34.00 
+     48.00      .00      8.00     44.00 
+     45.00      .00      7.00     31.00 
+     43.00      .00      8.00     50.00 
+     39.00      .00      8.00     42.00 
+     63.00      .00      9.00     50.00 
+     47.00      .00      8.00     58.00 
+     42.00      .00      7.00     50.00 
+     50.00      .00      9.00     36.00 
+     47.00      .00      7.00     33.00 
+     60.00      .00      9.00     61.00 
+     47.00      .00      7.00     42.00 
+     68.00     1.00      9.00     69.00 
+     52.00      .00      8.00     54.00 
+     63.00     1.00      9.00     61.00 
+     65.00     1.00      9.00     61.00 
+     63.00     1.00      9.00     53.00 
+     57.00      .00      8.00     51.00 
+     34.00      .00      8.00     36.00 
+     50.00      .00      8.00     39.00 
+     52.00     1.00      7.00     56.00 
+     45.00      .00      7.00     34.00 
+     47.00     1.00      7.00     53.00 
+     34.00      .00      7.00     39.00 
+     50.00     1.00      8.00     55.00 
+     60.00      .00      9.00     58.00 
+     63.00      .00      8.00     58.00 
+     35.00      .00      7.00     51.00 
+     50.00      .00      8.00     58.00 
+     68.00      .00      8.00     63.00 
+     41.00      .00      9.00     34.00 
+     47.00      .00      8.00     47.00 
+     76.00      .00      9.00     64.00 
+     44.00      .00      8.00     44.00 
+     36.00      .00      9.00     50.00 
+     68.00     1.00      9.00     55.00 
+     47.00     1.00      8.00     50.00 
+     50.00      .00      7.00     53.00 
+     68.00      .00      8.00     74.00 
+     39.00      .00      7.00     44.00 
+     50.00      .00      8.00     55.00 
+     52.00      .00      9.00     61.00 
+     47.00      .00      8.00     53.00 
+     39.00      .00      7.00     47.00 
+     55.00     1.00      9.00     49.00 
+     68.00     1.00      8.00     50.00 
+     52.00     1.00      9.00     63.00 
+     55.00      .00      8.00     58.00 
+     57.00      .00      8.00     55.00 
+     66.00     1.00      9.00     61.00 
+     65.00     1.00      7.00     58.00 
+     42.00      .00      7.00     42.00 
+     68.00     1.00      7.00     59.00 
+     60.00     1.00      9.00     61.00 
+     52.00      .00      8.00     55.00 
+     57.00     1.00      7.00     54.00 
+     42.00      .00      9.00     50.00 
+     42.00      .00      8.00     47.00 
+     57.00      .00      8.00     50.00 
+     47.00      .00      7.00     45.00 
+     44.00      .00      7.00     40.00 
+     43.00      .00      9.00     55.00 
+     31.00      .00      8.00     39.00 
+     37.00      .00      7.00     33.00 
+     63.00     1.00      7.00     63.00 
+     47.00      .00      8.00     39.00 
+     57.00     1.00      8.00     63.00 
+     52.00      .00      8.00     44.00 
+     44.00      .00      7.00     35.00 
+     52.00      .00      7.00     55.00 
+     55.00      .00      7.00     69.00 
+     52.00      .00      8.00     53.00 
+     55.00      .00      9.00     61.00 
+     65.00     1.00      9.00     63.00 
+     55.00      .00      8.00     44.00 
+     63.00      .00      7.00     65.00 
+     44.00      .00      7.00     39.00 
+     47.00      .00      7.00     36.00 
+     63.00     1.00      9.00     55.00 
+     68.00      .00      8.00     66.00 
+     34.00      .00      8.00     39.00 
+     47.00      .00      9.00     50.00 
+     50.00      .00      9.00     58.00 
+     63.00      .00      8.00     66.00 
+     44.00      .00      7.00     34.00 
+     44.00      .00      8.00     50.00 
+     50.00      .00      8.00     53.00 
+     47.00     1.00      9.00     69.00 
+     65.00      .00      9.00     58.00 
+     57.00      .00      8.00     47.00 
+     39.00      .00      8.00     39.00 
+     47.00      .00      8.00     53.00 
+     50.00     1.00      7.00     63.00 
+     50.00      .00      8.00     50.00 
+     63.00      .00      9.00     53.00 
+     73.00     1.00      9.00     61.00 
+     44.00      .00      7.00     47.00 
+     47.00      .00      8.00     42.00 
+     47.00      .00      8.00     58.00 
+     36.00      .00      7.00     61.00 
+     57.00     1.00      8.00     55.00 
+     53.00     1.00      8.00     57.00 
+     63.00      .00      7.00     66.00 
+     50.00      .00      8.00     34.00 
+     47.00      .00      9.00     48.00 
+     57.00     1.00      8.00     58.00 
+     39.00      .00      8.00     53.00 
+     42.00      .00      8.00     42.00 
+     42.00      .00      9.00     31.00 
+     42.00      .00      8.00     72.00 
+     46.00      .00      8.00     44.00 
+     55.00      .00      8.00     42.00 
+     42.00      .00      8.00     47.00 
+     50.00      .00      8.00     44.00 
+     44.00      .00      9.00     39.00 
+     73.00     1.00      8.00     69.00 
+     71.00     1.00      9.00     58.00 
+     50.00      .00      9.00     49.00 
+     63.00     1.00      7.00     54.00 
+     42.00      .00      8.00     36.00 
+     47.00      .00      7.00     42.00 
+     39.00      .00      9.00     26.00 
+     63.00      .00      8.00     58.00 
+     50.00      .00      8.00     55.00 
+     65.00     1.00      8.00     55.00 
+     76.00     1.00      9.00     67.00 
+     71.00     1.00      8.00     66.00 
+     39.00      .00      9.00     47.00 
+     47.00     1.00      9.00     63.00 
+     60.00      .00      7.00     50.00 
+     63.00      .00      9.00     55.00 
+     54.00     1.00      9.00     55.00 
+     55.00     1.00      8.00     58.00 
+     57.00      .00      8.00     61.00 
+     55.00     1.00      9.00     63.00 
+     42.00      .00      7.00     50.00 
+     50.00      .00      8.00     44.00 
+     55.00      .00      8.00     42.00 
+     42.00      .00      7.00     50.00 
+     34.00      .00      8.00     39.00 
+     65.00      .00      9.00     46.00 
+     52.00      .00      7.00     58.00 
+     44.00      .00      8.00     39.00 
+     65.00     1.00      9.00     66.00 
+     47.00      .00      8.00     42.00 
+     41.00      .00      7.00     39.00 
+     68.00      .00      9.00     63.00 
+     63.00     1.00      8.00     72.00 
+     52.00      .00      8.00     53.00 
+     57.00      .00      8.00     50.00 
+     68.00      .00      8.00     55.00 
+     42.00      .00      8.00     56.00 
+     47.00      .00      8.00     48.00 
+     73.00     1.00      9.00     58.00 
+     39.00      .00      8.00     50.00 
+     63.00     1.00      9.00     69.00 
+     60.00      .00      8.00     55.00 
+     65.00     1.00      9.00     66.00 
+     73.00     1.00      8.00     63.00 
+     52.00      .00      8.00     55.00 
+     36.00      .00      8.00     42.00 
+     28.00      .00      7.00     44.00 
+     47.00      .00      8.00     44.00 
+     57.00      .00      7.00     47.00 
+     34.00      .00      7.00     29.00 
+     47.00      .00      9.00     66.00 
+     57.00      .00      8.00     58.00 
+     60.00     1.00      9.00     50.00 
+     50.00      .00      9.00     47.00 
+     73.00     1.00      9.00     55.00 
+     52.00     1.00      8.00     47.00 
+     55.00      .00      8.00     53.00 
+     47.00      .00      8.00     53.00 
+     50.00      .00      8.00     61.00 
+     61.00      .00      7.00     44.00 
+     52.00      .00      9.00     53.00 
+     47.00      .00      7.00     40.00 
+     47.00      .00      7.00     50.00 
+])
+
+AT_DATA([stringcat.sps], [dnl
+set format=F20.3.
+data list notable file='lr-cat2.data' list /read honcomp wiz science *.
+
+string ses(a1).
+recode wiz (7 = "a") (8 = "b") (9 = "c") into ses.
+
+logistic regression honcomp with read science ses
+        /categorical = ses.
+
+])
+
+AT_CHECK([pspp -O format=csv stringcat.sps], [0],
+ [dnl
+Table: Dependent Variable Encoding
+Original Value,Internal Value
+.000,0
+1.000,1
+
+Table: Case Processing Summary
+Unweighted Cases,N,Percent
+Included in Analysis,200,100.000
+Missing Cases,0,.000
+Total,200,100.000
+
+note: Estimation terminated at iteration number 5 because parameter estimates changed by less than 0.001
+
+Table: Model Summary
+Step 1,-2 Log likelihood,Cox & Snell R Square,Nagelkerke R Square
+,165.701,.280,.408
+
+Table: Categorical Variables' Codings
+,,,Parameter coding,
+,,Frequency,(1),(2)
+ses,a,47,1,0
+,b,95,0,1
+,c,58,0,0
+
+Table: Variables in the Equation
+,,B,S.E.,Wald,df,Sig.,Exp(B)
+Step 1,read,.098,.025,15.199,1,.000,1.103
+,science,.066,.027,5.867,1,.015,1.068
+,ses,,,6.690,2,.035,
+,ses(1),.058,.532,.012,1,.913,1.060
+,ses(2),-1.013,.444,5.212,1,.022,.363
+,Constant,-9.561,1.662,33.113,1,.000,.000
+])
+
+AT_CLEANUP
author	John Darrington <john@darrington.wattle.id.au>
	Sun, 11 Nov 2012 13:19:05 +0000 (14:19 +0100)
committer	John Darrington <john@darrington.wattle.id.au>
	Sun, 11 Nov 2012 13:19:05 +0000 (14:19 +0100)
doc/statistics.texi		patch \| blob \| history
src/language/stats/logistic.c		patch \| blob \| history
src/math/categoricals.c		patch \| blob \| history
tests/language/stats/logistic.at		patch \| blob \| history