#include "libpspp/misc.h"
#include "math/categoricals.h"
#include "math/interaction.h"
+#include "libpspp/hmap.h"
+#include "libpspp/hash-functions.h"
#include "output/tab.h"
struct interaction **cat_predictors;
size_t n_cat_predictors;
+
+ /* The union of the categorical and non-categorical variables */
+ const struct variable **indep_vars;
+ size_t n_indep_vars;
+
+
/* Which classes of missing vars are to be excluded */
enum mv_class exclude;
double weight = dict_get_case_weight (cmd->dict, c, &res->warn_bad_weight);
const union value *depval = case_data (c, cmd->dep_var);
- for (v = 0; v < cmd->n_predictor_vars; ++v)
+ for (v = 0; v < cmd->n_indep_vars; ++v)
{
- const union value *val = case_data (c, cmd->predictor_vars[v]);
- if (var_is_value_missing (cmd->predictor_vars[v], val, cmd->exclude))
+ const union value *val = case_data (c, cmd->indep_vars[v]);
+ if (var_is_value_missing (cmd->indep_vars[v], val, cmd->exclude))
{
missing = true;
break;
input = casereader_create_filter_missing (input,
- cmd->predictor_vars,
- cmd->n_predictor_vars,
+ cmd->indep_vars,
+ cmd->n_indep_vars,
cmd->exclude,
NULL,
NULL);
return true;
}
+struct variable_node
+{
+ struct hmap_node node; /* Node in hash map. */
+ const struct variable *var; /* The variable */
+};
+
+static struct variable_node *
+lookup_variable (const struct hmap *map, const struct variable *var, unsigned int hash)
+{
+ struct variable_node *vn = NULL;
+ HMAP_FOR_EACH_WITH_HASH (vn, struct variable_node, node, hash, map)
+ {
+ if (vn->var == var)
+ break;
+
+ fprintf (stderr, "Warning: Hash table collision\n");
+ }
+
+ return vn;
+}
+
+
/* Parse the LOGISTIC REGRESSION command syntax */
int
cmd_logistic (struct lexer *lexer, struct dataset *ds)
lr.print = PRINT_DEFAULT;
lr.cat_predictors = NULL;
lr.n_cat_predictors = 0;
-
+ lr.indep_vars = NULL;
if (lex_match_id (lexer, "VARIABLES"))
final one, dropping any categorical variables which appear there.
FIXME: This is O(NxM).
*/
+
+ {
+ struct variable_node *vn, *next;
+ struct hmap allvars;
+ hmap_init (&allvars);
for (v = x = 0; v < n_pred_vars; ++v)
{
bool drop = false;
const struct variable *var = pred_vars[v];
int cv = 0;
+
+ unsigned int hash = hash_pointer (var, 0);
+ struct variable_node *vn = lookup_variable (&allvars, var, hash);
+ if (vn == NULL)
+ {
+ vn = xmalloc (sizeof *vn);
+ vn->var = var;
+ hmap_insert (&allvars, &vn->node, hash);
+ }
+
for (cv = 0; cv < lr.n_cat_predictors ; ++cv)
{
int iv;
const struct interaction *iact = lr.cat_predictors[cv];
for (iv = 0 ; iv < iact->n_vars ; ++iv)
{
- if (var == iact->vars[iv])
+ const struct variable *ivar = iact->vars[iv];
+ unsigned int hash = hash_pointer (ivar, 0);
+ struct variable_node *vn = lookup_variable (&allvars, ivar, hash);
+ if (vn == NULL)
+ {
+ vn = xmalloc (sizeof *vn);
+ vn->var = ivar;
+
+ hmap_insert (&allvars, &vn->node, hash);
+ }
+
+ if (var == ivar)
{
drop = true;
- goto dropped;
}
}
}
- dropped:
if (drop)
continue;
}
free (pred_vars);
+ lr.n_indep_vars = hmap_count (&allvars);
+ lr.indep_vars = xmalloc (lr.n_indep_vars * sizeof *lr.indep_vars);
+
+ /* Interate over each variable and push it into the array */
+ x = 0;
+ HMAP_FOR_EACH_SAFE (vn, next, struct variable_node, node, &allvars)
+ {
+ lr.indep_vars[x++] = vn->var;
+ free (vn);
+ }
+ hmap_destroy (&allvars);
+ }
- /* Run logistical regression for each split group */
+
+ /* logistical regression for each split group */
{
struct casegrouper *grouper;
struct casereader *group;
free (lr.predictor_vars);
free (lr.cat_predictors);
+ free (lr.indep_vars);
+
return CMD_SUCCESS;
error:
free (lr.predictor_vars);
free (lr.cat_predictors);
+ free (lr.indep_vars);
+
return CMD_FAILURE;
}
AT_CHECK([pspp -O format=csv crash.sps], [1], [ignore])
AT_CLEANUP
+
+
+dnl Test that missing values on the categorical predictors are treated
+dnl properly.
+AT_SETUP([LOGISTIC REGRESSION missing categoricals])
+
+AT_DATA([data.txt], [dnl
+ .00 3.69 .00
+ .00 1.16 1.00
+ 1.00 -12.99 .00
+ .00 2.97 1.00
+ .00 20.48 .00
+ .00 4.90 .00
+ 1.00 -4.38 .00
+ .00 -1.69 1.00
+ 1.00 -5.71 .00
+ 1.00 -14.28 .00
+ .00 9.00 .00
+ .00 2.89 1.00
+ .00 13.51 1.00
+ .00 23.32 1.00
+ .00 2.31 1.00
+ .00 -2.07 1.00
+ 1.00 -4.52 1.00
+ 1.00 -5.83 .00
+ 1.00 -1.91 .00
+ 1.00 -11.12 1.00
+ .00 -1.51 .00
+ .00 6.59 1.00
+ .00 19.28 1.00
+ .00 5.94 .00
+ .00 8.21 1.00
+ .00 8.11 1.00
+ .00 2.49 .00
+ .00 9.62 .00
+ 1.00 -20.74 1.00
+ .00 -1.41 1.00
+ .00 15.15 1.00
+ .00 9.39 .00
+ 1.00 -15.14 1.00
+ 1.00 -5.86 .00
+ 1.00 -11.64 1.00
+ 1.00 -14.36 .00
+ 1.00 -8.95 1.00
+ 1.00 -16.42 1.00
+ 1.00 -1.04 1.00
+ .00 12.89 1.00
+ .00 -7.08 1.00
+ .00 4.87 1.00
+ .00 11.53 1.00
+ 1.00 -6.24 1.00
+ .00 1.25 1.00
+ .00 4.39 1.00
+ .00 3.17 .00
+ .00 19.39 1.00
+ .00 13.03 1.00
+ .00 2.43 .00
+ 1.00 -14.73 1.00
+ .00 8.25 1.00
+ 1.00 -13.28 1.00
+ .00 5.27 1.00
+ 1.00 -3.46 1.00
+ .00 13.81 1.00
+ .00 1.35 1.00
+ 1.00 -3.94 1.00
+ .00 20.73 1.00
+ 1.00 -15.40 .00
+ 1.00 -11.01 1.00
+ .00 4.56 .00
+ 1.00 -15.35 1.00
+ .00 15.21 .00
+ .00 5.34 1.00
+ 1.00 -21.55 1.00
+ .00 10.12 1.00
+ .00 -.73 1.00
+ .00 15.28 1.00
+ .00 11.08 1.00
+ 1.00 -8.24 .00
+ .00 2.46 .00
+ .00 9.60 .00
+ .00 11.24 .00
+ .00 14.13 1.00
+ .00 19.72 1.00
+ .00 5.58 .00
+ .00 26.23 1.00
+ .00 7.25 .00
+ 1.00 -.79 .00
+ .00 6.24 .00
+ 1.00 1.16 .00
+ 1.00 -7.89 1.00
+ 1.00 -1.86 1.00
+ 1.00 -10.80 1.00
+ 1.00 -5.51 .00
+ .00 7.51 .00
+ .00 11.18 .00
+ .00 8.73 .00
+ 1.00 -11.21 1.00
+ 1.00 -13.24 .00
+ .00 19.34 .00
+ .00 9.32 1.00
+ .00 17.97 1.00
+ 1.00 -1.56 1.00
+ 1.00 -3.13 .00
+ .00 3.98 .00
+ .00 -1.21 1.00
+ .00 2.37 .00
+ 1.00 -18.03 1.00
+])
+
+AT_DATA([miss.sps], [dnl
+data list notable file='data.txt' list /y x1 cat0*.
+
+logistic regression y with x1 cat0
+ /categorical = cat0.
+])
+
+AT_CHECK([pspp -O format=csv miss.sps > file1], [0], [ignore])
+
+dnl Append a case with a missing categorical.
+AT_CHECK([echo '1 34 .' >> data.txt], [0], [ignore])
+
+AT_CHECK([pspp -O format=csv miss.sps > file2], [0], [ignore])
+
+AT_CHECK([diff file1 file2], [1], [dnl
+8,10c8,10
+< Included in Analysis,100,100.00
+< Missing Cases,0,.00
+< Total,100,100.00
+---
+> Included in Analysis,100,99.01
+> Missing Cases,1,.99
+> Total,101,100.00
+])
+
+AT_CLEANUP
+