From e5b7f2cd903c84efa0e73449cbcda6d135da1a23 Mon Sep 17 00:00:00 2001 From: Jason Stover Date: Fri, 30 Dec 2005 04:59:49 +0000 Subject: [PATCH] Added categorical variable support for model export --- src/regression.q | 39 +++++++++++++++++++++++++++++++++++++++ src/regression_export.h | 18 ++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/src/regression.q b/src/regression.q index 9d0f6163..74befc2e 100644 --- a/src/regression.q +++ b/src/regression.q @@ -472,6 +472,44 @@ subcommand_statistics (int *keywords, pspp_linreg_cache * c) statistics_keyword_output (reg_stats_tol, keywords[tol], c); statistics_keyword_output (reg_stats_selection, keywords[selection], c); } + +static void +reg_print_categorical_encoding (FILE *fp, pspp_linreg_cache *c) +{ + int i; + size_t j; + struct pspp_linreg_coeff coeff; + union value *val; + + fprintf (fp, "%s", reg_export_categorical_encode_1); + + for (i = 1; i < c->n_indeps; i++) /* c->coeff[0] is the intercept. */ + { + coeff = c->coeff[i]; + if (coeff.v->type == ALPHA) + { + fprintf (fp, "struct pspp_reg_categorical_variable %s;\n\t", coeff.v->name); + } + } + for (i = 1; i < c->n_indeps; i++) + { + coeff = c->coeff[i]; + if (coeff.v->type == ALPHA) + { + fprintf (fp, "%s.name = \"%s\";\n\t", coeff.v->name, coeff.v->name); + fprintf (fp, "%s.n_vals = %d;\n\t", coeff.v->name, coeff.v->obs_vals->n_categories); + fprintf (fp, "%s.values = {", coeff.v->name); + for (j = 0; j < coeff.v->obs_vals->n_categories - 1; j++) + { + val = cat_subscript_to_value ( (const size_t) j, coeff.v); + fprintf (fp, "\"%s\",\n\t\t", val->s); + } + val = cat_subscript_to_value ( (const size_t) j, coeff.v); + fprintf (fp, "\"%s\"};\n\n\t", val->s); + } + } +} + static void reg_print_depvars (FILE *fp, pspp_linreg_cache *c) { @@ -517,6 +555,7 @@ subcommand_export (int export, pspp_linreg_cache *c) fprintf (fp, "%s", reg_preamble); fprintf (fp, "#include \n#include \n\n"); reg_print_getvar (fp, c); + reg_print_categorical_encoding (fp, c); fprintf (fp, "%s", reg_export_t_quantiles_1); increment = 0.5 / (double) increment; for (i = 0; i < n_quantiles - 1; i++) diff --git a/src/regression_export.h b/src/regression_export.h index e7f880e3..8798027a 100644 --- a/src/regression_export.h +++ b/src/regression_export.h @@ -115,4 +115,22 @@ const char reg_export_prediction_interval_3[] = " + pspp_reg_variance (var_vals, "\n\tresult *= pspp_reg_t_quantile ((1.0 + p) / 2.0);\n\t" "result += pspp_reg_estimate (var_vals, var_names);\n\treturn result;\n}\n"; +/* + Change categorical values to binary vectors. The routine will use + an encoding in which a categorical variable with n values is mapped + to a vector with n-1 entries. Value 0 is mapped to the zero vector, + value 1 is mapped to a vector whose first entry is 1 and all others are + 0, etc. For example, if a variable can have 'a', 'b' or 'c' as values, + then the value 'a' will be encoded as (0,0), 'b' as (1,0) and 'c' as + (0,1). If the design matrix used to create the model used a different + encoding, then the function pspp_reg_categorical_encode () will return + a vector which does not match its categorical value in the model. + */ +const char reg_export_categorical_encode_1[] = "struct pspp_reg_categorical_variable\n" +"{\n\tchar * name;\n\tsize_t n_vals;\n\tchar *[] values;\n};\n\n" +"static\ndouble * get_value_vector (char *v)\n{\n\tdouble *result;\n\t"; + +const char reg_export_categorical_encode_2[] = "; i++)\n\t{\n\t\tif (strcmp (v, values[i]) == 0)" +"\n\t\t{\n\t\t\tresult[i] = 1.0;\n\t\t}\n\t}\n\treturn result;\n}\n"; + #endif -- 2.30.2