X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Fdata-io%2Fmatrix-reader.c;h=a1ef1e2331efaf2db6525e58083c217c42a73fa2;hb=422944fc7812cf13e616348412fecdbd85670e06;hp=c0cee0474310b5b1ad3e9a47c792d6e316baad34;hpb=1c34ef6b7f0a6af23141cb117429ab46f8c3a6c5;p=pspp diff --git a/src/language/data-io/matrix-reader.c b/src/language/data-io/matrix-reader.c index c0cee04743..a1ef1e2331 100644 --- a/src/language/data-io/matrix-reader.c +++ b/src/language/data-io/matrix-reader.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2017 Free Software Foundation, Inc. + Copyright (C) 2017, 2019 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,18 +19,26 @@ #include "matrix-reader.h" #include - -#include -#include -#include -#include -#include -#include +#include + +#include "data/casegrouper.h" +#include "data/casereader.h" +#include "data/data-out.h" +#include "data/dataset.h" +#include "data/dictionary.h" +#include "data/format.h" +#include "data/variable.h" +#include "language/command.h" +#include "libpspp/i18n.h" +#include "libpspp/message.h" +#include "libpspp/str.h" +#include "output/pivot-table.h" #include "gettext.h" #define _(msgid) gettext (msgid) #define N_(msgid) msgid +struct lexer; /* This module interprets a "data matrix", typically generated by the command @@ -75,83 +83,85 @@ s_0 ROWTYPE_ VARNAME_ v_0 v_1 v_2 */ -struct matrix_reader +void +matrix_material_uninit (struct matrix_material *mm) { - const struct variable *varname; - const struct variable *rowtype; - struct casegrouper *grouper; - - gsl_matrix *n_vectors; - gsl_matrix *mean_vectors; - gsl_matrix *var_vectors; -}; - -struct matrix_reader * -create_matrix_reader_from_case_reader (const struct dictionary *dict, struct casereader *in_reader, - const struct variable ***vars, size_t *n_vars) + gsl_matrix_free (mm->corr); + gsl_matrix_free (mm->cov); + gsl_matrix_free (mm->n); + gsl_matrix_free (mm->mean_matrix); + gsl_matrix_free (mm->var_matrix); +} + +static const struct variable * +find_matrix_string_var (const struct dictionary *dict, const char *name) { - struct matrix_reader *mr = xzalloc (sizeof *mr); - - mr->varname = dict_lookup_var (dict, "varname_"); - if (mr->varname == NULL) + const struct variable *var = dict_lookup_var (dict, name); + if (var == NULL) { - msg (ME, _("Matrix dataset lacks a variable called %s."), "VARNAME_"); - free (mr); + msg (ME, _("Matrix dataset lacks a variable called %s."), name); return NULL; } - - if (!var_is_alpha (mr->varname)) + if (!var_is_alpha (var)) { - msg (ME, _("Matrix dataset variable %s should be of string type."), - "VARNAME_"); - free (mr); + msg (ME, _("Matrix dataset variable %s should be of string type."), name); return NULL; } + return var; +} - mr->rowtype = dict_lookup_var (dict, "rowtype_"); - if (mr->rowtype == NULL) - { - msg (ME, _("Matrix dataset lacks a variable called %s."), "ROWTYPE_"); - free (mr); - return NULL; - } +struct matrix_reader * +matrix_reader_create (const struct dictionary *dict, + struct casereader *in_reader) +{ + const struct variable *varname = find_matrix_string_var (dict, "VARNAME_"); + const struct variable *rowtype = find_matrix_string_var (dict, "ROWTYPE_"); + if (!varname || !rowtype) + return NULL; - if (!var_is_alpha (mr->rowtype)) + for (size_t i = 0; i < dict_get_var_cnt (dict); i++) { - msg (ME, _("Matrix dataset variable %s should be of string type."), - "ROWTYPE_"); - free (mr); - return NULL; + const struct variable *v = dict_get_var (dict, i); + if (!var_is_numeric (v) && v != rowtype && v != varname) + { + msg (ME, _("Matrix dataset variable %s should be numeric."), + var_get_name (v)); + return NULL; + } } size_t dvarcnt; const struct variable **dvars = NULL; dict_get_vars (dict, &dvars, &dvarcnt, DC_SCRATCH); - if (n_vars) - *n_vars = dvarcnt - var_get_dict_index (mr->varname) - 1; - - if (vars) + /* Continuous variables and split variables. */ + const struct variable **cvars = dvars + var_get_dict_index (varname) + 1; + size_t n_cvars = dvarcnt - var_get_dict_index (varname) - 1; + const struct variable **svars = dvars; + size_t n_svars = var_get_dict_index (rowtype); + if (!n_cvars) { - int i; - *vars = xcalloc (sizeof (struct variable **), *n_vars); - - for (i = 0; i < *n_vars; ++i) - { - (*vars)[i] = dvars[i + var_get_dict_index (mr->varname) + 1]; - } + msg (ME, _("Matrix dataset does not have any continuous variables.")); + free (dvars); + return NULL; } - /* All the variables before ROWTYPE_ (if any) are split variables */ - mr->grouper = casegrouper_create_vars (in_reader, dvars, var_get_dict_index (mr->rowtype)); - + struct matrix_reader *mr = xmalloc (sizeof *mr); + *mr = (struct matrix_reader) { + .n_cvars = n_cvars, + .cvars = xmemdup (cvars, n_cvars * sizeof *cvars), + .rowtype = rowtype, + .varname = varname, + .dict = dict, + .grouper = casegrouper_create_vars (in_reader, svars, n_svars) + }; free (dvars); return mr; } bool -destroy_matrix_reader (struct matrix_reader *mr) +matrix_reader_destroy (struct matrix_reader *mr) { if (mr == NULL) return false; @@ -173,107 +183,243 @@ matrix_fill_row (gsl_matrix **matrix, { int col; if (*matrix == NULL) - *matrix = gsl_matrix_alloc (n_vars, n_vars); + { + *matrix = gsl_matrix_alloc (n_vars, n_vars); + gsl_matrix_set_all (*matrix, SYSMIS); + } for (col = 0; col < n_vars; ++col) { const struct variable *cv = vars [col]; - double x = case_data (c, cv)->f; + double x = case_num (c, cv); assert (col < (*matrix)->size2); assert (mrow < (*matrix)->size1); gsl_matrix_set (*matrix, mrow, col, x); } } -bool -next_matrix_from_reader (struct matrix_material *mm, - struct matrix_reader *mr, - const struct variable **vars, int n_vars) +static int +find_varname (const struct variable **vars, int n_vars, + const char *varname) { - struct casereader *group; + for (int i = 0; i < n_vars; i++) + if (!strcasecmp (var_get_name (vars[i]), varname)) + return i; + return -1; +} - assert (vars); +struct substring +matrix_reader_get_string (const struct ccase *c, const struct variable *var) +{ + struct substring s = case_ss (c, var); + ss_rtrim (&s, ss_cstr (CC_SPACES)); + return s; +} - gsl_matrix_free (mr->n_vectors); - gsl_matrix_free (mr->mean_vectors); - gsl_matrix_free (mr->var_vectors); +void +matrix_reader_set_string (struct ccase *c, const struct variable *var, + struct substring src) +{ + struct substring dst = case_ss (c, var); + for (size_t i = 0; i < dst.length; i++) + dst.string[i] = i < src.length ? src.string[i] : ' '; +} +bool +matrix_reader_next (struct matrix_material *mm, struct matrix_reader *mr, + struct casereader **groupp) +{ + struct casereader *group; if (!casegrouper_get_next_group (mr->grouper, &group)) - return false; + { + *mm = (struct matrix_material) MATRIX_MATERIAL_INIT; + if (groupp) + *groupp = NULL; + return false; + } - mr->n_vectors = gsl_matrix_alloc (n_vars, n_vars); - mr->mean_vectors = gsl_matrix_alloc (n_vars, n_vars); - mr->var_vectors = gsl_matrix_alloc (n_vars, n_vars); + if (groupp) + *groupp = casereader_clone (group); - mm->n = mr->n_vectors; - mm->mean_matrix = mr->mean_vectors; - mm->var_matrix = mr->var_vectors; + const struct variable **vars = mr->cvars; + size_t n_vars = mr->n_cvars; - // FIXME: Make this into a hash table. - unsigned long *table = xmalloc (sizeof (*table) * n_vars); - int i; - for (i = 0; i < n_vars; ++i) + *mm = (struct matrix_material) { + .n = gsl_matrix_calloc (n_vars, n_vars), + .mean_matrix = gsl_matrix_calloc (n_vars, n_vars), + .var_matrix = gsl_matrix_calloc (n_vars, n_vars), + }; + + struct matrix { - const int w = var_get_width (mr->varname); - char s[w]; - memset (s, 0, w); - const char *name = var_get_name (vars[i]); - strncpy (s, name, w); - unsigned long h = hash_bytes (s, w, 0); - table[i] = h; - } + const char *name; + gsl_matrix **m; + size_t good_rows; + size_t bad_rows; + }; + struct matrix matrices[] = { + { .name = "CORR", .m = &mm->corr }, + { .name = "COV", .m = &mm->cov }, + }; + enum { N_MATRICES = 2 }; struct ccase *c; - for ( ; (c = casereader_read (group) ); case_unref (c)) + for (; (c = casereader_read (group)); case_unref (c)) { - const union value *uv = case_data (c, mr->rowtype); - int col, row; - for (col = 0; col < n_vars; ++col) - { - const struct variable *cv = vars[col]; - double x = case_data (c, cv)->f; - if (0 == strncasecmp ((char *)value_str (uv, 8), "N ", 8)) - for (row = 0; row < n_vars; ++row) - gsl_matrix_set (mr->n_vectors, row, col, x); - else if (0 == strncasecmp ((char *) value_str (uv, 8), "MEAN ", 8)) - for (row = 0; row < n_vars; ++row) - gsl_matrix_set (mr->mean_vectors, row, col, x); - else if (0 == strncasecmp ((char *) value_str (uv, 8), "STDDEV ", 8)) - for (row = 0; row < n_vars; ++row) - gsl_matrix_set (mr->var_vectors, row, col, x * x); - } - - const union value *uvv = case_data (c, mr->varname); - const uint8_t *vs = value_str (uvv, var_get_width (mr->varname)); - int w = var_get_width (mr->varname); - unsigned long h = hash_bytes (vs, w, 0); - - int mrow = -1; - for (i = 0; i < n_vars; ++i) - { - if (table[i] == h) - { - mrow = i; - break; - } - } - - if (mrow == -1) - continue; - - if (0 == strncasecmp ((char *) value_str (uv, 8), "CORR ", 8)) - { - matrix_fill_row (&mm->corr, c, mrow, vars, n_vars); - } - else if (0 == strncasecmp ((char *) value_str (uv, 8), "COV ", 8)) - { - matrix_fill_row (&mm->cov, c, mrow, vars, n_vars); - } + struct substring rowtype = matrix_reader_get_string (c, mr->rowtype); + + gsl_matrix *v + = (ss_equals_case (rowtype, ss_cstr ("N")) ? mm->n + : ss_equals_case (rowtype, ss_cstr ("MEAN")) ? mm->mean_matrix + : ss_equals_case (rowtype, ss_cstr ("STDDEV")) ? mm->var_matrix + : NULL); + if (v) + { + for (int x = 0; x < n_vars; ++x) + { + double n = case_num (c, vars[x]); + if (v == mm->var_matrix) + n *= n; + for (int y = 0; y < n_vars; ++y) + gsl_matrix_set (v, y, x, n); + } + continue; + } + + struct matrix *m = NULL; + for (size_t i = 0; i < N_MATRICES; i++) + if (ss_equals_case (rowtype, ss_cstr (matrices[i].name))) + { + m = &matrices[i]; + break; + } + if (m) + { + struct substring varname_raw = case_ss (c, mr->varname); + struct substring varname = ss_cstr ( + recode_string (UTF8, dict_get_encoding (mr->dict), + varname_raw.string, varname_raw.length)); + ss_rtrim (&varname, ss_cstr (CC_SPACES)); + varname.string[varname.length] = '\0'; + + int y = find_varname (vars, n_vars, varname.string); + if (y >= 0) + { + m->good_rows++; + matrix_fill_row (m->m, c, y, vars, n_vars); + } + else + m->bad_rows++; + ss_dealloc (&varname); + } } - casereader_destroy (group); - free (table); + for (size_t i = 0; i < N_MATRICES; i++) + if (matrices[i].good_rows && matrices[i].good_rows != n_vars) + msg (SW, _("%s matrix has %zu columns but %zu rows named variables " + "to be analyzed (and %zu rows named unknown variables)."), + matrices[i].name, n_vars, matrices[i].good_rows, + matrices[i].bad_rows); return true; } + +int +cmd_debug_matrix_read (struct lexer *lexer UNUSED, struct dataset *ds) +{ + struct matrix_reader *mr = matrix_reader_create (dataset_dict (ds), + proc_open (ds)); + if (!mr) + return CMD_FAILURE; + + struct pivot_table *pt = pivot_table_create ("Debug Matrix Reader"); + + enum mm_stat + { + MM_CORR, + MM_COV, + MM_N, + MM_MEAN, + MM_STDDEV, + }; + const char *mm_stat_names[] = { + [MM_CORR] = "Correlation", + [MM_COV] = "Covariance", + [MM_N] = "N", + [MM_MEAN] = "Mean", + [MM_STDDEV] = "Standard Deviation", + }; + enum { N_STATS = sizeof mm_stat_names / sizeof *mm_stat_names }; + for (size_t i = 0; i < 2; i++) + { + struct pivot_dimension *d = pivot_dimension_create ( + pt, + i ? PIVOT_AXIS_COLUMN : PIVOT_AXIS_ROW, + i ? "Column" : "Row"); + if (!i) + pivot_category_create_leaf_rc (d->root, pivot_value_new_text ("Value"), + PIVOT_RC_CORRELATION); + for (size_t j = 0; j < mr->n_cvars; j++) + pivot_category_create_leaf_rc ( + d->root, pivot_value_new_variable (mr->cvars[j]), + PIVOT_RC_CORRELATION); + } + + struct pivot_dimension *stat = pivot_dimension_create (pt, PIVOT_AXIS_ROW, + "Statistic"); + for (size_t i = 0; i < N_STATS; i++) + pivot_category_create_leaf (stat->root, + pivot_value_new_text (mm_stat_names[i])); + + struct pivot_dimension *split = pivot_dimension_create ( + pt, PIVOT_AXIS_ROW, "Split"); + + int split_num = 0; + + struct matrix_material mm = MATRIX_MATERIAL_INIT; + while (matrix_reader_next (&mm, mr, NULL)) + { + pivot_category_create_leaf (split->root, + pivot_value_new_integer (split_num + 1)); + + const gsl_matrix *m[N_STATS] = { + [MM_CORR] = mm.corr, + [MM_COV] = mm.cov, + [MM_N] = mm.n, + [MM_MEAN] = mm.mean_matrix, + [MM_STDDEV] = mm.var_matrix, + }; + + for (size_t i = 0; i < N_STATS; i++) + if (m[i]) + { + if (i == MM_COV || i == MM_CORR) + { + for (size_t y = 0; y < mr->n_cvars; y++) + for (size_t x = 0; x < mr->n_cvars; x++) + pivot_table_put4 ( + pt, y + 1, x, i, split_num, + pivot_value_new_number (gsl_matrix_get (m[i], y, x))); + } + else + for (size_t x = 0; x < mr->n_cvars; x++) + { + double n = gsl_matrix_get (m[i], 0, x); + if (i == MM_STDDEV) + n = sqrt (n); + pivot_table_put4 (pt, 0, x, i, split_num, + pivot_value_new_number (n)); + } + } + + split_num++; + matrix_material_uninit (&mm); + } + pivot_table_submit (pt); + + proc_commit (ds); + + matrix_reader_destroy (mr); + return CMD_SUCCESS; +}