From: Ben Pfaff Date: Mon, 27 Sep 2021 01:51:05 +0000 (-0700) Subject: some basics of mcovnert work X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=422944fc7812cf13e616348412fecdbd85670e06;p=pspp some basics of mcovnert work --- diff --git a/doc/matrices.texi b/doc/matrices.texi index 4dafc02ab4..15d1e914c5 100644 --- a/doc/matrices.texi +++ b/doc/matrices.texi @@ -58,9 +58,9 @@ order. This column is blank for vector data. @cmd{MATRIX DATA} makes variables, but at least 8 bytes. @item -One or more continuous variables. These are the variables whose data -was analyzed to produce the matrices. @cmd{MATRIX DATA} assigns -continuous variables format F10.4. +One or more numeric continuous variables. These are the variables +whose data was analyzed to produce the matrices. @cmd{MATRIX DATA} +assigns continuous variables format F10.4. @end enumerate Case weights are ignored in matrix files. diff --git a/src/language/data-io/matrix-reader.c b/src/language/data-io/matrix-reader.c index cf7b524480..a1ef1e2331 100644 --- a/src/language/data-io/matrix-reader.c +++ b/src/language/data-io/matrix-reader.c @@ -93,81 +93,75 @@ matrix_material_uninit (struct matrix_material *mm) gsl_matrix_free (mm->var_matrix); } -struct matrix_reader +static const struct variable * +find_matrix_string_var (const struct dictionary *dict, const char *name) { - const struct dictionary *dict; - const struct variable *varname; - const struct variable *rowtype; - struct casegrouper *grouper; -}; - -struct matrix_reader * -create_matrix_reader_from_case_reader (const struct dictionary *dict, struct casereader *in_reader, - const struct variable ***vars, size_t *n_vars) -{ - struct matrix_reader *mr = xzalloc (sizeof *mr); - - mr->varname = dict_lookup_var (dict, "varname_"); - mr->dict = dict; - if (mr->varname == NULL) + const struct variable *var = dict_lookup_var (dict, name); + if (var == NULL) { - msg (ME, _("Matrix dataset lacks a variable called %s."), "VARNAME_"); - free (mr); + msg (ME, _("Matrix dataset lacks a variable called %s."), name); return NULL; } - - if (!var_is_alpha (mr->varname)) + if (!var_is_alpha (var)) { - msg (ME, _("Matrix dataset variable %s should be of string type."), - "VARNAME_"); - free (mr); + msg (ME, _("Matrix dataset variable %s should be of string type."), name); return NULL; } + return var; +} - mr->rowtype = dict_lookup_var (dict, "rowtype_"); - if (mr->rowtype == NULL) - { - msg (ME, _("Matrix dataset lacks a variable called %s."), "ROWTYPE_"); - free (mr); - return NULL; - } +struct matrix_reader * +matrix_reader_create (const struct dictionary *dict, + struct casereader *in_reader) +{ + const struct variable *varname = find_matrix_string_var (dict, "VARNAME_"); + const struct variable *rowtype = find_matrix_string_var (dict, "ROWTYPE_"); + if (!varname || !rowtype) + return NULL; - if (!var_is_alpha (mr->rowtype)) + for (size_t i = 0; i < dict_get_var_cnt (dict); i++) { - msg (ME, _("Matrix dataset variable %s should be of string type."), - "ROWTYPE_"); - free (mr); - return NULL; + const struct variable *v = dict_get_var (dict, i); + if (!var_is_numeric (v) && v != rowtype && v != varname) + { + msg (ME, _("Matrix dataset variable %s should be numeric."), + var_get_name (v)); + return NULL; + } } size_t dvarcnt; const struct variable **dvars = NULL; dict_get_vars (dict, &dvars, &dvarcnt, DC_SCRATCH); - if (n_vars) - *n_vars = dvarcnt - var_get_dict_index (mr->varname) - 1; - - if (vars) + /* Continuous variables and split variables. */ + const struct variable **cvars = dvars + var_get_dict_index (varname) + 1; + size_t n_cvars = dvarcnt - var_get_dict_index (varname) - 1; + const struct variable **svars = dvars; + size_t n_svars = var_get_dict_index (rowtype); + if (!n_cvars) { - int i; - *vars = xcalloc (*n_vars, sizeof (struct variable **)); - - for (i = 0; i < *n_vars; ++i) - { - (*vars)[i] = dvars[i + var_get_dict_index (mr->varname) + 1]; - } + msg (ME, _("Matrix dataset does not have any continuous variables.")); + free (dvars); + return NULL; } - /* All the variables before ROWTYPE_ (if any) are split variables */ - mr->grouper = casegrouper_create_vars (in_reader, dvars, var_get_dict_index (mr->rowtype)); - + struct matrix_reader *mr = xmalloc (sizeof *mr); + *mr = (struct matrix_reader) { + .n_cvars = n_cvars, + .cvars = xmemdup (cvars, n_cvars * sizeof *cvars), + .rowtype = rowtype, + .varname = varname, + .dict = dict, + .grouper = casegrouper_create_vars (in_reader, svars, n_svars) + }; free (dvars); return mr; } bool -destroy_matrix_reader (struct matrix_reader *mr) +matrix_reader_destroy (struct matrix_reader *mr) { if (mr == NULL) return false; @@ -214,21 +208,42 @@ find_varname (const struct variable **vars, int n_vars, return -1; } -bool -next_matrix_from_reader (struct matrix_material *mm, - struct matrix_reader *mr, - const struct variable **vars, int n_vars) +struct substring +matrix_reader_get_string (const struct ccase *c, const struct variable *var) { - struct casereader *group; + struct substring s = case_ss (c, var); + ss_rtrim (&s, ss_cstr (CC_SPACES)); + return s; +} - assert (vars); +void +matrix_reader_set_string (struct ccase *c, const struct variable *var, + struct substring src) +{ + struct substring dst = case_ss (c, var); + for (size_t i = 0; i < dst.length; i++) + dst.string[i] = i < src.length ? src.string[i] : ' '; +} +bool +matrix_reader_next (struct matrix_material *mm, struct matrix_reader *mr, + struct casereader **groupp) +{ + struct casereader *group; if (!casegrouper_get_next_group (mr->grouper, &group)) { *mm = (struct matrix_material) MATRIX_MATERIAL_INIT; + if (groupp) + *groupp = NULL; return false; } + if (groupp) + *groupp = casereader_clone (group); + + const struct variable **vars = mr->cvars; + size_t n_vars = mr->n_cvars; + *mm = (struct matrix_material) { .n = gsl_matrix_calloc (n_vars, n_vars), .mean_matrix = gsl_matrix_calloc (n_vars, n_vars), @@ -251,8 +266,7 @@ next_matrix_from_reader (struct matrix_material *mm, struct ccase *c; for (; (c = casereader_read (group)); case_unref (c)) { - struct substring rowtype = case_ss (c, mr->rowtype); - ss_rtrim (&rowtype, ss_cstr (CC_SPACES)); + struct substring rowtype = matrix_reader_get_string (c, mr->rowtype); gsl_matrix *v = (ss_equals_case (rowtype, ss_cstr ("N")) ? mm->n @@ -303,7 +317,7 @@ next_matrix_from_reader (struct matrix_material *mm, for (size_t i = 0; i < N_MATRICES; i++) if (matrices[i].good_rows && matrices[i].good_rows != n_vars) - msg (SW, _("%s matrix has %d columns but %zu rows named variables " + msg (SW, _("%s matrix has %zu columns but %zu rows named variables " "to be analyzed (and %zu rows named unknown variables)."), matrices[i].name, n_vars, matrices[i].good_rows, matrices[i].bad_rows); @@ -314,10 +328,8 @@ next_matrix_from_reader (struct matrix_material *mm, int cmd_debug_matrix_read (struct lexer *lexer UNUSED, struct dataset *ds) { - const struct variable **vars; - size_t n_vars; - struct matrix_reader *mr = create_matrix_reader_from_case_reader ( - dataset_dict (ds), proc_open (ds), &vars, &n_vars); + struct matrix_reader *mr = matrix_reader_create (dataset_dict (ds), + proc_open (ds)); if (!mr) return CMD_FAILURE; @@ -348,9 +360,10 @@ cmd_debug_matrix_read (struct lexer *lexer UNUSED, struct dataset *ds) if (!i) pivot_category_create_leaf_rc (d->root, pivot_value_new_text ("Value"), PIVOT_RC_CORRELATION); - for (size_t j = 0; j < n_vars; j++) + for (size_t j = 0; j < mr->n_cvars; j++) pivot_category_create_leaf_rc ( - d->root, pivot_value_new_variable (vars[j]), PIVOT_RC_CORRELATION); + d->root, pivot_value_new_variable (mr->cvars[j]), + PIVOT_RC_CORRELATION); } struct pivot_dimension *stat = pivot_dimension_create (pt, PIVOT_AXIS_ROW, @@ -365,7 +378,7 @@ cmd_debug_matrix_read (struct lexer *lexer UNUSED, struct dataset *ds) int split_num = 0; struct matrix_material mm = MATRIX_MATERIAL_INIT; - while (next_matrix_from_reader (&mm, mr, vars, n_vars)) + while (matrix_reader_next (&mm, mr, NULL)) { pivot_category_create_leaf (split->root, pivot_value_new_integer (split_num + 1)); @@ -383,14 +396,14 @@ cmd_debug_matrix_read (struct lexer *lexer UNUSED, struct dataset *ds) { if (i == MM_COV || i == MM_CORR) { - for (size_t y = 0; y < n_vars; y++) - for (size_t x = 0; x < n_vars; x++) + for (size_t y = 0; y < mr->n_cvars; y++) + for (size_t x = 0; x < mr->n_cvars; x++) pivot_table_put4 ( pt, y + 1, x, i, split_num, pivot_value_new_number (gsl_matrix_get (m[i], y, x))); } else - for (size_t x = 0; x < n_vars; x++) + for (size_t x = 0; x < mr->n_cvars; x++) { double n = gsl_matrix_get (m[i], 0, x); if (i == MM_STDDEV) @@ -407,7 +420,6 @@ cmd_debug_matrix_read (struct lexer *lexer UNUSED, struct dataset *ds) proc_commit (ds); - destroy_matrix_reader (mr); - free (vars); + matrix_reader_destroy (mr); return CMD_SUCCESS; } diff --git a/src/language/data-io/matrix-reader.h b/src/language/data-io/matrix-reader.h index f3ac7000d5..a0e0bb8ea2 100644 --- a/src/language/data-io/matrix-reader.h +++ b/src/language/data-io/matrix-reader.h @@ -20,6 +20,24 @@ #include #include +struct casereader; +struct ccase; +struct dictionary; +struct matrix_reader; +struct variable; + +struct matrix_reader + { + const struct dictionary *dict; + struct casegrouper *grouper; + + /* Variables in 'dict'. */ + const struct variable **cvars; /* Continuous variables. */ + size_t n_cvars; + const struct variable *rowtype; /* ROWTYPE_. */ + const struct variable *varname; /* VARNAME_. */ + }; + struct matrix_material { gsl_matrix *corr; /* The correlation matrix */ @@ -34,22 +52,18 @@ struct matrix_material #define MATRIX_MATERIAL_INIT { .corr = NULL } void matrix_material_uninit (struct matrix_material *); -struct dictionary; -struct variable; -struct casereader; - - -struct matrix_reader; +struct matrix_reader *matrix_reader_create (const struct dictionary *, + struct casereader *); -struct matrix_reader *create_matrix_reader_from_case_reader (const struct dictionary *dict, - struct casereader *in_reader, - const struct variable ***vars, size_t *n_vars); +bool matrix_reader_destroy (struct matrix_reader *mr); -bool destroy_matrix_reader (struct matrix_reader *mr); +bool matrix_reader_next (struct matrix_material *mm, struct matrix_reader *mr, + struct casereader **groupp); -bool next_matrix_from_reader (struct matrix_material *mm, - struct matrix_reader *mr, - const struct variable **vars, int n_vars); +struct substring matrix_reader_get_string (const struct ccase *, + const struct variable *); +void matrix_reader_set_string (struct ccase *, const struct variable *, + struct substring); #endif diff --git a/src/language/data-io/mconvert.c b/src/language/data-io/mconvert.c index 4498041eeb..78a0d212d9 100644 --- a/src/language/data-io/mconvert.c +++ b/src/language/data-io/mconvert.c @@ -16,8 +16,16 @@ #include +#include + +#include "data/any-reader.h" +#include "data/any-writer.h" +#include "data/casereader.h" +#include "data/casewriter.h" #include "data/dataset.h" +#include "data/dictionary.h" #include "language/data-io/file-handle.h" +#include "language/data-io/matrix-reader.h" #include "language/lexer/lexer.h" #include "language/command.h" @@ -27,7 +35,7 @@ int cmd_mconvert (struct lexer *lexer, struct dataset *ds) { - bool append = false; + bool append UNUSED = false; struct file_handle *in = NULL; struct file_handle *out = NULL; while (lex_token (lexer) != T_ENDCMD) @@ -76,11 +84,124 @@ cmd_mconvert (struct lexer *lexer, struct dataset *ds) goto error; } - /* XXX */ + assert (in); + assert (out); + + struct dictionary *d; + struct casereader *cr = any_reader_open_and_decode (in, NULL, &d, NULL); + if (!cr) + goto error; + + struct matrix_reader *mr = matrix_reader_create (d, cr); + if (!mr) + { + casereader_destroy (cr); + dict_unref (d); + goto error; + } + + struct casewriter *cw = any_writer_open (out, d); + if (!cw) + { + matrix_reader_destroy (mr); + casereader_destroy (cr); + dict_unref (d); + goto error; + } + + for (;;) + { + struct matrix_material mm; + struct casereader *group; + if (!matrix_reader_next (&mm, mr, &group)) + break; + + struct ccase *model = NULL; + for (;;) + { + struct ccase *c = casereader_read (group); + if (!c) + break; + + if (!model) + { + struct substring rowtype + = matrix_reader_get_string (c, mr->rowtype); + if (ss_equals_case (rowtype, ss_cstr ("COV")) + || ss_equals_case (rowtype, ss_cstr ("CORR"))) + model = case_ref (c); + } + casewriter_write (cw, c); + } + + if (!model) + continue; + + if (mm.cov && !mm.corr) + { + assert (mm.cov->size1 == mr->n_cvars); + assert (mm.cov->size2 == mr->n_cvars); + + for (size_t y = 0; y < mr->n_cvars; y++) + { + struct ccase *c = case_clone (model); + for (size_t x = 0; x < mr->n_cvars; x++) + { + double d1 = gsl_matrix_get (mm.cov, x, x); + double d2 = gsl_matrix_get (mm.cov, y, y); + double cov = gsl_matrix_get (mm.cov, y, x); + *case_num_rw (c, mr->cvars[x]) = cov / sqrt (d1 * d2); + } + matrix_reader_set_string (c, mr->rowtype, ss_cstr ("CORR")); + matrix_reader_set_string (c, mr->varname, + ss_cstr (var_get_name (mr->cvars[y]))); + casewriter_write (cw, c); + } + + struct ccase *c = case_clone (model); + for (size_t x = 0; x < mr->n_cvars; x++) + { + double variance = gsl_matrix_get (mm.cov, x, x); + *case_num_rw (c, mr->cvars[x]) = sqrt (variance); + } + matrix_reader_set_string (c, mr->rowtype, ss_cstr ("STDDEV")); + matrix_reader_set_string (c, mr->varname, ss_empty ()); + casewriter_write (cw, c); + } + + if (mm.corr && !mm.cov) + { + assert (mm.corr->size1 == mr->n_cvars); + assert (mm.corr->size2 == mr->n_cvars); + + for (size_t y = 0; y < mr->n_cvars; y++) + { + struct ccase *c = case_clone (model); + for (size_t x = 0; x < mr->n_cvars; x++) + { + double d1 = gsl_matrix_get (mm.var_matrix, x, x); + double d2 = gsl_matrix_get (mm.var_matrix, y, y); + double corr = gsl_matrix_get (mm.corr, y, x); + *case_num_rw (c, mr->cvars[x]) = corr * sqrt (d1 * d2); + } + casewriter_write (cw, c); + } + } + + case_unref (model); + } + + matrix_reader_destroy (mr); + casewriter_destroy (cw); + fh_unref (in); + fh_unref (out); + dict_unref (d); + return CMD_SUCCESS; error: fh_unref (in); fh_unref (out); + dict_unref (d); return CMD_FAILURE; } diff --git a/src/language/stats/factor.c b/src/language/stats/factor.c index 9fa0a8ad25..09de685eed 100644 --- a/src/language/stats/factor.c +++ b/src/language/stats/factor.c @@ -1145,8 +1145,9 @@ cmd_factor (struct lexer *lexer, struct dataset *ds) if (! lex_force_match (lexer, T_RPAREN)) goto error; - mr = create_matrix_reader_from_case_reader (dict, matrix_reader, - &factor.vars, &factor.n_vars); + mr = matrix_reader_create (dict, matrix_reader); + factor.vars = xmemdup (mr->cvars, mr->n_cvars * sizeof *mr->cvars); + factor.n_vars = mr->n_cvars; } else { @@ -1527,8 +1528,7 @@ cmd_factor (struct lexer *lexer, struct dataset *ds) { struct idata *id = idata_alloc (factor.n_vars); - while (next_matrix_from_reader (&id->mm, mr, - factor.vars, factor.n_vars)) + while (matrix_reader_next (&id->mm, mr, NULL)) { do_factor_by_matrix (&factor, id); @@ -1546,13 +1546,12 @@ cmd_factor (struct lexer *lexer, struct dataset *ds) if (! run_factor (ds, &factor)) goto error; - - destroy_matrix_reader (mr); + matrix_reader_destroy (mr); free (factor.vars); return CMD_SUCCESS; error: - destroy_matrix_reader (mr); + matrix_reader_destroy (mr); free (factor.vars); return CMD_FAILURE; } diff --git a/tests/automake.mk b/tests/automake.mk index e85170f75c..802666bfac 100644 --- a/tests/automake.mk +++ b/tests/automake.mk @@ -358,6 +358,7 @@ TESTSUITE_AT = \ tests/language/data-io/list.at \ tests/language/data-io/match-files.at \ tests/language/data-io/matrix-data.at \ + tests/language/data-io/mconvert.at \ tests/language/data-io/print-space.at \ tests/language/data-io/print.at \ tests/language/data-io/save.at \