/* PSPP - a program for statistical analysis.
- Copyright (C) 2017 Free Software Foundation, Inc.
+ Copyright (C) 2017, 2019 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
#include "matrix-reader.h"
#include <stdbool.h>
-
-#include <libpspp/hash-functions.h>
-#include <libpspp/message.h>
-#include <data/casegrouper.h>
-#include <data/casereader.h>
-#include <data/dictionary.h>
-#include <data/variable.h>
+#include <math.h>
+
+#include "data/casegrouper.h"
+#include "data/casereader.h"
+#include "data/data-out.h"
+#include "data/dataset.h"
+#include "data/dictionary.h"
+#include "data/format.h"
+#include "data/variable.h"
+#include "language/command.h"
+#include "libpspp/i18n.h"
+#include "libpspp/message.h"
+#include "libpspp/str.h"
+#include "output/pivot-table.h"
#include "gettext.h"
#define _(msgid) gettext (msgid)
#define N_(msgid) msgid
+struct lexer;
/*
This module interprets a "data matrix", typically generated by the command
*/
+void
+matrix_material_uninit (struct matrix_material *mm)
+{
+ gsl_matrix_free (mm->corr);
+ gsl_matrix_free (mm->cov);
+ gsl_matrix_free (mm->n);
+ gsl_matrix_free (mm->mean_matrix);
+ gsl_matrix_free (mm->var_matrix);
+}
+\f
struct matrix_reader
{
const struct dictionary *dict;
const struct variable *varname;
const struct variable *rowtype;
struct casegrouper *grouper;
-
- gsl_matrix *n_vectors;
- gsl_matrix *mean_vectors;
- gsl_matrix *var_vectors;
-
- // gsl_matrix *correlation;
- // gsl_matrix *covariance;
};
struct matrix_reader *
{
struct matrix_reader *mr = xzalloc (sizeof *mr);
- mr->dict = dict;
mr->varname = dict_lookup_var (dict, "varname_");
+ mr->dict = dict;
if (mr->varname == NULL)
{
msg (ME, _("Matrix dataset lacks a variable called %s."), "VARNAME_");
return NULL;
}
+ if (!var_is_alpha (mr->varname))
+ {
+ msg (ME, _("Matrix dataset variable %s should be of string type."),
+ "VARNAME_");
+ free (mr);
+ return NULL;
+ }
+
mr->rowtype = dict_lookup_var (dict, "rowtype_");
if (mr->rowtype == NULL)
{
return NULL;
}
+ if (!var_is_alpha (mr->rowtype))
+ {
+ msg (ME, _("Matrix dataset variable %s should be of string type."),
+ "ROWTYPE_");
+ free (mr);
+ return NULL;
+ }
+
size_t dvarcnt;
const struct variable **dvars = NULL;
dict_get_vars (dict, &dvars, &dvarcnt, DC_SCRATCH);
if (vars)
{
int i;
- *vars = xcalloc (sizeof (struct variable **), *n_vars);
+ *vars = xcalloc (*n_vars, sizeof (struct variable **));
for (i = 0; i < *n_vars; ++i)
{
{
int col;
if (*matrix == NULL)
- *matrix = gsl_matrix_alloc (n_vars, n_vars);
+ {
+ *matrix = gsl_matrix_alloc (n_vars, n_vars);
+ gsl_matrix_set_all (*matrix, SYSMIS);
+ }
for (col = 0; col < n_vars; ++col)
{
const struct variable *cv = vars [col];
- double x = case_data (c, cv)->f;
+ double x = case_num (c, cv);
assert (col < (*matrix)->size2);
assert (mrow < (*matrix)->size1);
gsl_matrix_set (*matrix, mrow, col, x);
}
}
+static int
+find_varname (const struct variable **vars, int n_vars,
+ const char *varname)
+{
+ for (int i = 0; i < n_vars; i++)
+ if (!strcasecmp (var_get_name (vars[i]), varname))
+ return i;
+ return -1;
+}
+
bool
next_matrix_from_reader (struct matrix_material *mm,
struct matrix_reader *mr,
assert (vars);
- gsl_matrix_free (mr->n_vectors);
- gsl_matrix_free (mr->mean_vectors);
- gsl_matrix_free (mr->var_vectors);
-
if (!casegrouper_get_next_group (mr->grouper, &group))
- return false;
-
- mr->n_vectors = gsl_matrix_alloc (n_vars, n_vars);
- mr->mean_vectors = gsl_matrix_alloc (n_vars, n_vars);
- mr->var_vectors = gsl_matrix_alloc (n_vars, n_vars);
+ {
+ *mm = (struct matrix_material) MATRIX_MATERIAL_INIT;
+ return false;
+ }
- mm->n = mr->n_vectors;
- mm->mean_matrix = mr->mean_vectors;
- mm->var_matrix = mr->var_vectors;
+ *mm = (struct matrix_material) {
+ .n = gsl_matrix_calloc (n_vars, n_vars),
+ .mean_matrix = gsl_matrix_calloc (n_vars, n_vars),
+ .var_matrix = gsl_matrix_calloc (n_vars, n_vars),
+ };
- // FIXME: Make this into a hash table.
- unsigned long *table = xmalloc (sizeof (*table) * n_vars);
- int i;
- for (i = 0; i < n_vars; ++i)
+ struct matrix
{
- const int w = var_get_width (mr->varname);
- char s[w];
- memset (s, 0, w);
- const char *name = var_get_name (vars[i]);
- strcpy (s, name);
- unsigned long h = hash_bytes (s, w, 0);
- table[i] = h;
- }
+ const char *name;
+ gsl_matrix **m;
+ size_t good_rows;
+ size_t bad_rows;
+ };
+ struct matrix matrices[] = {
+ { .name = "CORR", .m = &mm->corr },
+ { .name = "COV", .m = &mm->cov },
+ };
+ enum { N_MATRICES = 2 };
struct ccase *c;
- for ( ; (c = casereader_read (group) ); case_unref (c))
+ for (; (c = casereader_read (group)); case_unref (c))
{
- const union value *uv = case_data (c, mr->rowtype);
- int col, row;
- for (col = 0; col < n_vars; ++col)
- {
- const struct variable *cv
- = vars ? vars[col] : dict_get_var (mr->dict, var_get_dict_index (mr->varname) + 1 + col);
- double x = case_data (c, cv)->f;
- if (0 == strncasecmp ((char *)value_str (uv, 8), "N ", 8))
- for (row = 0; row < n_vars; ++row)
- gsl_matrix_set (mr->n_vectors, row, col, x);
- else if (0 == strncasecmp ((char *) value_str (uv, 8), "MEAN ", 8))
- for (row = 0; row < n_vars; ++row)
- gsl_matrix_set (mr->mean_vectors, row, col, x);
- else if (0 == strncasecmp ((char *) value_str (uv, 8), "STDDEV ", 8))
- for (row = 0; row < n_vars; ++row)
- gsl_matrix_set (mr->var_vectors, row, col, x * x);
- }
+ struct substring rowtype = case_ss (c, mr->rowtype);
+ ss_rtrim (&rowtype, ss_cstr (CC_SPACES));
+
+ gsl_matrix *v
+ = (ss_equals_case (rowtype, ss_cstr ("N")) ? mm->n
+ : ss_equals_case (rowtype, ss_cstr ("MEAN")) ? mm->mean_matrix
+ : ss_equals_case (rowtype, ss_cstr ("STDDEV")) ? mm->var_matrix
+ : NULL);
+ if (v)
+ {
+ for (int x = 0; x < n_vars; ++x)
+ {
+ double n = case_num (c, vars[x]);
+ if (v == mm->var_matrix)
+ n *= n;
+ for (int y = 0; y < n_vars; ++y)
+ gsl_matrix_set (v, y, x, n);
+ }
+ continue;
+ }
+
+ struct matrix *m = NULL;
+ for (size_t i = 0; i < N_MATRICES; i++)
+ if (ss_equals_case (rowtype, ss_cstr (matrices[i].name)))
+ {
+ m = &matrices[i];
+ break;
+ }
+ if (m)
+ {
+ struct substring varname_raw = case_ss (c, mr->varname);
+ struct substring varname = ss_cstr (
+ recode_string (UTF8, dict_get_encoding (mr->dict),
+ varname_raw.string, varname_raw.length));
+ ss_rtrim (&varname, ss_cstr (CC_SPACES));
+ varname.string[varname.length] = '\0';
+
+ int y = find_varname (vars, n_vars, varname.string);
+ if (y >= 0)
+ {
+ m->good_rows++;
+ matrix_fill_row (m->m, c, y, vars, n_vars);
+ }
+ else
+ m->bad_rows++;
+ ss_dealloc (&varname);
+ }
+ }
+ casereader_destroy (group);
- const union value *uvv = case_data (c, mr->varname);
- const uint8_t *vs = value_str (uvv, var_get_width (mr->varname));
- int w = var_get_width (mr->varname);
- unsigned long h = hash_bytes (vs, w, 0);
+ for (size_t i = 0; i < N_MATRICES; i++)
+ if (matrices[i].good_rows && matrices[i].good_rows != n_vars)
+ msg (SW, _("%s matrix has %d columns but %zu rows named variables "
+ "to be analyzed (and %zu rows named unknown variables)."),
+ matrices[i].name, n_vars, matrices[i].good_rows,
+ matrices[i].bad_rows);
- int mrow = -1;
- for (i = 0; i < n_vars; ++i)
- {
- if (table[i] == h)
- {
- mrow = i;
- break;
- }
- }
+ return true;
+}
- if (mrow == -1)
- continue;
+int
+cmd_debug_matrix_read (struct lexer *lexer UNUSED, struct dataset *ds)
+{
+ const struct variable **vars;
+ size_t n_vars;
+ struct matrix_reader *mr = create_matrix_reader_from_case_reader (
+ dataset_dict (ds), proc_open (ds), &vars, &n_vars);
+ if (!mr)
+ return CMD_FAILURE;
- if (0 == strncasecmp ((char *) value_str (uv, 8), "CORR ", 8))
- {
- matrix_fill_row (&mm->corr, c, mrow, vars, n_vars);
- }
- else if (0 == strncasecmp ((char *) value_str (uv, 8), "COV ", 8))
- {
- matrix_fill_row (&mm->cov, c, mrow, vars, n_vars);
- }
+ struct pivot_table *pt = pivot_table_create ("Debug Matrix Reader");
+
+ enum mm_stat
+ {
+ MM_CORR,
+ MM_COV,
+ MM_N,
+ MM_MEAN,
+ MM_STDDEV,
+ };
+ const char *mm_stat_names[] = {
+ [MM_CORR] = "Correlation",
+ [MM_COV] = "Covariance",
+ [MM_N] = "N",
+ [MM_MEAN] = "Mean",
+ [MM_STDDEV] = "Standard Deviation",
+ };
+ enum { N_STATS = sizeof mm_stat_names / sizeof *mm_stat_names };
+ for (size_t i = 0; i < 2; i++)
+ {
+ struct pivot_dimension *d = pivot_dimension_create (
+ pt,
+ i ? PIVOT_AXIS_COLUMN : PIVOT_AXIS_ROW,
+ i ? "Column" : "Row");
+ if (!i)
+ pivot_category_create_leaf_rc (d->root, pivot_value_new_text ("Value"),
+ PIVOT_RC_CORRELATION);
+ for (size_t j = 0; j < n_vars; j++)
+ pivot_category_create_leaf_rc (
+ d->root, pivot_value_new_variable (vars[j]), PIVOT_RC_CORRELATION);
}
- casereader_destroy (group);
+ struct pivot_dimension *stat = pivot_dimension_create (pt, PIVOT_AXIS_ROW,
+ "Statistic");
+ for (size_t i = 0; i < N_STATS; i++)
+ pivot_category_create_leaf (stat->root,
+ pivot_value_new_text (mm_stat_names[i]));
- free (table);
+ struct pivot_dimension *split = pivot_dimension_create (
+ pt, PIVOT_AXIS_ROW, "Split");
- return true;
+ int split_num = 0;
+
+ struct matrix_material mm = MATRIX_MATERIAL_INIT;
+ while (next_matrix_from_reader (&mm, mr, vars, n_vars))
+ {
+ pivot_category_create_leaf (split->root,
+ pivot_value_new_integer (split_num + 1));
+
+ const gsl_matrix *m[N_STATS] = {
+ [MM_CORR] = mm.corr,
+ [MM_COV] = mm.cov,
+ [MM_N] = mm.n,
+ [MM_MEAN] = mm.mean_matrix,
+ [MM_STDDEV] = mm.var_matrix,
+ };
+
+ for (size_t i = 0; i < N_STATS; i++)
+ if (m[i])
+ {
+ if (i == MM_COV || i == MM_CORR)
+ {
+ for (size_t y = 0; y < n_vars; y++)
+ for (size_t x = 0; x < n_vars; x++)
+ pivot_table_put4 (
+ pt, y + 1, x, i, split_num,
+ pivot_value_new_number (gsl_matrix_get (m[i], y, x)));
+ }
+ else
+ for (size_t x = 0; x < n_vars; x++)
+ {
+ double n = gsl_matrix_get (m[i], 0, x);
+ if (i == MM_STDDEV)
+ n = sqrt (n);
+ pivot_table_put4 (pt, 0, x, i, split_num,
+ pivot_value_new_number (n));
+ }
+ }
+
+ split_num++;
+ matrix_material_uninit (&mm);
+ }
+ pivot_table_submit (pt);
+
+ proc_commit (ds);
+
+ destroy_matrix_reader (mr);
+ free (vars);
+ return CMD_SUCCESS;
}