X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Fstats%2Ffactor.c;h=e67cdbc1d8afaa6ea1ba07ea73f09efc7f9149ab;hb=6e5736d55e61f5dca727428213a835998eeacead;hp=a2a8e5df037a15f0a67361d079002d7f70bc33e7;hpb=5f68c60b8283f6a410de20f927e9b12792ea58b3;p=pspp-builds.git diff --git a/src/language/stats/factor.c b/src/language/stats/factor.c index a2a8e5df..e67cdbc1 100644 --- a/src/language/stats/factor.c +++ b/src/language/stats/factor.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2009, 2010 Free Software Foundation, Inc. + Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -16,38 +16,33 @@ #include - #include #include #include #include #include #include - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - -#include -#include +#include + +#include "data/casegrouper.h" +#include "data/casereader.h" +#include "data/casewriter.h" +#include "data/dataset.h" +#include "data/dictionary.h" +#include "data/format.h" +#include "data/subcase.h" +#include "language/command.h" +#include "language/lexer/lexer.h" +#include "language/lexer/value-parser.h" +#include "language/lexer/variable-parser.h" +#include "libpspp/message.h" +#include "libpspp/misc.h" +#include "math/correlation.h" +#include "math/covariance.h" +#include "math/moments.h" +#include "output/chart-item.h" +#include "output/charts/scree.h" +#include "output/tab.h" #include "gettext.h" #define _(msgid) gettext (msgid) @@ -175,7 +170,7 @@ struct idata /* Intermediate values used in calculation */ const gsl_matrix *corr ; /* The correlation matrix */ - const gsl_matrix *cov ; /* The covariance matrix */ + gsl_matrix *cov ; /* The covariance matrix */ const gsl_matrix *n ; /* Matrix of number of samples */ gsl_vector *eval ; /* The eigenvalues */ @@ -184,6 +179,8 @@ struct idata int n_extractions; gsl_vector *msr ; /* Multiple Squared Regressions */ + + double detR; /* The determinant of the correlation matrix */ }; static struct idata * @@ -206,11 +203,61 @@ idata_free (struct idata *id) gsl_vector_free (id->msr); gsl_vector_free (id->eval); gsl_matrix_free (id->evec); + if (id->cov != NULL) + gsl_matrix_free (id->cov); free (id); } +static gsl_matrix * +anti_image (const gsl_matrix *m) +{ + int i, j; + gsl_matrix *a; + assert (m->size1 == m->size2); + + a = gsl_matrix_alloc (m->size1, m->size2); + + for (i = 0; i < m->size1; ++i) + { + for (j = 0; j < m->size2; ++j) + { + double *p = gsl_matrix_ptr (a, i, j); + *p = gsl_matrix_get (m, i, j); + *p /= gsl_matrix_get (m, i, i); + *p /= gsl_matrix_get (m, j, j); + } + } + + return a; +} + + +/* Return the sum of all the elements excluding row N */ +static double +ssq_od_n (const gsl_matrix *m, int n) +{ + int i, j; + double ss = 0; + assert (m->size1 == m->size2); + + assert (n < m->size1); + + for (i = 0; i < m->size1; ++i) + { + if (i == n ) continue; + for (j = 0; j < m->size2; ++j) + { + ss += pow2 (gsl_matrix_get (m, i, j)); + } + } + + return ss; +} + + + #if 0 static void dump_matrix (const gsl_matrix *m) @@ -225,7 +272,6 @@ dump_matrix (const gsl_matrix *m) } } - static void dump_matrix_permute (const gsl_matrix *m, const gsl_permutation *p) { @@ -788,14 +834,14 @@ cmd_factor (struct lexer *lexer, struct dataset *ds) factor.wv = dict_get_weight (dict); - lex_match (lexer, '/'); + lex_match (lexer, T_SLASH); if (!lex_force_match_id (lexer, "VARIABLES")) { goto error; } - lex_match (lexer, '='); + lex_match (lexer, T_EQUALS); if (!parse_variables_const (lexer, dict, &factor.vars, &factor.n_vars, PV_NO_DUPLICATE | PV_NUMERIC)) @@ -804,14 +850,14 @@ cmd_factor (struct lexer *lexer, struct dataset *ds) if (factor.n_vars < 2) msg (MW, _("Factor analysis on a single variable is not useful.")); - while (lex_token (lexer) != '.') + while (lex_token (lexer) != T_ENDCMD) { - lex_match (lexer, '/'); + lex_match (lexer, T_SLASH); if (lex_match_id (lexer, "PLOT")) { - lex_match (lexer, '='); - while (lex_token (lexer) != '.' && lex_token (lexer) != '/') + lex_match (lexer, T_EQUALS); + while (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_SLASH) { if (lex_match_id (lexer, "EIGEN")) { @@ -831,8 +877,8 @@ cmd_factor (struct lexer *lexer, struct dataset *ds) } else if (lex_match_id (lexer, "METHOD")) { - lex_match (lexer, '='); - while (lex_token (lexer) != '.' && lex_token (lexer) != '/') + lex_match (lexer, T_EQUALS); + while (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_SLASH) { if (lex_match_id (lexer, "COVARIANCE")) { @@ -851,8 +897,8 @@ cmd_factor (struct lexer *lexer, struct dataset *ds) } else if (lex_match_id (lexer, "ROTATION")) { - lex_match (lexer, '='); - while (lex_token (lexer) != '.' && lex_token (lexer) != '/') + lex_match (lexer, T_EQUALS); + while (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_SLASH) { /* VARIMAX and DEFAULT are defaults */ if (lex_match_id (lexer, "VARIMAX") || lex_match_id (lexer, "DEFAULT")) @@ -880,57 +926,57 @@ cmd_factor (struct lexer *lexer, struct dataset *ds) } else if (lex_match_id (lexer, "CRITERIA")) { - lex_match (lexer, '='); - while (lex_token (lexer) != '.' && lex_token (lexer) != '/') + lex_match (lexer, T_EQUALS); + while (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_SLASH) { if (lex_match_id (lexer, "FACTORS")) { - if ( lex_force_match (lexer, '(')) + if ( lex_force_match (lexer, T_LPAREN)) { lex_force_int (lexer); factor.n_factors = lex_integer (lexer); lex_get (lexer); - lex_force_match (lexer, ')'); + lex_force_match (lexer, T_RPAREN); } } else if (lex_match_id (lexer, "MINEIGEN")) { - if ( lex_force_match (lexer, '(')) + if ( lex_force_match (lexer, T_LPAREN)) { lex_force_num (lexer); factor.min_eigen = lex_number (lexer); lex_get (lexer); - lex_force_match (lexer, ')'); + lex_force_match (lexer, T_RPAREN); } } else if (lex_match_id (lexer, "ECONVERGE")) { - if ( lex_force_match (lexer, '(')) + if ( lex_force_match (lexer, T_LPAREN)) { lex_force_num (lexer); factor.econverge = lex_number (lexer); lex_get (lexer); - lex_force_match (lexer, ')'); + lex_force_match (lexer, T_RPAREN); } } else if (lex_match_id (lexer, "RCONVERGE")) { - if ( lex_force_match (lexer, '(')) + if ( lex_force_match (lexer, T_LPAREN)) { lex_force_num (lexer); factor.rconverge = lex_number (lexer); lex_get (lexer); - lex_force_match (lexer, ')'); + lex_force_match (lexer, T_RPAREN); } } else if (lex_match_id (lexer, "ITERATE")) { - if ( lex_force_match (lexer, '(')) + if ( lex_force_match (lexer, T_LPAREN)) { lex_force_int (lexer); factor.iterations = lex_integer (lexer); lex_get (lexer); - lex_force_match (lexer, ')'); + lex_force_match (lexer, T_RPAREN); } } else if (lex_match_id (lexer, "DEFAULT")) @@ -949,8 +995,8 @@ cmd_factor (struct lexer *lexer, struct dataset *ds) else if (lex_match_id (lexer, "EXTRACTION")) { extraction_seen = true; - lex_match (lexer, '='); - while (lex_token (lexer) != '.' && lex_token (lexer) != '/') + lex_match (lexer, T_EQUALS); + while (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_SLASH) { if (lex_match_id (lexer, "PAF")) { @@ -977,8 +1023,8 @@ cmd_factor (struct lexer *lexer, struct dataset *ds) } else if (lex_match_id (lexer, "FORMAT")) { - lex_match (lexer, '='); - while (lex_token (lexer) != '.' && lex_token (lexer) != '/') + lex_match (lexer, T_EQUALS); + while (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_SLASH) { if (lex_match_id (lexer, "SORT")) { @@ -986,12 +1032,12 @@ cmd_factor (struct lexer *lexer, struct dataset *ds) } else if (lex_match_id (lexer, "BLANK")) { - if ( lex_force_match (lexer, '(')) + if ( lex_force_match (lexer, T_LPAREN)) { lex_force_num (lexer); factor.blank = lex_number (lexer); lex_get (lexer); - lex_force_match (lexer, ')'); + lex_force_match (lexer, T_RPAREN); } } else if (lex_match_id (lexer, "DEFAULT")) @@ -1009,8 +1055,8 @@ cmd_factor (struct lexer *lexer, struct dataset *ds) else if (lex_match_id (lexer, "PRINT")) { factor.print = 0; - lex_match (lexer, '='); - while (lex_token (lexer) != '.' && lex_token (lexer) != '/') + lex_match (lexer, T_EQUALS); + while (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_SLASH) { if (lex_match_id (lexer, "UNIVARIATE")) { @@ -1053,10 +1099,11 @@ cmd_factor (struct lexer *lexer, struct dataset *ds) { factor.print |= PRINT_INITIAL; } -#if FACTOR_FULLY_IMPLEMENTED else if (lex_match_id (lexer, "KMO")) { + factor.print |= PRINT_KMO; } +#if FACTOR_FULLY_IMPLEMENTED else if (lex_match_id (lexer, "REPR")) { } @@ -1083,8 +1130,8 @@ cmd_factor (struct lexer *lexer, struct dataset *ds) } else if (lex_match_id (lexer, "MISSING")) { - lex_match (lexer, '='); - while (lex_token (lexer) != '.' && lex_token (lexer) != '/') + lex_match (lexer, T_EQUALS); + while (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_SLASH) { if (lex_match_id (lexer, "INCLUDE")) { @@ -1301,7 +1348,7 @@ show_factor_matrix (const struct cmd_factor *factor, struct idata *idata, const tab_title (t, _("Factor Matrix")); */ - tab_title (t, title); + tab_title (t, "%s", title); tab_headers (t, heading_columns, 0, heading_rows, 0); @@ -1494,7 +1541,7 @@ show_explained_variance (const struct cmd_factor * factor, struct idata *idata, c = 0; - tab_text_format (t, c++, i + heading_rows, TAB_LEFT | TAT_TITLE, _("%d"), i + 1); + tab_text_format (t, c++, i + heading_rows, TAB_LEFT | TAT_TITLE, _("%zu"), i + 1); i_cum += i_percent; e_cum += e_percent; @@ -1631,7 +1678,7 @@ show_correlation_matrix (const struct cmd_factor *factor, const struct idata *id if (factor->print & PRINT_SIG) { const double y = heading_rows + y_pos_sig * factor->n_vars; - tab_text (t, 0, y, TAT_TITLE, _("Sig. 1-tailed")); + tab_text (t, 0, y, TAT_TITLE, _("Sig. (1-tailed)")); for (i = 0; i < factor->n_vars; ++i) { @@ -1651,22 +1698,9 @@ show_correlation_matrix (const struct cmd_factor *factor, const struct idata *id if (factor->print & PRINT_DETERMINANT) { - int sign = 0; - double det = 0.0; - - const int size = idata->corr->size1; - gsl_permutation *p = gsl_permutation_calloc (size); - gsl_matrix *tmp = gsl_matrix_calloc (size, size); - gsl_matrix_memcpy (tmp, idata->corr); - - gsl_linalg_LU_decomp (tmp, p, &sign); - det = gsl_linalg_LU_det (tmp, sign); - gsl_permutation_free (p); - gsl_matrix_free (tmp); - - tab_text (t, 0, nr, TAB_LEFT | TAT_TITLE, _("Determinant")); - tab_double (t, 1, nr, 0, det, NULL); + + tab_double (t, 1, nr, 0, idata->detR, NULL); } tab_submit (t); @@ -1694,6 +1728,12 @@ do_factor (const struct cmd_factor *factor, struct casereader *r) idata->cov = covariance_calculate (cov); + if (idata->cov == NULL) + { + msg (MW, _("The dataset contains no complete observations. No analysis will be performed.")); + goto finish; + } + var_matrix = covariance_moments (cov, MOMENT_VARIANCE); mean_matrix = covariance_moments (cov, MOMENT_MEAN); idata->n = covariance_moments (cov, MOMENT_NONE); @@ -1701,17 +1741,33 @@ do_factor (const struct cmd_factor *factor, struct casereader *r) if ( factor->method == METHOD_CORR) { idata->corr = correlation_from_covariance (idata->cov, var_matrix); + analysis_matrix = idata->corr; } else analysis_matrix = idata->cov; + if (factor->print & PRINT_DETERMINANT + || factor->print & PRINT_KMO) + { + int sign = 0; + + const int size = idata->corr->size1; + gsl_permutation *p = gsl_permutation_calloc (size); + gsl_matrix *tmp = gsl_matrix_calloc (size, size); + gsl_matrix_memcpy (tmp, idata->corr); + + gsl_linalg_LU_decomp (tmp, p, &sign); + idata->detR = gsl_linalg_LU_det (tmp, sign); + gsl_permutation_free (p); + gsl_matrix_free (tmp); + } + if ( factor->print & PRINT_UNIVARIATE) { + const struct fmt_spec *wfmt = factor->wv ? var_get_print_format (factor->wv) : & F_8_0; const int nc = 4; int i; - const struct fmt_spec *wfmt = factor->wv ? var_get_print_format (factor->wv) : & F_8_0; - const int heading_columns = 1; const int heading_rows = 1; @@ -1757,9 +1813,88 @@ do_factor (const struct cmd_factor *factor, struct casereader *r) tab_submit (t); } + if (factor->print & PRINT_KMO) + { + int i; + double sum_ssq_r = 0; + double sum_ssq_a = 0; + + double df = factor->n_vars * ( factor->n_vars - 1) / 2; + + double w = 0; + + + double xsq; + + const int heading_columns = 2; + const int heading_rows = 0; + + const int nr = heading_rows + 4; + const int nc = heading_columns + 1; + + gsl_matrix *a, *x; + + struct tab_table *t = tab_create (nc, nr); + tab_title (t, _("KMO and Bartlett's Test")); + + x = clone_matrix (idata->corr); + gsl_linalg_cholesky_decomp (x); + gsl_linalg_cholesky_invert (x); + + a = anti_image (x); + + for (i = 0; i < x->size1; ++i) + { + sum_ssq_r += ssq_od_n (x, i); + sum_ssq_a += ssq_od_n (a, i); + } + + gsl_matrix_free (a); + gsl_matrix_free (x); + + tab_headers (t, heading_columns, 0, heading_rows, 0); + + /* Outline the box */ + tab_box (t, + TAL_2, TAL_2, + -1, -1, + 0, 0, + nc - 1, nr - 1); + + tab_vline (t, TAL_2, heading_columns, 0, nr - 1); + + tab_text (t, 0, 0, TAT_TITLE | TAB_LEFT, _("Kaiser-Meyer-Olkin Measure of Sampling Adequacy")); + + tab_double (t, 2, 0, 0, sum_ssq_r / (sum_ssq_r + sum_ssq_a), NULL); + + tab_text (t, 0, 1, TAT_TITLE | TAB_LEFT, _("Bartlett's Test of Sphericity")); + + tab_text (t, 1, 1, TAT_TITLE, _("Approx. Chi-Square")); + tab_text (t, 1, 2, TAT_TITLE, _("df")); + tab_text (t, 1, 3, TAT_TITLE, _("Sig.")); + + + /* The literature doesn't say what to do for the value of W when + missing values are involved. The best thing I can think of + is to take the mean average. */ + w = 0; + for (i = 0; i < idata->n->size1; ++i) + w += gsl_matrix_get (idata->n, i, i); + w /= idata->n->size1; + + xsq = w - 1 - (2 * factor->n_vars + 5) / 6.0; + xsq *= -log (idata->detR); + + tab_double (t, 2, 1, 0, xsq, NULL); + tab_double (t, 2, 2, 0, df, &F_8_0); + tab_double (t, 2, 3, 0, gsl_cdf_chisq_Q (xsq, df), NULL); + + + tab_submit (t); + } + show_correlation_matrix (factor, idata); -#if 1 { gsl_eigen_symmv_workspace *workspace = gsl_eigen_symmv_alloc (factor->n_vars); @@ -1769,7 +1904,6 @@ do_factor (const struct cmd_factor *factor, struct casereader *r) } gsl_eigen_symmv_sort (idata->eval, idata->evec, GSL_EIGEN_SORT_ABS_DESC); -#endif idata->n_extractions = n_extracted_factors (factor, idata);