X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Fstats%2Fgraph.c;h=8157bb295887c762f2362005b6e8f4bfcb474dcf;hb=28d3dbe39fce5b5ccdae8c99b4e00746f4aaad22;hp=a0dffbd3d13c8cb2d9d54c6644649724764f13e5;hpb=2d71aca595970d52947adce2a411a3496df0808a;p=pspp diff --git a/src/language/stats/graph.c b/src/language/stats/graph.c index a0dffbd3d1..8157bb2958 100644 --- a/src/language/stats/graph.c +++ b/src/language/stats/graph.c @@ -1,7 +1,7 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2012, 2013, 2015 Free Software Foundation, Inc. - + Copyright (C) 2012, 2013, 2015, 2019 Free Software Foundation, Inc. + This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or @@ -11,7 +11,7 @@ but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with this program. If not, see . */ @@ -23,6 +23,7 @@ #include #include +#include "gl/xalloc.h" #include #include "libpspp/assertion.h" @@ -48,13 +49,14 @@ #include "math/order-stats.h" #include "output/charts/plot-hist.h" #include "output/charts/scatterplot.h" +#include "output/charts/barchart.h" #include "language/command.h" #include "language/lexer/lexer.h" #include "language/lexer/value-parser.h" #include "language/lexer/variable-parser.h" - -#include "output/tab.h" +#include "language/stats/freq.h" +#include "language/stats/chart-category.h" #include "gettext.h" #define _(msgid) gettext (msgid) @@ -81,6 +83,15 @@ enum scatter_type ST_XYZ }; +enum bar_type + { + CBT_SIMPLE, + CBT_GROUPED, + CBT_STACKED, + CBT_RANGE + }; + + /* Variable index for histogram case */ enum { @@ -122,14 +133,155 @@ struct graph bool missing_pw; /* ------------ Graph ---------------- */ + bool normal; /* For histograms, draw the normal curve */ + enum chart_type chart_type; enum scatter_type scatter_type; - const struct variable *byvar; + enum bar_type bar_type; + const struct variable *by_var[2]; + size_t n_by_vars; + + struct subcase ordering; /* Ordering for aggregation */ + int agr; /* Index into ag_func */ + /* A caseproto that contains the plot data */ struct caseproto *gr_proto; }; + + +static double +calc_mom1 (double acc, double x, double w) +{ + return acc + x * w; +} + +static double +calc_mom0 (double acc, double x UNUSED, double w) +{ + return acc + w; +} + +static double +pre_low_extreme (void) +{ + return -DBL_MAX; +} + +static double +calc_max (double acc, double x, double w UNUSED) +{ + return (acc > x) ? acc : x; +} + +static double +pre_high_extreme (void) +{ + return DBL_MAX; +} + +static double +calc_min (double acc, double x, double w UNUSED) +{ + return (acc < x) ? acc : x; +} + +static double +post_normalise (double acc, double cc) +{ + return acc / cc; +} + +static double +post_percentage (double acc, double ccc) +{ + return acc / ccc * 100.0; +} + + +const struct ag_func ag_func[] = + { + {"COUNT", N_("Count"), 0, 0, NULL, calc_mom0, 0, 0}, + {"PCT", N_("Percentage"), 0, 0, NULL, calc_mom0, 0, post_percentage}, + {"CUFREQ", N_("Cumulative Count"), 0, 1, NULL, calc_mom0, 0, 0}, + {"CUPCT", N_("Cumulative Percent"), 0, 1, NULL, calc_mom0, 0, + post_percentage}, + + {"MEAN", N_("Mean"), 1, 0, NULL, calc_mom1, post_normalise, 0}, + {"SUM", N_("Sum"), 1, 0, NULL, calc_mom1, 0, 0}, + {"MAXIMUM", N_("Maximum"), 1, 0, pre_low_extreme, calc_max, 0, 0}, + {"MINIMUM", N_("Minimum"), 1, 0, pre_high_extreme, calc_min, 0, 0}, + }; + +const int N_AG_FUNCS = sizeof (ag_func) / sizeof (ag_func[0]); + +static bool +parse_function (struct lexer *lexer, struct graph *graph) +{ + int i; + for (i = 0 ; i < N_AG_FUNCS; ++i) + { + if (lex_match_id (lexer, ag_func[i].name)) + { + graph->agr = i; + break; + } + } + if (i == N_AG_FUNCS) + { + goto error; + } + + graph->n_dep_vars = ag_func[i].arity; + if (ag_func[i].arity > 0) + { + int v; + if (!lex_force_match (lexer, T_LPAREN)) + goto error; + + graph->dep_vars = xcalloc (graph->n_dep_vars, sizeof (graph->dep_vars)); + for (v = 0; v < ag_func[i].arity; ++v) + { + graph->dep_vars[v] = parse_variable (lexer, graph->dict); + if (! graph->dep_vars[v]) + goto error; + } + + if (!lex_force_match (lexer, T_RPAREN)) + goto error; + } + + if (!lex_force_match (lexer, T_BY)) + goto error; + + graph->by_var[0] = parse_variable (lexer, graph->dict); + if (!graph->by_var[0]) + { + goto error; + } + subcase_add_var (&graph->ordering, graph->by_var[0], SC_ASCEND); + graph->n_by_vars++; + + if (lex_match (lexer, T_BY)) + { + graph->by_var[1] = parse_variable (lexer, graph->dict); + if (!graph->by_var[1]) + { + goto error; + } + subcase_add_var (&graph->ordering, graph->by_var[1], SC_ASCEND); + graph->n_by_vars++; + } + + return true; + + error: + lex_error (lexer, NULL); + return false; +} + + static void show_scatterplot (const struct graph *cmd, struct casereader *input) { @@ -139,12 +291,12 @@ show_scatterplot (const struct graph *cmd, struct casereader *input) ds_init_empty (&title); - if (cmd->byvar) + if (cmd->n_by_vars > 0) { ds_put_format (&title, _("%s vs. %s by %s"), var_to_string (cmd->dep_vars[1]), var_to_string (cmd->dep_vars[0]), - var_to_string (cmd->byvar)); + var_to_string (cmd->by_var[0])); } else { @@ -152,12 +304,12 @@ show_scatterplot (const struct graph *cmd, struct casereader *input) var_to_string (cmd->dep_vars[1]), var_to_string (cmd->dep_vars[0])); } - scatterplot = scatterplot_create (input, var_to_string(cmd->dep_vars[0]), var_to_string(cmd->dep_vars[1]), - cmd->byvar, + (cmd->n_by_vars > 0) ? cmd->by_var[0] + : NULL, &byvar_overflow, ds_cstr (&title), cmd->es[0].minimum, cmd->es[0].maximum, @@ -179,6 +331,12 @@ show_histogr (const struct graph *cmd, struct casereader *input) struct histogram *histogram; struct ccase *c; + if (cmd->es[0].cc <= 0) + { + casereader_destroy (input); + return; + } + { /* Sturges Rule */ double bin_width = fabs (cmd->es[0].minimum - cmd->es[0].maximum) @@ -197,8 +355,8 @@ show_histogr (const struct graph *cmd, struct casereader *input) for (;(c = casereader_read (input)) != NULL; case_unref (c)) { - const double x = case_data_idx (c, HG_IDX_X)->f; - const double weight = case_data_idx (c, HG_IDX_WT)->f; + const double x = case_num_idx (c, HG_IDX_X); + const double weight = case_num_idx (c, HG_IDX_WT); moments_pass_two (cmd->es[0].mom, x, weight); histogram_add (histogram, x, weight); } @@ -210,24 +368,24 @@ show_histogr (const struct graph *cmd, struct casereader *input) struct string label; - ds_init_cstr (&label, + ds_init_cstr (&label, var_to_string (cmd->dep_vars[0])); moments_calculate (cmd->es[0].mom, &n, &mean, &var, NULL, NULL); - chart_item_submit - ( histogram_chart_create (histogram->gsl_hist, + chart_submit + (histogram_chart_create (histogram->gsl_hist, ds_cstr (&label), n, mean, - sqrt (var), false)); + sqrt (var), cmd->normal)); - statistic_destroy (&histogram->parent); + statistic_destroy (&histogram->parent); ds_destroy (&label); } } static void cleanup_exploratory_stats (struct graph *cmd) -{ +{ int v; for (v = 0; v < cmd->n_dep_vars; ++v) @@ -237,6 +395,180 @@ cleanup_exploratory_stats (struct graph *cmd) } +static void +run_barchart (struct graph *cmd, struct casereader *input) +{ + struct casegrouper *grouper; + struct casereader *group; + double ccc = 0.0; + + if (cmd->missing_pw == false) + input = casereader_create_filter_missing (input, + cmd->dep_vars, + cmd->n_dep_vars, + cmd->dep_excl, + NULL, + NULL); + + + input = sort_execute (input, &cmd->ordering); + + struct freq **cells = NULL; + int n_cells = 0; + + struct hmap columns = HMAP_INITIALIZER (columns); + assert (cmd->n_by_vars <= 2); + for (grouper = casegrouper_create_vars (input, cmd->by_var, + cmd->n_by_vars); + casegrouper_get_next_group (grouper, &group); + casereader_destroy (group)) + { + int v; + struct ccase *c = casereader_peek (group, 0); + + /* Deal with missing values in the categorical variables */ + for (v = 0; v < cmd->n_by_vars; ++v) + { + if (var_is_value_missing (cmd->by_var[v], + case_data (c, cmd->by_var[v])) + & cmd->fctr_excl) + break; + } + + if (v < cmd->n_by_vars) + { + case_unref (c); + continue; + } + + cells = xrealloc (cells, sizeof (*cells) * ++n_cells); + cells[n_cells - 1] = xzalloc (sizeof (**cells) + + sizeof (union value) + * (cmd->n_by_vars - 1)); + + if (ag_func[cmd->agr].cumulative && n_cells >= 2) + cells[n_cells - 1]->count = cells[n_cells - 2]->count; + else + cells[n_cells - 1]->count = 0; + if (ag_func[cmd->agr].pre) + cells[n_cells - 1]->count = ag_func[cmd->agr].pre(); + + if (cmd->n_by_vars > 1) + { + const union value *vv = case_data (c, cmd->by_var[1]); + const double weight = dict_get_case_weight (cmd->dict, c, NULL); + int v1_width = var_get_width (cmd->by_var[1]); + size_t hash = value_hash (vv, v1_width, 0); + + struct freq *fcol = NULL; + HMAP_FOR_EACH_WITH_HASH (fcol, struct freq, node, hash, &columns) + if (value_equal (vv, &fcol->values[0], v1_width)) + break; + + if (fcol) + fcol->count += weight; + else + { + fcol = xzalloc (sizeof *fcol); + fcol->count = weight; + value_clone (&fcol->values[0], vv, v1_width); + hmap_insert (&columns, &fcol->node, hash); + } + } + + for (v = 0; v < cmd->n_by_vars; ++v) + { + value_clone (&cells[n_cells - 1]->values[v], + case_data (c, cmd->by_var[v]), + var_get_width (cmd->by_var[v])); + } + case_unref (c); + + double cc = 0; + for (;(c = casereader_read (group)) != NULL; case_unref (c)) + { + const double weight = dict_get_case_weight (cmd->dict,c,NULL); + const double x = (cmd->n_dep_vars > 0) + ? case_num (c, cmd->dep_vars[0]) : SYSMIS; + + cc += weight; + + cells[n_cells - 1]->count + = ag_func[cmd->agr].calc (cells[n_cells - 1]->count, x, weight); + } + + if (ag_func[cmd->agr].post) + cells[n_cells - 1]->count + = ag_func[cmd->agr].post (cells[n_cells - 1]->count, cc); + + ccc += cc; + } + + casegrouper_destroy (grouper); + + for (int i = 0; i < n_cells; ++i) + { + if (ag_func[cmd->agr].ppost) + { + struct freq *cell = cells[i]; + if (cmd->n_by_vars > 1) + { + const union value *vv = &cell->values[1]; + + int v1_width = var_get_width (cmd->by_var[1]); + size_t hash = value_hash (vv, v1_width, 0); + + struct freq *fcol = NULL; + HMAP_FOR_EACH_WITH_HASH (fcol, struct freq, node, hash, &columns) + if (value_equal (vv, &fcol->values[0], v1_width)) + break; + + cell->count = ag_func[cmd->agr].ppost (cell->count, fcol->count); + } + else + cell->count = ag_func[cmd->agr].ppost (cell->count, ccc); + } + } + + if (cmd->n_by_vars > 1) + { + struct freq *col_cell; + struct freq *next; + HMAP_FOR_EACH_SAFE (col_cell, next, struct freq, node, &columns) + { + + value_destroy (col_cell->values, var_get_width (cmd->by_var[1])); + free (col_cell); + } + } + hmap_destroy (&columns); + + { + struct string label; + ds_init_empty (&label); + + if (cmd->n_dep_vars > 0) + ds_put_format (&label, _("%s of %s"), + ag_func[cmd->agr].description, + var_get_name (cmd->dep_vars[0])); + else + ds_put_cstr (&label, + ag_func[cmd->agr].description); + + chart_submit (barchart_create (cmd->by_var, cmd->n_by_vars, + ds_cstr (&label), false, + cells, n_cells)); + + ds_destroy (&label); + } + + for (int i = 0; i < n_cells; ++i) + free (cells[i]); + + free (cells); +} + + static void run_graph (struct graph *cmd, struct casereader *input) { @@ -244,7 +576,7 @@ run_graph (struct graph *cmd, struct casereader *input) struct casereader *reader; struct casewriter *writer; - cmd->es = pool_calloc (cmd->pool,cmd->n_dep_vars,sizeof(struct exploratory_stats)); + cmd->es = pool_calloc (cmd->pool,cmd->n_dep_vars, sizeof *cmd->es); for(int v=0;vn_dep_vars;v++) { cmd->es[v].mom = moments_create (MOMENT_KURTOSIS); @@ -255,7 +587,7 @@ run_graph (struct graph *cmd, struct casereader *input) /* Always remove cases listwise. This is correct for */ /* the histogram because there is only one variable */ /* and a simple bivariate scatterplot */ - /* if ( cmd->missing_pw == false) */ + /* if (cmd->missing_pw == false) */ input = casereader_create_filter_missing (input, cmd->dep_vars, cmd->n_dep_vars, @@ -277,23 +609,23 @@ run_graph (struct graph *cmd, struct casereader *input) struct ccase *outcase = case_create (cmd->gr_proto); const double weight = dict_get_case_weight (cmd->dict,c,NULL); if (cmd->chart_type == CT_HISTOGRAM) - case_data_rw_idx (outcase, HG_IDX_WT)->f = weight; - if (cmd->chart_type == CT_SCATTERPLOT && cmd->byvar) + *case_num_rw_idx (outcase, HG_IDX_WT) = weight; + if (cmd->chart_type == CT_SCATTERPLOT && cmd->n_by_vars > 0) value_copy (case_data_rw_idx (outcase, SP_IDX_BY), - case_data (c, cmd->byvar), - var_get_width (cmd->byvar)); + case_data (c, cmd->by_var[0]), + var_get_width (cmd->by_var[0])); for(int v=0;vn_dep_vars;v++) { const struct variable *var = cmd->dep_vars[v]; - const double x = case_data (c, var)->f; + const double x = case_num (c, var); - if (var_is_value_missing (var, case_data (c, var), cmd->dep_excl)) + if (var_is_value_missing (var, case_data (c, var)) & cmd->dep_excl) { cmd->es[v].missing += weight; continue; } /* Magically v value fits to SP_IDX_X, SP_IDX_Y, HG_IDX_X */ - case_data_rw_idx (outcase, v)->f = x; + *case_num_rw_idx (outcase, v) = x; if (x > cmd->es[v].maximum) cmd->es[v].maximum = x; @@ -339,22 +671,22 @@ cmd_graph (struct lexer *lexer, struct dataset *ds) struct graph graph; graph.missing_pw = false; - + graph.pool = pool_create (); graph.dep_excl = MV_ANY; graph.fctr_excl = MV_ANY; - + graph.dict = dataset_dict (ds); - - /* ---------------- graph ------------------ */ graph.dep_vars = NULL; graph.chart_type = CT_NONE; graph.scatter_type = ST_BIVARIATE; - graph.byvar = NULL; + graph.n_by_vars = 0; graph.gr_proto = caseproto_create (); + subcase_init_empty (&graph.ordering); + while (lex_token (lexer) != T_ENDCMD) { lex_match (lexer, T_SLASH); @@ -366,6 +698,17 @@ cmd_graph (struct lexer *lexer, struct dataset *ds) lex_error (lexer, _("Only one chart type is allowed.")); goto error; } + graph.normal = false; + if (lex_match (lexer, T_LPAREN)) + { + if (!lex_force_match_id (lexer, "NORMAL")) + goto error; + + if (!lex_force_match (lexer, T_RPAREN)) + goto error; + + graph.normal = true; + } if (!lex_force_match (lexer, T_EQUALS)) goto error; graph.chart_type = CT_HISTOGRAM; @@ -379,6 +722,54 @@ cmd_graph (struct lexer *lexer, struct dataset *ds) goto error; } } + else if (lex_match_id (lexer, "BAR")) + { + if (graph.chart_type != CT_NONE) + { + lex_error (lexer, _("Only one chart type is allowed.")); + goto error; + } + graph.chart_type = CT_BAR; + graph.bar_type = CBT_SIMPLE; + + if (lex_match (lexer, T_LPAREN)) + { + if (lex_match_id (lexer, "SIMPLE")) + { + /* This is the default anyway */ + } + else if (lex_match_id (lexer, "GROUPED")) + { + graph.bar_type = CBT_GROUPED; + goto error; + } + else if (lex_match_id (lexer, "STACKED")) + { + graph.bar_type = CBT_STACKED; + lex_error (lexer, _("%s is not yet implemented."), "STACKED"); + goto error; + } + else if (lex_match_id (lexer, "RANGE")) + { + graph.bar_type = CBT_RANGE; + lex_error (lexer, _("%s is not yet implemented."), "RANGE"); + goto error; + } + else + { + lex_error (lexer, NULL); + goto error; + } + if (!lex_force_match (lexer, T_RPAREN)) + goto error; + } + + if (!lex_force_match (lexer, T_EQUALS)) + goto error; + + if (! parse_function (lexer, &graph)) + goto error; + } else if (lex_match_id (lexer, "SCATTERPLOT")) { if (graph.chart_type != CT_NONE) @@ -387,30 +778,30 @@ cmd_graph (struct lexer *lexer, struct dataset *ds) goto error; } graph.chart_type = CT_SCATTERPLOT; - if (lex_match (lexer, T_LPAREN)) + if (lex_match (lexer, T_LPAREN)) { if (lex_match_id (lexer, "BIVARIATE")) { /* This is the default anyway */ } - else if (lex_match_id (lexer, "OVERLAY")) + else if (lex_match_id (lexer, "OVERLAY")) { lex_error (lexer, _("%s is not yet implemented."),"OVERLAY"); goto error; } - else if (lex_match_id (lexer, "MATRIX")) + else if (lex_match_id (lexer, "MATRIX")) { lex_error (lexer, _("%s is not yet implemented."),"MATRIX"); goto error; } - else if (lex_match_id (lexer, "XYZ")) + else if (lex_match_id (lexer, "XYZ")) { lex_error(lexer, _("%s is not yet implemented."),"XYZ"); goto error; } else { - lex_error_expecting (lexer, "BIVARIATE", NULL); + lex_error_expecting (lexer, "BIVARIATE"); goto error; } if (!lex_force_match (lexer, T_RPAREN)) @@ -423,7 +814,7 @@ cmd_graph (struct lexer *lexer, struct dataset *ds) &graph.dep_vars, &graph.n_dep_vars, PV_NO_DUPLICATE | PV_NUMERIC)) goto error; - + if (graph.scatter_type == ST_BIVARIATE && graph.n_dep_vars != 1) { lex_error(lexer, _("Only one variable is allowed.")); @@ -443,7 +834,7 @@ cmd_graph (struct lexer *lexer, struct dataset *ds) lex_error (lexer, _("Only one variable is allowed.")); goto error; } - + if (lex_match (lexer, T_BY)) { const struct variable *v = NULL; @@ -452,14 +843,10 @@ cmd_graph (struct lexer *lexer, struct dataset *ds) lex_error (lexer, _("Variable expected")); goto error; } - graph.byvar = v; + graph.by_var[0] = v; + graph.n_by_vars = 1; } } - else if (lex_match_id (lexer, "BAR")) - { - lex_error (lexer, _("%s is not yet implemented."),"BAR"); - goto error; - } else if (lex_match_id (lexer, "LINE")) { lex_error (lexer, _("%s is not yet implemented."),"LINE"); @@ -521,7 +908,7 @@ cmd_graph (struct lexer *lexer, struct dataset *ds) } else if (lex_match_id (lexer, "REPORT")) { - graph.fctr_excl = MV_NEVER; + graph.fctr_excl = 0; } else if (lex_match_id (lexer, "NOREPORT")) { @@ -545,18 +932,28 @@ cmd_graph (struct lexer *lexer, struct dataset *ds) { case CT_SCATTERPLOT: /* See scatterplot.h for the setup of the case prototype */ - graph.gr_proto = caseproto_add_width (graph.gr_proto, 0); /* x value - SP_IDX_X*/ - graph.gr_proto = caseproto_add_width (graph.gr_proto, 0); /* y value - SP_IDX_Y*/ - /* The byvar contains the plot categories for the different xy plot colors */ - if (graph.byvar) /* SP_IDX_BY */ - graph.gr_proto = caseproto_add_width (graph.gr_proto, var_get_width(graph.byvar)); + + /* x value - SP_IDX_X*/ + graph.gr_proto = caseproto_add_width (graph.gr_proto, 0); + + /* y value - SP_IDX_Y*/ + graph.gr_proto = caseproto_add_width (graph.gr_proto, 0); + /* The by_var contains the plot categories for the different xy + plot colors */ + if (graph.n_by_vars > 0) /* SP_IDX_BY */ + graph.gr_proto = caseproto_add_width (graph.gr_proto, + var_get_width(graph.by_var[0])); break; case CT_HISTOGRAM: - graph.gr_proto = caseproto_add_width (graph.gr_proto, 0); /* x value */ - graph.gr_proto = caseproto_add_width (graph.gr_proto, 0); /* weight value */ + /* x value */ + graph.gr_proto = caseproto_add_width (graph.gr_proto, 0); + /* weight value */ + graph.gr_proto = caseproto_add_width (graph.gr_proto, 0); + break; + case CT_BAR: break; case CT_NONE: - lex_error_expecting (lexer,"HISTOGRAM","SCATTERPLOT",NULL); + lex_error_expecting (lexer, "HISTOGRAM", "SCATTERPLOT", "BAR"); goto error; default: NOT_REACHED (); @@ -567,14 +964,20 @@ cmd_graph (struct lexer *lexer, struct dataset *ds) struct casegrouper *grouper; struct casereader *group; bool ok; - + grouper = casegrouper_create_splits (proc_open (ds), graph.dict); while (casegrouper_get_next_group (grouper, &group)) - run_graph (&graph, group); + { + if (graph.chart_type == CT_BAR) + run_barchart (&graph, group); + else + run_graph (&graph, group); + } ok = casegrouper_destroy (grouper); ok = proc_commit (ds) && ok; } + subcase_uninit (&graph.ordering); free (graph.dep_vars); pool_destroy (graph.pool); caseproto_unref (graph.gr_proto); @@ -582,6 +985,7 @@ cmd_graph (struct lexer *lexer, struct dataset *ds) return CMD_SUCCESS; error: + subcase_uninit (&graph.ordering); caseproto_unref (graph.gr_proto); free (graph.dep_vars); pool_destroy (graph.pool);