2 PSPP - a program for statistical analysis.
3 Copyright (C) 2012, 2013, 2015, 2019 Free Software Foundation, Inc.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
20 * This module implements the graph command
26 #include "gl/xalloc.h"
27 #include <gsl/gsl_cdf.h>
29 #include "libpspp/assertion.h"
30 #include "libpspp/message.h"
31 #include "libpspp/pool.h"
34 #include "data/dataset.h"
35 #include "data/dictionary.h"
36 #include "data/casegrouper.h"
37 #include "data/casereader.h"
38 #include "data/casewriter.h"
39 #include "data/caseproto.h"
40 #include "data/subcase.h"
43 #include "data/format.h"
45 #include "math/chart-geometry.h"
46 #include "math/histogram.h"
47 #include "math/moments.h"
48 #include "math/sort.h"
49 #include "math/order-stats.h"
50 #include "output/charts/plot-hist.h"
51 #include "output/charts/scatterplot.h"
52 #include "output/charts/barchart.h"
54 #include "language/command.h"
55 #include "language/lexer/lexer.h"
56 #include "language/lexer/value-parser.h"
57 #include "language/lexer/variable-parser.h"
58 #include "language/stats/freq.h"
59 #include "language/stats/chart-category.h"
62 #define _(msgid) gettext (msgid)
63 #define N_(msgid) msgid
95 /* Variable index for histogram case */
102 struct exploratory_stats
115 /* The minimum weight */
125 const struct variable **dep_vars;
126 struct exploratory_stats *es;
128 enum mv_class dep_excl;
129 enum mv_class fctr_excl;
131 const struct dictionary *dict;
135 /* ------------ Graph ---------------- */
136 bool normal; /* For histograms, draw the normal curve */
138 enum chart_type chart_type;
139 enum scatter_type scatter_type;
140 enum bar_type bar_type;
141 const struct variable *by_var[2];
144 struct subcase ordering; /* Ordering for aggregation */
145 int agr; /* Index into ag_func */
147 /* A caseproto that contains the plot data */
148 struct caseproto *gr_proto;
155 calc_mom1 (double acc, double x, double w)
161 calc_mom0 (double acc, double x UNUSED, double w)
167 pre_low_extreme (void)
173 calc_max (double acc, double x, double w UNUSED)
175 return (acc > x) ? acc : x;
179 pre_high_extreme (void)
185 calc_min (double acc, double x, double w UNUSED)
187 return (acc < x) ? acc : x;
191 post_normalise (double acc, double cc)
197 post_percentage (double acc, double ccc)
199 return acc / ccc * 100.0;
202 const struct ag_func ag_func[] =
204 {"COUNT", N_("Count"), 0, 0, NULL, calc_mom0, 0, 0},
205 {"PCT", N_("Percentage"), 0, 0, NULL, calc_mom0, 0, post_percentage},
206 {"CUFREQ", N_("Cumulative Count"), 0, 1, NULL, calc_mom0, 0, 0},
207 {"CUPCT", N_("Cumulative Percent"), 0, 1, NULL, calc_mom0, 0,
210 {"MEAN", N_("Mean"), 1, 0, NULL, calc_mom1, post_normalise, 0},
211 {"SUM", N_("Sum"), 1, 0, NULL, calc_mom1, 0, 0},
212 {"MAXIMUM", N_("Maximum"), 1, 0, pre_low_extreme, calc_max, 0, 0},
213 {"MINIMUM", N_("Minimum"), 1, 0, pre_high_extreme, calc_min, 0, 0},
216 const int N_AG_FUNCS = sizeof (ag_func) / sizeof (ag_func[0]);
219 parse_function_name (struct lexer *lexer, int *agr)
221 for (size_t i = 0; i < N_AG_FUNCS; ++i)
223 if (lex_match_id (lexer, ag_func[i].name))
230 const char *ag_func_names[N_AG_FUNCS];
231 for (size_t i = 0; i < N_AG_FUNCS; ++i)
232 ag_func_names[i] = ag_func[i].name;
233 lex_error_expecting_array (lexer, ag_func_names, N_AG_FUNCS);
238 parse_function (struct lexer *lexer, struct graph *graph)
240 if (!parse_function_name (lexer, &graph->agr))
243 size_t arity = ag_func[graph->agr].arity;
244 graph->n_dep_vars = arity;
247 if (!lex_force_match (lexer, T_LPAREN))
250 graph->dep_vars = xcalloc (graph->n_dep_vars, sizeof (graph->dep_vars));
251 for (int v = 0; v < arity; ++v)
253 graph->dep_vars[v] = parse_variable (lexer, graph->dict);
254 if (!graph->dep_vars[v])
258 if (!lex_force_match (lexer, T_RPAREN))
262 if (!lex_force_match (lexer, T_BY))
265 graph->by_var[0] = parse_variable (lexer, graph->dict);
266 if (!graph->by_var[0])
268 subcase_add_var (&graph->ordering, graph->by_var[0], SC_ASCEND);
271 if (lex_match (lexer, T_BY))
273 graph->by_var[1] = parse_variable (lexer, graph->dict);
274 if (!graph->by_var[1])
276 subcase_add_var (&graph->ordering, graph->by_var[1], SC_ASCEND);
284 show_scatterplot (const struct graph *cmd, struct casereader *input)
286 struct scatterplot_chart *scatterplot;
287 bool byvar_overflow = false;
289 char *title = (cmd->n_by_vars > 0
290 ? xasprintf (_("%s vs. %s by %s"),
291 var_to_string (cmd->dep_vars[1]),
292 var_to_string (cmd->dep_vars[0]),
293 var_to_string (cmd->by_var[0]))
294 : xasprintf (_("%s vs. %s"),
295 var_to_string (cmd->dep_vars[1]),
296 var_to_string (cmd->dep_vars[0])));;
298 scatterplot = scatterplot_create (input,
299 var_to_string(cmd->dep_vars[0]),
300 var_to_string(cmd->dep_vars[1]),
301 (cmd->n_by_vars > 0) ? cmd->by_var[0]
305 cmd->es[0].minimum, cmd->es[0].maximum,
306 cmd->es[1].minimum, cmd->es[1].maximum);
307 scatterplot_chart_submit (scatterplot);
311 msg (MW, _("Maximum number of scatterplot categories reached. "
312 "Your BY variable has too many distinct values. "
313 "The coloring of the plot will not be correct."));
317 show_histogr (const struct graph *cmd, struct casereader *input)
319 struct histogram *histogram;
321 if (cmd->es[0].cc <= 0)
323 casereader_destroy (input);
328 double bin_width = fabs (cmd->es[0].minimum - cmd->es[0].maximum)
329 / (1 + log2 (cmd->es[0].cc));
330 histogram = histogram_create (bin_width,
331 cmd->es[0].minimum, cmd->es[0].maximum);
334 casereader_destroy (input);
339 for (; (c = casereader_read (input)) != NULL; case_unref (c))
341 const double x = case_num_idx (c, HG_IDX_X);
342 const double weight = case_num_idx (c, HG_IDX_WT);
343 moments_pass_two (cmd->es[0].mom, x, weight);
344 histogram_add (histogram, x, weight);
346 casereader_destroy (input);
348 const char *label = var_to_string (cmd->dep_vars[0]);
350 moments_calculate (cmd->es[0].mom, &n, &mean, &var, NULL, NULL);
351 chart_submit (histogram_chart_create (histogram->gsl_hist, label, n, mean,
352 sqrt (var), cmd->normal));
354 statistic_destroy (&histogram->parent);
358 cleanup_exploratory_stats (struct graph *cmd)
360 for (size_t v = 0; v < cmd->n_dep_vars; ++v)
361 moments_destroy (cmd->es[v].mom);
365 any_categorical_missing (const struct graph *cmd, const struct ccase *c)
367 for (size_t v = 0; v < cmd->n_by_vars; ++v)
368 if (var_is_value_missing (cmd->by_var[v], case_data (c, cmd->by_var[v]))
375 find_fcol (struct hmap *columns, const union value *value, size_t hash,
379 HMAP_FOR_EACH_WITH_HASH (fcol, struct freq, node, hash, columns)
380 if (value_equal (value, &fcol->values[0], width))
386 run_barchart (struct graph *cmd, struct casereader *input)
390 if (cmd->missing_pw == false)
391 input = casereader_create_filter_missing (input,
399 input = sort_execute (input, &cmd->ordering);
401 struct freq **cells = NULL;
403 size_t allocated_cells = 0;
405 struct hmap columns = HMAP_INITIALIZER (columns);
406 assert (cmd->n_by_vars <= 2);
407 struct casegrouper *grouper = casegrouper_create_vars (input, cmd->by_var,
409 struct casereader *group;
410 for (; casegrouper_get_next_group (grouper, &group);
411 casereader_destroy (group))
413 struct ccase *c = casereader_peek (group, 0);
414 if (any_categorical_missing (cmd, c))
420 if (n_cells >= allocated_cells)
421 cells = x2nrealloc (cells, &allocated_cells, sizeof *cells);
422 cells[n_cells++] = xzalloc (table_entry_size (cmd->n_by_vars));
424 if (ag_func[cmd->agr].cumulative && n_cells >= 2)
425 cells[n_cells - 1]->count = cells[n_cells - 2]->count;
427 cells[n_cells - 1]->count = 0;
428 if (ag_func[cmd->agr].pre)
429 cells[n_cells - 1]->count = ag_func[cmd->agr].pre();
431 if (cmd->n_by_vars > 1)
433 const union value *vv = case_data (c, cmd->by_var[1]);
434 const double weight = dict_get_case_weight (cmd->dict, c, NULL);
435 int v1_width = var_get_width (cmd->by_var[1]);
436 size_t hash = value_hash (vv, v1_width, 0);
438 struct freq *fcol = find_fcol (&columns, vv, hash, v1_width);
441 fcol = xzalloc (sizeof *fcol);
442 value_clone (&fcol->values[0], vv, v1_width);
443 hmap_insert (&columns, &fcol->node, hash);
445 fcol->count += weight;
448 for (size_t v = 0; v < cmd->n_by_vars; ++v)
449 value_clone (&cells[n_cells - 1]->values[v],
450 case_data (c, cmd->by_var[v]),
451 var_get_width (cmd->by_var[v]));
455 for (; (c = casereader_read (group)) != NULL; case_unref (c))
457 const double weight = dict_get_case_weight (cmd->dict, c, NULL);
458 const double x = (cmd->n_dep_vars > 0
459 ? case_num (c, cmd->dep_vars[0]) : SYSMIS);
462 cells[n_cells - 1]->count
463 = ag_func[cmd->agr].calc (cells[n_cells - 1]->count, x, weight);
466 if (ag_func[cmd->agr].post)
467 cells[n_cells - 1]->count
468 = ag_func[cmd->agr].post (cells[n_cells - 1]->count, cc);
473 casegrouper_destroy (grouper);
475 for (int i = 0; i < n_cells; ++i)
477 if (ag_func[cmd->agr].ppost)
479 struct freq *cell = cells[i];
480 if (cmd->n_by_vars > 1)
482 const union value *vv = &cell->values[1];
484 int v1_width = var_get_width (cmd->by_var[1]);
485 size_t hash = value_hash (vv, v1_width, 0);
487 struct freq *fcol = find_fcol (&columns, vv, hash, v1_width);
488 cell->count = ag_func[cmd->agr].ppost (cell->count, fcol->count);
491 cell->count = ag_func[cmd->agr].ppost (cell->count, ccc);
495 if (cmd->n_by_vars > 1)
497 struct freq *cell, *next;
498 HMAP_FOR_EACH_SAFE (cell, next, struct freq, node, &columns)
500 value_destroy (cell->values, var_get_width (cmd->by_var[1]));
504 hmap_destroy (&columns);
506 char *label = (cmd->n_dep_vars > 0
507 ? xasprintf (_("%s of %s"),
508 ag_func[cmd->agr].description,
509 var_get_name (cmd->dep_vars[0]))
510 : xstrdup (ag_func[cmd->agr].description));
511 chart_submit (barchart_create (cmd->by_var, cmd->n_by_vars, label, false,
515 for (int i = 0; i < n_cells; ++i)
522 run_graph (struct graph *cmd, struct casereader *input)
524 cmd->es = pool_nmalloc (cmd->pool, cmd->n_dep_vars, sizeof *cmd->es);
525 for (int v = 0; v < cmd->n_dep_vars; v++)
526 cmd->es[v] = (struct exploratory_stats) {
527 .mom = moments_create (MOMENT_KURTOSIS),
533 /* Always remove cases listwise. This is correct for the histogram because
534 there is only one variable and a simple bivariate scatterplot. */
535 input = casereader_create_filter_missing (input,
542 struct casewriter *writer = autopaging_writer_create (cmd->gr_proto);
544 /* The case data is copied to a new writer.
545 The setup of the case depends on the chart type.
548 - x is assumed in dep_vars[0].
549 - y is assumed in dep_vars[1].
552 - x is assumed in dep_vars[0]. */
553 assert (SP_IDX_X == 0 && SP_IDX_Y == 1 && HG_IDX_X == 0);
556 for (; (c = casereader_read (input)) != NULL; case_unref (c))
558 struct ccase *outcase = case_create (cmd->gr_proto);
559 const double weight = dict_get_case_weight (cmd->dict, c, NULL);
560 if (cmd->chart_type == CT_HISTOGRAM)
561 *case_num_rw_idx (outcase, HG_IDX_WT) = weight;
562 if (cmd->chart_type == CT_SCATTERPLOT && cmd->n_by_vars > 0)
563 value_copy (case_data_rw_idx (outcase, SP_IDX_BY),
564 case_data (c, cmd->by_var[0]),
565 var_get_width (cmd->by_var[0]));
566 for (int v = 0; v < cmd->n_dep_vars; v++)
568 const struct variable *var = cmd->dep_vars[v];
569 const double x = case_num (c, var);
571 if (var_is_value_missing (var, case_data (c, var)) & cmd->dep_excl)
573 cmd->es[v].missing += weight;
577 /* Magically v value fits to SP_IDX_X, SP_IDX_Y, HG_IDX_X. */
578 *case_num_rw_idx (outcase, v) = x;
580 if (x > cmd->es[v].maximum)
581 cmd->es[v].maximum = x;
583 if (x < cmd->es[v].minimum)
584 cmd->es[v].minimum = x;
586 cmd->es[v].non_missing += weight;
588 moments_pass_one (cmd->es[v].mom, x, weight);
590 cmd->es[v].cc += weight;
592 if (cmd->es[v].cmin > weight)
593 cmd->es[v].cmin = weight;
595 casewriter_write (writer, outcase);
598 struct casereader *reader = casewriter_make_reader (writer);
599 switch (cmd->chart_type)
602 show_histogr (cmd,reader);
606 show_scatterplot (cmd,reader);
619 casereader_destroy (input);
620 cleanup_exploratory_stats (cmd);
624 cmd_graph (struct lexer *lexer, struct dataset *ds)
626 struct graph graph = {
629 .pool = pool_create (),
634 .dict = dataset_dict (ds),
636 .chart_type = CT_NONE,
637 .scatter_type = ST_BIVARIATE,
638 .gr_proto = caseproto_create (),
639 .ordering = SUBCASE_EMPTY_INITIALIZER,
642 while (lex_token (lexer) != T_ENDCMD)
644 lex_match (lexer, T_SLASH);
646 if (lex_match_id (lexer, "HISTOGRAM"))
648 if (graph.chart_type != CT_NONE)
650 lex_next_error (lexer, -1, -1,
651 _("Only one chart type is allowed."));
654 graph.normal = false;
655 if (lex_match (lexer, T_LPAREN))
657 if (!lex_force_match_phrase (lexer, "NORMAL)"))
662 if (!lex_force_match (lexer, T_EQUALS))
664 graph.chart_type = CT_HISTOGRAM;
665 int vars_start = lex_ofs (lexer);
666 if (!parse_variables_const (lexer, graph.dict,
667 &graph.dep_vars, &graph.n_dep_vars,
668 PV_NO_DUPLICATE | PV_NUMERIC))
670 if (graph.n_dep_vars > 1)
672 lex_ofs_error (lexer, vars_start, lex_ofs (lexer) - 1,
673 _("Only one variable is allowed."));
677 else if (lex_match_id (lexer, "BAR"))
679 if (graph.chart_type != CT_NONE)
681 lex_next_error (lexer, -1, -1,
682 _("Only one chart type is allowed."));
685 graph.chart_type = CT_BAR;
686 graph.bar_type = CBT_SIMPLE;
688 if (lex_match (lexer, T_LPAREN))
690 if (lex_match_id (lexer, "SIMPLE"))
692 /* This is the default anyway */
694 else if (lex_match_id (lexer, "GROUPED"))
696 graph.bar_type = CBT_GROUPED;
697 lex_next_error (lexer, -1, -1,
698 _("%s is not yet implemented."), "GROUPED");
701 else if (lex_match_id (lexer, "STACKED"))
703 graph.bar_type = CBT_STACKED;
704 lex_next_error (lexer, -1, -1,
705 _("%s is not yet implemented."), "STACKED");
708 else if (lex_match_id (lexer, "RANGE"))
710 graph.bar_type = CBT_RANGE;
711 lex_next_error (lexer, -1, -1,
712 _("%s is not yet implemented."), "RANGE");
717 lex_error_expecting (lexer, "SIMPLE", "GROUPED",
721 if (!lex_force_match (lexer, T_RPAREN))
725 if (!lex_force_match (lexer, T_EQUALS))
728 if (!parse_function (lexer, &graph))
731 else if (lex_match_id (lexer, "SCATTERPLOT"))
733 if (graph.chart_type != CT_NONE)
735 lex_next_error (lexer, -1, -1,
736 _("Only one chart type is allowed."));
739 graph.chart_type = CT_SCATTERPLOT;
740 if (lex_match (lexer, T_LPAREN))
742 if (lex_match_id (lexer, "BIVARIATE"))
744 /* This is the default anyway */
746 else if (lex_match_id (lexer, "OVERLAY"))
748 lex_next_error (lexer, -1, -1,
749 _("%s is not yet implemented."),"OVERLAY");
752 else if (lex_match_id (lexer, "MATRIX"))
754 lex_next_error (lexer, -1, -1,
755 _("%s is not yet implemented."),"MATRIX");
758 else if (lex_match_id (lexer, "XYZ"))
760 lex_next_error (lexer, -1, -1,
761 _("%s is not yet implemented."),"XYZ");
766 lex_error_expecting (lexer, "BIVARIATE", "OVERLAY",
770 if (!lex_force_match (lexer, T_RPAREN))
773 if (!lex_force_match (lexer, T_EQUALS))
776 int vars_start = lex_ofs (lexer);
777 if (!parse_variables_const (lexer, graph.dict,
778 &graph.dep_vars, &graph.n_dep_vars,
779 PV_NO_DUPLICATE | PV_NUMERIC))
782 if (graph.scatter_type == ST_BIVARIATE && graph.n_dep_vars != 1)
784 lex_ofs_error (lexer, vars_start, lex_ofs (lexer) - 1,
785 _("Only one variable is allowed."));
789 if (!lex_force_match (lexer, T_WITH))
792 vars_start = lex_ofs (lexer);
793 if (!parse_variables_const (lexer, graph.dict,
794 &graph.dep_vars, &graph.n_dep_vars,
795 PV_NO_DUPLICATE | PV_NUMERIC | PV_APPEND))
798 if (graph.scatter_type == ST_BIVARIATE && graph.n_dep_vars != 2)
800 lex_ofs_error (lexer, vars_start, lex_ofs (lexer) - 1,
801 _("Only one variable is allowed."));
805 if (lex_match (lexer, T_BY))
807 const struct variable *v = NULL;
808 if (!lex_match_variable (lexer,graph.dict,&v))
810 lex_error (lexer, _("Syntax error expecting variable name."));
817 else if (lex_match_id (lexer, "LINE"))
819 lex_next_error (lexer, -1, -1,
820 _("%s is not yet implemented."),"LINE");
823 else if (lex_match_id (lexer, "PIE"))
825 lex_next_error (lexer, -1, -1,
826 _("%s is not yet implemented."),"PIE");
829 else if (lex_match_id (lexer, "ERRORBAR"))
831 lex_next_error (lexer, -1, -1,
832 _("%s is not yet implemented."),"ERRORBAR");
835 else if (lex_match_id (lexer, "PARETO"))
837 lex_next_error (lexer, -1, -1,
838 _("%s is not yet implemented."),"PARETO");
841 else if (lex_match_id (lexer, "TITLE"))
843 lex_next_error (lexer, -1, -1,
844 _("%s is not yet implemented."),"TITLE");
847 else if (lex_match_id (lexer, "SUBTITLE"))
849 lex_next_error (lexer, -1, -1,
850 _("%s is not yet implemented."),"SUBTITLE");
853 else if (lex_match_id (lexer, "FOOTNOTE"))
855 lex_next_error (lexer, -1, -1,
856 _("%s is not yet implemented."),"FOOTNOTE");
859 else if (lex_match_id (lexer, "MISSING"))
861 lex_match (lexer, T_EQUALS);
863 while (lex_token (lexer) != T_ENDCMD
864 && lex_token (lexer) != T_SLASH)
866 if (lex_match_id (lexer, "LISTWISE"))
867 graph.missing_pw = false;
868 else if (lex_match_id (lexer, "VARIABLE"))
869 graph.missing_pw = true;
870 else if (lex_match_id (lexer, "EXCLUDE"))
871 graph.dep_excl = MV_ANY;
872 else if (lex_match_id (lexer, "INCLUDE"))
873 graph.dep_excl = MV_SYSTEM;
874 else if (lex_match_id (lexer, "REPORT"))
876 else if (lex_match_id (lexer, "NOREPORT"))
877 graph.fctr_excl = MV_ANY;
880 lex_error_expecting (lexer, "LISTWISE", "VARIABLE",
881 "EXCLUDE", "INCLUDE",
882 "REPORT", "NOREPORT");
889 lex_error_expecting (lexer, "HISTOGRAM", "BAR", "SCATTERPLOT", "LINE",
890 "PIE", "ERRORBAR", "PARETO", "TITLE", "SUBTITLE",
891 "FOOTNOTE", "MISSING");
896 switch (graph.chart_type)
899 /* See scatterplot.h for the setup of the case prototype */
901 /* x value - SP_IDX_X*/
902 graph.gr_proto = caseproto_add_width (graph.gr_proto, 0);
904 /* y value - SP_IDX_Y*/
905 graph.gr_proto = caseproto_add_width (graph.gr_proto, 0);
906 /* The by_var contains the plot categories for the different xy
908 if (graph.n_by_vars > 0) /* SP_IDX_BY */
909 graph.gr_proto = caseproto_add_width (graph.gr_proto,
910 var_get_width(graph.by_var[0]));
915 graph.gr_proto = caseproto_add_width (graph.gr_proto, 0);
917 graph.gr_proto = caseproto_add_width (graph.gr_proto, 0);
924 lex_error_expecting (lexer, "HISTOGRAM", "SCATTERPLOT", "BAR");
932 struct casegrouper *grouper = casegrouper_create_splits (proc_open (ds), graph.dict);
933 struct casereader *group;
934 while (casegrouper_get_next_group (grouper, &group))
936 if (graph.chart_type == CT_BAR)
937 run_barchart (&graph, group);
939 run_graph (&graph, group);
941 bool ok = casegrouper_destroy (grouper);
942 ok = proc_commit (ds) && ok;
944 subcase_uninit (&graph.ordering);
945 free (graph.dep_vars);
946 pool_destroy (graph.pool);
947 caseproto_unref (graph.gr_proto);
952 subcase_uninit (&graph.ordering);
953 caseproto_unref (graph.gr_proto);
954 free (graph.dep_vars);
955 pool_destroy (graph.pool);