2 PSPP - a program for statistical analysis.
3 Copyright (C) 2012, 2013, 2015 Free Software Foundation, Inc.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
20 * This module implements the graph command
26 #include "gl/xalloc.h"
27 #include <gsl/gsl_cdf.h>
29 #include "libpspp/assertion.h"
30 #include "libpspp/message.h"
31 #include "libpspp/pool.h"
34 #include "data/dataset.h"
35 #include "data/dictionary.h"
36 #include "data/casegrouper.h"
37 #include "data/casereader.h"
38 #include "data/casewriter.h"
39 #include "data/caseproto.h"
40 #include "data/subcase.h"
43 #include "data/format.h"
45 #include "math/chart-geometry.h"
46 #include "math/histogram.h"
47 #include "math/moments.h"
48 #include "math/sort.h"
49 #include "math/order-stats.h"
50 #include "output/charts/plot-hist.h"
51 #include "output/charts/scatterplot.h"
52 #include "output/charts/barchart.h"
54 #include "language/command.h"
55 #include "language/lexer/lexer.h"
56 #include "language/lexer/value-parser.h"
57 #include "language/lexer/variable-parser.h"
58 #include "language/stats/freq.h"
59 #include "language/stats/chart-category.h"
61 #include "output/tab.h"
64 #define _(msgid) gettext (msgid)
65 #define N_(msgid) msgid
97 /* Variable index for histogram case */
104 struct exploratory_stats
117 /* The minimum weight */
127 const struct variable **dep_vars;
128 struct exploratory_stats *es;
130 enum mv_class dep_excl;
131 enum mv_class fctr_excl;
133 const struct dictionary *dict;
137 /* ------------ Graph ---------------- */
138 bool normal; /* For histograms, draw the normal curve */
140 enum chart_type chart_type;
141 enum scatter_type scatter_type;
142 enum bar_type bar_type;
143 const struct variable *by_var[2];
146 struct subcase ordering; /* Ordering for aggregation */
147 int agr; /* Index into ag_func */
149 /* A caseproto that contains the plot data */
150 struct caseproto *gr_proto;
157 calc_mom1 (double acc, double x, double w)
163 calc_mom0 (double acc, double x UNUSED, double w)
169 pre_low_extreme (void)
175 calc_max (double acc, double x, double w UNUSED)
177 return (acc > x) ? acc : x;
181 pre_high_extreme (void)
187 calc_min (double acc, double x, double w UNUSED)
189 return (acc < x) ? acc : x;
193 post_normalise (double acc, double cc)
199 post_percentage (double acc, double ccc)
201 return acc / ccc * 100.0;
205 const struct ag_func ag_func[] =
207 {"COUNT", N_("Count"), 0, 0, NULL, calc_mom0, 0, 0},
208 {"PCT", N_("Percentage"), 0, 0, NULL, calc_mom0, 0, post_percentage},
209 {"CUFREQ", N_("Cumulative Count"), 0, 1, NULL, calc_mom0, 0, 0},
210 {"CUPCT", N_("Cumulative Percent"), 0, 1, NULL, calc_mom0, 0, post_percentage},
212 {"MEAN", N_("Mean"), 1, 0, NULL, calc_mom1, post_normalise, 0},
213 {"SUM", N_("Sum"), 1, 0, NULL, calc_mom1, 0, 0},
214 {"MAXIMUM", N_("Maximum"), 1, 0, pre_low_extreme, calc_max, 0, 0},
215 {"MINIMUM", N_("Minimum"), 1, 0, pre_high_extreme, calc_min, 0, 0},
218 const int N_AG_FUNCS = sizeof (ag_func) / sizeof (ag_func[0]);
221 parse_function (struct lexer *lexer, struct graph *graph)
224 for (i = 0 ; i < N_AG_FUNCS; ++i)
226 if (lex_match_id (lexer, ag_func[i].name))
237 graph->n_dep_vars = ag_func[i].arity;
238 if (ag_func[i].arity > 0)
241 if (!lex_force_match (lexer, T_LPAREN))
244 graph->dep_vars = xzalloc (sizeof (graph->dep_vars) * graph->n_dep_vars);
245 for (v = 0; v < ag_func[i].arity; ++v)
247 graph->dep_vars[v] = parse_variable (lexer, graph->dict);
248 if (! graph->dep_vars[v])
252 if (!lex_force_match (lexer, T_RPAREN))
256 if (!lex_force_match (lexer, T_BY))
259 graph->by_var[0] = parse_variable (lexer, graph->dict);
260 if (!graph->by_var[0])
264 subcase_add_var (&graph->ordering, graph->by_var[0], SC_ASCEND);
267 if (lex_match (lexer, T_BY))
269 graph->by_var[1] = parse_variable (lexer, graph->dict);
270 if (!graph->by_var[1])
274 subcase_add_var (&graph->ordering, graph->by_var[1], SC_ASCEND);
281 lex_error (lexer, NULL);
287 show_scatterplot (const struct graph *cmd, struct casereader *input)
290 struct scatterplot_chart *scatterplot;
291 bool byvar_overflow = false;
293 ds_init_empty (&title);
295 if (cmd->n_by_vars > 0)
297 ds_put_format (&title, _("%s vs. %s by %s"),
298 var_to_string (cmd->dep_vars[1]),
299 var_to_string (cmd->dep_vars[0]),
300 var_to_string (cmd->by_var[0]));
304 ds_put_format (&title, _("%s vs. %s"),
305 var_to_string (cmd->dep_vars[1]),
306 var_to_string (cmd->dep_vars[0]));
309 scatterplot = scatterplot_create (input,
310 var_to_string(cmd->dep_vars[0]),
311 var_to_string(cmd->dep_vars[1]),
312 (cmd->n_by_vars > 0) ? cmd->by_var[0] : NULL,
315 cmd->es[0].minimum, cmd->es[0].maximum,
316 cmd->es[1].minimum, cmd->es[1].maximum);
317 scatterplot_chart_submit (scatterplot);
322 msg (MW, _("Maximum number of scatterplot categories reached. "
323 "Your BY variable has too many distinct values. "
324 "The coloring of the plot will not be correct."));
329 show_histogr (const struct graph *cmd, struct casereader *input)
331 struct histogram *histogram;
334 if (cmd->es[0].cc <= 0)
336 casereader_destroy (input);
342 double bin_width = fabs (cmd->es[0].minimum - cmd->es[0].maximum)
343 / (1 + log2 (cmd->es[0].cc))
347 histogram_create (bin_width, cmd->es[0].minimum, cmd->es[0].maximum);
350 if (NULL == histogram)
352 casereader_destroy (input);
356 for (;(c = casereader_read (input)) != NULL; case_unref (c))
358 const double x = case_data_idx (c, HG_IDX_X)->f;
359 const double weight = case_data_idx (c, HG_IDX_WT)->f;
360 moments_pass_two (cmd->es[0].mom, x, weight);
361 histogram_add (histogram, x, weight);
363 casereader_destroy (input);
371 ds_init_cstr (&label,
372 var_to_string (cmd->dep_vars[0]));
374 moments_calculate (cmd->es[0].mom, &n, &mean, &var, NULL, NULL);
377 ( histogram_chart_create (histogram->gsl_hist,
378 ds_cstr (&label), n, mean,
379 sqrt (var), cmd->normal));
381 statistic_destroy (&histogram->parent);
387 cleanup_exploratory_stats (struct graph *cmd)
391 for (v = 0; v < cmd->n_dep_vars; ++v)
393 moments_destroy (cmd->es[v].mom);
399 run_barchart (struct graph *cmd, struct casereader *input)
401 struct casegrouper *grouper;
402 struct casereader *group;
405 if ( cmd->missing_pw == false)
406 input = casereader_create_filter_missing (input,
414 input = sort_execute (input, &cmd->ordering);
416 struct freq **freqs = NULL;
419 for (grouper = casegrouper_create_vars (input, cmd->by_var,
421 casegrouper_get_next_group (grouper, &group);
422 casereader_destroy (group))
425 struct ccase *c = casereader_peek (group, 0);
427 /* Deal with missing values in the categorical variables */
428 for (v = 0; v < cmd->n_by_vars; ++v)
430 if (var_is_value_missing (cmd->by_var[v], case_data (c, cmd->by_var[v]), cmd->fctr_excl) )
434 if (v < cmd->n_by_vars)
440 freqs = xrealloc (freqs, sizeof (*freqs) * ++n_freqs);
441 freqs[n_freqs - 1] = xzalloc (sizeof (**freqs) +
442 sizeof (union value) * (cmd->n_by_vars - 1) );
444 if (ag_func[cmd->agr].cumulative && n_freqs >= 2)
445 freqs[n_freqs - 1]->count = freqs[n_freqs - 2]->count;
447 freqs[n_freqs - 1]->count = 0;
448 if (ag_func[cmd->agr].pre)
449 freqs[n_freqs - 1]->count = ag_func[cmd->agr].pre();
452 for (v = 0; v < cmd->n_by_vars; ++v)
454 value_clone (&freqs[n_freqs - 1]->values[v], case_data (c, cmd->by_var[v]),
455 var_get_width (cmd->by_var[v])
461 for (;(c = casereader_read (group)) != NULL; case_unref (c))
463 const double weight = dict_get_case_weight (cmd->dict,c,NULL);
464 const double x = (cmd->n_dep_vars > 0) ? case_data (c, cmd->dep_vars[0])->f : SYSMIS;
468 freqs[n_freqs - 1]->count
469 = ag_func[cmd->agr].calc (freqs[n_freqs - 1]->count, x, weight);
472 if (ag_func[cmd->agr].post)
473 freqs[n_freqs - 1]->count
474 = ag_func[cmd->agr].post (freqs[n_freqs - 1]->count, cc);
479 casegrouper_destroy (grouper);
481 for (int i = 0; i < n_freqs; ++i)
483 if (ag_func[cmd->agr].ppost)
484 freqs[i]->count = ag_func[cmd->agr].ppost (freqs[i]->count, ccc);
490 ds_init_empty (&label);
492 if (cmd->n_dep_vars > 0)
493 ds_put_format (&label, _("%s of %s"),
494 ag_func[cmd->agr].description,
495 var_get_name (cmd->dep_vars[0]));
498 ag_func[cmd->agr].description);
500 chart_item_submit (barchart_create (cmd->by_var, cmd->n_by_vars,
501 ds_cstr (&label), false,
507 for (int i = 0; i < n_freqs; ++i)
515 run_graph (struct graph *cmd, struct casereader *input)
518 struct casereader *reader;
519 struct casewriter *writer;
521 cmd->es = pool_calloc (cmd->pool,cmd->n_dep_vars,sizeof(struct exploratory_stats));
522 for(int v=0;v<cmd->n_dep_vars;v++)
524 cmd->es[v].mom = moments_create (MOMENT_KURTOSIS);
525 cmd->es[v].cmin = DBL_MAX;
526 cmd->es[v].maximum = -DBL_MAX;
527 cmd->es[v].minimum = DBL_MAX;
529 /* Always remove cases listwise. This is correct for */
530 /* the histogram because there is only one variable */
531 /* and a simple bivariate scatterplot */
532 /* if ( cmd->missing_pw == false) */
533 input = casereader_create_filter_missing (input,
540 writer = autopaging_writer_create (cmd->gr_proto);
542 /* The case data is copied to a new writer */
543 /* The setup of the case depends on the Charttype */
544 /* For Scatterplot x is assumed in dep_vars[0] */
545 /* y is assumed in dep_vars[1] */
546 /* For Histogram x is assumed in dep_vars[0] */
547 assert(SP_IDX_X == 0 && SP_IDX_Y == 1 && HG_IDX_X == 0);
549 for (;(c = casereader_read (input)) != NULL; case_unref (c))
551 struct ccase *outcase = case_create (cmd->gr_proto);
552 const double weight = dict_get_case_weight (cmd->dict,c,NULL);
553 if (cmd->chart_type == CT_HISTOGRAM)
554 case_data_rw_idx (outcase, HG_IDX_WT)->f = weight;
555 if (cmd->chart_type == CT_SCATTERPLOT && cmd->n_by_vars > 0)
556 value_copy (case_data_rw_idx (outcase, SP_IDX_BY),
557 case_data (c, cmd->by_var[0]),
558 var_get_width (cmd->by_var[0]));
559 for(int v=0;v<cmd->n_dep_vars;v++)
561 const struct variable *var = cmd->dep_vars[v];
562 const double x = case_data (c, var)->f;
564 if (var_is_value_missing (var, case_data (c, var), cmd->dep_excl))
566 cmd->es[v].missing += weight;
569 /* Magically v value fits to SP_IDX_X, SP_IDX_Y, HG_IDX_X */
570 case_data_rw_idx (outcase, v)->f = x;
572 if (x > cmd->es[v].maximum)
573 cmd->es[v].maximum = x;
575 if (x < cmd->es[v].minimum)
576 cmd->es[v].minimum = x;
578 cmd->es[v].non_missing += weight;
580 moments_pass_one (cmd->es[v].mom, x, weight);
582 cmd->es[v].cc += weight;
584 if (cmd->es[v].cmin > weight)
585 cmd->es[v].cmin = weight;
587 casewriter_write (writer,outcase);
590 reader = casewriter_make_reader (writer);
592 switch (cmd->chart_type)
595 show_histogr (cmd,reader);
598 show_scatterplot (cmd,reader);
605 casereader_destroy (input);
606 cleanup_exploratory_stats (cmd);
611 cmd_graph (struct lexer *lexer, struct dataset *ds)
615 graph.missing_pw = false;
617 graph.pool = pool_create ();
619 graph.dep_excl = MV_ANY;
620 graph.fctr_excl = MV_ANY;
622 graph.dict = dataset_dict (ds);
624 graph.dep_vars = NULL;
625 graph.chart_type = CT_NONE;
626 graph.scatter_type = ST_BIVARIATE;
628 graph.gr_proto = caseproto_create ();
630 subcase_init_empty (&graph.ordering);
632 while (lex_token (lexer) != T_ENDCMD)
634 lex_match (lexer, T_SLASH);
636 if (lex_match_id (lexer, "HISTOGRAM"))
638 if (graph.chart_type != CT_NONE)
640 lex_error (lexer, _("Only one chart type is allowed."));
643 graph.normal = false;
644 if (lex_match (lexer, T_LPAREN))
646 if (!lex_force_match_id (lexer, "NORMAL"))
649 if (!lex_force_match (lexer, T_RPAREN))
654 if (!lex_force_match (lexer, T_EQUALS))
656 graph.chart_type = CT_HISTOGRAM;
657 if (!parse_variables_const (lexer, graph.dict,
658 &graph.dep_vars, &graph.n_dep_vars,
659 PV_NO_DUPLICATE | PV_NUMERIC))
661 if (graph.n_dep_vars > 1)
663 lex_error (lexer, _("Only one variable is allowed."));
667 else if (lex_match_id (lexer, "BAR"))
669 if (graph.chart_type != CT_NONE)
671 lex_error (lexer, _("Only one chart type is allowed."));
674 graph.chart_type = CT_BAR;
675 graph.bar_type = CBT_SIMPLE;
677 if (lex_match (lexer, T_LPAREN))
679 if (lex_match_id (lexer, "SIMPLE"))
681 /* This is the default anyway */
683 else if (lex_match_id (lexer, "GROUPED"))
685 graph.bar_type = CBT_GROUPED;
688 else if (lex_match_id (lexer, "STACKED"))
690 graph.bar_type = CBT_STACKED;
691 lex_error (lexer, _("%s is not yet implemented."), "STACKED");
694 else if (lex_match_id (lexer, "RANGE"))
696 graph.bar_type = CBT_RANGE;
697 lex_error (lexer, _("%s is not yet implemented."), "RANGE");
702 lex_error (lexer, NULL);
705 if (!lex_force_match (lexer, T_RPAREN))
709 if (!lex_force_match (lexer, T_EQUALS))
712 if (! parse_function (lexer, &graph))
715 else if (lex_match_id (lexer, "SCATTERPLOT"))
717 if (graph.chart_type != CT_NONE)
719 lex_error (lexer, _("Only one chart type is allowed."));
722 graph.chart_type = CT_SCATTERPLOT;
723 if (lex_match (lexer, T_LPAREN))
725 if (lex_match_id (lexer, "BIVARIATE"))
727 /* This is the default anyway */
729 else if (lex_match_id (lexer, "OVERLAY"))
731 lex_error (lexer, _("%s is not yet implemented."),"OVERLAY");
734 else if (lex_match_id (lexer, "MATRIX"))
736 lex_error (lexer, _("%s is not yet implemented."),"MATRIX");
739 else if (lex_match_id (lexer, "XYZ"))
741 lex_error(lexer, _("%s is not yet implemented."),"XYZ");
746 lex_error_expecting (lexer, "BIVARIATE", NULL);
749 if (!lex_force_match (lexer, T_RPAREN))
752 if (!lex_force_match (lexer, T_EQUALS))
755 if (!parse_variables_const (lexer, graph.dict,
756 &graph.dep_vars, &graph.n_dep_vars,
757 PV_NO_DUPLICATE | PV_NUMERIC))
760 if (graph.scatter_type == ST_BIVARIATE && graph.n_dep_vars != 1)
762 lex_error(lexer, _("Only one variable is allowed."));
766 if (!lex_force_match (lexer, T_WITH))
769 if (!parse_variables_const (lexer, graph.dict,
770 &graph.dep_vars, &graph.n_dep_vars,
771 PV_NO_DUPLICATE | PV_NUMERIC | PV_APPEND))
774 if (graph.scatter_type == ST_BIVARIATE && graph.n_dep_vars != 2)
776 lex_error (lexer, _("Only one variable is allowed."));
780 if (lex_match (lexer, T_BY))
782 const struct variable *v = NULL;
783 if (!lex_match_variable (lexer,graph.dict,&v))
785 lex_error (lexer, _("Variable expected"));
792 else if (lex_match_id (lexer, "LINE"))
794 lex_error (lexer, _("%s is not yet implemented."),"LINE");
797 else if (lex_match_id (lexer, "PIE"))
799 lex_error (lexer, _("%s is not yet implemented."),"PIE");
802 else if (lex_match_id (lexer, "ERRORBAR"))
804 lex_error (lexer, _("%s is not yet implemented."),"ERRORBAR");
807 else if (lex_match_id (lexer, "PARETO"))
809 lex_error (lexer, _("%s is not yet implemented."),"PARETO");
812 else if (lex_match_id (lexer, "TITLE"))
814 lex_error (lexer, _("%s is not yet implemented."),"TITLE");
817 else if (lex_match_id (lexer, "SUBTITLE"))
819 lex_error (lexer, _("%s is not yet implemented."),"SUBTITLE");
822 else if (lex_match_id (lexer, "FOOTNOTE"))
824 lex_error (lexer, _("%s is not yet implemented."),"FOOTNOTE");
825 lex_error (lexer, _("FOOTNOTE is not implemented yet for GRAPH"));
828 else if (lex_match_id (lexer, "MISSING"))
830 lex_match (lexer, T_EQUALS);
832 while (lex_token (lexer) != T_ENDCMD
833 && lex_token (lexer) != T_SLASH)
835 if (lex_match_id (lexer, "LISTWISE"))
837 graph.missing_pw = false;
839 else if (lex_match_id (lexer, "VARIABLE"))
841 graph.missing_pw = true;
843 else if (lex_match_id (lexer, "EXCLUDE"))
845 graph.dep_excl = MV_ANY;
847 else if (lex_match_id (lexer, "INCLUDE"))
849 graph.dep_excl = MV_SYSTEM;
851 else if (lex_match_id (lexer, "REPORT"))
853 graph.fctr_excl = MV_NEVER;
855 else if (lex_match_id (lexer, "NOREPORT"))
857 graph.fctr_excl = MV_ANY;
861 lex_error (lexer, NULL);
868 lex_error (lexer, NULL);
873 switch (graph.chart_type)
876 /* See scatterplot.h for the setup of the case prototype */
877 graph.gr_proto = caseproto_add_width (graph.gr_proto, 0); /* x value - SP_IDX_X*/
878 graph.gr_proto = caseproto_add_width (graph.gr_proto, 0); /* y value - SP_IDX_Y*/
879 /* The by_var contains the plot categories for the different xy plot colors */
880 if (graph.n_by_vars > 0) /* SP_IDX_BY */
881 graph.gr_proto = caseproto_add_width (graph.gr_proto, var_get_width(graph.by_var[0]));
884 graph.gr_proto = caseproto_add_width (graph.gr_proto, 0); /* x value */
885 graph.gr_proto = caseproto_add_width (graph.gr_proto, 0); /* weight value */
890 lex_error_expecting (lexer, "HISTOGRAM", "SCATTERPLOT", "BAR", NULL);
898 struct casegrouper *grouper;
899 struct casereader *group;
902 grouper = casegrouper_create_splits (proc_open (ds), graph.dict);
903 while (casegrouper_get_next_group (grouper, &group))
905 if (graph.chart_type == CT_BAR)
906 run_barchart (&graph, group);
908 run_graph (&graph, group);
910 ok = casegrouper_destroy (grouper);
911 ok = proc_commit (ds) && ok;
914 subcase_destroy (&graph.ordering);
915 free (graph.dep_vars);
916 pool_destroy (graph.pool);
917 caseproto_unref (graph.gr_proto);
922 subcase_destroy (&graph.ordering);
923 caseproto_unref (graph.gr_proto);
924 free (graph.dep_vars);
925 pool_destroy (graph.pool);