Add GRAPH command initially with just scatterplots and histograms.
authorFriedrich Beckmann <friedrich.beckmann@gmx.de>
Sun, 28 Sep 2014 04:32:17 +0000 (21:32 -0700)
committerBen Pfaff <blp@cs.stanford.edu>
Sun, 28 Sep 2014 04:36:09 +0000 (21:36 -0700)
14 files changed:
AUTHORS
NEWS
doc/statistics.texi
src/language/command.def
src/language/stats/automake.mk
src/language/stats/graph.c [new file with mode: 0644]
src/output/automake.mk
src/output/cairo-chart.h
src/output/cairo.c
src/output/charts/scatterplot-cairo.c [new file with mode: 0644]
src/output/charts/scatterplot.c [new file with mode: 0644]
src/output/charts/scatterplot.h [new file with mode: 0644]
tests/automake.mk
tests/language/stats/graph.at [new file with mode: 0644]

diff --git a/AUTHORS b/AUTHORS
index 9350975c738e97a2bb79a871176a5ee5cf83f82f..20998f08eb8b81e59569298509b83907c3404e58 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -16,6 +16,8 @@ is also an important contributor to GSL, which is used by PSPP.
 
 * Mehmet Hakan Satman wrote the QUICK CLUSTER command.
 
+* Friedrich Beckmann wrote the GRAPH command.
+
 We also thank past contributors:
 
 * John Williams wrote an initial draft of the T-TEST procedure.
diff --git a/NEWS b/NEWS
index f605aaab1ed6e66024bb412b88035419c0098ebe..cc24a723659459d83715240146cd2ff8a7b5a723 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -4,6 +4,11 @@ See the end for copying conditions.
 
 Please send PSPP bug reports to bug-gnu-pspp@gnu.org.
  
+Changes since 0.8.4:
+
+ * The GRAPH command is now available.  Initially it support
+   scatterplots and histograms.
+
 Changes from 0.8.3 to 0.8.4:
 
  * Formatting of SYSFILE INFO output was made easier to read.
index 3a4302873b9f35742b7f32860343c55fd91b49aa..7a880d15ab33b17e9cf735de2fc2eb67ed4ba6ff 100644 (file)
@@ -8,6 +8,7 @@ far.
 * DESCRIPTIVES::                Descriptive statistics.
 * FREQUENCIES::                 Frequency tables.
 * EXAMINE::                     Testing data for normality.
+* GRAPH::                       Plot data.
 * CORRELATIONS::                Correlation tables.
 * CROSSTABS::                   Crosstabulation tables.
 * FACTOR::                      Factor analysis and Principal Components analysis.
@@ -375,6 +376,52 @@ specified for which
 there are many distinct values, then @cmd{EXAMINE} will produce a very
 large quantity of output.
 
+@node GRAPH
+@section GRAPH
+
+@vindex GRAPH
+@cindex Exploratory data analysis
+@cindex normality, testing
+
+@display
+GRAPH
+        /HISTOGRAM = @var{var}
+        /SCATTERPLOT [(BIVARIATE)] = @var{var1} WITH @var{var2} [BY @var{var3}] 
+        [ /MISSING=@{LISTWISE, VARIABLE@} [@{EXCLUDE, INCLUDE@}] ] 
+               [@{NOREPORT,REPORT@}]
+
+@end display
+
+The @cmd{GRAPH} produces graphical plots of data. Only one of the subcommands 
+@subcmd{HISTOGRAM} or @subcmd{SCATTERPLOT} can be specified, i.e. only one plot
+can be produced per call of @cmd{GRAPH}. The @subcmd{MISSING} is optional. 
+
+@cindex scatterplot
+
+The subcommand @subcmd{SCATTERPLOT} produces an xy plot of the data. The different 
+values of the optional third variable @var{var3} will result in different colours and/or
+markers for the plot. The following is an example for producing a scatterplot.
+
+@example
+GRAPH   
+        /SCATTERPLOT = @var{height} WITH @var{weight} BY @var{gender}.
+@end example
+
+This example will produce a scatterplot where height is plotted versus weight. Depending
+on the value of the gender variable, the colour of the datapoint is different. With
+this plot it is possible to analyze gender differences for height vs. weight relation.
+
+@cindex histogram
+
+The subcommand @subcmd{HISTOGRAM} produces a histogram. Only one variable is allowed for
+the histogram plot. For an alternative method to produce histograms @pxref{EXAMINE}. The
+following example produces a histogram plot for variable weigth.
+
+@example
+GRAPH   
+        /HISTOGRAM = @var{weight}.
+@end example
+
 @node CORRELATIONS
 @section CORRELATIONS
 
index 8fe737218fa83f3755beadbc3836a454b343255c..c7b6325073655641bcb3649f4e5cea3d4319d611 100644 (file)
@@ -122,6 +122,7 @@ DEF_CMD (S_DATA, 0, "FILTER", cmd_filter)
 DEF_CMD (S_DATA, 0, "FLIP", cmd_flip)
 DEF_CMD (S_DATA, 0, "FREQUENCIES", cmd_frequencies)
 DEF_CMD (S_DATA, 0, "GLM", cmd_glm)
+DEF_CMD (S_DATA, 0, "GRAPH", cmd_graph)
 DEF_CMD (S_DATA, 0, "LIST", cmd_list)
 DEF_CMD (S_DATA, 0, "LOGISTIC REGRESSION", cmd_logistic)
 DEF_CMD (S_DATA, 0, "MEANS", cmd_means)
@@ -194,7 +195,6 @@ UNIMPL_CMD ("FIT", "Goodness of Fit")
 UNIMPL_CMD ("GENLOG", "Categorical model fitting")
 UNIMPL_CMD ("GET TRANSLATE", "Read other file formats")
 UNIMPL_CMD ("GGRAPH", "Custom defined graphs")
-UNIMPL_CMD ("GRAPH", "Draw graphs")
 UNIMPL_CMD ("HILOGLINEAR", "Hierarchical loglinear models")
 UNIMPL_CMD ("HOMALS", "Homogeneity analysis")
 UNIMPL_CMD ("IGRAPH", "Interactive graphs")
index d923a7d1346abdc0f5cdf943be332db3eebe7ae1..bfe379e3452d05ada1139968b87ce5e0e68d9535 100644 (file)
@@ -26,6 +26,7 @@ language_stats_sources = \
        src/language/stats/friedman.c \
        src/language/stats/friedman.h \
        src/language/stats/glm.c \
+       src/language/stats/graph.c \
        src/language/stats/kruskal-wallis.c \
        src/language/stats/kruskal-wallis.h \
        src/language/stats/ks-one-sample.c \
diff --git a/src/language/stats/graph.c b/src/language/stats/graph.c
new file mode 100644 (file)
index 0000000..7bfbbc7
--- /dev/null
@@ -0,0 +1,540 @@
+/*
+  PSPP - a program for statistical analysis.
+  Copyright (C) 2012, 2013  Free Software Foundation, Inc.
+  
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+ * This module implements the graph command
+ */
+
+#include <config.h>
+
+#include <math.h>
+#include <gsl/gsl_cdf.h>
+
+#include "libpspp/assertion.h"
+#include "libpspp/message.h"
+#include "libpspp/pool.h"
+
+
+#include "data/dataset.h"
+#include "data/dictionary.h"
+#include "data/casegrouper.h"
+#include "data/casereader.h"
+#include "data/casewriter.h"
+#include "data/caseproto.h"
+#include "data/subcase.h"
+
+
+#include "data/format.h"
+
+#include "math/chart-geometry.h"
+#include "math/histogram.h"
+#include "math/moments.h"
+#include "math/sort.h"
+#include "math/order-stats.h"
+#include "output/charts/plot-hist.h"
+#include "output/charts/scatterplot.h"
+
+#include "language/command.h"
+#include "language/lexer/lexer.h"
+#include "language/lexer/value-parser.h"
+#include "language/lexer/variable-parser.h"
+
+#include "output/tab.h"
+
+#include "gettext.h"
+#define _(msgid) gettext (msgid)
+#define N_(msgid) msgid
+
+enum chart_type
+  {
+    CT_NONE,
+    CT_BAR,
+    CT_LINE,
+    CT_PIE,
+    CT_ERRORBAR,
+    CT_HILO,
+    CT_HISTOGRAM,
+    CT_SCATTERPLOT,
+    CT_PARETO
+  };
+
+enum scatter_type
+  {
+    ST_BIVARIATE,
+    ST_OVERLAY,
+    ST_MATRIX,
+    ST_XYZ
+  };
+
+struct exploratory_stats
+{
+  double missing;
+  double non_missing;
+
+  struct moments *mom;
+
+  double minimum;
+  double maximum;
+
+  /* Total weight */
+  double cc;
+
+  /* The minimum weight */
+  double cmin;
+};
+
+
+struct graph
+{
+  struct pool *pool;
+
+  size_t n_dep_vars;
+  const struct variable **dep_vars;
+  struct exploratory_stats *es;
+
+  enum mv_class dep_excl;
+  enum mv_class fctr_excl;
+
+  const struct dictionary *dict;
+
+  bool missing_pw;
+
+  /* ------------ Graph ---------------- */
+  enum chart_type chart_type;
+  enum scatter_type scatter_type;
+  const struct variable *byvar;
+};
+
+
+static void
+show_scatterplot (const struct graph *cmd, const struct casereader *input)
+{
+  struct string title;
+  struct scatterplot_chart *scatterplot;
+  bool byvar_overflow = false;
+
+  ds_init_cstr (&title, var_to_string (cmd->dep_vars[0]));
+  ds_put_cstr (&title, " vs ");              
+  ds_put_cstr (&title, var_to_string (cmd->dep_vars[1]));
+  if (cmd->byvar)
+    {
+      ds_put_cstr (&title, " by ");                
+      ds_put_cstr (&title, var_to_string (cmd->byvar));
+    }    
+
+  scatterplot = scatterplot_create(input,
+                                  cmd->dep_vars[0], 
+                                  cmd->dep_vars[1],
+                                  cmd->byvar,
+                                  &byvar_overflow,
+                                  ds_cstr (&title),
+                                  cmd->es[0].minimum, cmd->es[0].maximum,
+                                  cmd->es[1].minimum, cmd->es[1].maximum);
+  scatterplot_chart_submit(scatterplot);
+  ds_destroy(&title);
+
+  if (byvar_overflow)
+    {
+      msg (MW, _("Maximum number of scatterplot categories reached." 
+                "Your BY variable has too many distinct values."
+                "The colouring of the plot will not be correct"));
+    }
+
+
+}
+
+static void
+show_histogr (const struct graph *cmd, const struct casereader *input)
+{
+  struct histogram *histogram;
+  struct ccase *c;
+  struct casereader *reader;
+
+  {
+    /* Sturges Rule */
+    double bin_width = fabs (cmd->es[0].minimum - cmd->es[0].maximum)
+      / (1 + log2 (cmd->es[0].cc))
+      ;
+
+    histogram =
+      histogram_create (bin_width, cmd->es[0].minimum, cmd->es[0].maximum);
+  }
+
+
+  for (reader=casereader_clone(input);(c = casereader_read (reader)) != NULL; case_unref (c))
+    {
+      const struct variable *var = cmd->dep_vars[0];
+      const double x = case_data (c, var)->f;
+      const double weight = dict_get_case_weight(cmd->dict,c,NULL);
+      moments_pass_two (cmd->es[0].mom, x, weight);
+      histogram_add (histogram, x, weight);
+    }
+  casereader_destroy(reader);
+
+
+  {
+    double n, mean, var;
+
+    struct string label;
+
+    ds_init_cstr (&label, 
+                 var_to_string (cmd->dep_vars[0]));
+
+    moments_calculate (cmd->es[0].mom, &n, &mean, &var, NULL, NULL);
+
+    chart_item_submit
+      ( histogram_chart_create (histogram->gsl_hist,
+                               ds_cstr (&label), n, mean,
+                               sqrt (var), false));
+
+    statistic_destroy(&histogram->parent);      
+    ds_destroy (&label);
+  }
+}
+
+static void
+cleanup_exploratory_stats (struct graph *cmd)
+{ 
+  int v;
+
+  for (v = 0; v < cmd->n_dep_vars; ++v)
+    {
+      moments_destroy (cmd->es[v].mom);
+    }
+}
+
+
+static void
+run_graph (struct graph *cmd, struct casereader *input)
+{
+  struct ccase *c;
+  struct casereader *reader;
+
+
+  cmd->es = pool_calloc(cmd->pool,cmd->n_dep_vars,sizeof(struct exploratory_stats));
+  for(int v=0;v<cmd->n_dep_vars;v++)
+    {
+      cmd->es[v].mom = moments_create (MOMENT_KURTOSIS);
+      cmd->es[v].cmin = DBL_MAX;
+      cmd->es[v].maximum = -DBL_MAX;
+      cmd->es[v].minimum =  DBL_MAX;
+    }
+  /* Always remove cases listwise. This is correct for */
+  /* the histogram because there is only one variable  */
+  /* and a simple bivariate scatterplot                */
+  /* if ( cmd->missing_pw == false)                    */
+    input = casereader_create_filter_missing (input,
+                                              cmd->dep_vars,
+                                              cmd->n_dep_vars,
+                                              cmd->dep_excl,
+                                              NULL,
+                                              NULL);
+
+  for (reader = casereader_clone (input);
+       (c = casereader_read (reader)) != NULL; case_unref (c))
+    {
+      const double weight = dict_get_case_weight(cmd->dict,c,NULL);      
+      for(int v=0;v<cmd->n_dep_vars;v++)
+       {
+         const struct variable *var = cmd->dep_vars[v];
+         const double x = case_data (c, var)->f;
+
+         if (var_is_value_missing (var, case_data (c, var), cmd->dep_excl))
+           {
+             cmd->es[v].missing += weight;
+             continue;
+           }
+
+         if (x > cmd->es[v].maximum)
+           cmd->es[v].maximum = x;
+
+         if (x < cmd->es[v].minimum)
+           cmd->es[v].minimum =  x;
+
+         cmd->es[v].non_missing += weight;
+
+         moments_pass_one (cmd->es[v].mom, x, weight);
+
+         cmd->es[v].cc += weight;
+
+         if (cmd->es[v].cmin > weight)
+           cmd->es[v].cmin = weight;
+       }
+    }
+  casereader_destroy (reader);
+
+  switch (cmd->chart_type)
+    {
+    case CT_HISTOGRAM:
+      reader = casereader_clone(input);
+      show_histogr(cmd,reader);
+      casereader_destroy(reader);
+      break;
+    case CT_SCATTERPLOT:
+      reader = casereader_clone(input);
+      show_scatterplot(cmd,reader);
+      casereader_destroy(reader);
+      break;
+    default:
+      NOT_REACHED ();
+      break;
+    };
+
+  casereader_destroy(input);
+
+  cleanup_exploratory_stats (cmd);
+}
+
+
+int
+cmd_graph (struct lexer *lexer, struct dataset *ds)
+{
+  struct graph graph;
+
+  graph.missing_pw = false;
+  
+  graph.pool = pool_create ();
+
+  graph.dep_excl = MV_ANY;
+  graph.fctr_excl = MV_ANY;
+  
+  graph.dict = dataset_dict (ds);
+  
+
+  /* ---------------- graph ------------------ */
+  graph.dep_vars = NULL;
+  graph.chart_type = CT_NONE;
+  graph.scatter_type = ST_BIVARIATE;
+  graph.byvar = NULL;
+
+  while (lex_token (lexer) != T_ENDCMD)
+    {
+      lex_match (lexer, T_SLASH);
+
+      if (lex_match_id(lexer, "HISTOGRAM"))
+       {
+         if (graph.chart_type != CT_NONE)
+           {
+             lex_error(lexer, _("Only one chart type is allowed."));
+             goto error;
+           }
+         if (!lex_force_match (lexer, T_EQUALS))
+           goto error;
+         graph.chart_type = CT_HISTOGRAM;
+         if (!parse_variables_const (lexer, graph.dict,
+                                     &graph.dep_vars, &graph.n_dep_vars,
+                                     PV_NO_DUPLICATE | PV_NUMERIC))
+           goto error;
+         if (graph.n_dep_vars > 1)
+           {
+             lex_error(lexer, _("Only one variable allowed"));
+             goto error;
+           }
+       }
+      else if (lex_match_id (lexer, "SCATTERPLOT"))
+       {
+         if (graph.chart_type != CT_NONE)
+           {
+             lex_error(lexer, _("Only one chart type is allowed."));
+             goto error;
+           }
+         graph.chart_type = CT_SCATTERPLOT;
+         if (lex_match (lexer, T_LPAREN)) 
+           {
+             if (lex_match_id (lexer, "BIVARIATE"))
+               {
+                 /* This is the default anyway */
+               }
+             else if (lex_match_id (lexer, "OVERLAY"))  
+               {
+                 lex_error(lexer, _("%s is not yet implemented."),"OVERLAY");
+                 goto error;
+               }
+             else if (lex_match_id (lexer, "MATRIX"))  
+               {
+                 lex_error(lexer, _("%s is not yet implemented."),"MATRIX");
+                 goto error;
+               }
+             else if (lex_match_id (lexer, "XYZ"))  
+               {
+                 lex_error(lexer, _("%s is not yet implemented."),"XYZ");
+                 goto error;
+               }
+             else
+               {
+                 lex_error_expecting(lexer, "BIVARIATE", NULL);
+                 goto error;
+               }
+             if (!lex_force_match (lexer, T_RPAREN))
+               goto error;
+           }
+         if (!lex_force_match (lexer, T_EQUALS))
+           goto error;
+
+         if (!parse_variables_const (lexer, graph.dict,
+                                     &graph.dep_vars, &graph.n_dep_vars,
+                                     PV_NO_DUPLICATE | PV_NUMERIC))
+           goto error;
+        
+         if (graph.scatter_type == ST_BIVARIATE && graph.n_dep_vars != 1)
+           {
+             lex_error(lexer, _("Only one variable allowed"));
+             goto error;
+           }
+
+         if (!lex_force_match (lexer, T_WITH))
+           goto error;
+
+         if (!parse_variables_const (lexer, graph.dict,
+                                     &graph.dep_vars, &graph.n_dep_vars,
+                                     PV_NO_DUPLICATE | PV_NUMERIC | PV_APPEND))
+           goto error;
+
+         if (graph.scatter_type == ST_BIVARIATE && graph.n_dep_vars != 2)
+           {
+             lex_error(lexer, _("Only one variable allowed"));
+             goto error;
+           }
+         
+         if (lex_match(lexer, T_BY))
+           {
+             const struct variable *v = NULL;
+             if (!lex_match_variable (lexer,graph.dict,&v))
+               {
+                 lex_error(lexer, _("Variable expected"));
+                 goto error;
+               }
+             graph.byvar = v;
+           }
+       }
+      else if (lex_match_id (lexer, "BAR"))
+       {
+         lex_error (lexer, _("%s is not yet implemented."),"BAR");
+         goto error;
+       }
+      else if (lex_match_id (lexer, "LINE"))
+       {
+         lex_error (lexer, _("%s is not yet implemented."),"LINE");
+         goto error;
+       }
+      else if (lex_match_id (lexer, "PIE"))
+       {
+         lex_error (lexer, _("%s is not yet implemented."),"PIE");
+         goto error;
+       }
+      else if (lex_match_id (lexer, "ERRORBAR"))
+       {
+         lex_error (lexer, _("%s is not yet implemented."),"ERRORBAR");
+         goto error;
+       }
+      else if (lex_match_id (lexer, "PARETO"))
+       {
+         lex_error (lexer, _("%s is not yet implemented."),"PARETO");
+         goto error;
+       }
+      else if (lex_match_id (lexer, "TITLE"))
+       {
+         lex_error (lexer, _("%s is not yet implemented."),"TITLE");
+         goto error;
+       }
+      else if (lex_match_id (lexer, "SUBTITLE"))
+       {
+         lex_error (lexer, _("%s is not yet implemented."),"SUBTITLE");
+         goto error;
+       }
+      else if (lex_match_id (lexer, "FOOTNOTE"))
+       {
+         lex_error (lexer, _("%s is not yet implemented."),"FOOTNOTE");
+         lex_error (lexer, _("FOOTNOTE is not implemented yet for GRAPH"));
+         goto error;
+       }
+      else if (lex_match_id (lexer, "MISSING"))
+        {
+         lex_match (lexer, T_EQUALS);
+
+         while (lex_token (lexer) != T_ENDCMD
+                && lex_token (lexer) != T_SLASH)
+           {
+              if (lex_match_id (lexer, "LISTWISE"))
+                {
+                  graph.missing_pw = false;
+                }
+              else if (lex_match_id (lexer, "VARIABLE"))
+                {
+                  graph.missing_pw = true;
+                }
+              else if (lex_match_id (lexer, "EXCLUDE"))
+                {
+                  graph.dep_excl = MV_ANY;
+                }
+              else if (lex_match_id (lexer, "INCLUDE"))
+                {
+                  graph.dep_excl = MV_SYSTEM;
+                }
+              else if (lex_match_id (lexer, "REPORT"))
+                {
+                  graph.fctr_excl = MV_NEVER;
+                }
+              else if (lex_match_id (lexer, "NOREPORT"))
+                {
+                  graph.fctr_excl = MV_ANY;
+                }
+              else
+                {
+                  lex_error (lexer, NULL);
+                  goto error;
+                }
+            }
+        }
+      else
+        {
+          lex_error (lexer, NULL);
+          goto error;
+        }
+    }
+
+  if (graph.chart_type == CT_NONE)
+    {
+      lex_error_expecting(lexer,"HISTOGRAM","SCATTERPLOT",NULL);
+      goto error;
+    }
+
+
+  {
+    struct casegrouper *grouper;
+    struct casereader *group;
+    bool ok;
+    
+    grouper = casegrouper_create_splits (proc_open (ds), graph.dict);
+    while (casegrouper_get_next_group (grouper, &group))
+      run_graph (&graph, group);
+    ok = casegrouper_destroy (grouper);
+    ok = proc_commit (ds) && ok;
+  }
+
+  free (graph.dep_vars);
+  pool_destroy (graph.pool);
+
+  return CMD_SUCCESS;
+
+ error:
+  free (graph.dep_vars);
+  pool_destroy (graph.pool);
+
+  return CMD_FAILURE;
+}
index 0b3e1fd78ce4fc6ffbb3ffacbd9d688149c47a86..39c52d7496f59be166d53c70d9366a29e47d4299 100644 (file)
@@ -24,6 +24,8 @@ src_output_liboutput_la_SOURCES = \
        src/output/charts/spreadlevel-plot.h \
        src/output/charts/scree.c \
        src/output/charts/scree.h \
+       src/output/charts/scatterplot.c \
+       src/output/charts/scatterplot.h \
        src/output/csv.c \
        src/output/driver-provider.h \
        src/output/driver.c \
@@ -70,7 +72,8 @@ src_output_liboutput_la_SOURCES += \
        src/output/charts/plot-hist-cairo.c \
        src/output/charts/roc-chart-cairo.c \
        src/output/charts/scree-cairo.c \
-       src/output/charts/spreadlevel-cairo.c
+       src/output/charts/spreadlevel-cairo.c \
+       src/output/charts/scatterplot-cairo.c
 endif
 if ODF_WRITE_SUPPORT
 src_output_liboutput_la_SOURCES += src/output/odt.c
index 646b1cc3d7e97eaa49d86338a1d6eb2b7b63e2e8..edf4ed1148cd25ea2ff4730ebc191f511f04f018 100644 (file)
@@ -173,6 +173,8 @@ void xrchart_draw_scree (const struct chart_item *, cairo_t *,
                          struct xrchart_geometry *);
 void xrchart_draw_spreadlevel (const struct chart_item *, cairo_t *,
                          struct xrchart_geometry *);
+void xrchart_draw_scatterplot (const struct chart_item *, cairo_t *,
+                         struct xrchart_geometry *);
 
 
 #endif /* output/cairo-chart.h */
index 5197ffee9aa24bc9d274a6fb805943f202eae528..9a0cb6797e5f7d309c16e4eac5e95ff294eb8276 100644 (file)
@@ -34,6 +34,7 @@
 #include "output/charts/roc-chart.h"
 #include "output/charts/spreadlevel-plot.h"
 #include "output/charts/scree.h"
+#include "output/charts/scatterplot.h"
 #include "output/driver-provider.h"
 #include "output/message-item.h"
 #include "output/options.h"
@@ -1420,6 +1421,8 @@ xr_draw_chart (const struct chart_item *chart_item, cairo_t *cr,
     xrchart_draw_scree (chart_item, cr, &geom);
   else if (is_spreadlevel_plot_chart (chart_item))
     xrchart_draw_spreadlevel (chart_item, cr, &geom);
+  else if (is_scatterplot_chart (chart_item))
+    xrchart_draw_scatterplot (chart_item, cr, &geom);
   else
     NOT_REACHED ();
   xrchart_geometry_free (cr, &geom);
diff --git a/src/output/charts/scatterplot-cairo.c b/src/output/charts/scatterplot-cairo.c
new file mode 100644 (file)
index 0000000..b555a12
--- /dev/null
@@ -0,0 +1,117 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2014 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include "output/charts/scatterplot.h"
+
+#include "data/case.h"
+#include "data/casereader.h"
+#include "data/variable.h"
+#include "output/cairo-chart.h"
+#include "libpspp/str.h"
+#include "libpspp/message.h"
+
+#include "gettext.h"
+#define _(msgid) gettext (msgid)
+
+
+void
+xrchart_draw_scatterplot (const struct chart_item *chart_item, cairo_t *cr,
+                         struct xrchart_geometry *geom)
+{
+  const struct scatterplot_chart *spc = to_scatterplot_chart (chart_item);
+  struct casereader *data;
+  struct ccase *c;
+  /* While reading the cases, a list with categories of the byvar is build */
+  /* All distinct values are stored in catvals                             */
+  /* Each category will later have a different plot colour                 */
+  const int MAX_PLOT_CATS = 20;
+  union value catvals[MAX_PLOT_CATS];
+  int n_catvals = 0;
+  int byvar_width = 0;
+  int i = 0;
+  const struct xrchart_colour *colour;
+  
+  if (spc->byvar)
+    byvar_width = var_get_width(spc->byvar);
+
+  xrchart_write_xscale (cr, geom,
+                      spc->x_min,
+                      spc->x_max, 5);
+  xrchart_write_yscale (cr, geom, spc->y_min, spc->y_max, 5);
+  xrchart_write_title (cr, geom, _("Scatterplot %s"), chart_item->title);
+  xrchart_write_xlabel (cr, geom, var_to_string(spc->xvar));
+  xrchart_write_ylabel (cr, geom, var_to_string(spc->yvar));
+
+  cairo_save (cr);
+  data = casereader_clone (spc->data);
+  for (; (c = casereader_read (data)) != NULL; case_unref (c))
+    {
+      if (spc->byvar)
+       {
+         const union value *val = case_data(c,spc->byvar);
+         for(i=0;i<n_catvals && !value_equal(&catvals[i],val,byvar_width);i++);
+         if (i == n_catvals) /* No entry found */
+           {
+             if (n_catvals < MAX_PLOT_CATS)
+               {
+                 struct string label;
+                 ds_init_empty(&label);
+                 if (var_is_value_missing(spc->byvar,val,MV_ANY))
+                   ds_put_cstr(&label,"missing");
+                 else
+                   var_append_value_name(spc->byvar,val,&label);
+                 value_clone(&catvals[n_catvals++],val,byvar_width);
+                 geom->n_datasets++;
+                 geom->dataset = xrealloc (geom->dataset,
+                                           geom->n_datasets * sizeof (*geom->dataset));
+
+                 geom->dataset[geom->n_datasets - 1] = strdup(ds_cstr(&label));
+                 ds_destroy(&label);
+               }
+             else /* Use the last plot category */
+               {
+                 *(spc->byvar_overflow) = true;
+                 i--;
+               }
+           }
+       }
+      colour = &data_colour [ i % XRCHART_N_COLOURS];
+      cairo_set_source_rgb (cr,
+                            colour->red / 255.0,
+                            colour->green / 255.0,
+                            colour->blue / 255.0);
+    
+      xrchart_datum (cr, geom, 0,
+                    case_data (c, spc->xvar)->f,
+                    case_data (c, spc->yvar)->f);
+    }
+  casereader_destroy (data);
+  cairo_restore(cr);
+
+  for(i=0;i<n_catvals;i++)
+    value_destroy(&catvals[i],byvar_width);
+
+  if (spc->byvar)
+    xrchart_write_legend(cr, geom);
+
+    
+
+  //  xrchart_line (cr, geom, npp->slope, npp->intercept,
+  //            npp->y_first, npp->y_last, XRCHART_DIM_Y);
+
+}
diff --git a/src/output/charts/scatterplot.c b/src/output/charts/scatterplot.c
new file mode 100644 (file)
index 0000000..920bc17
--- /dev/null
@@ -0,0 +1,73 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2014 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include "output/charts/scatterplot.h"
+
+#include <gsl/gsl_cdf.h>
+
+#include "data/casereader.h"
+#include "libpspp/cast.h"
+#include "output/chart-item-provider.h"
+
+#include "gl/minmax.h"
+
+
+/* Creates a scatterplot 
+
+   The caller retains ownership of READER. */
+struct scatterplot_chart *
+scatterplot_create (const struct casereader *reader, 
+                   const struct variable *xvar, 
+                   const struct variable *yvar,
+                   const struct variable *byvar,
+                   bool *byvar_overflow,
+                   const char *label,
+                   double xmin, double xmax, double ymin, double ymax)
+{
+  struct scatterplot_chart *spc;
+
+  spc = xzalloc (sizeof *spc);
+  chart_item_init (&spc->chart_item, &scatterplot_chart_class, label);
+  spc->data = casereader_clone (reader);
+
+  spc->y_min = ymin;
+  spc->y_max = ymax;
+
+  spc->x_min = xmin;
+  spc->x_max = xmax;
+
+  spc->xvar = xvar;
+  spc->yvar = yvar;
+  spc->byvar = byvar;
+  spc->byvar_overflow = byvar_overflow;
+
+  return spc;
+}
+
+static void
+scatterplot_chart_destroy (struct chart_item *chart_item)
+{
+  struct scatterplot_chart *spc = to_scatterplot_chart (chart_item);
+  casereader_destroy (spc->data);
+  free (spc);
+}
+
+const struct chart_item_class scatterplot_chart_class =
+  {
+    scatterplot_chart_destroy
+  };
diff --git a/src/output/charts/scatterplot.h b/src/output/charts/scatterplot.h
new file mode 100644 (file)
index 0000000..e95562b
--- /dev/null
@@ -0,0 +1,106 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2014 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#ifndef OUTPUT_CHARTS_SCATTERPLOT_H
+#define OUTPUT_CHARTS_SCATTERPLOT_H 1
+
+#include "output/chart-item.h"
+
+/* A  scatterplot. */
+struct scatterplot_chart
+  {
+    struct chart_item chart_item;
+    struct casereader *data;
+    const struct variable *xvar, *yvar, *byvar;
+
+    double y_min, y_max;
+    double x_min, x_max;
+    /* If the number of distinct values of byvar */
+    /* exceeds a certain limit, the warning flag */
+    /* is activated after the chart is drawn     */
+    bool *byvar_overflow;
+  };
+
+struct scatterplot_chart *
+scatterplot_create (const struct casereader *, 
+                   const struct variable *, 
+                   const struct variable *,
+                   const struct variable *,
+                   bool *,
+                   const char *label,
+                   double xmin, double xmax, double ymin, double ymax);
+\f
+/* This boilerplate for scatterplot_chart, a subclass of chart_item, was
+   autogenerated by mk-class-boilerplate. */
+
+#include <assert.h>
+#include "libpspp/cast.h"
+
+extern const struct chart_item_class scatterplot_chart_class;
+
+/* Returns true if SUPER is a scatterplot_chart, otherwise false. */
+static inline bool
+is_scatterplot_chart (const struct chart_item *super)
+{
+  return super->class == &scatterplot_chart_class;
+}
+
+/* Returns SUPER converted to scatterplot_chart.  SUPER must be a scatterplot_chart, as
+   reported by is_scatterplot_chart. */
+static inline struct scatterplot_chart *
+to_scatterplot_chart (const struct chart_item *super)
+{
+  assert (is_scatterplot_chart (super));
+  return UP_CAST (super, struct scatterplot_chart, chart_item);
+}
+
+/* Returns INSTANCE converted to chart_item. */
+static inline struct chart_item *
+scatterplot_chart_super (const struct scatterplot_chart *instance)
+{
+  return CONST_CAST (struct chart_item *, &instance->chart_item);
+}
+
+/* Increments INSTANCE's reference count and returns INSTANCE. */
+static inline struct scatterplot_chart *
+scatterplot_chart_ref (const struct scatterplot_chart *instance)
+{
+  return to_scatterplot_chart (chart_item_ref (&instance->chart_item));
+}
+
+/* Decrements INSTANCE's reference count, then destroys INSTANCE if
+   the reference count is now zero. */
+static inline void
+scatterplot_chart_unref (struct scatterplot_chart *instance)
+{
+  chart_item_unref (&instance->chart_item);
+}
+
+/* Returns true if INSTANCE's reference count is greater than 1,
+   false otherwise. */
+static inline bool
+scatterplot_chart_is_shared (const struct scatterplot_chart *instance)
+{
+  return chart_item_is_shared (&instance->chart_item);
+}
+
+static inline void
+scatterplot_chart_submit (struct scatterplot_chart *instance)
+{
+  chart_item_submit (&instance->chart_item);
+}
+\f
+#endif /* output/charts/scatterplot.h */
index 7677b4c2b3d460ff517d0315850cf81a8cdb6a89..b9df0539e14c6bf54daf07539fd0a6fda15082d0 100644 (file)
@@ -312,6 +312,7 @@ TESTSUITE_AT = \
        tests/language/stats/crosstabs.at \
        tests/language/stats/descriptives.at \
        tests/language/stats/examine.at \
+       tests/language/stats/graph.at \
        tests/language/stats/factor.at \
        tests/language/stats/flip.at \
        tests/language/stats/frequencies.at \
diff --git a/tests/language/stats/graph.at b/tests/language/stats/graph.at
new file mode 100644 (file)
index 0000000..f4b125b
--- /dev/null
@@ -0,0 +1,149 @@
+AT_BANNER([GRAPH])
+
+AT_SETUP([GRAPH simple scatterplot])
+AT_DATA([scatterplot.sps],[
+* Simple Scatterplot test
+NEW FILE.
+INPUT PROGRAM.
+LOOP #i = 1 to 100.
+COMPUTE Age = RV.NORMAL(40,10).
+END CASE.
+END LOOP.
+END FILE.
+END INPUT PROGRAM.
+
+COMPUTE Size = Age * 3 + 50.
+
+GRAPH
+    /SCATTERPLOT(BIVARIATE) = Age WITH Size. 
+
+])
+
+AT_CHECK([pspp -O format=csv scatterplot.sps], [0], [ignore])
+
+AT_CLEANUP
+
+
+AT_SETUP([GRAPH Scatter and Histogram])
+AT_DATA([scatterlong.sps],[
+NEW FILE.
+INPUT PROGRAM.
+LOOP #i = 1 to 10000.
+COMPUTE Age = RV.NORMAL(40,10).
+COMPUTE CityNum = TRUNC(UNIFORM(2.95)).
+END CASE.
+END LOOP.
+END FILE.
+END INPUT PROGRAM.
+
+COMPUTE Size = Age * 3 + 50 + 50*CityNum.
+
+STRING City (a20).
+
+Recode CityNum 
+       (0 = "Madrid")
+       (1 = "Paris")
+       (ELSE = "Stockholm")
+       into City.
+
+ GRAPH
+    /SCATTERPLOT(BIVARIATE) = Age WITH Size 
+
+ GRAPH
+    /SCATTERPLOT(BIVARIATE) = Age WITH CityNum
+ GRAPH
+    /SCATTERPLOT = CityNum WITH Age
+
+ GRAPH
+    /SCATTERPLOT = CityNum WITH Size
+
+ GRAPH
+    /SCATTERPLOT(BIVARIATE) = Age WITH Size BY City
+
+ GRAPH
+    /SCATTERPLOT(BIVARIATE) = Age WITH Size BY CityNum
+
+ ADD VALUE LABELS 
+    /CityNum 1 'Rio' 2 'Tokyo' 0 'Mumbai'.
+
+ GRAPH
+    /SCATTERPLOT(BIVARIATE) = Age WITH Size BY CityNum
+
+ GRAPH
+    /HISTOGRAM = Age.
+
+])
+
+AT_CHECK([pspp -O format=pdf scatterlong.sps], [0], [ignore])
+AT_CLEANUP
+
+AT_SETUP([GRAPH missing values don't crash])
+AT_DATA([scatter.sps], [dnl
+data list list /x * y *.
+begin data.
+1 0
+2 0
+. 0
+3 1
+4 1
+5 .
+6 1
+end data.
+graph 
+      /scatterplot = x with y.
+graph
+      /histogram = x. 
+])
+AT_CHECK([pspp -o pspp.pdf scatter.sps])
+dnl Ignore output -- this is just a no-crash check.
+AT_CLEANUP
+
+AT_SETUP([GRAPH missing=VARIABLE no crash])
+AT_DATA([scatter.sps], [dnl
+data list list /x * y *.
+begin data.
+1 0
+2 0
+. 0
+3 1
+4 1
+5 .
+6 1
+end data.
+graph 
+      /scatterplot = x with y
+      /missing = VARIABLE.
+graph
+      /histogram = x
+      /missing = VARIABLE.
+])
+AT_CHECK([pspp -o pspp.pdf scatter.sps])
+dnl Ignore output -- this is just a no-crash check.
+AT_CLEANUP
+
+AT_SETUP([GRAPH missing value in by variable])
+AT_DATA([scatter.sps], [dnl
+data list list /x * y * z *.
+begin data.
+1 0 9
+2 0 9
+. 0 9
+3 1 .
+4 1 8
+5 . 8
+6 1 8
+end data.
+graph 
+      /scatterplot = x with y by z
+      /missing = VARIABLE.
+
+graph 
+      /scatterplot = x with y by z.
+
+])
+AT_CHECK([pspp -o pspp.pdf scatter.sps])
+dnl Ignore output -- this is just a no-crash check.
+AT_CLEANUP
+
+