X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Fstats%2Fmedian.c;fp=src%2Flanguage%2Fstats%2Fmedian.c;h=2db57ab76f025df50ee42797074981e2e09cfa7e;hb=1babb2ecc20dcae51f6cc639d39cce197d31dbdc;hp=0000000000000000000000000000000000000000;hpb=3b74aa9059aefe4ce82ecab1a5fd48cd251600dc;p=pspp

diff --git a/src/language/stats/median.c b/src/language/stats/median.c
new file mode 100644
index 0000000000..2db57ab76f
--- /dev/null
+++ b/src/language/stats/median.c
@@ -0,0 +1,461 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2011 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. 
+*/
+
+#include <config.h>
+#include "median.h"
+
+#include <gsl/gsl_cdf.h>
+
+#include "data/format.h"
+#include "libpspp/cast.h"
+
+#include "data/variable.h"
+#include "data/case.h"
+#include "data/dictionary.h"
+#include "data/dataset.h"
+#include "data/casereader.h"
+#include "data/casewriter.h"
+#include "data/subcase.h"
+
+
+#include "data/casereader.h"
+#include "math/percentiles.h"
+
+#include "math/sort.h"
+
+#include "libpspp/hmap.h"
+#include "libpspp/array.h"
+#include "libpspp/str.h"
+#include "data/value.h"
+#include "libpspp/misc.h"
+
+#include "output/tab.h"
+
+#include "gettext.h"
+#define _(msgid) gettext (msgid)
+
+
+struct val_node
+{
+  struct hmap_node node;
+  union value val;
+  casenumber le;
+  casenumber gt;
+};
+
+struct results
+{
+  const struct variable *var;
+  struct val_node **sorted_array;
+  double n;
+  double median;
+  double chisq;  
+};
+
+
+
+static int 
+val_node_cmp_3way (const void *a_, const void *b_, const void *aux)
+{
+  const struct variable *indep_var = aux;
+  const struct val_node *const *a = a_;
+  const struct val_node *const *b = b_;
+
+  return value_compare_3way (&(*a)->val, &(*b)->val, var_get_width (indep_var));
+}
+
+static void 
+show_frequencies (const struct n_sample_test *nst, const struct results *results,  int n_vals, const struct dictionary *);
+
+static void 
+show_test_statistics (const struct n_sample_test *nst, const struct results *results, int, const struct dictionary *);
+
+
+static struct val_node *
+find_value (const struct hmap *map, const union value *val, 
+	    const struct variable *var)
+{
+  struct val_node *foo = NULL;
+  size_t hash = value_hash (val, var_get_width (var), 0);
+  HMAP_FOR_EACH_WITH_HASH (foo, struct val_node, node, hash, map)
+    if (value_equal (val, &foo->val, var_get_width (var)))
+      break;
+
+  return foo;
+}
+
+void
+median_execute (const struct dataset *ds,
+		struct casereader *input,
+		enum mv_class exclude,
+		const struct npar_test *test,
+		bool exact UNUSED,
+		double timer UNUSED)
+{
+  const struct dictionary *dict = dataset_dict (ds);
+  const struct variable *wvar = dict_get_weight (dict);
+  bool warn = true;
+  int v;
+  const struct median_test *mt = UP_CAST (test, const struct median_test,
+					  parent.parent);
+
+  const struct n_sample_test *nst = UP_CAST (test, const struct n_sample_test,
+					  parent);
+
+  const bool n_sample_test = (value_compare_3way (&nst->val2, &nst->val1,
+				       var_get_width (nst->indep_var)) > 0);
+
+  struct results *results = xcalloc (nst->n_vars, sizeof (*results));
+  int n_vals = 0;
+  for (v = 0; v < nst->n_vars; ++v)
+    {
+      double count = 0;
+      double cc = 0;
+      double median = mt->median;
+      const struct variable *var = nst->vars[v];
+      struct ccase *c;
+      struct hmap map = HMAP_INITIALIZER (map);
+      struct casereader *r = casereader_clone (input);
+
+
+
+      if (n_sample_test == false)
+	{
+	  struct val_node *vn = xzalloc (sizeof *vn);
+	  value_clone (&vn->val,  &nst->val1, var_get_width (nst->indep_var));
+	  hmap_insert (&map, &vn->node, value_hash (&nst->val1,
+					    var_get_width (nst->indep_var), 0));
+
+	  vn = xzalloc (sizeof *vn);
+	  value_clone (&vn->val,  &nst->val2, var_get_width (nst->indep_var));
+	  hmap_insert (&map, &vn->node, value_hash (&nst->val2,
+					    var_get_width (nst->indep_var), 0));
+	}
+
+      if ( median == SYSMIS)
+	{
+	  struct percentile *ptl;
+	  struct order_stats *os;
+
+	  struct casereader *rr;
+	  struct subcase sc;
+	  struct casewriter *writer;
+	  subcase_init_var (&sc, var, SC_ASCEND);
+	  rr = casereader_clone (r);
+	  writer = sort_create_writer (&sc, casereader_get_proto (rr));
+
+	  for (; (c = casereader_read (rr)) != NULL; )
+	    {
+	      if ( var_is_value_missing (var, case_data (c, var), exclude))
+		{
+		  case_unref (c);
+		  continue;
+		}
+
+	      cc += dict_get_case_weight (dict, c, &warn);
+	      casewriter_write (writer, c);
+	    }
+	  subcase_destroy (&sc);
+	  casereader_destroy (rr);
+
+	  rr = casewriter_make_reader (writer);
+
+	  ptl = percentile_create (0.5, cc);
+	  os = &ptl->parent;
+	    
+	  order_stats_accumulate (&os, 1,
+				  rr,
+				  wvar,
+				  var,
+				  exclude);
+
+	  median = percentile_calculate (ptl, PC_HAVERAGE);
+	  statistic_destroy (&ptl->parent.parent);
+	}
+
+      results[v].median = median;
+      
+
+      for (; (c = casereader_read (r)) != NULL; case_unref (c))
+	{
+	  struct val_node *vn ;
+	  const double weight = dict_get_case_weight (dict, c, &warn);
+	  const union value *val = case_data (c, var);
+	  const union value *indep_val = case_data (c, nst->indep_var);
+
+	  if ( var_is_value_missing (var, case_data (c, var), exclude))
+	    {
+	      case_unref (c);
+	      continue;
+	    }
+
+	  if (n_sample_test)
+	    {
+	      int width = var_get_width (nst->indep_var);
+	      /* Ignore out of range values */
+	      if (
+		  value_compare_3way (indep_val, &nst->val1, width) < 0
+		||
+		  value_compare_3way (indep_val, &nst->val2, width) > 0
+		   )
+		{
+		  case_unref (c);
+		  continue;
+		}
+	    }
+
+	  vn = find_value (&map, indep_val, nst->indep_var);
+	  if ( vn == NULL)
+	    {
+	      if ( n_sample_test == true)
+		{
+		  int width = var_get_width (nst->indep_var);
+		  vn = xzalloc (sizeof *vn);
+		  value_clone (&vn->val,  indep_val, width);
+		  
+		  hmap_insert (&map, &vn->node, value_hash (indep_val, width, 0));
+		}
+	      else
+		{
+		  continue;
+		}
+	    }
+
+	  if (val->f <= median)
+	    vn->le += weight;
+	  else
+	    vn->gt += weight;
+
+	  count += weight;
+	}
+      casereader_destroy (r);
+
+      {
+	int x = 0;
+	struct val_node *vn = NULL;
+	double r_0 = 0;
+	double r_1 = 0;
+	HMAP_FOR_EACH (vn, struct val_node, node, &map)
+	  {
+	    r_0 += vn->le;
+	    r_1 += vn->gt;
+	  }
+
+	results[v].n = count;
+	results[v].sorted_array = xcalloc (hmap_count (&map), sizeof (void*));
+	results[v].var = var;
+
+	HMAP_FOR_EACH (vn, struct val_node, node, &map)
+	  {
+	    double e_0j = r_0 * (vn->le + vn->gt) / count;
+	    double e_1j = r_1 * (vn->le + vn->gt) / count;
+
+	    results[v].chisq += pow2 (vn->le - e_0j) / e_0j;
+	    results[v].chisq += pow2 (vn->gt - e_1j) / e_1j;
+
+	    results[v].sorted_array[x++] = vn;
+	  }
+
+	n_vals = x;
+	hmap_destroy (&map);
+
+	sort (results[v].sorted_array, x, sizeof (*results[v].sorted_array), 
+	      val_node_cmp_3way, nst->indep_var);
+
+      }
+    }
+
+  casereader_destroy (input);
+
+  show_frequencies (nst, results,  n_vals, dict);
+  show_test_statistics (nst, results, n_vals, dict);
+
+  for (v = 0; v < nst->n_vars; ++v)
+    {
+      int i;
+      const struct results *rs = results + v;
+
+      for (i = 0; i < n_vals; ++i)
+	{
+	  struct val_node *vn = rs->sorted_array[i];
+	  value_destroy (&vn->val, var_get_width (nst->indep_var));
+	  free (vn);
+	}
+      free (rs->sorted_array);
+    }
+  free (results);
+}
+
+
+
+static void 
+show_frequencies (const struct n_sample_test *nst, const struct results *results,  int n_vals, const struct dictionary *dict)
+{
+  const struct variable *weight = dict_get_weight (dict);
+  const struct fmt_spec *wfmt = weight ? var_get_print_format (weight) : &F_8_0;
+
+  int i;
+  int v;
+
+  const int row_headers = 2;
+  const int column_headers = 2;
+  const int nc = row_headers + n_vals;
+  const int nr = column_headers + nst->n_vars * 2;
+    
+  struct tab_table *table = tab_create (nc, nr);
+
+  tab_headers (table, row_headers, 0, column_headers, 0);
+
+  tab_title (table, _("Frequencies"));
+
+  /* Box around the table and vertical lines inside*/
+  tab_box (table, TAL_2, TAL_2, -1, TAL_1,
+	   0,  0, tab_nc (table) - 1, tab_nr (table) - 1 );
+
+  tab_hline (table, TAL_2, 0, tab_nc (table) -1, column_headers);
+  tab_vline (table, TAL_2, row_headers, 0, tab_nr (table) - 1);
+
+  tab_joint_text (table,
+		  row_headers, 0, row_headers + n_vals - 1, 0,
+		  TAT_TITLE | TAB_CENTER, var_to_string (nst->indep_var));
+
+
+  tab_hline (table, TAL_1, row_headers, tab_nc (table) - 1, 1);
+
+
+  for (i = 0; i < n_vals; ++i)
+    {
+      const struct results *rs = results + 0;
+      struct string label;
+      ds_init_empty (&label);
+
+      var_append_value_name (nst->indep_var, &rs->sorted_array[i]->val,
+			    &label);
+
+      tab_text (table, row_headers + i, 1,
+		TAT_TITLE | TAB_LEFT, ds_cstr (&label));
+  
+      ds_destroy (&label);
+    }
+
+  for (v = 0; v < nst->n_vars; ++v)
+    {
+      const struct results *rs = &results[v];
+      tab_text (table,  0, column_headers + v * 2,
+		TAT_TITLE | TAB_LEFT, var_to_string (rs->var) );
+
+      tab_text (table,  1, column_headers + v * 2,
+		TAT_TITLE | TAB_LEFT, _("> Median") );
+
+      tab_text (table,  1, column_headers + v * 2 + 1,
+		TAT_TITLE | TAB_LEFT, _("â¤ Median") );
+
+      if ( v > 0)
+	tab_hline (table, TAL_1, 0, tab_nc (table) - 1, column_headers + v * 2);
+    }
+
+  for (v = 0; v < nst->n_vars; ++v)
+    {
+      int i;
+      const struct results *rs = &results[v];
+
+      for (i = 0; i < n_vals; ++i)
+	{
+	  const struct val_node *vn = rs->sorted_array[i];
+	  tab_double (table, row_headers + i, column_headers + v * 2,
+		      0, vn->gt, wfmt);
+
+	  tab_double (table, row_headers + i, column_headers + v * 2 + 1,
+		    0, vn->le, wfmt);
+	}
+    }
+
+  tab_submit (table);
+}
+
+
+static void 
+show_test_statistics (const struct n_sample_test *nst,
+		      const struct results *results,
+		      int n_vals,
+		      const struct dictionary *dict)
+{
+  const struct variable *weight = dict_get_weight (dict);
+  const struct fmt_spec *wfmt = weight ? var_get_print_format (weight) : &F_8_0;
+
+  int v;
+
+  const int row_headers = 1;
+  const int column_headers = 1;
+  const int nc = row_headers + 5;
+  const int nr = column_headers + nst->n_vars;
+    
+  struct tab_table *table = tab_create (nc, nr);
+
+  tab_headers (table, row_headers, 0, column_headers, 0);
+
+  tab_title (table, _("Test Statistics"));
+
+
+  tab_box (table, TAL_2, TAL_2, -1, TAL_1,
+	   0,  0, tab_nc (table) - 1, tab_nr (table) - 1 );
+
+  tab_hline (table, TAL_2, 0, tab_nc (table) -1, column_headers);
+  tab_vline (table, TAL_2, row_headers, 0, tab_nr (table) - 1);
+
+  tab_text (table, row_headers + 0, 0,
+	    TAT_TITLE | TAB_CENTER, _("N"));
+
+  tab_text (table, row_headers + 1, 0,
+	    TAT_TITLE | TAB_CENTER, _("Median"));
+
+  tab_text (table, row_headers + 2, 0,
+	    TAT_TITLE | TAB_CENTER, _("Chi-Square"));
+
+  tab_text (table, row_headers + 3, 0,
+	    TAT_TITLE | TAB_CENTER, _("df"));
+
+  tab_text (table, row_headers + 4, 0,
+	    TAT_TITLE | TAB_CENTER, _("Asymp. Sig."));
+
+
+  for (v = 0; v < nst->n_vars; ++v)
+    {
+      double df = n_vals - 1;
+      const struct results *rs = &results[v];
+      tab_text (table,  0, column_headers + v,
+		TAT_TITLE | TAB_LEFT, var_to_string (rs->var));
+
+
+      tab_double (table, row_headers + 0, column_headers + v,
+		  0, rs->n, wfmt);
+
+      tab_double (table, row_headers + 1, column_headers + v,
+		  0, rs->median, NULL);
+
+      tab_double (table, row_headers + 2, column_headers + v,
+		  0, rs->chisq, NULL);
+
+      tab_double (table, row_headers + 3, column_headers + v,
+		  0, df, wfmt);
+
+      tab_double (table, row_headers + 4, column_headers + v,
+		  0, gsl_cdf_chisq_Q (rs->chisq, df), NULL);
+    }
+  
+  tab_submit (table);
+}