Added documentation for the MEANS command

[pspp-builds.git] / doc / statistics.texi
diff --git a/doc/statistics.texi b/doc/statistics.texi

index 36727c44c5f75dfa7620e900029875d4c2dd874b..edd96d0990bb3fcd050a8490efc36d6560855f39 100644 (file)
--- a/doc/statistics.texi
+++ b/doc/statistics.texi
@@ -11,6 +11,7 @@ far.
  * CORRELATIONS::                Correlation tables.
  * CROSSTABS::                   Crosstabulation tables.
  * FACTOR::                      Factor analysis and Principal Components analysis
+* MEANS::                       Average values and other statistics.
  * NPAR TESTS::                  Nonparametric tests.
  * T-TEST::                      Test hypotheses about means.
  * ONEWAY::                      One way analysis of variance.
@@ -258,7 +259,7 @@ useful there is more than one dependent variable and at least one factor.   If
  containing boxplots for all the factors.
  If /COMPARE=VARIABLES is specified, then one plot per factor is produced, each 
  each containing one boxplot per dependent variable.
-If the /COMPARE subcommand is ommitted, then PSPP uses the default value of 
+If the /COMPARE subcommand is omitted, then PSPP uses the default value of 
  /COMPARE=GROUPS.
   
  The ID subcommand also pertains to boxplots.  If given, it must
@@ -361,7 +362,6 @@ CROSSTABS
          /MISSING=@{TABLE,INCLUDE,REPORT@}
          /WRITE=@{NONE,CELLS,ALL@}
          /FORMAT=@{TABLES,NOTABLES@}
-                @{LABELS,NOLABELS,NOVALLABS@}
                  @{PIVOT,NOPIVOT@}
                  @{AVALUE,DVALUE@}
                  @{NOINDEX,INDEX@}
@@ -419,11 +419,6 @@ settings:
  TABLES, the default, causes crosstabulation tables to be output.
  NOTABLES suppresses them.
  
-@item
-LABELS, the default, allows variable labels and value labels to appear
-in the output.  NOLABELS suppresses them.  NOVALLABS displays variable
-labels but suppresses value labels.
-
  @item
  PIVOT, the default, causes each TABLES subcommand to be displayed in a
  pivot table format.  NOPIVOT causes the old-style crosstabulation format
@@ -556,7 +551,7 @@ FACTOR  VARIABLES=var_list
  
          [ /ROTATION=@{VARIMAX, EQUAMAX, QUARTIMAX, NOROTATE@}]
  
-        [ /PRINT=[INITIAL] [EXTRACTION] [ROTATION] [UNIVARIATE] [CORRELATION] [COVARIANCE] [DET] [SIG] [ALL] [DEFAULT] ]
+        [ /PRINT=[INITIAL] [EXTRACTION] [ROTATION] [UNIVARIATE] [CORRELATION] [COVARIANCE] [DET] [KMO] [SIG] [ALL] [DEFAULT] ]
  
          [ /PLOT=[EIGEN] ]
  
@@ -601,6 +596,8 @@ The /PRINT subcommand may be used to select which features of the analysis are r
        The covariance matrix is printed.
  @item DET
        The determinant of the correlation or covariance matrix is printed.
+@item KMO
+      The Kaiser-Meyer-Olkin measure of sampling adequacy and the Bartlett test of sphericity is printed.
  @item SIG
        The significance of the elements of correlation matrix is printed.
  @item ALL
@@ -609,7 +606,7 @@ The /PRINT subcommand may be used to select which features of the analysis are r
        Identical to INITIAL and EXTRACTION.
  @end itemize
  
-If /PLOT=EIGEN is given, then a ``Scree'' plot of the eigenvalues will be printed.  This can be useful for visualising
+If /PLOT=EIGEN is given, then a ``Scree'' plot of the eigenvalues will be printed.  This can be useful for visualizing
  which factors (components) should be retained.
  
  The /FORMAT subcommand determined how data are to be displayed in loading matrices.  If SORT is specified, then the variables
@@ -638,7 +635,145 @@ contains a missing value.
  If PAIRWISE is set, then a case is considered missing only if either of the
  values  for the particular coefficient are missing.
  The default is LISTWISE.
- 
+
+@node MEANS
+@section MEANS
+
+@vindex MEANS
+@cindex means
+
+@display 
+MEANS [TABLES =] 
+      @{varlist@} 
+        [ BY @{varlist@} [BY @{varlist@} [BY @{varlist@} @dots{} ]]]
+
+      [ /@{varlist@} 
+         [ BY @{varlist@} [BY @{varlist@} [BY @{varlist@} @dots{} ]]] ]
+
+      [/CELLS = [MEAN] [COUNT] [STDDEV] [SEMEAN] [SUM] [MIN] [MAX] [RANGE]
+        [VARIANCE] [KURT] [SEKURT] 
+        [SKEW] [SESKEW] [FIRST] [LAST] 
+        [HARMONIC] [GEOMETRIC] 
+        [DEFAULT]
+        [ALL]
+        [NONE] ]
+
+      [/MISSING = [TABLE] [INCLUDE] [DEPENDENT]]
+@end display 
+
+You can use the MEANS command to calculate the arithmetic mean and similar
+statistics, either for the dataset as a whole or for categories of data.
+
+The simplest form of the command is
+@example
+MEANS @var{v}.
+@end example
+@noindent which calculates the mean, count and standard deviation for @var{v}.
+If you specify a grouping variable, for example
+@example
+MEANS @var{v} BY @var{g}.
+@end example
+@noindent then the means, counts and standard deviations for @var{v} after having
+been grouped by @var{g} will be calculated.
+Instead of the mean, count and standard deviation, you could specify the statistics
+in which you are interested:
+@example
+MEANS @var{x} @var{y} BY @var{g}
+      /CELLS = HARMONIC SUM MIN.
+@end example
+This example calculates the harmonic mean, the sum and the minimum values of @var{x} and @var{y}
+grouped by @var{g}.
+
+The CELLS subcommand specifies which statistics to calculate.  The available statistics
+are:
+@itemize
+@item MEAN
+@cindex arithmetic mean
+      The arithmetic mean.
+@item COUNT
+      The count of the values.
+@item STDDEV
+      The standard deviation.
+@item SEMEAN
+      The standard error of the mean.
+@item SUM
+      The sum of the values.
+@item MIN
+      The minimum value.
+@item MAX
+      The maximum value.
+@item RANGE
+      The difference between the maximum and minimum values.
+@item VARIANCE
+      The variance.
+@item FIRST
+      The first value in the category.
+@item LAST
+      The last value in the category.
+@item SKEW
+      The skewness.
+@item SESKEW
+      The standard error of the skewness.
+@item KURT
+      The kurtosis
+@item SEKURT
+      The standard error of the kurtosis.
+@item HARMONIC
+@cindex harmonic mean
+      The harmonic mean.
+@item GEOMETRIC
+@cindex geometric mean
+      The geometric mean.
+@end itemize
+
+In addition, three special keywords are recognized:
+@itemize
+@item DEFAULT
+      This is the same as MEAN COUNT STDDEV
+@item ALL
+      All of the above statistics will be calculated.
+@item NONE
+      No statistics will be calculated (only a summary will be shown).
+@end itemize
+
+
+More than one @dfn{table} can be specified in a single command. 
+Each table is separated by a @samp{/}. For
+example
+@example
+MEANS TABLES =
+      @var{c} @var{d} @var{e} BY @var{x}
+      /@var{a} @var{b} BY @var{x} @var{y}
+      /@var{f} BY @var{y} BY @var{z}.
+@end example
+has three tables (the @samp{TABLE =} is optional).
+The first table has three dependent variables @var{c}, @var{d} and @var{e}
+and a single categorical variable @var{x}.
+The second table has two dependent variables @var{a} and @var{b}, 
+and two categorical variables @var{x} and @var{y}.
+The third table has a single dependent variables @var{f}
+and a categorical variable formed by the combination of @var{y} and @var{z}.
+
+
+By default values are omitted from the analysis only if missing values
+(either system missing or user missing)
+for any of the variables directly involved in their calculation are 
+encountered.
+This behaviour can be modified with the  /MISSING subcommand.
+Three options are possible: TABLE, INCLUDE and DEPENDENT.
+
+/MISSING = TABLE causes cases to be dropped if any variable is missing 
+in the table specification currently being processed, regardless of 
+whether it is needed to calculate the statistic.
+
+/MISSING = INCLUDE says that user missing values, either in the dependent
+variables or in the categorical variables should be taken at their face
+value, and not excluded.
+
+/MISSING = DEPENDENT says that user missing values, in the dependent
+variables should be taken at their face value, however cases which 
+have user missing values for the categorical variables should be omitted 
+from the calculation.
  
  @node NPAR TESTS
  @section NPAR TESTS
@@ -685,9 +820,11 @@ is used.
  * COCHRAN::                 Cochran Q Test
  * FRIEDMAN::                Friedman Test
  * KENDALL::                 Kendall's W Test
+* KOLMOGOROV-SMIRNOV::      Kolmogorov Smirnov Test
  * KRUSKAL-WALLIS::          Kruskal-Wallis Test
  * MANN-WHITNEY::            Mann Whitney U Test
  * MCNEMAR::                 McNemar Test
+* MEDIAN::                  Median Test
  * RUNS::                    Runs Test
  * SIGN::                    The Sign Test
  * WILCOXON::                Wilcoxon Signed Ranks Test
@@ -816,6 +953,45 @@ It has the range [0,1] --- a value of zero indicates no agreement between the sa
  unity indicates complete agreement.
  
  
+@node KOLMOGOROV-SMIRNOV
+@subsection Kolmogorov-Smirnov Test
+@vindex KOLMOGOROV-SMIRNOV
+@vindex K-S
+@cindex Kolmogorov-Smirnov test
+
+@display
+     [ /KOLMOGOROV-SMIRNOV (@{NORMAL [@var{mu}, @var{sigma}], UNIFORM [@var{min}, @var{max}], POISSON [@var{lambda}], EXPONENTIAL [@var{scale}] @}) = varlist ]
+@end display
+
+The one sample Kolmogorov-Smirnov subcommand is used to test whether or not a dataset is
+drawn from a particular distribution.  Four distributions are supported, @i{viz:}
+Normal, Uniform, Poisson and Exponential.
+
+Ideally you should provide the parameters of the distribution against which you wish to test
+the data. For example, with the normal distribution  the mean (@var{mu})and standard deviation (@var{sigma})
+should be given; with the uniform distribution, the minimum (@var{min})and maximum (@var{max}) value should
+be provided.
+However, if the parameters are omitted they will be imputed from the data. Imputing the
+parameters reduces the power of the test so should be avoided if possible.
+
+In the following example, two variables @var{score} and @var{age} are tested to see if
+they follow a normal distribution with a mean of 3.5 and a standard deviation of 2.0.
+@example
+  NPAR TESTS
+        /KOLMOGOROV-SMIRNOV (normal 3.5 2.0) = @var{score} @var{age}.
+@end example
+If the variables need to be tested against different distributions, then a separate
+subcommand must be used.  For example the following syntax tests @var{score} against
+a normal distribution with mean of 3.5 and standard deviation of 2.0 whilst @var{age}
+is tested against a normal distribution of mean 40 and standard deviation 1.5.
+@example
+  NPAR TESTS
+        /KOLMOGOROV-SMIRNOV (normal 3.5 2.0) = @var{score}
+        /KOLMOGOROV-SMIRNOV (normal 40 1.5) =  @var{age}.
+@end example
+
+The abbreviated subcommand  K-S may be used in place of KOLMOGOROV-SMIRNOV.
+
  @node KRUSKAL-WALLIS
  @subsection Kruskal-Wallis Test
  @vindex KRUSKAL-WALLIS
@@ -889,13 +1065,38 @@ The data in each variable must be dichotomous.  If there are more
  than two distinct variables an error will occur and the test will
  not be run.
  
+@node MEDIAN
+@subsection Median Test
+@vindex MEDIAN
+@cindex Median test
+
+@display
+     [ /MEDIAN [(value)] = varlist BY variable (value1, value2) ]
+@end display
+
+The median test is used to test whether independent samples come from 
+populations with a common median.
+The median of the populations against which the samples are to be tested
+may be given in parentheses immediately after the 
+/MEDIAN subcommand.  If it is not given, the median will be imputed from the 
+union of all the samples.
+
+The variables of the samples to be tested should immediately follow the @samp{=} sign. The
+keyword @code{BY} must come next, and then the grouping variable.  Two values
+in parentheses should follow.  If the first value is greater than the second,
+then a 2 sample test is performed using these two values to determine the groups.
+If however, the first variable is less than the second, then a @i{k} sample test is
+conducted and the group values used are all values encountered which lie in the
+range [@var{value1},@var{value2}].
+
+
  @node RUNS
  @subsection Runs Test
  @vindex RUNS
  @cindex runs test
  
  @display 
-     [ /RUNS (@{MEAN, MEDIAN, MODE, value@}) varlist ]
+     [ /RUNS (@{MEAN, MEDIAN, MODE, value@})  = varlist ]
  @end display
  
  The /RUNS subcommand tests whether a data sequence is randomly ordered.
@@ -1023,7 +1224,7 @@ which they would be needed. This is the default.
  
  
  @menu
-* One Sample Mode::             Testing against a hypothesised mean
+* One Sample Mode::             Testing against a hypothesized mean
  * Independent Samples Mode::    Testing two independent groups for equal mean
  * Paired Samples Mode::         Testing two interdependent groups for equal mean
  @end menu
@@ -1032,7 +1233,7 @@ which they would be needed. This is the default.
  @subsection One Sample Mode
  
  The @cmd{TESTVAL} subcommand invokes the One Sample mode.
-This mode is used to test a population mean against a hypothesised
+This mode is used to test a population mean against a hypothesized
  mean. 
  The value given to the @cmd{TESTVAL} subcommand is the value against
  which you wish to test.
@@ -1291,7 +1492,7 @@ RELIABILITY
  @end display
  
  @cindex Cronbach's Alpha
-The @cmd{RELIABILTY} command performs reliablity analysis on the data.
+The @cmd{RELIABILTY} command performs reliability analysis on the data.
  
  The VARIABLES subcommand is required. It determines the set of variables 
  upon which analysis is to be performed.
@@ -1326,7 +1527,7 @@ analysis tested against the totals.
  @section ROC
  
  @vindex ROC
-@cindex Receiver Operating Characterstic
+@cindex Receiver Operating Characteristic
  @cindex Area under curve
  
  @display
@@ -1390,5 +1591,3 @@ exclude them.
  Cases are excluded on a listwise basis; if any of the variables in @var{var_list} 
  or if the variable @var{state_var} is missing, then the entire case will be 
  excluded.
-
-