Increment version to 0.7.9 to send to Translation Project.

[pspp-builds.git] / doc / statistics.texi
diff --git a/doc/statistics.texi b/doc/statistics.texi

index 36727c44c5f75dfa7620e900029875d4c2dd874b..edd96d0990bb3fcd050a8490efc36d6560855f39 100644 (file)
--- a/doc/statistics.texi
+++ b/doc/statistics.texi
@@ -11,6 +11,7 @@ far.
  * CORRELATIONS::                Correlation tables.
  * CROSSTABS::                   Crosstabulation tables.
  * FACTOR::                      Factor analysis and Principal Components analysis
  * CORRELATIONS::                Correlation tables.
  * CROSSTABS::                   Crosstabulation tables.
  * FACTOR::                      Factor analysis and Principal Components analysis
+* MEANS::                       Average values and other statistics.
  * NPAR TESTS::                  Nonparametric tests.
  * T-TEST::                      Test hypotheses about means.
  * ONEWAY::                      One way analysis of variance.
  * NPAR TESTS::                  Nonparametric tests.
  * T-TEST::                      Test hypotheses about means.
  * ONEWAY::                      One way analysis of variance.
@@ -258,7 +259,7 @@ useful there is more than one dependent variable and at least one factor.   If
  containing boxplots for all the factors.
  If /COMPARE=VARIABLES is specified, then one plot per factor is produced, each 
  each containing one boxplot per dependent variable.
  containing boxplots for all the factors.
  If /COMPARE=VARIABLES is specified, then one plot per factor is produced, each 
  each containing one boxplot per dependent variable.
-If the /COMPARE subcommand is ommitted, then PSPP uses the default value of 
+If the /COMPARE subcommand is omitted, then PSPP uses the default value of 
  /COMPARE=GROUPS.
   
  The ID subcommand also pertains to boxplots.  If given, it must
  /COMPARE=GROUPS.
   
  The ID subcommand also pertains to boxplots.  If given, it must
@@ -361,7 +362,6 @@ CROSSTABS
          /MISSING=@{TABLE,INCLUDE,REPORT@}
          /WRITE=@{NONE,CELLS,ALL@}
          /FORMAT=@{TABLES,NOTABLES@}
          /MISSING=@{TABLE,INCLUDE,REPORT@}
          /WRITE=@{NONE,CELLS,ALL@}
          /FORMAT=@{TABLES,NOTABLES@}
-                @{LABELS,NOLABELS,NOVALLABS@}
                  @{PIVOT,NOPIVOT@}
                  @{AVALUE,DVALUE@}
                  @{NOINDEX,INDEX@}
                  @{PIVOT,NOPIVOT@}
                  @{AVALUE,DVALUE@}
                  @{NOINDEX,INDEX@}
@@ -419,11 +419,6 @@ settings:
  TABLES, the default, causes crosstabulation tables to be output.
  NOTABLES suppresses them.
  
  TABLES, the default, causes crosstabulation tables to be output.
  NOTABLES suppresses them.
  
-@item
-LABELS, the default, allows variable labels and value labels to appear
-in the output.  NOLABELS suppresses them.  NOVALLABS displays variable
-labels but suppresses value labels.
-
  @item
  PIVOT, the default, causes each TABLES subcommand to be displayed in a
  pivot table format.  NOPIVOT causes the old-style crosstabulation format
  @item
  PIVOT, the default, causes each TABLES subcommand to be displayed in a
  pivot table format.  NOPIVOT causes the old-style crosstabulation format
@@ -556,7 +551,7 @@ FACTOR  VARIABLES=var_list
  
          [ /ROTATION=@{VARIMAX, EQUAMAX, QUARTIMAX, NOROTATE@}]
  
  
          [ /ROTATION=@{VARIMAX, EQUAMAX, QUARTIMAX, NOROTATE@}]
  
-        [ /PRINT=[INITIAL] [EXTRACTION] [ROTATION] [UNIVARIATE] [CORRELATION] [COVARIANCE] [DET] [SIG] [ALL] [DEFAULT] ]
+        [ /PRINT=[INITIAL] [EXTRACTION] [ROTATION] [UNIVARIATE] [CORRELATION] [COVARIANCE] [DET] [KMO] [SIG] [ALL] [DEFAULT] ]
  
          [ /PLOT=[EIGEN] ]
  
  
          [ /PLOT=[EIGEN] ]
  
@@ -601,6 +596,8 @@ The /PRINT subcommand may be used to select which features of the analysis are r
        The covariance matrix is printed.
  @item DET
        The determinant of the correlation or covariance matrix is printed.
        The covariance matrix is printed.
  @item DET
        The determinant of the correlation or covariance matrix is printed.
+@item KMO
+      The Kaiser-Meyer-Olkin measure of sampling adequacy and the Bartlett test of sphericity is printed.
  @item SIG
        The significance of the elements of correlation matrix is printed.
  @item ALL
  @item SIG
        The significance of the elements of correlation matrix is printed.
  @item ALL
@@ -609,7 +606,7 @@ The /PRINT subcommand may be used to select which features of the analysis are r
        Identical to INITIAL and EXTRACTION.
  @end itemize
  
        Identical to INITIAL and EXTRACTION.
  @end itemize
  
-If /PLOT=EIGEN is given, then a ``Scree'' plot of the eigenvalues will be printed.  This can be useful for visualising
+If /PLOT=EIGEN is given, then a ``Scree'' plot of the eigenvalues will be printed.  This can be useful for visualizing
  which factors (components) should be retained.
  
  The /FORMAT subcommand determined how data are to be displayed in loading matrices.  If SORT is specified, then the variables
  which factors (components) should be retained.
  
  The /FORMAT subcommand determined how data are to be displayed in loading matrices.  If SORT is specified, then the variables
@@ -638,7 +635,145 @@ contains a missing value.
  If PAIRWISE is set, then a case is considered missing only if either of the
  values  for the particular coefficient are missing.
  The default is LISTWISE.
  If PAIRWISE is set, then a case is considered missing only if either of the
  values  for the particular coefficient are missing.
  The default is LISTWISE.
- 
+
+@node MEANS
+@section MEANS
+
+@vindex MEANS
+@cindex means
+
+@display 
+MEANS [TABLES =] 
+      @{varlist@} 
+        [ BY @{varlist@} [BY @{varlist@} [BY @{varlist@} @dots{} ]]]
+
+      [ /@{varlist@} 
+         [ BY @{varlist@} [BY @{varlist@} [BY @{varlist@} @dots{} ]]] ]
+
+      [/CELLS = [MEAN] [COUNT] [STDDEV] [SEMEAN] [SUM] [MIN] [MAX] [RANGE]
+        [VARIANCE] [KURT] [SEKURT] 
+        [SKEW] [SESKEW] [FIRST] [LAST] 
+        [HARMONIC] [GEOMETRIC] 
+        [DEFAULT]
+        [ALL]
+        [NONE] ]
+
+      [/MISSING = [TABLE] [INCLUDE] [DEPENDENT]]
+@end display 
+
+You can use the MEANS command to calculate the arithmetic mean and similar
+statistics, either for the dataset as a whole or for categories of data.
+
+The simplest form of the command is
+@example
+MEANS @var{v}.
+@end example
+@noindent which calculates the mean, count and standard deviation for @var{v}.
+If you specify a grouping variable, for example
+@example
+MEANS @var{v} BY @var{g}.
+@end example
+@noindent then the means, counts and standard deviations for @var{v} after having
+been grouped by @var{g} will be calculated.
+Instead of the mean, count and standard deviation, you could specify the statistics
+in which you are interested:
+@example
+MEANS @var{x} @var{y} BY @var{g}
+      /CELLS = HARMONIC SUM MIN.
+@end example
+This example calculates the harmonic mean, the sum and the minimum values of @var{x} and @var{y}
+grouped by @var{g}.
+
+The CELLS subcommand specifies which statistics to calculate.  The available statistics
+are:
+@itemize
+@item MEAN
+@cindex arithmetic mean
+      The arithmetic mean.
+@item COUNT
+      The count of the values.
+@item STDDEV
+      The standard deviation.
+@item SEMEAN
+      The standard error of the mean.
+@item SUM
+      The sum of the values.
+@item MIN
+      The minimum value.
+@item MAX
+      The maximum value.
+@item RANGE
+      The difference between the maximum and minimum values.
+@item VARIANCE
+      The variance.
+@item FIRST
+      The first value in the category.
+@item LAST
+      The last value in the category.
+@item SKEW
+      The skewness.
+@item SESKEW
+      The standard error of the skewness.
+@item KURT
+      The kurtosis
+@item SEKURT
+      The standard error of the kurtosis.
+@item HARMONIC
+@cindex harmonic mean
+      The harmonic mean.
+@item GEOMETRIC
+@cindex geometric mean
+      The geometric mean.
+@end itemize
+
+In addition, three special keywords are recognized:
+@itemize
+@item DEFAULT
+      This is the same as MEAN COUNT STDDEV
+@item ALL
+      All of the above statistics will be calculated.
+@item NONE
+      No statistics will be calculated (only a summary will be shown).
+@end itemize
+
+
+More than one @dfn{table} can be specified in a single command. 
+Each table is separated by a @samp{/}. For
+example
+@example
+MEANS TABLES =
+      @var{c} @var{d} @var{e} BY @var{x}
+      /@var{a} @var{b} BY @var{x} @var{y}
+      /@var{f} BY @var{y} BY @var{z}.
+@end example
+has three tables (the @samp{TABLE =} is optional).
+The first table has three dependent variables @var{c}, @var{d} and @var{e}
+and a single categorical variable @var{x}.
+The second table has two dependent variables @var{a} and @var{b}, 
+and two categorical variables @var{x} and @var{y}.
+The third table has a single dependent variables @var{f}
+and a categorical variable formed by the combination of @var{y} and @var{z}.
+
+
+By default values are omitted from the analysis only if missing values
+(either system missing or user missing)
+for any of the variables directly involved in their calculation are 
+encountered.
+This behaviour can be modified with the  /MISSING subcommand.
+Three options are possible: TABLE, INCLUDE and DEPENDENT.
+
+/MISSING = TABLE causes cases to be dropped if any variable is missing 
+in the table specification currently being processed, regardless of 
+whether it is needed to calculate the statistic.
+
+/MISSING = INCLUDE says that user missing values, either in the dependent
+variables or in the categorical variables should be taken at their face
+value, and not excluded.
+
+/MISSING = DEPENDENT says that user missing values, in the dependent
+variables should be taken at their face value, however cases which 
+have user missing values for the categorical variables should be omitted 
+from the calculation.
  
  @node NPAR TESTS
  @section NPAR TESTS
  
  @node NPAR TESTS
  @section NPAR TESTS
@@ -685,9 +820,11 @@ is used.
  * COCHRAN::                 Cochran Q Test
  * FRIEDMAN::                Friedman Test
  * KENDALL::                 Kendall's W Test
  * COCHRAN::                 Cochran Q Test
  * FRIEDMAN::                Friedman Test
  * KENDALL::                 Kendall's W Test
+* KOLMOGOROV-SMIRNOV::      Kolmogorov Smirnov Test
  * KRUSKAL-WALLIS::          Kruskal-Wallis Test
  * MANN-WHITNEY::            Mann Whitney U Test
  * MCNEMAR::                 McNemar Test
  * KRUSKAL-WALLIS::          Kruskal-Wallis Test
  * MANN-WHITNEY::            Mann Whitney U Test
  * MCNEMAR::                 McNemar Test
+* MEDIAN::                  Median Test
  * RUNS::                    Runs Test
  * SIGN::                    The Sign Test
  * WILCOXON::                Wilcoxon Signed Ranks Test
  * RUNS::                    Runs Test
  * SIGN::                    The Sign Test
  * WILCOXON::                Wilcoxon Signed Ranks Test
@@ -816,6 +953,45 @@ It has the range [0,1] --- a value of zero indicates no agreement between the sa
  unity indicates complete agreement.
  
  
  unity indicates complete agreement.
  
  
+@node KOLMOGOROV-SMIRNOV
+@subsection Kolmogorov-Smirnov Test
+@vindex KOLMOGOROV-SMIRNOV
+@vindex K-S
+@cindex Kolmogorov-Smirnov test
+
+@display
+     [ /KOLMOGOROV-SMIRNOV (@{NORMAL [@var{mu}, @var{sigma}], UNIFORM [@var{min}, @var{max}], POISSON [@var{lambda}], EXPONENTIAL [@var{scale}] @}) = varlist ]
+@end display
+
+The one sample Kolmogorov-Smirnov subcommand is used to test whether or not a dataset is
+drawn from a particular distribution.  Four distributions are supported, @i{viz:}
+Normal, Uniform, Poisson and Exponential.
+
+Ideally you should provide the parameters of the distribution against which you wish to test
+the data. For example, with the normal distribution  the mean (@var{mu})and standard deviation (@var{sigma})
+should be given; with the uniform distribution, the minimum (@var{min})and maximum (@var{max}) value should
+be provided.
+However, if the parameters are omitted they will be imputed from the data. Imputing the
+parameters reduces the power of the test so should be avoided if possible.
+
+In the following example, two variables @var{score} and @var{age} are tested to see if
+they follow a normal distribution with a mean of 3.5 and a standard deviation of 2.0.
+@example
+  NPAR TESTS
+        /KOLMOGOROV-SMIRNOV (normal 3.5 2.0) = @var{score} @var{age}.
+@end example
+If the variables need to be tested against different distributions, then a separate
+subcommand must be used.  For example the following syntax tests @var{score} against
+a normal distribution with mean of 3.5 and standard deviation of 2.0 whilst @var{age}
+is tested against a normal distribution of mean 40 and standard deviation 1.5.
+@example
+  NPAR TESTS
+        /KOLMOGOROV-SMIRNOV (normal 3.5 2.0) = @var{score}
+        /KOLMOGOROV-SMIRNOV (normal 40 1.5) =  @var{age}.
+@end example
+
+The abbreviated subcommand  K-S may be used in place of KOLMOGOROV-SMIRNOV.
+
  @node KRUSKAL-WALLIS
  @subsection Kruskal-Wallis Test
  @vindex KRUSKAL-WALLIS
  @node KRUSKAL-WALLIS
  @subsection Kruskal-Wallis Test
  @vindex KRUSKAL-WALLIS
@@ -889,13 +1065,38 @@ The data in each variable must be dichotomous.  If there are more
  than two distinct variables an error will occur and the test will
  not be run.
  
  than two distinct variables an error will occur and the test will
  not be run.
  
+@node MEDIAN
+@subsection Median Test
+@vindex MEDIAN
+@cindex Median test
+
+@display
+     [ /MEDIAN [(value)] = varlist BY variable (value1, value2) ]
+@end display
+
+The median test is used to test whether independent samples come from 
+populations with a common median.
+The median of the populations against which the samples are to be tested
+may be given in parentheses immediately after the 
+/MEDIAN subcommand.  If it is not given, the median will be imputed from the 
+union of all the samples.
+
+The variables of the samples to be tested should immediately follow the @samp{=} sign. The
+keyword @code{BY} must come next, and then the grouping variable.  Two values
+in parentheses should follow.  If the first value is greater than the second,
+then a 2 sample test is performed using these two values to determine the groups.
+If however, the first variable is less than the second, then a @i{k} sample test is
+conducted and the group values used are all values encountered which lie in the
+range [@var{value1},@var{value2}].
+
+
  @node RUNS
  @subsection Runs Test
  @vindex RUNS
  @cindex runs test
  
  @display 
  @node RUNS
  @subsection Runs Test
  @vindex RUNS
  @cindex runs test
  
  @display 
-     [ /RUNS (@{MEAN, MEDIAN, MODE, value@}) varlist ]
+     [ /RUNS (@{MEAN, MEDIAN, MODE, value@})  = varlist ]
  @end display
  
  The /RUNS subcommand tests whether a data sequence is randomly ordered.
  @end display
  
  The /RUNS subcommand tests whether a data sequence is randomly ordered.
@@ -1023,7 +1224,7 @@ which they would be needed. This is the default.
  
  
  @menu
  
  
  @menu
-* One Sample Mode::             Testing against a hypothesised mean
+* One Sample Mode::             Testing against a hypothesized mean
  * Independent Samples Mode::    Testing two independent groups for equal mean
  * Paired Samples Mode::         Testing two interdependent groups for equal mean
  @end menu
  * Independent Samples Mode::    Testing two independent groups for equal mean
  * Paired Samples Mode::         Testing two interdependent groups for equal mean
  @end menu
@@ -1032,7 +1233,7 @@ which they would be needed. This is the default.
  @subsection One Sample Mode
  
  The @cmd{TESTVAL} subcommand invokes the One Sample mode.
  @subsection One Sample Mode
  
  The @cmd{TESTVAL} subcommand invokes the One Sample mode.
-This mode is used to test a population mean against a hypothesised
+This mode is used to test a population mean against a hypothesized
  mean. 
  The value given to the @cmd{TESTVAL} subcommand is the value against
  which you wish to test.
  mean. 
  The value given to the @cmd{TESTVAL} subcommand is the value against
  which you wish to test.
@@ -1291,7 +1492,7 @@ RELIABILITY
  @end display
  
  @cindex Cronbach's Alpha
  @end display
  
  @cindex Cronbach's Alpha
-The @cmd{RELIABILTY} command performs reliablity analysis on the data.
+The @cmd{RELIABILTY} command performs reliability analysis on the data.
  
  The VARIABLES subcommand is required. It determines the set of variables 
  upon which analysis is to be performed.
  
  The VARIABLES subcommand is required. It determines the set of variables 
  upon which analysis is to be performed.
@@ -1326,7 +1527,7 @@ analysis tested against the totals.
  @section ROC
  
  @vindex ROC
  @section ROC
  
  @vindex ROC
-@cindex Receiver Operating Characterstic
+@cindex Receiver Operating Characteristic
  @cindex Area under curve
  
  @display
  @cindex Area under curve
  
  @display
@@ -1390,5 +1591,3 @@ exclude them.
  Cases are excluded on a listwise basis; if any of the variables in @var{var_list} 
  or if the variable @var{state_var} is missing, then the entire case will be 
  excluded.
  Cases are excluded on a listwise basis; if any of the variables in @var{var_list} 
  or if the variable @var{state_var} is missing, then the entire case will be 
  excluded.
-
-