From 8da01635d9c23a3e8a31f2fe3c350d0f9e8a2f31 Mon Sep 17 00:00:00 2001
From: Ben Pfaff <blp@cs.stanford.edu>
Date: Wed, 7 May 2025 12:24:19 -0700
Subject: [PATCH] work on manual

---
 rust/doc/src/SUMMARY.md                       |   8 ++
 rust/doc/src/commands/selection/filter.md     |  26 ++++
 rust/doc/src/commands/selection/index.md      |   4 +
 rust/doc/src/commands/selection/n.md          |  30 +++++
 rust/doc/src/commands/selection/sample.md     |  40 ++++++
 rust/doc/src/commands/selection/select-if.md  |  76 +++++++++++
 rust/doc/src/commands/selection/split-file.md | 121 ++++++++++++++++++
 rust/doc/src/commands/selection/temporary.md  |  70 ++++++++++
 rust/doc/src/commands/selection/weight.md     |  95 ++++++++++++++
 9 files changed, 470 insertions(+)
 create mode 100644 rust/doc/src/commands/selection/filter.md
 create mode 100644 rust/doc/src/commands/selection/index.md
 create mode 100644 rust/doc/src/commands/selection/n.md
 create mode 100644 rust/doc/src/commands/selection/sample.md
 create mode 100644 rust/doc/src/commands/selection/select-if.md
 create mode 100644 rust/doc/src/commands/selection/split-file.md
 create mode 100644 rust/doc/src/commands/selection/temporary.md
 create mode 100644 rust/doc/src/commands/selection/weight.md

diff --git a/rust/doc/src/SUMMARY.md b/rust/doc/src/SUMMARY.md
index 77e058bd01..8e9467813b 100644
--- a/rust/doc/src/SUMMARY.md
+++ b/rust/doc/src/SUMMARY.md
@@ -105,6 +105,14 @@
   - [IF](commands/data/if.md)
   - [RECODE](commands/data/recode.md)
   - [SORT CASES](commands/data/sort-cases.md)
+- [Selecting Data](commands/selection/index.md)
+  - [FILTER](commands/selection/filter.md)
+  - [N OF CASES](commands/selection/n.md)
+  - [SAMPLE](commands/selection/sample.md)
+  - [SELECT IF](commands/selection/select-if.md)
+  - [SPLIT FILE](commands/selection/split-file.md)
+  - [TEMPORARY](commands/selection/temporary.md)
+  - [WEIGHT](commands/selection/weight.md)
 
 # Developer Documentation
 
diff --git a/rust/doc/src/commands/selection/filter.md b/rust/doc/src/commands/selection/filter.md
new file mode 100644
index 0000000000..a16d7d65c9
--- /dev/null
+++ b/rust/doc/src/commands/selection/filter.md
@@ -0,0 +1,26 @@
+# FILTER
+
+```
+FILTER BY VAR_NAME.
+FILTER OFF.
+```
+
+`FILTER` allows a boolean-valued variable to be used to select cases
+from the data stream for processing.
+
+To set up filtering, specify `BY` and a variable name.  Keyword `BY` is
+optional but recommended.  Cases which have a zero or system- or
+user-missing value are excluded from analysis, but not deleted from
+the data stream.  Cases with other values are analyzed.  To filter
+based on a different condition, use transformations such as `COMPUTE`
+or `RECODE` to compute a filter variable of the required form, then
+specify that variable on `FILTER`.
+
+`FILTER OFF` turns off case filtering.
+
+Filtering takes place immediately before cases pass to a procedure for
+analysis.  Only one filter variable may be active at a time.
+Normally, case filtering continues until it is explicitly turned off
+with `FILTER OFF`.  However, if `FILTER` is placed after `TEMPORARY`,
+it filters only the next procedure or procedure-like command.
+
diff --git a/rust/doc/src/commands/selection/index.md b/rust/doc/src/commands/selection/index.md
new file mode 100644
index 0000000000..d8634d3a7e
--- /dev/null
+++ b/rust/doc/src/commands/selection/index.md
@@ -0,0 +1,4 @@
+# Selecting Data
+
+This chapter documents PSPP commands that temporarily or permanently
+select data records from the active dataset for analysis.
diff --git a/rust/doc/src/commands/selection/n.md b/rust/doc/src/commands/selection/n.md
new file mode 100644
index 0000000000..7b2396f275
--- /dev/null
+++ b/rust/doc/src/commands/selection/n.md
@@ -0,0 +1,30 @@
+# N OF CASES
+
+```
+N [OF CASES] NUM_OF_CASES [ESTIMATED].
+```
+
+`N OF CASES` limits the number of cases processed by any procedures
+that follow it in the command stream.  `N OF CASES 100`, for example,
+tells PSPP to disregard all cases after the first 100.
+
+When `N OF CASES` is specified after `TEMPORARY`, it affects only the
+next procedure (*note TEMPORARY::).  Otherwise, cases beyond the limit
+specified are not processed by any later procedure.
+
+If the limit specified on `N OF CASES` is greater than the number of
+cases in the active dataset, it has no effect.
+
+When `N OF CASES` is used along with `SAMPLE` or `SELECT IF`, the
+case limit is applied to the cases obtained after sampling or case
+selection, regardless of how `N OF CASES` is placed relative to `SAMPLE`
+or `SELECT IF` in the command file.  Thus, the commands `N OF CASES 100`
+and `SAMPLE .5` both randomly sample approximately half of the active
+dataset's cases, then select the first 100 of those sampled, regardless
+of their order in the command file.
+
+`N OF CASES` with the `ESTIMATED` keyword gives an estimated number of
+cases before `DATA LIST` or another command to read in data.
+`ESTIMATED` never limits the number of cases processed by procedures.
+PSPP currently does not use case count estimates.
+
diff --git a/rust/doc/src/commands/selection/sample.md b/rust/doc/src/commands/selection/sample.md
new file mode 100644
index 0000000000..41be007383
--- /dev/null
+++ b/rust/doc/src/commands/selection/sample.md
@@ -0,0 +1,40 @@
+# SAMPLE
+
+```
+SAMPLE NUM1 [FROM NUM2].
+```
+
+`SAMPLE` randomly samples a proportion of the cases in the active
+file.  Unless it follows `TEMPORARY`, it permanently removes cases
+from the active dataset.
+
+The proportion to sample may be expressed as a single number between 0
+and 1.  If `N` is the number of currently-selected cases in the active
+dataset, then `SAMPLE K.` will select approximately `KÃN` cases.
+
+The proportion to sample can also be specified in the style `SAMPLE M
+FROM N`.  With this style, cases are selected as follows:
+
+1. If `N` is the number of currently-selected cases in the active
+   dataset, exactly `M` cases are selected.
+
+2. If `N` is greater than the number of currently-selected cases in
+   the active dataset, an equivalent proportion of cases are selected.
+
+3. If `N` is less than the number of currently-selected cases in the
+   active, exactly `M` cases are selected *from the first `N` cases*
+   in the active dataset.
+
+`SAMPLE` and `SELECT IF` are performed in the order specified by the
+syntax file.
+
+`SAMPLE` is always performed before [`N OF CASES`](n.md), regardless
+of ordering in the syntax file.
+
+The same values for `SAMPLE` may result in different samples.  To
+obtain the same sample, use the `SET` command to set the random number
+seed to the same value before each `SAMPLE`.  Different samples may
+still result when the file is processed on systems with different
+machine types or PSPP versions.  By default, the random number seed is
+based on the system time.
+
diff --git a/rust/doc/src/commands/selection/select-if.md b/rust/doc/src/commands/selection/select-if.md
new file mode 100644
index 0000000000..259cc912a8
--- /dev/null
+++ b/rust/doc/src/commands/selection/select-if.md
@@ -0,0 +1,76 @@
+# SELECT IF
+
+```
+SELECT IF EXPRESSION.
+```
+
+`SELECT IF` selects cases for analysis based on the value of
+EXPRESSION.  Cases not selected are permanently eliminated from the
+active dataset, unless [`TEMPORARY`](temporary.md) is in effect.
+
+Specify a [boolean
+expression](../../language/expressions/index.md#boolean-values).  If
+the expression is true for a particular case, the case is analyzed.
+If the expression is false or missing, then the case is deleted from
+the data stream.
+
+Place `SELECT IF` early in the command file.  Cases that are deleted
+early can be processed more efficiently in time and space.  Once cases
+have been deleted from the active dataset using `SELECT IF` they
+cannot be re-instated.  If you want to be able to re-instate cases,
+then use [`FILTER`](filter.md) instead.
+
+When `SELECT IF` is specified following [`TEMPORARY`](temporary.md),
+the [`LAG`](../../language/expressions/functions/miscellaneous.md)
+function may not be used.
+
+## Example
+
+A shop steward is interested in the salaries of younger personnel in a
+firm.  The file `personnel.sav` provides the salaries of all the
+workers and their dates of birth.  The syntax below shows how `SELECT
+IF` can be used to limit analysis only to those persons born after
+December 31, 1999.
+
+```
+get file = 'personnel.sav'.
+
+echo 'Salaries of all personnel'.
+descriptives salary.
+
+echo 'Salaries of personnel born after December 31 1999'.
+select if dob > date.dmy (31,12,1999).
+descriptives salary.
+```
+
+From the output shown below, one can see that there are 56 persons
+listed in the dataset, and 17 of them were born after December 31,
+1999.
+
+```
+Salaries of all personnel
+
+               Descriptive Statistics
+ââââââââââââââââââââââââââ¬âââ¬âââââââââ¬ââââââââ¬ââââââââ¬ââââââââ
+â                        â Nâ  Mean  âStd DevâMinimumâMaximumâ
+ââââââââââââââââââââââââââ¼âââ¼âââââââââ¼ââââââââ¼ââââââââ¼ââââââââ¤
+âAnnual salary before taxâ56â40028.97â8721.17â$23,451â$57,044â
+âValid N (listwise)      â56â        â       â       â       â
+âMissing N (listwise)    â 0â        â       â       â       â
+ââââââââââââââââââââââââââ´âââ´âââââââââ´ââââââââ´ââââââââ´ââââââââ
+
+Salaries of personnel born after December 31 1999
+
+               Descriptive Statistics
+ââââââââââââââââââââââââââ¬âââ¬âââââââââ¬ââââââââ¬ââââââââ¬ââââââââ
+â                        â Nâ  Mean  âStd DevâMinimumâMaximumâ
+ââââââââââââââââââââââââââ¼âââ¼âââââââââ¼ââââââââ¼ââââââââ¼ââââââââ¤
+âAnnual salary before taxâ17â31828.59â4454.80â$23,451â$39,504â
+âValid N (listwise)      â17â        â       â       â       â
+âMissing N (listwise)    â 0â        â       â       â       â
+ââââââââââââââââââââââââââ´âââ´âââââââââ´ââââââââ´ââââââââ´ââââââââ
+```
+
+Note that the `personnel.sav` file from which the data were read is
+unaffected.  The transformation affects only the active file.
+
diff --git a/rust/doc/src/commands/selection/split-file.md b/rust/doc/src/commands/selection/split-file.md
new file mode 100644
index 0000000000..f1bbe2db8e
--- /dev/null
+++ b/rust/doc/src/commands/selection/split-file.md
@@ -0,0 +1,121 @@
+# SPLIT FILE
+
+```
+SPLIT FILE [{LAYERED, SEPARATE}] BY VAR_LIST.
+SPLIT FILE OFF.
+```
+
+`SPLIT FILE` allows multiple sets of data present in one data file to
+be analyzed separately using single statistical procedure commands.
+
+Specify a list of variable names to analyze multiple sets of data
+separately.  Groups of adjacent cases having the same values for these
+variables are analyzed by statistical procedure commands as one group.
+An independent analysis is carried out for each group of cases, and the
+variable values for the group are printed along with the analysis.
+
+When a list of variable names is specified, one of the keywords
+`LAYERED` or `SEPARATE` may also be specified.  With `LAYERED`, which
+is the default, the separate analyses for each group are presented
+together in a single table.  With `SEPARATE`, each analysis is
+presented in a separate table.  Not all procedures honor the
+distinction.
+
+Groups are formed only by _adjacent_ cases.  To create a split using a
+variable where like values are not adjacent in the working file, first
+[sort the data](../../commands/data/sort-cases.md) by that variable.
+
+Specify `OFF` to disable `SPLIT FILE` and resume analysis of the
+entire active dataset as a single group of data.
+
+When `SPLIT FILE` is specified after [`TEMPORARY`](temporary.md), it
+affects only the next procedure.
+
+## Example
+
+The file `horticulture.sav` contains data describing the yield of a
+number of horticultural specimens which have been subjected to various
+treatments.  If we wanted to investigate linear statistics of the
+yeild, one way to do this is using the `DESCRIPTIVES` (*note
+DESCRIPTIVES::).  However, it is reasonable to expect the mean to be
+different depending on the treatment.  So we might want to perform
+three separate procedures -- one for each treatment.[^1] The following
+syntax shows how this can be done automatically using the `SPLIT FILE`
+command.
+
+[^1]: There are other, possibly better, ways to achieve a similar
+result using the `MEANS` or `EXAMINE` commands.
+
+```
+get file='horticulture.sav'.
+
+* Ensure cases are sorted before splitting.
+sort cases by treatment.
+
+split file by treatment.
+
+* Run descriptives on the yield variable
+descriptives /variable = yield.
+```
+
+In the following output, you can see that the table of descriptive
+statistics appears 3 timesâonce for each value of treatment.  In this
+example `N`, the number of observations are identical in all splits.
+This is because that experiment was deliberately designed that way.
+However in general one can expect a different `N` for each split.
+
+```
+    Split Values
+âââââââââââ¬ââââââââ
+âVariable â Value â
+âââââââââââ¼ââââââââ¤
+âtreatmentâcontrolâ
+âââââââââââ´ââââââââ
+
+            Descriptive Statistics
+ââââââââââââââââââââââ¬âââ¬ââââââ¬ââââââââ¬ââââââââ¬ââââââââ
+â                    â Nâ MeanâStd DevâMinimumâMaximumâ
+ââââââââââââââââââââââ¼âââ¼ââââââ¼ââââââââ¼ââââââââ¼ââââââââ¤
+âyield               â30â51.23â   8.28â  37.86â  68.59â
+âValid N (listwise)  â30â     â       â       â       â
+âMissing N (listwise)â 0â     â       â       â       â
+ââââââââââââââââââââââ´âââ´ââââââ´ââââââââ´ââââââââ´ââââââââ
+
+ Split Values
+âââââââââââ¬âââââââââââââ
+âVariable â    Value   â
+âââââââââââ¼âââââââââââââ¤
+âtreatmentâconventionalâ
+âââââââââââ´âââââââââââââ
+
+            Descriptive Statistics
+ââââââââââââââââââââââ¬âââ¬ââââââ¬ââââââââ¬ââââââââ¬ââââââââ
+â                    â Nâ MeanâStd DevâMinimumâMaximumâ
+ââââââââââââââââââââââ¼âââ¼ââââââ¼ââââââââ¼ââââââââ¼ââââââââ¤
+âyield               â30â53.57â   8.92â  36.30â  70.66â
+âValid N (listwise)  â30â     â       â       â       â
+âMissing N (listwise)â 0â     â       â       â       â
+ââââââââââââââââââââââ´âââ´ââââââ´ââââââââ´ââââââââ´ââââââââ
+
+ Split Values
+âââââââââââ¬ââââââââââââ
+âVariable â   Value   â
+âââââââââââ¼ââââââââââââ¤
+âtreatmentâtraditionalâ
+âââââââââââ´ââââââââââââ
+
+            Descriptive Statistics
+ââââââââââââââââââââââ¬âââ¬ââââââ¬ââââââââ¬ââââââââ¬ââââââââ
+â                    â Nâ MeanâStd DevâMinimumâMaximumâ
+ââââââââââââââââââââââ¼âââ¼ââââââ¼ââââââââ¼ââââââââ¼ââââââââ¤
+âyield               â30â56.87â   8.88â  39.08â  75.93â
+âValid N (listwise)  â30â     â       â       â       â
+âMissing N (listwise)â 0â     â       â       â       â
+ââââââââââââââââââââââ´âââ´ââââââ´ââââââââ´ââââââââ´ââââââââ
+```
+
+Example 13.3: The results of running `DESCRIPTIVES` with an active split
+
+Unless `TEMPORARY` was used, after a split has been defined for a
+dataset it remains active until explicitly disabled.
+
diff --git a/rust/doc/src/commands/selection/temporary.md b/rust/doc/src/commands/selection/temporary.md
new file mode 100644
index 0000000000..0d337e2d8e
--- /dev/null
+++ b/rust/doc/src/commands/selection/temporary.md
@@ -0,0 +1,70 @@
+# TEMPORARY
+
+```
+TEMPORARY.
+```
+
+`TEMPORARY` is used to make the effects of transformations following
+its execution temporary.  These transformations affect only the
+execution of the next procedure or procedure-like command.  Their
+effects are not be saved to the active dataset.
+
+The only specification on `TEMPORARY` is the command name.
+
+`TEMPORARY` may not appear within a `DO IF` or `LOOP` construct.  It
+may appear only once between procedures and procedure-like commands.
+
+Scratch variables cannot be used following `TEMPORARY`.
+
+## Example
+
+In the syntax below, there are two `COMPUTE` transformation.  One of
+them immediately follows a `TEMPORARY` command, and therefore affects
+only the next procedure, which in this case is the first
+`DESCRIPTIVES` command.
+
+```
+data list notable /x 1-2.
+begin data.
+ 2
+ 4
+10
+15
+20
+24
+end data.
+
+compute x=x/2.
+
+temporary.
+compute x=x+3.
+
+descriptives x.
+descriptives x.
+```
+
+The data read by the first `DESCRIPTIVES` procedure are 4, 5, 8, 10.5,
+13, 15.  The data read by the second `DESCRIPTIVES` procedure are 1,
+2, 5, 7.5, 10, 12.  This is because the second `COMPUTE`
+transformation has no effect on the second `DESCRIPTIVES` procedure.
+You can check these figures in the following output.
+
+```
+                Descriptive Statistics
+ââââââââââââââââââââââ¬ââ¬âââââ¬ââââââââ¬ââââââââ¬ââââââââ
+â                    âNâMeanâStd DevâMinimumâMaximumâ
+ââââââââââââââââââââââ¼ââ¼âââââ¼ââââââââ¼ââââââââ¼ââââââââ¤
+âx                   â6â9.25â   4.38â      4â     15â
+âValid N (listwise)  â6â    â       â       â       â
+âMissing N (listwise)â0â    â       â       â       â
+ââââââââââââââââââââââ´ââ´âââââ´ââââââââ´ââââââââ´ââââââââ
+
+           Descriptive Statistics
+ââââââââââââââââââââââ¬ââ¬âââââ¬ââââââââ¬ââââââââ¬ââââââââ
+â                    âNâMeanâStd DevâMinimumâMaximumâ
+ââââââââââââââââââââââ¼ââ¼âââââ¼ââââââââ¼ââââââââ¼ââââââââ¤
+âx                   â6â6.25â   4.38â      1â     12â
+âValid N (listwise)  â6â    â       â       â       â
+âMissing N (listwise)â0â    â       â       â       â
+ââââââââââââââââââââââ´ââ´âââââ´ââââââââ´ââââââââ´ââââââââ
+```
diff --git a/rust/doc/src/commands/selection/weight.md b/rust/doc/src/commands/selection/weight.md
new file mode 100644
index 0000000000..5046944944
--- /dev/null
+++ b/rust/doc/src/commands/selection/weight.md
@@ -0,0 +1,95 @@
+# WEIGHT
+
+```
+WEIGHT BY VAR_NAME.
+WEIGHT OFF.
+```
+
+`WEIGHT` assigns cases varying weights, changing the frequency
+distribution of the active dataset.  Execution of `WEIGHT` is delayed
+until data have been read.
+
+If a variable name is specified, `WEIGHT` causes the values of that
+variable to be used as weighting factors for subsequent statistical
+procedures.  Use of keyword `BY` is optional but recommended.
+Weighting variables must be numeric.  Scratch variables may not be
+used for weighting (*note Scratch Variables::).
+
+When `OFF` is specified, subsequent statistical procedures weight all
+cases equally.
+
+A positive integer weighting factor `W` on a case yields the same
+statistical output as would replicating the case `W` times.  A
+weighting factor of 0 is treated for statistical purposes as if the
+case did not exist in the input.  Weighting values need not be
+integers, but negative and system-missing values for the weighting
+variable are interpreted as weighting factors of 0.  User-missing
+values are not treated specially.
+
+When `WEIGHT` is specified after [`TEMPORARY`](temporary.md), it
+affects only the next procedure.
+
+`WEIGHT` does not cause cases in the active dataset to be replicated
+in memory.
+
+## Example
+
+One could define a dataset containing an inventory of stock items.  It
+would be reasonable to use a string variable for a description of the
+item, and a numeric variable for the number in stock, like in the
+syntax below.
+
+```
+data list notable list /item (a16) quantity (f8.0).
+begin   data
+nuts    345
+screws  10034
+washers 32012
+bolts   876
+end data.
+
+echo 'Unweighted frequency table'.
+frequencies /variables = item /format=dfreq.
+
+weight by quantity.
+
+echo 'Weighted frequency table'.
+frequencies /variables = item /format=dfreq.
+```
+
+One analysis which most surely would be of interest is the relative
+amounts or each item in stock.  However without setting a weight
+variable, `FREQUENCIES` (*note FREQUENCIES::) does not tell us what we
+want to know, since there is only one case for each stock item.  The
+output below shows the difference between the weighted and unweighted
+frequency tables.
+
+```
+Unweighted frequency table
+
+                          item
+âââââââââââââââ¬ââââââââââ¬ââââââââ¬ââââââââââââââ¬âââââââââââââââââââ
+â             âFrequencyâPercentâValid PercentâCumulative Percentâ
+âââââââââââââââ¼ââââââââââ¼ââââââââ¼ââââââââââââââ¼âââââââââââââââââââ¤
+âValid bolts  â        1â  25.0%â        25.0%â             25.0%â
+â      nuts   â        1â  25.0%â        25.0%â             50.0%â
+â      screws â        1â  25.0%â        25.0%â             75.0%â
+â      washersâ        1â  25.0%â        25.0%â            100.0%â
+âââââââââââââââ¼ââââââââââ¼ââââââââ¼ââââââââââââââ¼âââââââââââââââââââ¤
+âTotal        â        4â 100.0%â             â                  â
+âââââââââââââââ´ââââââââââ´ââââââââ´ââââââââââââââ´âââââââââââââââââââ
+
+Weighted frequency table
+
+                          item
+âââââââââââââââ¬ââââââââââ¬ââââââââ¬ââââââââââââââ¬âââââââââââââââââââ
+â             âFrequencyâPercentâValid PercentâCumulative Percentâ
+âââââââââââââââ¼ââââââââââ¼ââââââââ¼ââââââââââââââ¼âââââââââââââââââââ¤
+âValid washersâ    32012â  74.0%â        74.0%â             74.0%â
+â      screws â    10034â  23.2%â        23.2%â             97.2%â
+â      bolts  â      876â   2.0%â         2.0%â             99.2%â
+â      nuts   â      345â    .8%â          .8%â            100.0%â
+âââââââââââââââ¼ââââââââââ¼ââââââââ¼ââââââââââââââ¼âââââââââââââââââââ¤
+âTotal        â    43267â 100.0%â             â                  â
+âââââââââââââââ´ââââââââââ´ââââââââ´ââââââââââââââ´âââââââââââââââââââ
+```
-- 
2.30.2