From c6caca37f19989f96ad843e2baee09a54c4f23ba Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sun, 14 Aug 2022 22:22:43 -0700 Subject: [PATCH] Docs --- doc/pspp-figures/ctables18.sps | 2 +- doc/pspp-figures/ctables19.sps | 2 +- doc/pspp-figures/ctables20.sps | 2 +- doc/pspp-figures/ctables21.sps | 2 +- doc/pspp-figures/ctables9.sps | 4 +- doc/statistics.texi | 146 ++++++++++++++++++++------------- src/data/dictionary.c | 11 ++- src/data/dictionary.h | 2 + src/data/variable.c | 17 ++-- src/language/stats/ctables.c | 23 ++++-- 10 files changed, 135 insertions(+), 76 deletions(-) diff --git a/doc/pspp-figures/ctables18.sps b/doc/pspp-figures/ctables18.sps index 71248785cc..52938dc9e5 100644 --- a/doc/pspp-figures/ctables18.sps +++ b/doc/pspp-figures/ctables18.sps @@ -1,4 +1,4 @@ -DATA LIST LIST NOTABLE/x y z. +DATA LIST LIST NOTABLE/x (F8.0) y z (F8.2). BEGIN DATA. 1 . 40 1 10 50 diff --git a/doc/pspp-figures/ctables19.sps b/doc/pspp-figures/ctables19.sps index ada8823bcc..ee9187af8f 100644 --- a/doc/pspp-figures/ctables19.sps +++ b/doc/pspp-figures/ctables19.sps @@ -1,4 +1,4 @@ -DATA LIST LIST NOTABLE/x y z. +DATA LIST LIST NOTABLE/x (F8.0) y z (F8.2). BEGIN DATA. 1 . 40 1 10 50 diff --git a/doc/pspp-figures/ctables20.sps b/doc/pspp-figures/ctables20.sps index 6814a274e5..fe0d6fc039 100644 --- a/doc/pspp-figures/ctables20.sps +++ b/doc/pspp-figures/ctables20.sps @@ -1,4 +1,4 @@ -DATA LIST LIST NOTABLE/x y z. +DATA LIST LIST NOTABLE/x (F8.0) y z (F8.2). BEGIN DATA. 1 . 40 1 10 50 diff --git a/doc/pspp-figures/ctables21.sps b/doc/pspp-figures/ctables21.sps index 8fb18d8b2b..1efa83e116 100644 --- a/doc/pspp-figures/ctables21.sps +++ b/doc/pspp-figures/ctables21.sps @@ -1,4 +1,4 @@ -DATA LIST LIST NOTABLE/x y z. +DATA LIST LIST NOTABLE/x (F8.0) y z (F8.2). BEGIN DATA. 1 . 40 1 10 50 diff --git a/doc/pspp-figures/ctables9.sps b/doc/pspp-figures/ctables9.sps index 133d0d7d01..e0494444bb 100644 --- a/doc/pspp-figures/ctables9.sps +++ b/doc/pspp-figures/ctables9.sps @@ -1,2 +1,4 @@ GET FILE='nhtsa.sav'. -CTABLES /TABLE qn20 [C] BY qns3a. +CTABLES + /TABLE qn20 BY qns3a + /TABLE qn20 [C] BY qns3a. diff --git a/doc/statistics.texi b/doc/statistics.texi index 2e133dd651..1b0a294218 100644 --- a/doc/statistics.texi +++ b/doc/statistics.texi @@ -1027,7 +1027,8 @@ An axis expression that names a categorical variable divides the data into cells according to the values of that variable. When all the variables named on @code{TABLE} are categorical, by default each cell displays the number of cases that it contains, so specifying a single -variable yields a frequency table: +variable yields a frequency table, much like the output of the +@code{FREQUENCIES} command (@pxref{FREQUENCIES}): @example CTABLES /TABLE=AgeGroup. @@ -1036,7 +1037,8 @@ CTABLES /TABLE=AgeGroup. @noindent Specifying a row and a column categorical variable yields a -crosstabulation: +crosstabulation, much like the output of the @code{CROSSTABS} command +(@pxref{CROSSTABS}): @example CTABLES /TABLE=AgeGroup BY qns3a. @@ -1121,15 +1123,24 @@ decide whether to treat it as categorical or scalar. Variables assigned the nominal or ordinal measurement level are treated as categorical, and scalar variables are treated as scalar. -Use the @code{VARIABLE LEVEL} command to change a variable's -measurement level (@pxref{VARIABLE LEVEL}). To treat a variable as -categorical or scalar only for one use on @code{CTABLES}, add -@samp{[C]} or @samp{[S]}, respectively, after the variable name. The -following example shows how to analyze the scalar variable @code{qn20} -as categorical: +When @pspp{} reads data from a file in an external format, such as a +text file, variables' measurement levels are often unknown. If +@code{CTABLES} runs when a variable has an unknown measurement level, +it makes an initial pass through the data to guess measurement levels +using the rules described earlier in this manual (@pxref{Measurement +Level}). Use the @code{VARIABLE LEVEL} command to set or change a +variable's measurement level (@pxref{VARIABLE LEVEL}). + +To treat a variable as categorical or scalar only for one use on +@code{CTABLES}, add @samp{[C]} or @samp{[S]}, respectively, after the +variable name. The following example shows the output when variable +@code{qn20} is analyzed as scalar (the default for its measurement +level) and as categorical: @example -CTABLES /TABLE qn20 [C] BY qns3a. +CTABLES + /TABLE qn20 BY qns3a + /TABLE qn20 [C] BY qns3a. @end example @psppoutput {ctables9} @@ -1144,15 +1155,20 @@ sets. @node CTABLES Data Summarization @subsection Data Summarization +@c TODO Summary function default formats + The @code{CTABLES} command allows the user to control how the data are -summarized with summary specifications, which are enclosed in square -brackets following a variable name on the @code{TABLE} subcommand. -When all the variables are categorical, summary specifications can be -given for the innermost nested variables on any one axis. When a -scalar variable is present, only the scalar variable may have summary -specifications. The following example includes a summary -specification for column and row percentages for categorical -variables, and mean and median for a scalar variable: +summarized with @dfn{summary specifications}, syntax that lists one or +more summary function names, optionally separated by commas, and which +are enclosed in square brackets following a variable name on the +@code{TABLE} subcommand. When all the variables are categorical, +summary specifications can be given for the innermost nested variables +on any one axis. When a scalar variable is present, only the scalar +variable may have summary specifications. + +The following example includes a summary specification for column and +row percentages for categorical variables, and mean and median for a +scalar variable: @example CTABLES @@ -1500,6 +1516,8 @@ column variable category labels, respectively, to the layer axis. Only one axis's labels may be moved, whether to the opposite axis or to the layer axis. +@c TODO Moving category labels for stacked variables + @subsubheading Effect on Summary Statistics @code{CLABELS} primarily affects the appearance of tables, not the @@ -1550,7 +1568,7 @@ variables. @code{CATEGORIES} applies to the table produced by the @code{CATEGORIES} does not apply to scalar variables. -@t{VARIABLES} is required. List the variables for the subcommand +@t{VARIABLES} is required and must list the variables for the subcommand to affect. There are two way to specify the Categories to include and their sort @@ -1597,7 +1615,7 @@ A computed category name (@pxref{CTABLES Computed Categories}). Additional forms, described later, allow for subtotals. If multiple elements of the list cover a given category, the last one -in the list is considered to be a match. +in the list takes precedence. @item Implicit categories. Without an explicit list of categories, @pspp{} sorts @@ -1606,13 +1624,15 @@ categories automatically. The @code{KEY} setting specifies the sort key. By default, or with @code{KEY=VALUE}, categories are sorted by default. Categories may also be sorted by value label, with @code{KEY=LABEL}, or by the value -of a summary function, e.g.@: @code{KEY=COUNT}. For summary -functions, a variable name may be specified in parentheses, e.g.@: -@code{KEY=MAXIUM(qnd1)}, and this is required for functions that apply -only to scalar variables. The @code{PTILE} function also requires a -percentage argument, e.g.@: @code{KEY=PTILE(qnd1, 90)}. Only summary -functions used in the table may be used, except that @code{COUNT} is -always allowed. +of a summary function, e.g.@: @code{KEY=COUNT}. +@ignore @c Not yet implemented +For summary functions, a variable name may be specified in +parentheses, e.g.@: @code{KEY=MAXIUM(qnd1)}, and this is required for +functions that apply only to scalar variables. The @code{PTILE} +function also requires a percentage argument, e.g.@: +@code{KEY=PTILE(qnd1, 90)}. Only summary functions used in the table +may be used, except that @code{COUNT} is always allowed. +@end ignore By default, or with @code{ORDER=A}, categories are sorted in ascending order. Specify @code{ORDER=D} to sort in descending order. @@ -1625,9 +1645,10 @@ user-missing values. The system-missing value is always excluded. @subsubheading Totals and Subtotals @code{CATEGORIES} also controls display of totals and subtotals. -Totals are not displayed by default, or with @code{TOTAL=NO}. Specify -@code{TOTAL=YES} to display a total. By default, the total is labeled -``Total''; use @code{LABEL="@i{label}"} to override it. +Totals are not displayed with @code{TOTAL=NO}, which is also the +default. Specify @code{TOTAL=YES} to display a total. By default, +the total is labeled ``Total''; use @code{LABEL="@i{label}"} to +override it. Subtotals are also not displayed by default. To add one or more subtotals, use an explicit category list and insert @code{SUBTOTAL} or @@ -1638,16 +1659,18 @@ categories that make up the subtotal. Either way, the default label is ``Subtotal'', use @code{SUBTOTAL="@i{label}"} or @code{HSUBTOTAL="@i{label}"} to specify a custom label. -By default, or with @code{POSITION=AFTER}, totals come after the last -category and subtotals apply to categories that precede them. With -@code{POSITION=BEFORE}, totals come before the first category and -subtotals apply to categories that follow them. +By default, or with @code{POSITION=AFTER}, totals are displayed in the +output after the last category and subtotals apply to categories that +precede them. With @code{POSITION=BEFORE}, totals come before the +first category and subtotals apply to categories that follow them. Only categorical variables may have totals and subtotals. Scalar variables may be ``totaled'' indirectly by enabling totals and subtotals on a categorical variable within which the scalar variable is summarized. +@c TODO Specifying summaries for totals and subtotals + @subsubheading Categories Without Values Some categories might not be included in the data set being analyzed. @@ -1657,9 +1680,10 @@ younger'' age group. By default, or with @code{EMPTY=INCLUDE}, them, specify @code{EMPTY=EXCLUDE}. For implicit categories, empty categories potentially include all the -values with labels for a given variable; for explicit categories, they -include all the values listed individually and all labeled values -covered by ranges or @code{MISSING} or @code{OTHERNM}. +values with value labels for a given variable; for explicit +categories, they include all the values listed individually and all +values with value labels that are covered by ranges or @code{MISSING} +or @code{OTHERNM}. @node CTABLES Titles @subsection Titles @@ -1671,11 +1695,14 @@ covered by ranges or @code{MISSING} or @code{OTHERNM}. [@t{CORNER=}@i{string}@dots{}] @end display +@c TODO Describe substitution variables + The @code{TITLES} subcommand sets the title, caption, and corner text for the table output for the previous @code{TABLE} subcommand. The title appears above the table, the caption below the table, and the corner text appears in the table's upper left corner. By default, the title is ``Custom Tables'' and the caption and corner text are empty. +With some table output styles, the corner text is not displayed. @node CTABLES Table Formatting @subsection Table Formatting @@ -1694,13 +1721,13 @@ The @code{FORMAT} subcommand, which must precede the first tables. @code{FORMAT} and all of its settings are optional. Use @code{MINCOLWIDTH} and @code{MAXCOLWIDTH} to control the minimum -or maximum width of columns in output tables. By default, or with +or maximum width of columns in output tables. By default, with @code{DEFAULT}, column width varies based on content. Otherwise, specify a number for either or both of these settings. If both are -specified, @code{MAXCOLWIDTH} must be bigger than @code{MINCOLWIDTH}. -The default unit, or with @code{UNITS=POINTS}, is points (1/72 inch), -but specify @code{UNITS=INCHES} to use inches or @code{UNITS=CM} for -centimeters. +specified, @code{MAXCOLWIDTH} must be greater than or equal to +@code{MINCOLWIDTH}. The default unit, or with @code{UNITS=POINTS}, is +points (1/72 inch), or specify @code{UNITS=INCHES} to use inches or +@code{UNITS=CM} for centimeters. By default, or with @code{EMPTY=ZERO}, zero values are displayed in their usual format. Use @code{EMPTY=BLANK} to use an empty cell @@ -1730,7 +1757,7 @@ variables listed on @code{VARIABLES}. The supported values are: @table @code @item DEFAULT -Uses the setting from @ref{SET TVARS}. +Use the setting from @code{SET TVARS} (@pxref{SET TVARS}). @item NAME Show only a variable name. @@ -1829,30 +1856,33 @@ in @code{EXPR(@dots{})}. A postcompute expression consists of: This form evaluates to the summary statistic for @i{category}, e.g.@: @code{[1]} evaluates to the value of the summary statistic associated with category 1. The @i{category} may be a number, a quoted string, -or a quoted time or date value, and all of the categories for a given -postcompute must have the same form. +or a quoted time or date value. All of the categories for a given +postcompute must have the same form. The category must appear in all +the @code{CATEGORIES} list in which the postcompute is used. @item [@i{min} THRU @i{max}] @itemx [LO THRU @i{max}] @itemx [@i{min} THRU HI] @itemx MISSING @itemx OTHERNM -These forms evaluate to the summary statistics for categories matching -the given syntax, as described in previous sections (@pxref{CTABLES -Explicit Category List}). If more than one category matches, their -values are summed. +These forms evaluate to the summary statistics for a category +specified with the same syntax, as described in previous section +(@pxref{CTABLES Explicit Category List}). The category must appear in +all the @code{CATEGORIES} list in which the postcompute is used. @item SUBTOTAL The summary statistic for the subtotal category. This form is allowed -only for variables with exactly one subtotal. +only if the @code{CATEGORIES} lists that include this postcompute have +exactly one subtotal. @item SUBTOTAL[@i{index}] The summary statistic for subtotal category @i{index}, where 1 is the first subtotal, 2 is the second, and so on. This form may be used for -any number of subtotals. +@code{CATEGORIES} lists with any number of subtotals. @item TOTAL -The summary statistic for the total. +The summary statistic for the total. The @code{CATEGORIES} lsits that +include this postcompute must have a total enabled. @item @i{a} + @i{b} @itemx @i{a} - @i{b} @@ -1919,14 +1949,16 @@ by computed categories are displayed like other categories. Use The @code{WEIGHT} subcommand is optional and must appear before @code{TABLE}. If it appears, it must name a numeric variable, known as the @dfn{effective base weight} or @dfn{adjustment weight}. The -effective base weight variable is used for the @code{ECOUNT}, -@code{ETOTALN}, and @code{EVALIDN} summary functions. - -Cases with zero, missing, or negative effective base weight are -excluded from all analysis. +effective base weight variable stands in for the dictionary's weight +variable (@pxref{WEIGHT}), if any, in most calculations in +@code{CTABLES}. The only exceptions are the @code{COUNT}, +@code{TOTALN}, and @code{VALIDN} summary functions, which use the +dictionary weight instead. Weights obtained from the @pspp{} dictionary are rounded to the -nearest integer. Effective base weights are not rounded. +nearest integer at the case level. Effective base weights are not +rounded. Regardless of the weighting source, @pspp{} does not analyze +cases with zero, missing, or negative effective weights. @node CTABLES Hiding Small Counts @subsection Hiding Small Counts diff --git a/src/data/dictionary.c b/src/data/dictionary.c index d04a4f70fa..a2e3fb8fc1 100644 --- a/src/data/dictionary.c +++ b/src/data/dictionary.c @@ -1262,7 +1262,7 @@ dict_get_weight (const struct dictionary *d) } /* Returns the value of D's weighting variable in case C, except - that a negative weight is returned as 0. Returns 1 if the + that a negative or missing weight is returned as 0. Returns 1 if the dictionary is unweighted. Will warn about missing, negative, or zero values if *WARN_ON_INVALID is true. The function will set *WARN_ON_INVALID to false if an invalid weight is @@ -1283,6 +1283,15 @@ dict_get_case_weight (const struct dictionary *d, const struct ccase *c, } } +/* Like dict_get_case_weight(), but additionally rounds each weight to the + nearest integer. */ +double +dict_get_rounded_case_weight (const struct dictionary *d, + const struct ccase *c, bool *warn_on_invalid) +{ + return floor (dict_get_case_weight (d, c, warn_on_invalid) + 0.5); +} + /* Returns the format to use for weights. */ const struct fmt_spec * dict_get_weight_format (const struct dictionary *d) diff --git a/src/data/dictionary.h b/src/data/dictionary.h index 47317a22cb..3e874d8707 100644 --- a/src/data/dictionary.h +++ b/src/data/dictionary.h @@ -98,6 +98,8 @@ void dict_set_names_must_be_ids (struct dictionary *, bool); /* Weight variable. */ double dict_get_case_weight (const struct dictionary *, const struct ccase *, bool *); +double dict_get_rounded_case_weight (const struct dictionary *, + const struct ccase *, bool *); struct variable *dict_get_weight (const struct dictionary *); void dict_set_weight (struct dictionary *, struct variable *); const struct fmt_spec *dict_get_weight_format (const struct dictionary *); diff --git a/src/data/variable.c b/src/data/variable.c index 2e584fe86f..87e7d07823 100644 --- a/src/data/variable.c +++ b/src/data/variable.c @@ -1357,15 +1357,16 @@ var_clear_vardict (struct variable *v) double var_force_valid_weight (const struct variable *wv, double w, bool *warn_on_invalid) { - if (w < 0.0 || (wv && var_is_num_missing (wv, w))) - w = 0.0; - - if (w == 0.0 && warn_on_invalid != NULL && *warn_on_invalid) + if (w <= 0.0 || (wv ? var_is_num_missing (wv, w) : w == SYSMIS)) { - *warn_on_invalid = false; - msg (SW, _("At least one case in the data file had a weight value " - "that was user-missing, system-missing, zero, or " - "negative. These case(s) were ignored.")); + w = 0.0; + if (warn_on_invalid != NULL && *warn_on_invalid) + { + *warn_on_invalid = false; + msg (SW, _("At least one case in the data file had a weight value " + "that was user-missing, system-missing, zero, or " + "negative. These case(s) were ignored.")); + } } return w; diff --git a/src/language/stats/ctables.c b/src/language/stats/ctables.c index b2274f3e44..bb7f5c37aa 100644 --- a/src/language/stats/ctables.c +++ b/src/language/stats/ctables.c @@ -2655,24 +2655,37 @@ ctables_summary_add (union ctables_summary *s, switch (ss->function) { case CTSF_TOTALN: - case CTSF_areaPCT_TOTALN: s->count += ss->weighted ? d_weight : 1.0; break; + case CTSF_areaPCT_TOTALN: + s->count += ss->weighted ? e_weight : 1.0; + break; + case CTSF_COUNT: - case CTSF_areaPCT_COUNT: if (is_scale || !excluded_missing) s->count += ss->weighted ? d_weight : 1.0; break; + case CTSF_areaPCT_COUNT: + if (is_scale || !excluded_missing) + s->count += ss->weighted ? e_weight : 1.0; + break; + case CTSF_VALIDN: - case CTSF_areaPCT_VALIDN: if (is_scale ? !is_scale_missing : !is_missing) s->count += ss->weighted ? d_weight : 1.0; break; + case CTSF_areaPCT_VALIDN: + if (is_scale + ? !is_scale_missing + : !is_missing) + s->count += ss->weighted ? e_weight : 1.0; + break; + case CTSF_areaID: break; @@ -2680,7 +2693,7 @@ ctables_summary_add (union ctables_summary *s, if (is_scale ? is_scale_missing : is_missing) - s->count += ss->weighted ? d_weight : 1.0; + s->count += ss->weighted ? e_weight : 1.0; break; case CTSF_ECOUNT: @@ -5264,7 +5277,7 @@ ctables_execute (struct dataset *ds, struct casereader *input, for (struct ccase *c = casereader_read (group); c; case_unref (c), c = casereader_read (group)) { - double d_weight = dict_get_case_weight (dict, c, &warn_on_invalid); + double d_weight = dict_get_rounded_case_weight (dict, c, &warn_on_invalid); double e_weight = (ct->e_weight ? var_force_valid_weight (ct->e_weight, case_num (c, ct->e_weight), -- 2.30.2