From: Ben Pfaff Date: Sun, 20 Mar 2016 00:19:45 +0000 (-0700) Subject: Implement new command SORT VARIABLES. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?p=pspp;a=commitdiff_plain;h=76c790067446eb993f615813069936887fbb4fc8 Implement new command SORT VARIABLES. --- diff --git a/NEWS b/NEWS index d280e7f719..acf621be10 100644 --- a/NEWS +++ b/NEWS @@ -38,6 +38,10 @@ Changes from 0.8.5 to 0.9.0: discourage its use PSPP and PSPPIRE do not directly read or write this format. + * New commands: + + - SORT VARIABLES. + * The following functions for transformation expressions are new: - REPLACE, for search-and-replace of one string with another. diff --git a/doc/variables.texi b/doc/variables.texi index 68a996681d..6a6a77b7fe 100644 --- a/doc/variables.texi +++ b/doc/variables.texi @@ -16,6 +16,7 @@ several utility functions for examining and adjusting them. * NUMERIC:: Create new numeric variables. * PRINT FORMATS:: Set variable print formats. * RENAME VARIABLES:: Rename variables. +* SORT VARIABLES:: Reorder variables. * VALUE LABELS:: Set value labels for variables. * STRING:: Create new string variables. * VARIABLE ATTRIBUTE:: Set custom attributes on variables. @@ -443,6 +444,82 @@ to be read. @cmd{RENAME VARIABLES} may not be specified following @cmd{TEMPORARY} (@pxref{TEMPORARY}). +@node SORT VARIABLES +@section SORT VARIABLES +@vindex SORT VARIABLES + +@display +SORT VARIABLES [BY] + (NAME | TYPE | FORMAT | LABEL | VALUES | MISSING | MEASURE + | ROLE | COLUMNS | ALIGNMENT | ATTRIBUTE @var{name}) + [(D)]. +@end display + +@cmd{SORT VARIABLES} reorders the variables in the active dataset. +The main specification is one of the following identifiers, which +determines how the variables are sorted: + +@table @asis +@item NAME +Sorts the variables according to their names, in a case-insensitive +fashion. However, when variable names differ only in a number at the +end, they are sorted numerically. For example, @code{VAR5} is sorted +before @code{VAR400} even though @samp{4} precedes @samp{5}. + +@item TYPE +Sorts numeric variables before string variables, and shorter string +variables before longer ones. + +@item FORMAT +Groups variables by print format; within a format, sorts narrower +formats before wider ones; with the same format and width, sorts fewer +decimal places before more decimal places. +@xref{FORMATS}. + +@item LABEL +Sorts variables without a variable label before those with one. +@xref{VARIABLE LABELS}. + +@item VALUES +Sorts variables without value labels before those with some. +@xref{VALUE LABELS}. + +@item MISSING +Sorts variables without missing values before those with some. +@xref{MISSING VALUES}. + +@item MEASURE +Sorts nominal variables first, followed by ordinal variables, followed +by scale variables. @xref{VARIABLE LEVEL}. + +@item ROLE +Groups variables according to their role. @xref{VARIABLE ROLE}. + +@item COLUMNS +Sorts variables in ascending display width. @xref{VARIABLE WIDTH}. + +@item ALIGNMENT +Sorts variables according to their alignment, first left-aligned, then +right-aligned, then centered. @xref{VARIABLE ALIGNMENT}. + +@item ATTRIBUTE @var{name} +Sorts variables according to the first value of their @var{name} +attribute. Variables without attribute are sorted first. +@xref{VARIABLE ATTRIBUTE}. +@end table + +Only one sort criterion can be specified. The sort is ``stable,'' so +to sort on multiple criteria one may perform multiple sorts. For +example, the following will sort primarily based on alignment, with +variables that have the same alignment ordered based on display width: + +@example +SORT VARIABLES BY COLUMNS. +SORT VARIABLES BY ALIGNMENT. +@end example + +Specify @code{(D)} to reverse the sort order. + @node VALUE LABELS @section VALUE LABELS @vindex VALUE LABELS diff --git a/src/data/attributes.c b/src/data/attributes.c index c4ae97c285..f516dc61a1 100644 --- a/src/data/attributes.c +++ b/src/data/attributes.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2008, 2009, 2011, 2012 Free Software Foundation, Inc. + Copyright (C) 2008, 2009, 2011, 2012, 2016 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -220,14 +220,14 @@ attrset_count (const struct attrset *set) case-insensitively, or a null pointer if SET does not contain an attribute with that name. */ struct attribute * -attrset_lookup (struct attrset *set, const char *name) +attrset_lookup (const struct attrset *set, const char *name) { - struct attribute *attr; + const struct attribute *attr; HMAP_FOR_EACH_WITH_HASH (attr, struct attribute, node, utf8_hash_case_string (name, 0), &set->map) if (!utf8_strcasecmp (attribute_get_name (attr), name)) break; - return attr; + return CONST_CAST (struct attribute *, attr); } /* Adds ATTR to SET, which must not already contain an attribute diff --git a/src/data/attributes.h b/src/data/attributes.h index ab7b12e301..a5cebc5bcc 100644 --- a/src/data/attributes.h +++ b/src/data/attributes.h @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2008, 2011, 2012 Free Software Foundation, Inc. + Copyright (C) 2008, 2011, 2012, 2016 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -52,7 +52,7 @@ void attrset_destroy (struct attrset *); size_t attrset_count (const struct attrset *); -struct attribute *attrset_lookup (struct attrset *, const char *); +struct attribute *attrset_lookup (const struct attrset *, const char *); void attrset_add (struct attrset *, struct attribute *); void attrset_delete (struct attrset *, const char *); void attrset_clear (struct attrset *); diff --git a/src/language/command.def b/src/language/command.def index 2ba0f2673d..49becdf922 100644 --- a/src/language/command.def +++ b/src/language/command.def @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2006, 2009, 2010, 2011, 2013 Free Software Foundation, Inc. + Copyright (C) 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -140,6 +140,7 @@ DEF_CMD (S_DATA, 0, "SAMPLE", cmd_sample) DEF_CMD (S_DATA, 0, "SAVE", cmd_save) DEF_CMD (S_DATA, 0, "SAVE TRANSLATE", cmd_save_translate) DEF_CMD (S_DATA, 0, "SORT CASES", cmd_sort_cases) +DEF_CMD (S_DATA, 0, "SORT VARIABLES", cmd_sort_variables) DEF_CMD (S_DATA, 0, "T-TEST", cmd_t_test) DEF_CMD (S_DATA, 0, "TEMPORARY", cmd_temporary) DEF_CMD (S_DATA, 0, "USE", cmd_use) diff --git a/src/language/dictionary/automake.mk b/src/language/dictionary/automake.mk index 0891828081..b1fe0bf6c7 100644 --- a/src/language/dictionary/automake.mk +++ b/src/language/dictionary/automake.mk @@ -10,6 +10,7 @@ language_dictionary_sources = \ src/language/dictionary/mrsets.c \ src/language/dictionary/numeric.c \ src/language/dictionary/rename-variables.c \ + src/language/dictionary/sort-variables.c \ src/language/dictionary/split-file.c \ src/language/dictionary/split-file.h \ src/language/dictionary/sys-file-info.c \ diff --git a/src/language/dictionary/sort-variables.c b/src/language/dictionary/sort-variables.c new file mode 100644 index 0000000000..8c653af724 --- /dev/null +++ b/src/language/dictionary/sort-variables.c @@ -0,0 +1,272 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2016 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include + +#include "data/attributes.h" +#include "data/dataset.h" +#include "data/dictionary.h" +#include "data/format.h" +#include "data/variable.h" +#include "language/command.h" +#include "language/lexer/lexer.h" +#include "libpspp/array.h" +#include "libpspp/assertion.h" +#include "libpspp/i18n.h" +#include "libpspp/message.h" +#include "libpspp/str.h" + +#include "gl/xalloc.h" + +#include "gettext.h" +#define _(msgid) gettext (msgid) + +enum key + { + K_NAME, + K_TYPE, + K_FORMAT, + K_VAR_LABEL, + K_VALUE_LABELS, + K_MISSING_VALUES, + K_MEASURE, + K_ROLE, + K_COLUMNS, + K_ALIGNMENT, + K_ATTRIBUTE, + }; + +struct criterion + { + enum key key; + char *attr_name; + bool descending; + }; + +static int +compare_ints (int a, int b) +{ + return a < b ? -1 : a > b; +} + +static int +compare_formats (const struct fmt_spec *a, const struct fmt_spec *b) +{ + int retval = compare_ints (fmt_to_io (a->type), fmt_to_io (b->type)); + if (!retval) + retval = compare_ints (a->w, b->w); + if (!retval) + retval = compare_ints (a->d, b->d); + return retval; +} + +static int +compare_var_labels (const struct variable *a, const struct variable *b) +{ + const char *a_label = var_get_label (a); + const char *b_label = var_get_label (b); + return utf8_strcasecmp (a_label ? a_label : "", + b_label ? b_label : ""); +} + +static int +map_measure (enum measure m) +{ + return (m == MEASURE_NOMINAL ? 0 + : m == MEASURE_ORDINAL ? 1 + : 2); +} + +static int +map_role (enum var_role r) +{ + return (r == ROLE_INPUT ? 0 + : r == ROLE_TARGET ? 1 + : r == ROLE_BOTH ? 2 + : r == ROLE_NONE ? 3 + : r == ROLE_PARTITION ? 4 + : 5); +} + +static const char * +get_attribute (const struct variable *v, const char *name) +{ + const struct attrset *set = var_get_attributes (v); + const struct attribute *attr = attrset_lookup (set, name); + const char *value = attr ? attribute_get_value (attr, 0) : NULL; + return value ? value : ""; +} + +static int +map_alignment (enum alignment a) +{ + return (a == ALIGN_LEFT ? 0 + : a == ALIGN_RIGHT ? 1 + : 2); +} + +static int +compare_vars (const void *a_, const void *b_, const void *c_) +{ + const struct variable *const *ap = a_; + const struct variable *const *bp = b_; + const struct variable *a = *ap; + const struct variable *b = *bp; + const struct criterion *c = c_; + + int retval; + switch (c->key) + { + case K_NAME: + retval = utf8_strverscasecmp (var_get_name (a), var_get_name (b)); + break; + + case K_TYPE: + retval = compare_ints (var_get_width (a), var_get_width (b)); + break; + + case K_FORMAT: + retval = compare_formats (var_get_print_format (a), + var_get_print_format (b)); + break; + + case K_VAR_LABEL: + retval = compare_var_labels (a, b); + break; + + case K_VALUE_LABELS: + retval = compare_ints (var_has_value_labels (a), + var_has_value_labels (b)); + break; + + case K_MISSING_VALUES: + retval = compare_ints (var_has_missing_values (a), + var_has_missing_values (b)); + break; + + case K_MEASURE: + retval = compare_ints (map_measure (var_get_measure (a)), + map_measure (var_get_measure (b))); + break; + + case K_ROLE: + retval = compare_ints (map_role (var_get_role (a)), + map_role (var_get_role (b))); + break; + + case K_COLUMNS: + retval = compare_ints (var_get_display_width (a), + var_get_display_width (b)); + break; + + case K_ALIGNMENT: + retval = compare_ints (map_alignment (var_get_alignment (a)), + map_alignment (var_get_alignment (b))); + break; + + case K_ATTRIBUTE: + retval = utf8_strcasecmp (get_attribute (a, c->attr_name), + get_attribute (b, c->attr_name)); + break; + + default: + NOT_REACHED (); + } + + /* Make this a stable sort. */ + if (!retval) + retval = a < b ? -1 : a > b; + + if (c->descending) + retval = -retval; + + return retval; +} + +/* Performs SORT VARIABLES command. */ +int +cmd_sort_variables (struct lexer *lexer, struct dataset *ds) +{ + enum cmd_result result = CMD_FAILURE; + + lex_match (lexer, T_BY); + + /* Parse sort key. */ + struct criterion c = { .attr_name = NULL }; + if (lex_match_id (lexer, "NAME")) + c.key = K_NAME; + else if (lex_match_id (lexer, "TYPE")) + c.key = K_TYPE; + else if (lex_match_id (lexer, "FORMAT")) + c.key = K_FORMAT; + else if (lex_match_id (lexer, "LABEL")) + c.key = K_VAR_LABEL; + else if (lex_match_id (lexer, "VALUES")) + c.key = K_VALUE_LABELS; + else if (lex_match_id (lexer, "MISSING")) + c.key = K_MISSING_VALUES; + else if (lex_match_id (lexer, "MEASURE")) + c.key = K_MEASURE; + else if (lex_match_id (lexer, "ROLE")) + c.key = K_ROLE; + else if (lex_match_id (lexer, "COLUMNS")) + c.key = K_COLUMNS; + else if (lex_match_id (lexer, "ALIGNMENT")) + c.key = K_ALIGNMENT; + else if (lex_match_id (lexer, "ATTRIBUTE")) + { + if (!lex_force_id (lexer)) + goto exit; + c.key = K_ATTRIBUTE; + c.attr_name = xstrdup (lex_tokcstr (lexer)); + lex_get (lexer); + } + + /* Parse sort direction. */ + if (lex_match (lexer, T_LPAREN)) + { + if (lex_match_id (lexer, "A") || lex_match_id (lexer, "UP")) + c.descending = false; + else if (lex_match_id (lexer, "D") || lex_match_id (lexer, "DOWN")) + c.descending = true; + else + { + lex_error (lexer, NULL); + goto exit; + } + if (!lex_force_match (lexer, T_RPAREN)) + goto exit; + } + else + c.descending = false; + + /* Sort variables. */ + struct dictionary *d = dataset_dict (ds); + struct variable **vars; + size_t n_vars; + dict_get_vars_mutable (d, &vars, &n_vars, 0); + sort (vars, n_vars, sizeof *vars, compare_vars, &c); + dict_reorder_vars (d, CONST_CAST (struct variable *const *, vars), n_vars); + free (vars); + + result = CMD_SUCCESS; + +exit: + free (c.attr_name); + return result; +} diff --git a/src/libpspp/i18n.c b/src/libpspp/i18n.c index dad224d34d..55b2d67fab 100644 --- a/src/libpspp/i18n.c +++ b/src/libpspp/i18n.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2006, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Free Software Foundation, Inc. + Copyright (C) 2006, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -37,6 +37,7 @@ #include "libpspp/str.h" #include "libpspp/version.h" +#include "gl/c-ctype.h" #include "gl/c-strcase.h" #include "gl/localcharset.h" #include "gl/minmax.h" @@ -845,6 +846,80 @@ utf8_strncasecmp (const char *a, size_t an, const char *b, size_t bn) return result; } +static bool +is_all_digits (const uint8_t *s, size_t len) +{ + for (size_t i = 0; i < len; i++) + if (!c_isdigit (s[i])) + return false; + return true; +} + +/* Compares UTF-8 strings A and B case-insensitively. If the strings end in a + number, then they are compared numerically. Returns a negative value if A < + B, zero if A == B, positive if A > B. */ +int +utf8_strverscasecmp (const char *a, const char *b) +{ + /* Normalize A. */ + uint8_t a_stub[64]; + size_t a_len = sizeof a_stub; + uint8_t *a_norm = u8_casefold (CHAR_CAST (uint8_t *, a), strlen (a), NULL, + UNINORM_NFKD, a_stub, &a_len); + + /* Normalize B. */ + uint8_t b_stub[64]; + size_t b_len = sizeof b_stub; + uint8_t *b_norm = u8_casefold (CHAR_CAST (uint8_t *, b), strlen (b), NULL, + UNINORM_NFKD, b_stub, &b_len); + + int result; + if (!a_norm || !b_norm) + { + result = strcmp (a, b); + goto exit; + } + + size_t len = MIN (a_len, b_len); + for (size_t i = 0; i < len; i++) + if (a_norm[i] != b_norm[i]) + { + /* If both strings end in digits, compare them numerically. */ + if (is_all_digits (&a_norm[i], a_len - i) + && is_all_digits (&b_norm[i], b_len - i)) + { + /* Start by stripping leading zeros, since those don't matter for + numerical comparison. */ + size_t ap, bp; + for (ap = i; ap < a_len; ap++) + if (a_norm[ap] != '0') + break; + for (bp = i; bp < b_len; bp++) + if (b_norm[bp] != '0') + break; + + /* The number with more digits, if there is one, is larger. */ + size_t a_digits = a_len - ap; + size_t b_digits = b_len - bp; + if (a_digits != b_digits) + result = a_digits > b_digits ? 1 : -1; + else + result = memcmp (&a_norm[ap], &b_norm[bp], a_digits); + } + else + result = a_norm[i] > b_norm[i] ? 1 : -1; + goto exit; + } + result = a_len < b_len ? -1 : a_len > b_len; + +exit: + if (a_norm != a_stub) + free (a_norm); + if (b_norm != b_stub) + free (b_norm); + return result; +} + static char * utf8_casemap (const char *s, uint8_t *(*f) (const uint8_t *, size_t, const char *, uninorm_t, diff --git a/src/libpspp/i18n.h b/src/libpspp/i18n.h index 54717bcaa9..5170c50f0d 100644 --- a/src/libpspp/i18n.h +++ b/src/libpspp/i18n.h @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2006, 2010, 2011, 2012, 2014 Free Software Foundation, Inc. + Copyright (C) 2006, 2010, 2011, 2012, 2014, 2016 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -75,6 +75,7 @@ unsigned int utf8_hash_case_bytes (const char *, size_t n, unsigned int basis); unsigned int utf8_hash_case_string (const char *, unsigned int basis); int utf8_strcasecmp (const char *, const char *); int utf8_strncasecmp (const char *, size_t, const char *, size_t); +int utf8_strverscasecmp (const char *, const char *); char *utf8_to_upper (const char *); char *utf8_to_lower (const char *); diff --git a/tests/automake.mk b/tests/automake.mk index 561f23bfce..9284b93d13 100644 --- a/tests/automake.mk +++ b/tests/automake.mk @@ -314,6 +314,7 @@ TESTSUITE_AT = \ tests/language/dictionary/missing-values.at \ tests/language/dictionary/mrsets.at \ tests/language/dictionary/rename-variables.at \ + tests/language/dictionary/sort-variables.at \ tests/language/dictionary/split-file.at \ tests/language/dictionary/sys-file-info.at \ tests/language/dictionary/value-labels.at \