#include "libpspp/i18n.h"
#include "libpspp/message.h"
+#include "libpspp/string-array.h"
#include "output/pivot-table.h"
#include "output/spv/light-binary-parser.h"
#include "output/spv/spv.h"
pivot_table_unref (out);
return error;
}
+\f
+/* collect_spvlb_strings */
+
+static void
+add_if_nonempty (struct string_array *strings, const char *s)
+{
+ if (s && s[0])
+ string_array_append (strings, s);
+}
+
+static void
+collect_value_mod_strings (struct string_array *strings,
+ const struct spvlb_value_mod *vm)
+{
+ if (vm->template_string)
+ add_if_nonempty (strings, vm->template_string->id);
+
+ if (vm->style_pair && vm->style_pair->font_style)
+ add_if_nonempty (strings, vm->style_pair->font_style->typeface);
+}
+
+static void
+collect_value_strings (struct string_array *strings,
+ const struct spvlb_value *value)
+{
+ if (!value)
+ return;
+
+ switch (value->type)
+ {
+ case 1:
+ collect_value_mod_strings (strings, value->type_01.value_mod);
+ break;
+
+ case 2:
+ collect_value_mod_strings (strings, value->type_02.value_mod);
+ add_if_nonempty (strings, value->type_02.var_name);
+ add_if_nonempty (strings, value->type_02.value_label);
+ break;
+
+ case 3:
+ collect_value_mod_strings (strings, value->type_03.value_mod);
+ add_if_nonempty (strings, value->type_03.local);
+ add_if_nonempty (strings, value->type_03.id);
+ add_if_nonempty (strings, value->type_03.c);
+ break;
+
+ case 4:
+ collect_value_mod_strings (strings, value->type_04.value_mod);
+ add_if_nonempty (strings, value->type_04.value_label);
+ add_if_nonempty (strings, value->type_04.var_name);
+ add_if_nonempty (strings, value->type_04.s);
+ break;
+
+ case 5:
+ collect_value_mod_strings (strings, value->type_05.value_mod);
+ add_if_nonempty (strings, value->type_05.var_name);
+ add_if_nonempty (strings, value->type_05.var_label);
+ break;
+
+ case 6:
+ collect_value_mod_strings (strings, value->type_06.value_mod);
+ add_if_nonempty (strings, value->type_06.local);
+ add_if_nonempty (strings, value->type_06.id);
+ add_if_nonempty (strings, value->type_06.c);
+ break;
+
+ case -1:
+ collect_value_mod_strings (strings, value->type_else.value_mod);
+ add_if_nonempty (strings, value->type_else.template);
+ for (size_t i = 0; i < value->type_else.n_args; i++)
+ {
+ const struct spvlb_argument *a = value->type_else.args[i];
+ collect_value_strings (strings, a->value);
+ for (size_t j = 0; j < a->n_values; j++)
+ collect_value_strings (strings, a->values[j]);
+ }
+ break;
+ }
+}
+
+static void
+collect_category_strings (struct string_array *strings,
+ const struct spvlb_category *cat)
+{
+ collect_value_strings (strings, cat->name);
+ if (cat->group)
+ for (size_t i = 0; i < cat->group->n_subcategories; i++)
+ collect_category_strings (strings, cat->group->subcategories[i]);
+}
+
+/* Adds all of the characters strings in TABLE to STRINGS. */
+void
+collect_spvlb_strings (const struct spvlb_table *table,
+ struct string_array *strings)
+{
+ add_if_nonempty (strings, table->ts->notes);
+ add_if_nonempty (strings, table->ts->table_look);
+ add_if_nonempty (strings, table->ps->continuation_string);
+
+ const struct spvlb_custom_currency *cc = table->formats->custom_currency;
+ if (cc)
+ for (int i = 0; i < cc->n_ccs; i++)
+ add_if_nonempty (strings, cc->ccs[i]);
+
+ const struct spvlb_y1 *y1 = (table->formats->x0 ? table->formats->x0->y1
+ : table->formats->x3 ? table->formats->x3->y1
+ : NULL);
+ if (y1)
+ {
+ add_if_nonempty (strings, y1->command_local);
+ add_if_nonempty (strings, y1->command);
+ add_if_nonempty (strings, y1->language);
+ add_if_nonempty (strings, y1->charset);
+ add_if_nonempty (strings, y1->locale);
+ }
+
+ const struct spvlb_x3 *x3 = table->formats->x3;
+ if (x3)
+ {
+ if (x3->dataset && x3->dataset[0] && x3->dataset[0] != 4)
+ add_if_nonempty (strings, x3->dataset);
+ add_if_nonempty (strings, x3->datafile);
+ }
+
+ for (size_t i = 0; i < table->footnotes->n_footnotes; i++)
+ {
+ const struct spvlb_footnote *f = table->footnotes->footnotes[i];
+ collect_value_strings (strings, f->text);
+ collect_value_strings (strings, f->marker);
+ }
+
+ collect_value_strings (strings, table->titles->user_title);
+ collect_value_strings (strings, table->titles->subtype);
+ collect_value_strings (strings, table->titles->corner_text);
+ collect_value_strings (strings, table->titles->caption);
+
+ for (size_t i = 0; i < PIVOT_N_AREAS; i++)
+ add_if_nonempty (strings, table->areas->areas[i]->typeface);
+
+ for (size_t i = 0; i < table->dimensions->n_dims; i++)
+ {
+ const struct spvlb_dimension *d = table->dimensions->dims[i];
+ collect_value_strings (strings, d->name);
+ for (size_t j = 0; j < d->n_categories; j++)
+ collect_category_strings (strings, d->categories[j]);
+ }
+
+ for (size_t i = 0; i < table->cells->n_cells; i++)
+ collect_value_strings (strings, table->cells->cells[i]->value);
+}
+\f
+/* Returns the encoding that TABLE declares to be in use for its strings.
+ (Watch out, it's not always correct.) */
+const char *
+spvlb_table_get_encoding (const struct spvlb_table *table)
+{
+ const struct spvlb_y1 *y1 = (table->formats->x0 ? table->formats->x0->y1
+ : table->formats->x3 ? table->formats->x3->y1
+ : NULL);
+ if (y1)
+ return y1->charset;
+ else
+ {
+ const char *dot = strchr (table->formats->locale, '.');
+ return dot ? dot + 1 : "windows-1252";
+ }
+}
#include <limits.h>
#include <stdlib.h>
#include <unistd.h>
+#include <unistr.h>
#include "data/file-handle-def.h"
#include "data/settings.h"
+#include "libpspp/encoding-guesser.h"
#include "libpspp/i18n.h"
#include "libpspp/message.h"
#include "libpspp/string-map.h"
#include "output/pivot-table.h"
#include "output/spv/light-binary-parser.h"
#include "output/spv/spv-legacy-data.h"
+#include "output/spv/spv-light-decoder.h"
#include "output/spv/spv-output.h"
#include "output/spv/spv-select.h"
#include "output/spv/spv-table-look.h"
/* --sort: Sort members under dump-light-table, to make comparisons easier. */
static bool sort;
-/* --raw: Dump raw binary data in dump-light-table. */
+/* --raw: Dump raw binary data in "dump-light-table"; dump all strings in
+ "strings". */
static bool raw;
+/* --no-ascii-only: Drop all-ASCII strings in "strings". */
+static bool exclude_ascii_only;
+
+/* --utf8-only: Only print strings that have UTF-8 multibyte sequences in
+ * "strings". */
+static bool include_utf8_only;
+
/* -f, --force: Keep output file even on error. */
static bool force;
exit (is_legacy ? EXIT_SUCCESS : EXIT_FAILURE);
}
+static bool
+is_all_ascii (const char *s)
+{
+ for (; *s; s++)
+ if (!encoding_guess_is_ascii_text (*s))
+ return false;
+
+ return true;
+}
+
+static void
+dump_strings (const char *encoding, struct string_array *strings)
+{
+ string_array_sort (strings);
+ string_array_uniq (strings);
+
+ if (raw)
+ {
+ if (exclude_ascii_only || include_utf8_only)
+ {
+ size_t i = 0;
+ for (size_t j = 0; j < strings->n; j++)
+ {
+ char *s = strings->strings[j];
+ bool is_ascii = is_all_ascii (s);
+ bool is_utf8 = !u8_check (CHAR_CAST (uint8_t *, s), strlen (s));
+ if (!is_ascii && (!include_utf8_only || is_utf8))
+ strings->strings[i++] = s;
+ else
+ free (s);
+ }
+ strings->n = i;
+ }
+ for (size_t i = 0; i < strings->n; i++)
+ puts (strings->strings[i]);
+ }
+ else
+ {
+ size_t n_nonascii = 0;
+ size_t n_utf8 = 0;
+ for (size_t i = 0; i < strings->n; i++)
+ {
+ const char *s = strings->strings[i];
+ if (!is_all_ascii (s))
+ {
+ n_nonascii++;
+ if (!u8_check (CHAR_CAST (uint8_t *, s), strlen (s)))
+ n_utf8++;
+ }
+ }
+ printf ("%s: %zu unique strings, %zu non-ASCII, %zu UTF-8.\n",
+ encoding, strings->n, n_nonascii, n_utf8);
+ }
+}
+
+static void
+run_strings (int argc UNUSED, char **argv)
+{
+ struct spv_reader *spv;
+ char *err = spv_open (argv[1], &spv);
+ if (err)
+ error (1, 0, "%s", err);
+
+ struct encoded_strings
+ {
+ char *encoding;
+ struct string_array strings;
+ }
+ *es = NULL;
+ size_t n_es = 0;
+ size_t allocated_es = 0;
+
+ struct spv_item **items;
+ size_t n_items;
+ spv_select (spv, criteria, n_criteria, &items, &n_items);
+ for (size_t i = 0; i < n_items; i++)
+ {
+ if (!spv_item_is_light_table (items[i]))
+ continue;
+
+ char *error;
+ struct spvlb_table *table;
+ error = spv_item_get_light_table (items[i], &table);
+ if (error)
+ {
+ msg (ME, "%s", error);
+ free (error);
+ continue;
+ }
+
+ const char *table_encoding = spvlb_table_get_encoding (table);
+ size_t j = 0;
+ for (j = 0; j < n_es; j++)
+ if (!strcmp (es[j].encoding, table_encoding))
+ break;
+ if (j >= n_es)
+ {
+ if (n_es >= allocated_es)
+ es = x2nrealloc (es, &allocated_es, sizeof *es);
+ es[n_es++] = (struct encoded_strings) {
+ .encoding = xstrdup (table_encoding),
+ .strings = STRING_ARRAY_INITIALIZER,
+ };
+ }
+ collect_spvlb_strings (table, &es[j].strings);
+ }
+ free (items);
+
+ for (size_t i = 0; i < n_es; i++)
+ {
+ dump_strings (es[i].encoding, &es[i].strings);
+ free (es[i].encoding);
+ string_array_destroy (&es[i].strings);
+ }
+ free (es);
+
+ spv_close (spv);
+}
+
struct command
{
const char *name;
{ "dump-legacy-table", 1, INT_MAX, run_dump_legacy_table },
{ "dump-structure", 1, INT_MAX, run_dump_structure },
{ "is-legacy", 1, 1, run_is_legacy },
+ { "strings", 1, 1, run_strings },
};
static const int n_commands = sizeof commands / sizeof *commands;
OPT_OR,
OPT_SORT,
OPT_RAW,
+ OPT_NO_ASCII_ONLY,
+ OPT_UTF8_ONLY,
OPT_TABLE_LOOK,
OPT_HELP_DEVELOPER,
};
{ "sort", no_argument, NULL, OPT_SORT },
{ "raw", no_argument, NULL, OPT_RAW },
+ /* "strings" command options. */
+ { "no-ascii-only", no_argument, NULL, OPT_NO_ASCII_ONLY },
+ { "utf8-only", no_argument, NULL, OPT_UTF8_ONLY },
+
{ "help", no_argument, NULL, 'h' },
{ "help-developer", no_argument, NULL, OPT_HELP_DEVELOPER },
{ "version", no_argument, NULL, 'v' },
parse_table_look (optarg);
break;
+ case OPT_NO_ASCII_ONLY:
+ exclude_ascii_only = true;
+ break;
+
+ case OPT_UTF8_ONLY:
+ include_utf8_only = true;
+ break;
+
case 'f':
force = true;
break;
dump-legacy-table FILE [XPATH]... Dump legacy table XML\n\
dump-structure FILE [XPATH]... Dump structure XML\n\
is-legacy FILE Exit with status 0 if any legacy table selected\n\
+ strings FILE Dump analysis of strings\n\
\n\
Additional input selection options:\n\
--members=MEMBER... include only objects with these Zip member names\n\
Additional options for \"dir\" command:\n\
--member-names show Zip member names with objects\n\
\n\
+Options for the \"strings\" command:\n\
+ --raw Dump all (unique) strings\n\
+ --raw --no-ascii-only Dump all strings that contain non-ASCII characters\n\
+ --raw --utf8-only Dump all non-ASCII strings that are valid UTF-8\n\
+\n\
Other options:\n\
--raw print raw binary data instead of a parsed version\n\
--sort sort borders and areas for shorter \"diff\" output\n");