@item string
@itemx bestring
A 32-bit unsigned integer, in little-endian or big-endian byte order,
-respectively, followed by the specified number of bytes of UTF-8
-encoded character data.
+respectively, followed by the specified number of bytes of character
+data. (The encoding is indicated by the Formats nonterminal.)
@item @var{x}?
@var{x} is optional, e.g.@: 00? is an optional zero byte.
column widths as manually adjusted by the user.
@code{locale} is a locale including an encoding, such as
-@code{en_US.windows-1252} or @code{it_IT.windows-1252}. The encoding
-string (like other strings in the member) is encoded in UTF-8.
+@code{en_US.windows-1252} or @code{it_IT.windows-1252}.
+(@code{locale} is often duplicated in Y1, described below).
@code{epoch} is the year that starts the epoch. A 2-digit year is
interpreted as belonging to the 100 years beginning at the epoch. The
the procedure: for example, NPAR TESTS becomes ``Nonparametric
Tests.'' @code{command-local} is the procedure's name, translated
into the output language; it is often empty and, when it is not,
-sometimes the same as @code{command}.q
+sometimes the same as @code{command}.
@code{missing} is the character used to indicate that a cell contains
a missing value. It is always observed as @samp{.}.
@code{lang} may indicate the language in use. Some values seem to be
0: @t{en}, 1: @t{de}, 2: @t{es}, 3: @t{it}, 5: @t{ko}, 6: @t{pl}, 8:
-@t{zh-tw}, 10: @t{pt_BR}, 11: @t{fr}. The @code{locale} in Formats
-and the @code{language}, @code{charset}, and @code{locale} in X0 are
-more likely to be useful in practice.
+@t{zh-tw}, 10: @t{pt_BR}, 11: @t{fr}.
@code{show-variables} determines how variables are displayed by
default. A value of 1 means to display variable names, 2 to display
A writer may safely use 4 for @code{x21} and omit @code{x22} and the
other optional bytes at the end.
+@subsubheading Encoding
+
+Formats contains several indications of character encoding:
+
+@itemize @bullet
+@item
+@code{locale} in Formats itself.
+
+@item
+@code{locale} in Y1 (in version 1, Y1 is optionally nested inside X0;
+in version 3, Y1 is nested inside X3).
+
+@item
+@code{charset} in version 3, in Y1.
+
+@item
+@code{lang} in X1, in version 3.
+@end itemize
+
+@code{charset}, if present, is a good indication of character
+encoding, and in its absence the encoding suffix on @code{locale} in
+Formats will work.
+
+@code{locale} in Y1 can be disregarded: it is normally the same as
+@code{locale} in Formats, and it is only present if @code{charset} is
+also.
+
+@code{lang} is not helpful and should be ignored for character
+encoding purposes.
+
+However, the corpus contains many examples of light members whose
+strings are encoded in UTF-8 despite declaring some other character
+set. Furthermore, the corpus contains several examples of light
+members in which some strings are encoded in UTF-8 (and contain
+multibyte characters) and other strings are encoded in another
+character set (and contain non-ASCII characters). PSPP treats any
+valid UTF-8 string as UTF-8 and only falls back to the declared
+encoding for strings that are not valid UTF-8.
+
+The @command{pspp-output} program's @command{strings} command can help
+analyze the encoding in an SPV light member. Use @code{pspp-output
+--help-dev} to see its usage.
+
@node SPV Light Member Dimensions
@subsection Dimensions
#include <limits.h>
#include <stdlib.h>
#include <string.h>
+#include <unistr.h>
#include "libpspp/i18n.h"
#include "libpspp/message.h"
#include "gl/xalloc.h"
#include "gl/xsize.h"
+/* Returns a copy of S converted to UTF-8. S might be in UTF-8 already or it
+ might be in ENCODING (yes, this makes no sense). */
static char *
-xstrdup_if_nonempty (const char *s)
+to_utf8 (const char *s, const char *encoding)
{
- return s && s[0] ? xstrdup (s) : NULL;
+ size_t length = strlen (s);
+ return (u8_check (CHAR_CAST (const uint8_t *, s), length)
+ ? recode_string ("UTF-8", encoding, s, length)
+ : xstrdup (s));
+}
+
+static char *
+to_utf8_if_nonempty (const char *s, const char *encoding)
+{
+ return s && s[0] ? to_utf8 (s, encoding) : NULL;
}
static void
static char * WARN_UNUSED_RESULT
decode_spvlb_font_style (const struct spvlb_font_style *in,
- struct font_style **outp)
+ const char *encoding, struct font_style **outp)
{
if (!in)
{
.underline = in->underline,
.fg = { fg, fg },
.bg = { bg, bg },
- .typeface = xstrdup (in->typeface),
+ .typeface = to_utf8 (in->typeface, encoding),
.size = in->size / 1.33,
};
return NULL;
static char *decode_spvlb_value (
const struct pivot_table *, const struct spvlb_value *,
- struct pivot_value **) WARN_UNUSED_RESULT;
+ const char *encoding, struct pivot_value **) WARN_UNUSED_RESULT;
static char * WARN_UNUSED_RESULT
decode_spvlb_argument (const struct pivot_table *table,
const struct spvlb_argument *in,
- struct pivot_argument *out)
+ const char *encoding, struct pivot_argument *out)
{
if (in->value)
{
struct pivot_value *value;
- char *error = decode_spvlb_value (table, in->value, &value);
+ char *error = decode_spvlb_value (table, in->value, encoding, &value);
if (error)
return error;
out->values = xnmalloc (in->n_values, sizeof *out->values);
for (size_t i = 0; i < in->n_values; i++)
{
- char *error = decode_spvlb_value (table, in->values[i],
+ char *error = decode_spvlb_value (table, in->values[i], encoding,
&out->values[i]);
if (error)
{
static char * WARN_UNUSED_RESULT
decode_spvlb_value (const struct pivot_table *table,
- const struct spvlb_value *in, struct pivot_value **outp)
+ const struct spvlb_value *in,
+ const char *encoding, struct pivot_value **outp)
{
*outp = NULL;
error = decode_spvlb_value_show (in->type_02.show, &out->numeric.show);
if (error)
return NULL;
- out->numeric.var_name = xstrdup_if_nonempty (in->type_02.var_name);
- out->numeric.value_label = xstrdup_if_nonempty (in->type_02.value_label);
+ out->numeric.var_name = to_utf8_if_nonempty (in->type_02.var_name,
+ encoding);
+ out->numeric.value_label = to_utf8_if_nonempty (in->type_02.value_label,
+ encoding);
break;
case 3:
vm = in->type_03.value_mod;
out->type = PIVOT_VALUE_TEXT;
- out->text.local = xstrdup (in->type_03.local);
- out->text.c = xstrdup (in->type_03.c);
- out->text.id = xstrdup (in->type_03.id);
+ out->text.local = to_utf8 (in->type_03.local, encoding);
+ out->text.c = to_utf8 (in->type_03.c, encoding);
+ out->text.id = to_utf8 (in->type_03.id, encoding);
out->text.user_provided = !in->type_03.fixed;
break;
error = decode_spvlb_value_show (in->type_04.show, &out->string.show);
if (error)
return NULL;
- out->string.s = xstrdup (in->type_04.s);
+ out->string.s = to_utf8 (in->type_04.s, encoding);
out->string.hex = (in->type_04.format >> 16) == fmt_to_io (FMT_AHEX);
- out->string.var_name = xstrdup (in->type_04.var_name);
- out->string.value_label = xstrdup_if_nonempty (in->type_04.value_label);
+ out->string.var_name = to_utf8 (in->type_04.var_name, encoding);
+ out->string.value_label = to_utf8_if_nonempty (in->type_04.value_label,
+ encoding);
break;
case 5:
error = decode_spvlb_value_show (in->type_05.show, &out->variable.show);
if (error)
return error;
- out->variable.var_name = xstrdup (in->type_05.var_name);
- out->variable.var_label = xstrdup_if_nonempty (in->type_05.var_label);
+ out->variable.var_name = to_utf8 (in->type_05.var_name, encoding);
+ out->variable.var_label = to_utf8_if_nonempty (in->type_05.var_label,
+ encoding);
break;
case 6:
vm = in->type_06.value_mod;
out->type = PIVOT_VALUE_TEXT;
- out->text.local = xstrdup (in->type_06.local);
- out->text.c = xstrdup (in->type_06.c);
- out->text.id = xstrdup (in->type_06.id);
+ out->text.local = to_utf8 (in->type_06.local, encoding);
+ out->text.c = to_utf8 (in->type_06.c, encoding);
+ out->text.id = to_utf8 (in->type_06.id, encoding);
out->text.user_provided = false;
break;
case -1:
vm = in->type_else.value_mod;
out->type = PIVOT_VALUE_TEMPLATE;
- out->template.local = xstrdup (in->type_else.template);
+ out->template.local = to_utf8 (in->type_else.template, encoding);
out->template.id = out->template.local;
out->template.n_args = 0;
out->template.args = xnmalloc (in->type_else.n_args,
for (size_t i = 0; i < in->type_else.n_args; i++)
{
error = decode_spvlb_argument (table, in->type_else.args[i],
- &out->template.args[i]);
+ encoding, &out->template.args[i]);
if (error)
{
pivot_value_destroy (out);
out->subscripts = xnmalloc (vm->n_subscripts,
sizeof *out->subscripts);
for (size_t i = 0; i < vm->n_subscripts; i++)
- out->subscripts[i] = xstrdup (vm->subscripts[i]);
+ out->subscripts[i] = to_utf8 (vm->subscripts[i], encoding);
}
if (vm->n_refs)
if (vm->style_pair)
{
error = decode_spvlb_font_style (vm->style_pair->font_style,
- &out->font_style);
+ encoding, &out->font_style);
if (!error)
error = decode_spvlb_cell_style (vm->style_pair->cell_style,
&out->cell_style);
&& vm->template_string->id
&& vm->template_string->id[0]
&& out->type == PIVOT_VALUE_TEMPLATE)
- out->template.id = xstrdup (vm->template_string->id);
+ out->template.id = to_utf8 (vm->template_string->id, encoding);
}
*outp = out;
}
static char * WARN_UNUSED_RESULT
-decode_spvlb_area (const struct spvlb_area *in, struct table_area_style *out)
+decode_spvlb_area (const struct spvlb_area *in, struct table_area_style *out,
+ const char *encoding)
{
char *error;
.underline = in->underline,
.fg = { fg0, in->alternate ? fg1 : fg0 },
.bg = { bg0, in->alternate ? bg1 : bg0 },
- .typeface = xstrdup (in->typeface),
+ .typeface = to_utf8 (in->typeface, encoding),
.size = in->size / 1.33,
},
.cell_style = {
size_t n_categories,
bool show_label,
struct pivot_category *parent,
- struct pivot_dimension *);
+ struct pivot_dimension *,
+ const char *encoding);
static char * WARN_UNUSED_RESULT
decode_spvlb_categories (const struct pivot_table *table,
struct spvlb_category **categories,
size_t n_categories,
struct pivot_category *parent,
- struct pivot_dimension *dimension)
+ struct pivot_dimension *dimension,
+ const char *encoding)
{
for (size_t i = 0; i < n_categories; i++)
{
{
char *error = decode_spvlb_categories (
table, in->group->subcategories, in->group->n_subcategories,
- parent, dimension);
+ parent, dimension, encoding);
if (error)
return error;
}
struct pivot_value *name;
- char *error = decode_spvlb_value (table, in->name, &name);
+ char *error = decode_spvlb_value (table, in->name, encoding, &name);
if (error)
return error;
{
char *error = decode_spvlb_group (table, in->group->subcategories,
in->group->n_subcategories,
- true, out, dimension);
+ true, out, dimension, encoding);
if (error)
{
pivot_category_destroy (out);
struct spvlb_category **categories,
size_t n_categories, bool show_label,
struct pivot_category *category,
- struct pivot_dimension *dimension)
+ struct pivot_dimension *dimension,
+ const char *encoding)
{
category->subs = XCALLOC (n_categories, struct pivot_category *);
category->n_subs = 0;
category->show_label = show_label;
return decode_spvlb_categories (table, categories, n_categories, category,
- dimension);
+ dimension, encoding);
}
static char * WARN_UNUSED_RESULT
static char * WARN_UNUSED_RESULT
decode_spvlb_dimension (const struct pivot_table *table,
const struct spvlb_dimension *in,
- size_t idx, struct pivot_dimension **outp)
+ size_t idx, const char *encoding,
+ struct pivot_dimension **outp)
{
/* Convert most of the dimension. */
struct pivot_value *name;
- char *error = decode_spvlb_value (table, in->name, &name);
+ char *error = decode_spvlb_value (table, in->name, encoding, &name);
if (error)
return error;
};
error = decode_spvlb_group (table, in->categories, in->n_categories,
!in->props->hide_dim_label, out->root,
- out);
+ out, encoding);
if (error)
goto error;
static char * WARN_UNUSED_RESULT
decode_spvlb_cells (struct spvlb_cell **in, size_t n_in,
- struct pivot_table *table)
+ struct pivot_table *table, const char *encoding)
{
if (!table->n_dimensions)
return NULL;
struct pivot_value *value;
char *error = decode_data_index (in[i]->index, table, dindexes);
if (!error)
- error = decode_spvlb_value (table, in[i]->value, &value);
+ error = decode_spvlb_value (table, in[i]->value, encoding, &value);
if (error)
{
free (dindexes);
}
static char * WARN_UNUSED_RESULT
-decode_spvlb_footnote (const struct spvlb_footnote *in,
+decode_spvlb_footnote (const struct spvlb_footnote *in, const char *encoding,
size_t idx, struct pivot_table *table)
{
struct pivot_value *content;
- char *error = decode_spvlb_value (table, in->text, &content);
+ char *error = decode_spvlb_value (table, in->text, encoding, &content);
if (error)
return error;
struct pivot_value *marker = NULL;
if (in->marker)
{
- error = decode_spvlb_value (table, in->marker, &marker);
+ error = decode_spvlb_value (table, in->marker, encoding, &marker);
if (error)
{
pivot_value_destroy (content);
out->look = pivot_table_look_new_builtin_default ();
out->settings = (struct fmt_settings) FMT_SETTINGS_INIT;
+ const struct spvlb_y1 *y1 = (in->formats->x0 ? in->formats->x0->y1
+ : in->formats->x3 ? in->formats->x3->y1
+ : NULL);
+ const char *encoding = spvlb_table_get_encoding (in);
+
/* Display settings. */
out->look->show_numeric_markers = !in->ts->show_alphabetic_markers;
out->rotate_inner_column_labels = in->header->rotate_inner_column_labels;
&out->sizing[TABLE_HORZ].keeps,
&out->sizing[TABLE_HORZ].n_keeps);
- out->notes = xstrdup_if_nonempty (in->ts->notes);
- out->look->name = xstrdup_if_nonempty (in->ts->table_look);
+ out->notes = to_utf8_if_nonempty (in->ts->notes, encoding);
+ out->look->name = to_utf8_if_nonempty (in->ts->table_look, encoding);
/* Print settings. */
out->look->print_all_layers = in->ps->all_layers;
out->look->shrink_to_fit[TABLE_VERT] = in->ps->fit_length;
out->look->top_continuation = in->ps->top_continuation;
out->look->bottom_continuation = in->ps->bottom_continuation;
- out->look->continuation = xstrdup (in->ps->continuation_string);
+ out->look->continuation = to_utf8 (in->ps->continuation_string, encoding);
out->look->n_orphan_lines = in->ps->n_orphan_lines;
/* Format settings. */
out->small = in->formats->x3 ? in->formats->x3->small : 0;
/* Command information. */
- const struct spvlb_y1 *y1 = (in->formats->x0 ? in->formats->x0->y1
- : in->formats->x3 ? in->formats->x3->y1
- : NULL);
if (y1)
{
- out->command_local = xstrdup (y1->command_local);
- out->command_c = xstrdup (y1->command);
- out->language = xstrdup (y1->language);
+ out->command_local = to_utf8 (y1->command_local, encoding);
+ out->command_c = to_utf8 (y1->command, encoding);
+ out->language = to_utf8 (y1->language, encoding);
/* charset? */
- out->locale = xstrdup (y1->locale);
+ out->locale = to_utf8 (y1->locale, encoding);
}
/* Source information. */
if (x3)
{
if (x3->dataset && x3->dataset[0] && x3->dataset[0] != 4)
- out->dataset = xstrdup (x3->dataset);
- out->datafile = xstrdup_if_nonempty (x3->datafile);
+ out->dataset = to_utf8 (x3->dataset, encoding);
+ out->datafile = to_utf8_if_nonempty (x3->datafile, encoding);
out->date = x3->date;
}
pivot_table_create_footnote__ (out, fn->n_footnotes - 1, NULL, NULL);
for (size_t i = 0; i < fn->n_footnotes; i++)
{
- error = decode_spvlb_footnote (in->footnotes->footnotes[i], i, out);
+ error = decode_spvlb_footnote (in->footnotes->footnotes[i],
+ encoding, i, out);
if (error)
goto error;
}
}
/* Title and caption. */
- error = decode_spvlb_value (out, in->titles->user_title, &out->title);
+ error = decode_spvlb_value (out, in->titles->user_title, encoding,
+ &out->title);
if (error)
goto error;
- error = decode_spvlb_value (out, in->titles->subtype, &out->subtype);
+ error = decode_spvlb_value (out, in->titles->subtype, encoding,
+ &out->subtype);
if (error)
goto error;
if (in->titles->corner_text)
{
error = decode_spvlb_value (out, in->titles->corner_text,
- &out->corner_text);
+ encoding, &out->corner_text);
if (error)
goto error;
}
if (in->titles->caption)
{
- error = decode_spvlb_value (out, in->titles->caption, &out->caption);
+ error = decode_spvlb_value (out, in->titles->caption, encoding,
+ &out->caption);
if (error)
goto error;
}
/* Styles. */
for (size_t i = 0; i < PIVOT_N_AREAS; i++)
{
- error = decode_spvlb_area (in->areas->areas[i], &out->look->areas[i]);
+ error = decode_spvlb_area (in->areas->areas[i], &out->look->areas[i],
+ encoding);
if (error)
goto error;
}
for (size_t i = 0; i < out->n_dimensions; i++)
{
error = decode_spvlb_dimension (out, in->dimensions->dims[i],
- i, &out->dimensions[i]);
+ i, encoding, &out->dimensions[i]);
if (error)
goto error;
}
goto error;
/* Data. */
- error = decode_spvlb_cells (in->cells->cells, in->cells->n_cells, out);
+ error = decode_spvlb_cells (in->cells->cells, in->cells->n_cells, out,
+ encoding);
*outp = out;
return NULL;