{
off_t pos;
int width;
- char name[8];
+ char name[9];
int print_format;
int write_format;
int missing_value_code;
size_t n_lines;
};
+struct sfm_mrset
+ {
+ const char *name; /* Name. */
+ const char *label; /* Human-readable label for group. */
+ enum mrset_type type; /* Group type. */
+ const char **vars; /* Constituent variables' names. */
+ size_t n_vars; /* Number of constituent variables. */
+
+ /* MRSET_MD only. */
+ enum mrset_md_cat_source cat_source; /* Source of category labels. */
+ bool label_from_var_label; /* 'label' taken from variable label? */
+ const char *counted; /* Counted value, as string. */
+ };
+
struct sfm_extension_record
{
int subtype; /* Record subtype. */
off_t pos; /* Starting offset in file. */
- size_t size; /* Size of data elements. */
- size_t count; /* Number of data elements. */
+ unsigned int size; /* Size of data elements. */
+ unsigned int count; /* Number of data elements. */
void *data; /* Contents. */
};
struct sfm_value_label_record *labels;
size_t n_labels;
struct sfm_document_record *document;
+ struct sfm_mrset *mrsets;
+ size_t n_mrsets;
struct sfm_extension_record *extensions[32];
/* File state. */
struct sfm_read_info *);
static void parse_mrsets (struct sfm_reader *,
const struct sfm_extension_record *,
- struct dictionary *);
+ size_t *allocated_mrsets);
+static void decode_mrsets (struct sfm_reader *, struct dictionary *);
static void parse_long_var_name_map (struct sfm_reader *,
const struct sfm_extension_record *,
struct dictionary *);
struct sfm_reader *
sfm_open (struct file_handle *fh)
{
+ size_t allocated_mrsets = 0;
struct sfm_reader *r;
/* Create and initialize reader. */
if (!read_dictionary (r))
goto error;
+ if (r->extensions[EXT_MRSETS] != NULL)
+ parse_mrsets (r, r->extensions[EXT_MRSETS], &allocated_mrsets);
+
+ if (r->extensions[EXT_MRSETS2] != NULL)
+ parse_mrsets (r, r->extensions[EXT_MRSETS2], &allocated_mrsets);
+
return r;
error:
sfm_close (r);
return NULL;
}
+struct get_strings_aux
+ {
+ struct pool *pool;
+ char **titles;
+ char **strings;
+ bool *ids;
+ size_t allocated;
+ size_t n;
+ };
+
+static void
+add_string__ (struct get_strings_aux *aux,
+ const char *string, bool id, char *title)
+{
+ if (aux->n >= aux->allocated)
+ {
+ aux->allocated = 2 * (aux->allocated + 1);
+ aux->titles = pool_realloc (aux->pool, aux->titles,
+ aux->allocated * sizeof *aux->titles);
+ aux->strings = pool_realloc (aux->pool, aux->strings,
+ aux->allocated * sizeof *aux->strings);
+ aux->ids = pool_realloc (aux->pool, aux->ids,
+ aux->allocated * sizeof *aux->ids);
+ }
+
+ aux->titles[aux->n] = title;
+ aux->strings[aux->n] = pool_strdup (aux->pool, string);
+ aux->ids[aux->n] = id;
+ aux->n++;
+}
+
+static void PRINTF_FORMAT (3, 4)
+add_string (struct get_strings_aux *aux,
+ const char *string, const char *title, ...)
+{
+ va_list args;
+
+ va_start (args, title);
+ add_string__ (aux, string, false, pool_vasprintf (aux->pool, title, args));
+ va_end (args);
+}
+
+static void PRINTF_FORMAT (3, 4)
+add_id (struct get_strings_aux *aux, const char *id, const char *title, ...)
+{
+ va_list args;
+
+ va_start (args, title);
+ add_string__ (aux, id, true, pool_vasprintf (aux->pool, title, args));
+ va_end (args);
+}
+
+/* Retrieves significant string data from R in its raw format, to allow the
+ caller to try to detect the encoding in use.
+
+ Returns the number of strings retrieved N. Sets each of *TITLESP, *IDSP,
+ and *STRINGSP to an array of N elements allocated from POOL. For each I in
+ 0...N-1, UTF-8 string *TITLESP[I] describes *STRINGSP[I], which is in
+ whatever encoding system file R uses. *IDS[I] is true if *STRINGSP[I] must
+ be a valid PSPP language identifier, false if *STRINGSP[I] is free-form
+ text. */
+size_t
+sfm_get_strings (const struct sfm_reader *r, struct pool *pool,
+ char ***titlesp, bool **idsp, char ***stringsp)
+{
+ const struct sfm_mrset *mrset;
+ struct get_strings_aux aux;
+ size_t var_idx;
+ size_t i, j, k;
+
+ aux.pool = pool;
+ aux.titles = NULL;
+ aux.strings = NULL;
+ aux.ids = NULL;
+ aux.allocated = 0;
+ aux.n = 0;
+
+ var_idx = 0;
+ for (i = 0; i < r->n_vars; i++)
+ if (r->vars[i].width != -1)
+ add_id (&aux, r->vars[i].name, _("Variable %zu"), ++var_idx);
+
+ var_idx = 0;
+ for (i = 0; i < r->n_vars; i++)
+ if (r->vars[i].width != -1)
+ {
+ var_idx++;
+ if (r->vars[i].label)
+ add_string (&aux, r->vars[i].label, _("Variable %zu Label"),
+ var_idx);
+ }
+
+ k = 0;
+ for (i = 0; i < r->n_labels; i++)
+ for (j = 0; j < r->labels[i].n_labels; j++)
+ add_string (&aux, r->labels[i].labels[j].label,
+ _("Value Label %zu"), k++);
+
+ add_string (&aux, r->header.creation_date, _("Creation Date"));
+ add_string (&aux, r->header.creation_time, _("Creation Time"));
+ add_string (&aux, r->header.eye_catcher, _("Product"));
+ add_string (&aux, r->header.file_label, _("File Label"));
+
+ if (r->extensions[EXT_PRODUCT_INFO])
+ add_string (&aux, r->extensions[EXT_PRODUCT_INFO]->data,
+ _("Extra Product Info"));
+
+ if (r->document)
+ {
+ size_t i;
+
+ for (i = 0; i < r->document->n_lines; i++)
+ {
+ char line[81];
+
+ memcpy (line, r->document->documents + i * 80, 80);
+ line[80] = '\0';
+
+ add_string (&aux, line, _("Document Line %zu"), i + 1);
+ }
+ }
+
+ for (mrset = r->mrsets; mrset < &r->mrsets[r->n_mrsets]; mrset++)
+ {
+ size_t mrset_idx = mrset - r->mrsets + 1;
+
+ add_id (&aux, mrset->name, _("MRSET %zu"), mrset_idx);
+ if (mrset->label[0])
+ add_string (&aux, mrset->label, _("MRSET %zu Label"), mrset_idx);
+
+ /* Skip the variables because they ought to be duplicates. */
+
+ if (mrset->counted)
+ add_string (&aux, mrset->counted, _("MRSET %zu Counted Value"),
+ mrset_idx);
+ }
+
+ /* */
+ /* data file attributes */
+ /* variable attributes */
+ /* long var map */
+ /* long string value labels */
+ /* long string missing values */
+
+ *titlesp = aux.titles;
+ *idsp = aux.ids;
+ *stringsp = aux.strings;
+ return aux.n;
+}
+
/* Decodes the dictionary read from R, saving it into into *DICT. Character
strings in R are decoded using ENCODING, or an encoding obtained from R if
ENCODING is null, or the locale encoding if R specifies no encoding.
{
encoding = sfm_get_encoding (r);
if (encoding == NULL)
- encoding = locale_charset ();
+ {
+ sys_warn (r, -1, _("This system file does not indicate its own "
+ "character encoding. Using default encoding "
+ "%s. For best results, specify an encoding "
+ "explicitly. Use SYSFILE INFO with "
+ "ENCODING=\"DETECT\" to analyze the possible "
+ "encodings."),
+ locale_charset ());
+ encoding = locale_charset ();
+ }
}
dict = dict_create (encoding);
/* The following records use short names, so they need to be parsed before
parse_long_var_name_map() changes short names to long names. */
- if (r->extensions[EXT_MRSETS] != NULL)
- parse_mrsets (r, r->extensions[EXT_MRSETS], dict);
-
- if (r->extensions[EXT_MRSETS2] != NULL)
- parse_mrsets (r, r->extensions[EXT_MRSETS2], dict);
+ decode_mrsets (r, dict);
if (r->extensions[EXT_LONG_STRINGS] != NULL
&& !parse_long_string_map (r, r->extensions[EXT_LONG_STRINGS], dict))
|| !read_int (r, &record->missing_value_code)
|| !read_int (r, &record->print_format)
|| !read_int (r, &record->write_format)
- || !read_bytes (r, record->name, sizeof record->name))
+ || !read_string (r, record->name, sizeof record->name))
return false;
if (has_variable_label == 1)
record->pos = r->pos;
if (!read_uint (r, &record->n_labels))
return false;
- if (record->n_labels > SIZE_MAX / sizeof *record->labels)
+ if (record->n_labels > UINT_MAX / sizeof *record->labels)
{
sys_error (r, r->pos - 4, _("Invalid number of labels %zu."),
record->n_labels);
if (record->n_vars < 1 || record->n_vars > r->n_vars)
{
sys_error (r, r->pos - 4,
- _("Number of variables associated with a value label (%zu) "
+ _("Number of variables associated with a value label (%u) "
"is not between 1 and the number of variables (%zu)."),
record->n_vars, r->n_vars);
return false;
size_t i;
name = recode_string_pool ("UTF-8", dict_encoding,
- rec->name, 8, r->pool);
+ rec->name, -1, r->pool);
name[strcspn (name, " ")] = '\0';
if (!dict_id_is_valid (dict, name, false)
/* Parses record type 7, subtype 7 or 19. */
static void
parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record,
- struct dictionary *dict)
+ size_t *allocated_mrsets)
{
struct text_record *text;
- struct mrset *mrset;
text = open_text_record (r, record, false);
for (;;)
{
- const char *counted = NULL;
- const char *name;
- const char *label;
- struct stringi_set var_names;
+ struct sfm_mrset *mrset;
size_t allocated_vars;
char delimiter;
- int width;
/* Skip extra line feeds if present. */
while (text_match (text, '\n'))
continue;
- mrset = xzalloc (sizeof *mrset);
+ if (r->n_mrsets >= *allocated_mrsets)
+ r->mrsets = pool_2nrealloc (r->pool, r->mrsets, allocated_mrsets,
+ sizeof *r->mrsets);
+ mrset = &r->mrsets[r->n_mrsets];
+ memset(mrset, 0, sizeof *mrset);
- name = text_get_token (text, ss_cstr ("="), NULL);
- if (name == NULL)
+ mrset->name = text_get_token (text, ss_cstr ("="), NULL);
+ if (mrset->name == NULL)
break;
- mrset->name = recode_string ("UTF-8", r->encoding, name, -1);
-
- if (mrset->name[0] != '$')
- {
- sys_warn (r, record->pos,
- _("`%s' does not begin with `$' at offset %zu "
- "in MRSETS record."), mrset->name, text_pos (text));
- break;
- }
if (text_match (text, 'C'))
{
mrset->label_from_var_label = true;
else if (strcmp (number, "1"))
sys_warn (r, record->pos,
- _("Unexpected label source value `%s' following `E' "
+ _("Unexpected label source value following `E' "
"at offset %zu in MRSETS record."),
- number, text_pos (text));
+ text_pos (text));
}
else
{
if (mrset->type == MRSET_MD)
{
- counted = text_parse_counted_string (r, text);
- if (counted == NULL)
+ mrset->counted = text_parse_counted_string (r, text);
+ if (mrset->counted == NULL)
break;
}
- label = text_parse_counted_string (r, text);
- if (label == NULL)
+ mrset->label = text_parse_counted_string (r, text);
+ if (mrset->label == NULL)
break;
- if (label[0] != '\0')
- mrset->label = recode_string ("UTF-8", r->encoding, label, -1);
- stringi_set_init (&var_names);
allocated_vars = 0;
- width = INT_MAX;
do
{
- const char *raw_var_name;
- struct variable *var;
- char *var_name;
+ const char *var;
- raw_var_name = text_get_token (text, ss_cstr (" \n"), &delimiter);
- if (raw_var_name == NULL)
+ var = text_get_token (text, ss_cstr (" \n"), &delimiter);
+ if (var == NULL)
{
if (delimiter != '\n')
sys_warn (r, record->pos,
text_pos (text));
break;
}
- var_name = recode_string ("UTF-8", r->encoding, raw_var_name, -1);
+
+ if (mrset->n_vars >= allocated_vars)
+ mrset->vars = pool_2nrealloc (r->pool, mrset->vars,
+ &allocated_vars,
+ sizeof *mrset->vars);
+ mrset->vars[mrset->n_vars++] = var;
+ }
+ while (delimiter != '\n');
+
+ r->n_mrsets++;
+ }
+ close_text_record (r, text);
+}
+
+static void
+decode_mrsets (struct sfm_reader *r, struct dictionary *dict)
+{
+ const struct sfm_mrset *s;
+
+ for (s = r->mrsets; s < &r->mrsets[r->n_mrsets]; s++)
+ {
+ struct stringi_set var_names;
+ struct mrset *mrset;
+ char *name;
+ int width;
+ size_t i;
+
+ name = recode_string ("UTF-8", r->encoding, s->name, -1);
+ if (name[0] != '$')
+ {
+ sys_warn (r, -1, _("Multiple response set name `%s' does not begin "
+ "with `$'."),
+ name);
+ free (name);
+ continue;
+ }
+
+ mrset = xzalloc (sizeof *mrset);
+ mrset->name = name;
+ mrset->type = s->type;
+ mrset->cat_source = s->cat_source;
+ mrset->label_from_var_label = s->label_from_var_label;
+ if (s->label[0] != '\0')
+ mrset->label = recode_string ("UTF-8", r->encoding, s->label, -1);
+
+ stringi_set_init (&var_names);
+ mrset->vars = xmalloc (s->n_vars * sizeof *mrset->vars);
+ width = INT_MAX;
+ for (i = 0; i < s->n_vars; i++)
+ {
+ struct variable *var;
+ char *var_name;
+
+ var_name = recode_string ("UTF-8", r->encoding, s->vars[i], -1);
var = dict_lookup_var (dict, var_name);
if (var == NULL)
}
if (!stringi_set_insert (&var_names, var_name))
{
- sys_warn (r, record->pos,
- _("Duplicate variable name %s "
- "at offset %zu in MRSETS record."),
- var_name, text_pos (text));
+ sys_warn (r, -1,
+ _("MRSET %s contains duplicate variable name %s."),
+ mrset->name, var_name);
free (var_name);
continue;
}
if (mrset->n_vars
&& var_get_type (var) != var_get_type (mrset->vars[0]))
{
- sys_warn (r, record->pos,
+ sys_warn (r, -1,
_("MRSET %s contains both string and "
- "numeric variables."), name);
+ "numeric variables."), mrset->name);
continue;
}
width = MIN (width, var_get_width (var));
- if (mrset->n_vars >= allocated_vars)
- mrset->vars = x2nrealloc (mrset->vars, &allocated_vars,
- sizeof *mrset->vars);
mrset->vars[mrset->n_vars++] = var;
}
- while (delimiter != '\n');
if (mrset->n_vars < 2)
{
- sys_warn (r, record->pos,
- _("MRSET %s has only %zu variables."), mrset->name,
- mrset->n_vars);
+ if (mrset->n_vars == 0)
+ sys_warn (r, -1, _("MRSET %s has no variables."), mrset->name);
+ else
+ sys_warn (r, -1, _("MRSET %s has only one variable."),
+ mrset->name);
mrset_destroy (mrset);
stringi_set_destroy (&var_names);
continue;
mrset->width = width;
value_init (&mrset->counted, width);
if (width == 0)
- mrset->counted.f = c_strtod (counted, NULL);
+ mrset->counted.f = c_strtod (s->counted, NULL);
else
value_copy_str_rpad (&mrset->counted, width,
- (const uint8_t *) counted, ' ');
+ (const uint8_t *) s->counted, ' ');
}
dict_add_mrset (dict, mrset);
- mrset = NULL;
stringi_set_destroy (&var_names);
}
- mrset_destroy (mrset);
- close_text_record (r, text);
}
/* Read record type 7, subtype 11, which specifies how variables