X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Fdata%2Fsys-file-reader.c;h=9bb1c775783488b96275b69a405ae63ae16b947f;hb=78cffcbf683cd66a6717d40854825d731fd4f01d;hp=3ddd633ce63e5ffe1f313a6e33fa18248ce622d5;hpb=9b43ed0de590acc1926e4787c74c86870577c65a;p=pspp diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c index 3ddd633ce6..9bb1c77578 100644 --- a/src/data/sys-file-reader.c +++ b/src/data/sys-file-reader.c @@ -110,7 +110,7 @@ struct sfm_var_record { off_t pos; int width; - char name[8]; + char name[9]; int print_format; int write_format; int missing_value_code; @@ -142,12 +142,26 @@ struct sfm_document_record size_t n_lines; }; +struct sfm_mrset + { + const char *name; /* Name. */ + const char *label; /* Human-readable label for group. */ + enum mrset_type type; /* Group type. */ + const char **vars; /* Constituent variables' names. */ + size_t n_vars; /* Number of constituent variables. */ + + /* MRSET_MD only. */ + enum mrset_md_cat_source cat_source; /* Source of category labels. */ + bool label_from_var_label; /* 'label' taken from variable label? */ + const char *counted; /* Counted value, as string. */ + }; + struct sfm_extension_record { int subtype; /* Record subtype. */ off_t pos; /* Starting offset in file. */ - size_t size; /* Size of data elements. */ - size_t count; /* Number of data elements. */ + unsigned int size; /* Size of data elements. */ + unsigned int count; /* Number of data elements. */ void *data; /* Contents. */ }; @@ -165,6 +179,8 @@ struct sfm_reader struct sfm_value_label_record *labels; size_t n_labels; struct sfm_document_record *document; + struct sfm_mrset *mrsets; + size_t n_mrsets; struct sfm_extension_record *extensions[32]; /* File state. */ @@ -320,7 +336,8 @@ static void parse_extra_product_info (struct sfm_reader *, struct sfm_read_info *); static void parse_mrsets (struct sfm_reader *, const struct sfm_extension_record *, - struct dictionary *); + size_t *allocated_mrsets); +static void decode_mrsets (struct sfm_reader *, struct dictionary *); static void parse_long_var_name_map (struct sfm_reader *, const struct sfm_extension_record *, struct dictionary *); @@ -363,6 +380,7 @@ sfm_read_info_destroy (struct sfm_read_info *info) struct sfm_reader * sfm_open (struct file_handle *fh) { + size_t allocated_mrsets = 0; struct sfm_reader *r; /* Create and initialize reader. */ @@ -389,6 +407,12 @@ sfm_open (struct file_handle *fh) if (!read_dictionary (r)) goto error; + if (r->extensions[EXT_MRSETS] != NULL) + parse_mrsets (r, r->extensions[EXT_MRSETS], &allocated_mrsets); + + if (r->extensions[EXT_MRSETS2] != NULL) + parse_mrsets (r, r->extensions[EXT_MRSETS2], &allocated_mrsets); + return r; error: sfm_close (r); @@ -543,6 +567,156 @@ sfm_get_encoding (const struct sfm_reader *r) return NULL; } +struct get_strings_aux + { + struct pool *pool; + char **titles; + char **strings; + bool *ids; + size_t allocated; + size_t n; + }; + +static void +add_string__ (struct get_strings_aux *aux, + const char *string, bool id, char *title) +{ + if (aux->n >= aux->allocated) + { + aux->allocated = 2 * (aux->allocated + 1); + aux->titles = pool_realloc (aux->pool, aux->titles, + aux->allocated * sizeof *aux->titles); + aux->strings = pool_realloc (aux->pool, aux->strings, + aux->allocated * sizeof *aux->strings); + aux->ids = pool_realloc (aux->pool, aux->ids, + aux->allocated * sizeof *aux->ids); + } + + aux->titles[aux->n] = title; + aux->strings[aux->n] = pool_strdup (aux->pool, string); + aux->ids[aux->n] = id; + aux->n++; +} + +static void PRINTF_FORMAT (3, 4) +add_string (struct get_strings_aux *aux, + const char *string, const char *title, ...) +{ + va_list args; + + va_start (args, title); + add_string__ (aux, string, false, pool_vasprintf (aux->pool, title, args)); + va_end (args); +} + +static void PRINTF_FORMAT (3, 4) +add_id (struct get_strings_aux *aux, const char *id, const char *title, ...) +{ + va_list args; + + va_start (args, title); + add_string__ (aux, id, true, pool_vasprintf (aux->pool, title, args)); + va_end (args); +} + +/* Retrieves significant string data from R in its raw format, to allow the + caller to try to detect the encoding in use. + + Returns the number of strings retrieved N. Sets each of *TITLESP, *IDSP, + and *STRINGSP to an array of N elements allocated from POOL. For each I in + 0...N-1, UTF-8 string *TITLESP[I] describes *STRINGSP[I], which is in + whatever encoding system file R uses. *IDS[I] is true if *STRINGSP[I] must + be a valid PSPP language identifier, false if *STRINGSP[I] is free-form + text. */ +size_t +sfm_get_strings (const struct sfm_reader *r, struct pool *pool, + char ***titlesp, bool **idsp, char ***stringsp) +{ + const struct sfm_mrset *mrset; + struct get_strings_aux aux; + size_t var_idx; + size_t i, j, k; + + aux.pool = pool; + aux.titles = NULL; + aux.strings = NULL; + aux.ids = NULL; + aux.allocated = 0; + aux.n = 0; + + var_idx = 0; + for (i = 0; i < r->n_vars; i++) + if (r->vars[i].width != -1) + add_id (&aux, r->vars[i].name, _("Variable %zu"), ++var_idx); + + var_idx = 0; + for (i = 0; i < r->n_vars; i++) + if (r->vars[i].width != -1) + { + var_idx++; + if (r->vars[i].label) + add_string (&aux, r->vars[i].label, _("Variable %zu Label"), + var_idx); + } + + k = 0; + for (i = 0; i < r->n_labels; i++) + for (j = 0; j < r->labels[i].n_labels; j++) + add_string (&aux, r->labels[i].labels[j].label, + _("Value Label %zu"), k++); + + add_string (&aux, r->header.creation_date, _("Creation Date")); + add_string (&aux, r->header.creation_time, _("Creation Time")); + add_string (&aux, r->header.eye_catcher, _("Product")); + add_string (&aux, r->header.file_label, _("File Label")); + + if (r->extensions[EXT_PRODUCT_INFO]) + add_string (&aux, r->extensions[EXT_PRODUCT_INFO]->data, + _("Extra Product Info")); + + if (r->document) + { + size_t i; + + for (i = 0; i < r->document->n_lines; i++) + { + char line[81]; + + memcpy (line, r->document->documents + i * 80, 80); + line[80] = '\0'; + + add_string (&aux, line, _("Document Line %zu"), i + 1); + } + } + + for (mrset = r->mrsets; mrset < &r->mrsets[r->n_mrsets]; mrset++) + { + size_t mrset_idx = mrset - r->mrsets + 1; + + add_id (&aux, mrset->name, _("MRSET %zu"), mrset_idx); + if (mrset->label[0]) + add_string (&aux, mrset->label, _("MRSET %zu Label"), mrset_idx); + + /* Skip the variables because they ought to be duplicates. */ + + if (mrset->counted) + add_string (&aux, mrset->counted, _("MRSET %zu Counted Value"), + mrset_idx); + } + + /* */ + /* data file attributes */ + /* variable attributes */ + /* long var map */ + /* long string value labels */ + /* long string missing values */ + + *titlesp = aux.titles; + *idsp = aux.ids; + *stringsp = aux.strings; + return aux.n; +} + /* Decodes the dictionary read from R, saving it into into *DICT. Character strings in R are decoded using ENCODING, or an encoding obtained from R if ENCODING is null, or the locale encoding if R specifies no encoding. @@ -564,7 +738,16 @@ sfm_decode (struct sfm_reader *r, const char *encoding, { encoding = sfm_get_encoding (r); if (encoding == NULL) - encoding = locale_charset (); + { + sys_warn (r, -1, _("This system file does not indicate its own " + "character encoding. Using default encoding " + "%s. For best results, specify an encoding " + "explicitly. Use SYSFILE INFO with " + "ENCODING=\"DETECT\" to analyze the possible " + "encodings."), + locale_charset ()); + encoding = locale_charset (); + } } dict = dict_create (encoding); @@ -621,11 +804,7 @@ sfm_decode (struct sfm_reader *r, const char *encoding, /* The following records use short names, so they need to be parsed before parse_long_var_name_map() changes short names to long names. */ - if (r->extensions[EXT_MRSETS] != NULL) - parse_mrsets (r, r->extensions[EXT_MRSETS], dict); - - if (r->extensions[EXT_MRSETS2] != NULL) - parse_mrsets (r, r->extensions[EXT_MRSETS2], dict); + decode_mrsets (r, dict); if (r->extensions[EXT_LONG_STRINGS] != NULL && !parse_long_string_map (r, r->extensions[EXT_LONG_STRINGS], dict)) @@ -886,12 +1065,12 @@ read_variable_record (struct sfm_reader *r, struct sfm_var_record *record) || !read_int (r, &record->missing_value_code) || !read_int (r, &record->print_format) || !read_int (r, &record->write_format) - || !read_bytes (r, record->name, sizeof record->name)) + || !read_string (r, record->name, sizeof record->name)) return false; if (has_variable_label == 1) { - enum { MAX_LABEL_LEN = 255 }; + enum { MAX_LABEL_LEN = 65536 }; unsigned int len, read_len; if (!read_uint (r, &len)) @@ -962,9 +1141,9 @@ read_value_label_record (struct sfm_reader *r, record->pos = r->pos; if (!read_uint (r, &record->n_labels)) return false; - if (record->n_labels > SIZE_MAX / sizeof *record->labels) + if (record->n_labels > UINT_MAX / sizeof *record->labels) { - sys_error (r, r->pos - 4, _("Invalid number of labels %zu."), + sys_error (r, r->pos - 4, _("Invalid number of labels %u."), record->n_labels); return false; } @@ -1009,7 +1188,7 @@ read_value_label_record (struct sfm_reader *r, if (record->n_vars < 1 || record->n_vars > r->n_vars) { sys_error (r, r->pos - 4, - _("Number of variables associated with a value label (%zu) " + _("Number of variables associated with a value label (%u) " "is not between 1 and the number of variables (%zu)."), record->n_vars, r->n_vars); return false; @@ -1127,11 +1306,11 @@ read_extension_record (struct sfm_reader *r, int subtype, { if (type->size > 0 && record->size != type->size) sys_warn (r, record->pos, - _("Record type 7, subtype %d has bad size %zu " + _("Record type 7, subtype %d has bad size %u " "(expected %d)."), subtype, record->size, type->size); else if (type->count > 0 && record->count != type->count) sys_warn (r, record->pos, - _("Record type 7, subtype %d has bad count %zu " + _("Record type 7, subtype %d has bad count %u " "(expected %d)."), subtype, record->count, type->count); else if (type->count == 0 && type->size == 0) { @@ -1222,7 +1401,7 @@ parse_variable_records (struct sfm_reader *r, struct dictionary *dict, size_t i; name = recode_string_pool ("UTF-8", dict_encoding, - rec->name, 8, r->pool); + rec->name, -1, r->pool); name[strcspn (name, " ")] = '\0'; if (!dict_id_is_valid (dict, name, false) @@ -1260,7 +1439,7 @@ parse_variable_records (struct sfm_reader *r, struct dictionary *dict, utf8_label = recode_string_pool ("UTF-8", dict_encoding, rec->label, -1, r->pool); - var_set_label (var, utf8_label, false); + var_set_label (var, utf8_label); } /* Set missing values. */ @@ -1500,40 +1679,30 @@ parse_extra_product_info (struct sfm_reader *r, /* Parses record type 7, subtype 7 or 19. */ static void parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record, - struct dictionary *dict) + size_t *allocated_mrsets) { struct text_record *text; - struct mrset *mrset; text = open_text_record (r, record, false); for (;;) { - const char *counted = NULL; - const char *name; - const char *label; - struct stringi_set var_names; + struct sfm_mrset *mrset; size_t allocated_vars; char delimiter; - int width; /* Skip extra line feeds if present. */ while (text_match (text, '\n')) continue; - mrset = xzalloc (sizeof *mrset); + if (r->n_mrsets >= *allocated_mrsets) + r->mrsets = pool_2nrealloc (r->pool, r->mrsets, allocated_mrsets, + sizeof *r->mrsets); + mrset = &r->mrsets[r->n_mrsets]; + memset(mrset, 0, sizeof *mrset); - name = text_get_token (text, ss_cstr ("="), NULL); - if (name == NULL) + mrset->name = text_get_token (text, ss_cstr ("="), NULL); + if (mrset->name == NULL) break; - mrset->name = recode_string ("UTF-8", r->encoding, name, -1); - - if (mrset->name[0] != '$') - { - sys_warn (r, record->pos, - _("`%s' does not begin with `$' at offset %zu " - "in MRSETS record."), mrset->name, text_pos (text)); - break; - } if (text_match (text, 'C')) { @@ -1570,9 +1739,9 @@ parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record, mrset->label_from_var_label = true; else if (strcmp (number, "1")) sys_warn (r, record->pos, - _("Unexpected label source value `%s' following `E' " + _("Unexpected label source value following `E' " "at offset %zu in MRSETS record."), - number, text_pos (text)); + text_pos (text)); } else { @@ -1585,28 +1754,22 @@ parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record, if (mrset->type == MRSET_MD) { - counted = text_parse_counted_string (r, text); - if (counted == NULL) + mrset->counted = text_parse_counted_string (r, text); + if (mrset->counted == NULL) break; } - label = text_parse_counted_string (r, text); - if (label == NULL) + mrset->label = text_parse_counted_string (r, text); + if (mrset->label == NULL) break; - if (label[0] != '\0') - mrset->label = recode_string ("UTF-8", r->encoding, label, -1); - stringi_set_init (&var_names); allocated_vars = 0; - width = INT_MAX; do { - const char *raw_var_name; - struct variable *var; - char *var_name; + const char *var; - raw_var_name = text_get_token (text, ss_cstr (" \n"), &delimiter); - if (raw_var_name == NULL) + var = text_get_token (text, ss_cstr (" \n"), &delimiter); + if (var == NULL) { if (delimiter != '\n') sys_warn (r, record->pos, @@ -1615,7 +1778,60 @@ parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record, text_pos (text)); break; } - var_name = recode_string ("UTF-8", r->encoding, raw_var_name, -1); + + if (mrset->n_vars >= allocated_vars) + mrset->vars = pool_2nrealloc (r->pool, mrset->vars, + &allocated_vars, + sizeof *mrset->vars); + mrset->vars[mrset->n_vars++] = var; + } + while (delimiter != '\n'); + + r->n_mrsets++; + } + close_text_record (r, text); +} + +static void +decode_mrsets (struct sfm_reader *r, struct dictionary *dict) +{ + const struct sfm_mrset *s; + + for (s = r->mrsets; s < &r->mrsets[r->n_mrsets]; s++) + { + struct stringi_set var_names; + struct mrset *mrset; + char *name; + int width; + size_t i; + + name = recode_string ("UTF-8", r->encoding, s->name, -1); + if (name[0] != '$') + { + sys_warn (r, -1, _("Multiple response set name `%s' does not begin " + "with `$'."), + name); + free (name); + continue; + } + + mrset = xzalloc (sizeof *mrset); + mrset->name = name; + mrset->type = s->type; + mrset->cat_source = s->cat_source; + mrset->label_from_var_label = s->label_from_var_label; + if (s->label[0] != '\0') + mrset->label = recode_string ("UTF-8", r->encoding, s->label, -1); + + stringi_set_init (&var_names); + mrset->vars = xmalloc (s->n_vars * sizeof *mrset->vars); + width = INT_MAX; + for (i = 0; i < s->n_vars; i++) + { + struct variable *var; + char *var_name; + + var_name = recode_string ("UTF-8", r->encoding, s->vars[i], -1); var = dict_lookup_var (dict, var_name); if (var == NULL) @@ -1625,10 +1841,9 @@ parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record, } if (!stringi_set_insert (&var_names, var_name)) { - sys_warn (r, record->pos, - _("Duplicate variable name %s " - "at offset %zu in MRSETS record."), - var_name, text_pos (text)); + sys_warn (r, -1, + _("MRSET %s contains duplicate variable name %s."), + mrset->name, var_name); free (var_name); continue; } @@ -1641,25 +1856,23 @@ parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record, if (mrset->n_vars && var_get_type (var) != var_get_type (mrset->vars[0])) { - sys_warn (r, record->pos, + sys_warn (r, -1, _("MRSET %s contains both string and " - "numeric variables."), name); + "numeric variables."), mrset->name); continue; } width = MIN (width, var_get_width (var)); - if (mrset->n_vars >= allocated_vars) - mrset->vars = x2nrealloc (mrset->vars, &allocated_vars, - sizeof *mrset->vars); mrset->vars[mrset->n_vars++] = var; } - while (delimiter != '\n'); if (mrset->n_vars < 2) { - sys_warn (r, record->pos, - _("MRSET %s has only %zu variables."), mrset->name, - mrset->n_vars); + if (mrset->n_vars == 0) + sys_warn (r, -1, _("MRSET %s has no variables."), mrset->name); + else + sys_warn (r, -1, _("MRSET %s has only one variable."), + mrset->name); mrset_destroy (mrset); stringi_set_destroy (&var_names); continue; @@ -1670,18 +1883,15 @@ parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record, mrset->width = width; value_init (&mrset->counted, width); if (width == 0) - mrset->counted.f = c_strtod (counted, NULL); + mrset->counted.f = c_strtod (s->counted, NULL); else value_copy_str_rpad (&mrset->counted, width, - (const uint8_t *) counted, ' '); + (const uint8_t *) s->counted, ' '); } dict_add_mrset (dict, mrset); - mrset = NULL; stringi_set_destroy (&var_names); } - mrset_destroy (mrset); - close_text_record (r, text); } /* Read record type 7, subtype 11, which specifies how variables @@ -1705,7 +1915,7 @@ parse_display_parameters (struct sfm_reader *r, else { sys_warn (r, record->pos, - _("Extension 11 has bad count %zu (for %zu variables)."), + _("Extension 11 has bad count %u (for %zu variables)."), record->count, n_vars); return; }