X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Fdata%2Fdata-in.c;h=de19d670484fd8344de1351497684386ca0d3f16;hb=38993354cabb6fc37bb882be92f9a49e9aeb4c88;hp=b9907c2d7303069348a670320957ff099d10260a;hpb=d0371553a98cd169353bf6d211e375e5ffc3a3bd;p=pspp diff --git a/src/data/data-in.c b/src/data/data-in.c index b9907c2d73..de19d67048 100644 --- a/src/data/data-in.c +++ b/src/data/data-in.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2006, 2009, 2010 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -20,30 +20,34 @@ #include #include +#include #include #include +#include #include #include #include #include -#include #include "calendar.h" +#include "dictionary.h" +#include "format.h" #include "identifier.h" +#include "libpspp/assertion.h" +#include "libpspp/compiler.h" +#include "libpspp/i18n.h" +#include "libpspp/integer-format.h" +#include "libpspp/legacy-encoding.h" +#include "libpspp/message.h" +#include "libpspp/misc.h" +#include "libpspp/str.h" #include "settings.h" #include "value.h" -#include -#include -#include -#include -#include -#include -#include -#include "c-ctype.h" -#include "c-strtod.h" -#include "minmax.h" -#include "xalloc.h" +#include "gl/c-ctype.h" +#include "gl/c-strtod.h" +#include "gl/minmax.h" +#include "gl/xalloc.h" #include "gettext.h" #define _(msgid) gettext (msgid) @@ -51,10 +55,8 @@ /* Information about parsing one data field. */ struct data_in { - enum legacy_encoding encoding;/* Encoding of source. */ struct substring input; /* Source. */ enum fmt_type format; /* Input format. */ - int implied_decimals; /* Number of implied decimal places. */ union value *output; /* Destination. */ int width; /* Output width. */ @@ -63,44 +65,30 @@ struct data_in int last_column; /* Last column. */ }; -/* Integer format used for IB and PIB input. */ -static enum integer_format input_integer_format = INTEGER_NATIVE; - -/* Floating-point format used for RB and RBHEX input. */ -static enum float_format input_float_format = FLOAT_NATIVE_DOUBLE; - typedef bool data_in_parser_func (struct data_in *); #define FMT(NAME, METHOD, IMIN, OMIN, IO, CATEGORY) \ static data_in_parser_func parse_##METHOD; #include "format.def" -static void vdata_warning (const struct data_in *, const char *, va_list) - PRINTF_FORMAT (2, 0); static void data_warning (const struct data_in *, const char *, ...) PRINTF_FORMAT (2, 3); -static void apply_implied_decimals (struct data_in *); static void default_result (struct data_in *); static bool trim_spaces_and_check_missing (struct data_in *); static int hexit_value (int c); /* Parses the characters in INPUT, which are encoded in the given - ENCODING, according to FORMAT. Stores the parsed - representation in OUTPUT, which has the given WIDTH (0 for - a numeric field, otherwise the string width). + INPUT_ENCODING, according to FORMAT. - If no decimal point is included in a numeric format, then - IMPLIED_DECIMALS decimal places are implied. Specify 0 if no - decimal places should be implied. - - If FIRST_COLUMN is nonzero, then it should be the 1-based - column number of the first character in INPUT, used in error - messages. */ + Stores the parsed representation in OUTPUT, which the caller must have + initialized with the given WIDTH (0 for a numeric field, otherwise the + string width). If FORMAT is FMT_A, then OUTPUT_ENCODING must specify the + correct encoding for OUTPUT (normally obtained via dict_get_encoding()). */ bool -data_in (struct substring input, enum legacy_encoding encoding, - enum fmt_type format, int implied_decimals, - int first_column, union value *output, int width) +data_in (struct substring input, const char *input_encoding, + enum fmt_type format, int first_column, int last_column, + union value *output, int width, const char *output_encoding) { static data_in_parser_func *const handlers[FMT_NUMBER_OF_FORMATS] = { @@ -109,81 +97,170 @@ data_in (struct substring input, enum legacy_encoding encoding, }; struct data_in i; - void *copy = NULL; + + enum fmt_category cat; + const char *dest_encoding; + char *s; bool ok; assert ((width != 0) == fmt_is_string (format)); - if (encoding == LEGACY_NATIVE - || fmt_get_category (format) & (FMT_CAT_BINARY | FMT_CAT_STRING)) - { - i.input = input; - i.encoding = encoding; - } - else - { - ss_alloc_uninit (&i.input, ss_length (input)); - legacy_recode (encoding, ss_data (input), LEGACY_NATIVE, - ss_data (i.input), ss_length (input)); - i.encoding = LEGACY_NATIVE; - copy = ss_data (i.input); - } i.format = format; - i.implied_decimals = implied_decimals; i.output = output; i.width = width; i.first_column = first_column; - i.last_column = first_column + ss_length (input) - 1; + i.last_column = last_column; - if (!ss_is_empty (i.input)) + if (ss_is_empty (input)) { - ok = handlers[i.format] (&i); - if (!ok) - default_result (&i); + default_result (&i); + return true; + } + + cat = fmt_get_category (format); + if (cat & (FMT_CAT_BASIC | FMT_CAT_HEXADECIMAL + | FMT_CAT_DATE | FMT_CAT_TIME | FMT_CAT_DATE_COMPONENT)) + { + /* We're going to parse these into numbers. For this purpose we want to + deal with them in the local "C" encoding. Any character not in that + encoding wouldn't be valid anyhow. */ + dest_encoding = LEGACY_NATIVE; + } + else if (cat & (FMT_CAT_BINARY | FMT_CAT_LEGACY)) + { + /* Don't recode these binary formats at all, since they are not text. */ + dest_encoding = NULL; } else { - default_result (&i); - ok = true; + assert (cat == FMT_CAT_STRING); + if (format == FMT_AHEX) + { + /* We want the hex digits in the local "C" encoding, even though the + result may not be in that encoding. */ + dest_encoding = LEGACY_NATIVE; + } + else + { + /* Use the final output encoding. */ + dest_encoding = output_encoding; + } } - if (copy) - free (copy); + if (dest_encoding != NULL) + { + i.input = recode_substring_pool (dest_encoding, input_encoding, input, + NULL); + s = i.input.string; + } + else + { + i.input = input; + s = NULL; + } + + ok = handlers[i.format] (&i); + if (!ok) + default_result (&i); + + free (s); return ok; } -/* Returns the integer format used for IB and PIB input. */ -enum integer_format -data_in_get_integer_format (void) +static bool +number_has_implied_decimals (const char *s, enum fmt_type type) { - return input_integer_format; -} + int decimal = settings_get_style (type)->decimal; + bool got_digit = false; + for (;;) + { + switch (*s) + { + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + got_digit = true; + break; -/* Sets the integer format used for IB and PIB input to - FORMAT. */ -void -data_in_set_integer_format (enum integer_format format) -{ - input_integer_format = format; + case '+': case '-': + if (got_digit) + return false; + break; + + case 'e': case 'E': case 'd': case 'D': + return false; + + case '.': case ',': + if (*s == decimal) + return false; + break; + + case '\0': + return true; + + default: + break; + } + + s++; + } } -/* Returns the floating-point format used for RB and RBHEX - input. */ -enum float_format -data_in_get_float_format (void) +static bool +has_implied_decimals (struct substring input, const char *input_encoding, + enum fmt_type format) { - return input_float_format; + bool retval; + char *s; + + switch (format) + { + case FMT_F: + case FMT_COMMA: + case FMT_DOT: + case FMT_DOLLAR: + case FMT_PCT: + case FMT_E: + case FMT_Z: + break; + + case FMT_N: + case FMT_IB: + case FMT_PIB: + case FMT_P: + case FMT_PK: + return true; + + default: + return false; + } + + s = recode_string (LEGACY_NATIVE, input_encoding, + ss_data (input), ss_length (input)); + retval = (format == FMT_Z + ? strchr (s, '.') == NULL + : number_has_implied_decimals (s, format)); + free (s); + + return retval; } -/* Sets the floating-point format used for RB and RBHEX input to - FORMAT. */ +/* In some cases, when no decimal point is explicitly included in numeric + input, its position is implied by the number of decimal places in the input + format. In such a case, this function may be called just after data_in(). + Its arguments are a subset of that function's arguments plus D, the number + of decimal places associated with FORMAT. + + If it is appropriate, this function modifies the numeric value in OUTPUT. */ void -data_in_set_float_format (enum float_format format) +data_in_imply_decimals (struct substring input, const char *input_encoding, + enum fmt_type format, int d, union value *output) { - input_float_format = format; + if (d > 0 && output->f != SYSMIS + && has_implied_decimals (input, input_encoding, format)) + output->f /= pow (10., d); } /* Format parsers. */ @@ -192,7 +269,8 @@ data_in_set_float_format (enum float_format format) static bool parse_number (struct data_in *i) { - const struct fmt_number_style *style = fmt_get_style (i->format); + const struct fmt_number_style *style = + settings_get_style (i->format); struct string tmp; @@ -200,7 +278,10 @@ parse_number (struct data_in *i) int save_errno; char *tail; - assert (fmt_get_category (i->format) != FMT_CAT_CUSTOM); + if (fmt_get_category (i->format) == FMT_CAT_CUSTOM) + { + style = settings_get_style (FMT_F); + } /* Trim spaces and check for missing value representation. */ if (trim_spaces_and_check_missing (i)) @@ -318,8 +399,6 @@ parse_number (struct data_in *i) else { errno = save_errno; - if (!explicit_decimals) - apply_implied_decimals (i); } ds_destroy (&tmp); @@ -343,7 +422,6 @@ parse_N (struct data_in *i) i->output->f = i->output->f * 10.0 + (c - '0'); } - apply_implied_decimals (i); return true; } @@ -499,11 +577,7 @@ parse_Z (struct data_in *i) } } else - { - errno = save_errno; - if (!got_dot) - apply_implied_decimals (i); - } + errno = save_errno; ds_destroy (&tmp); return true; @@ -518,7 +592,7 @@ parse_IB (struct data_in *i) uint64_t sign_bit; bytes = MIN (8, ss_length (i->input)); - value = integer_get (input_integer_format, ss_data (i->input), bytes); + value = integer_get (settings_get_input_integer_format (), ss_data (i->input), bytes); sign_bit = UINT64_C(1) << (8 * bytes - 1); if (!(value & sign_bit)) @@ -530,8 +604,6 @@ parse_IB (struct data_in *i) i->output->f = -(double) -value; } - apply_implied_decimals (i); - return true; } @@ -539,11 +611,9 @@ parse_IB (struct data_in *i) static bool parse_PIB (struct data_in *i) { - i->output->f = integer_get (input_integer_format, ss_data (i->input), + i->output->f = integer_get (settings_get_input_integer_format (), ss_data (i->input), MIN (8, ss_length (i->input))); - apply_implied_decimals (i); - return true; } @@ -583,8 +653,6 @@ parse_P (struct data_in *i) else if (low_nibble == 0xb || low_nibble == 0xd) i->output->f = -i->output->f; - apply_implied_decimals (i); - return true; } @@ -606,8 +674,6 @@ parse_PK (struct data_in *i) i->output->f = (100 * i->output->f) + (10 * high_nibble) + low_nibble; } - apply_implied_decimals (i); - return true; } @@ -615,9 +681,10 @@ parse_PK (struct data_in *i) static bool parse_RB (struct data_in *i) { - size_t size = float_get_size (input_float_format); + enum float_format ff = settings_get_input_float_format (); + size_t size = float_get_size (ff); if (ss_length (i->input) >= size) - float_convert (input_float_format, ss_data (i->input), + float_convert (ff, ss_data (i->input), FLOAT_NATIVE_DOUBLE, &i->output->f); else i->output->f = SYSMIS; @@ -631,12 +698,13 @@ parse_A (struct data_in *i) { /* This is equivalent to buf_copy_rpad, except that we posibly do a character set recoding in the middle. */ - char *dst = i->output->s; + uint8_t *dst = value_str_rw (i->output, i->width); size_t dst_size = i->width; const char *src = ss_data (i->input); size_t src_size = ss_length (i->input); - legacy_recode (i->encoding, src, LEGACY_NATIVE, dst, MIN (src_size, dst_size)); + memcpy (dst, src, MIN (src_size, dst_size)); + if (dst_size > src_size) memset (&dst[src_size], ' ', dst_size - src_size); @@ -647,6 +715,7 @@ parse_A (struct data_in *i) static bool parse_AHEX (struct data_in *i) { + uint8_t *s = value_str_rw (i->output, i->width); size_t j; for (j = 0; ; j++) @@ -661,11 +730,6 @@ parse_AHEX (struct data_in *i) return false; } - if (i->encoding != LEGACY_NATIVE) - { - hi = legacy_to_native (i->encoding, hi); - lo = legacy_to_native (i->encoding, lo); - } if (!c_isxdigit (hi) || !c_isxdigit (lo)) { data_warning (i, _("Field must contain only hex digits.")); @@ -673,10 +737,10 @@ parse_AHEX (struct data_in *i) } if (j < i->width) - i->output->s[j] = hexit_value (hi) * 16 + hexit_value (lo); + s[j] = hexit_value (hi) * 16 + hexit_value (lo); } - memset (i->output->s + j, ' ', i->width - j); + memset (&s[j], ' ', i->width - j); return true; } @@ -795,7 +859,7 @@ parse_name_token (struct data_in *i) exact matches (except for case) are allowed. Returns true if successful, false otherwise. */ static bool -match_name (struct substring token, const char **names, long *output) +match_name (struct substring token, const char *const *names, long *output) { int i; @@ -824,14 +888,14 @@ parse_month (struct data_in *i, long *month) } else { - static const char *english_names[] = + static const char *const english_names[] = { "jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec", NULL, }; - static const char *roman_names[] = + static const char *const roman_names[] = { "i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x", "xi", "xii", @@ -860,7 +924,7 @@ parse_year (struct data_in *i, long *year, size_t max_digits) if (*year >= 0 && *year <= 99) { - int epoch = get_epoch (); + int epoch = settings_get_epoch (); int epoch_century = ROUND_DOWN (epoch, 100); int epoch_offset = epoch - epoch_century; if (*year >= epoch_offset) @@ -883,7 +947,7 @@ parse_trailer (struct data_in *i) if (ss_is_empty (i->input)) return true; - data_warning (i, _("Trailing garbage \"%.*s\" following date."), + data_warning (i, _("Trailing garbage `%.*s' following date."), (int) ss_length (i->input), ss_data (i->input)); return false; } @@ -995,7 +1059,7 @@ parse_minute_second (struct data_in *i, double *time) cp = buf; while (c_isdigit (ss_first (i->input))) *cp++ = ss_get_char (&i->input); - if (ss_match_char (&i->input, fmt_decimal_char (FMT_F))) + if (ss_match_char (&i->input, settings_get_decimal_char (FMT_F))) *cp++ = '.'; while (c_isdigit (ss_first (i->input))) *cp++ = ss_get_char (&i->input); @@ -1012,7 +1076,7 @@ parse_minute_second (struct data_in *i, double *time) static bool parse_weekday (struct data_in *i, long *weekday) { - static const char *weekday_names[] = + static const char *const weekday_names[] = { "su", "mo", "tu", "we", "th", "fr", "sa", NULL, @@ -1029,19 +1093,6 @@ parse_weekday (struct data_in *i, long *weekday) /* Date & time formats. */ -/* Helper function for passing to - calendar_gregorian_to_offset. */ -static void -calendar_error (void *i_, const char *format, ...) -{ - struct data_in *i = i_; - va_list args; - - va_start (args, format); - vdata_warning (i, format, args); - va_end (args); -} - /* Parses WKDAY format. */ static bool parse_WKDAY (struct data_in *i) @@ -1172,10 +1223,16 @@ parse_date (struct data_in *i) if (year != INT_MIN) { - double ofs = calendar_gregorian_to_offset (year, month, day, - calendar_error, i); + char *error; + double ofs; + + ofs = calendar_gregorian_to_offset (year, month, day, &error); if (ofs == SYSMIS) - return false; + { + data_warning (i, "%s", error); + free (error); + return false; + } date = (yday - 1 + ofs) * 60. * 60. * 24.; } else @@ -1184,58 +1241,37 @@ parse_date (struct data_in *i) return true; } + /* Utility functions. */ -/* Outputs FORMAT with the given ARGS as a warning for input - I. */ +/* Outputs FORMAT with as a warning for input I. */ static void -vdata_warning (const struct data_in *i, const char *format, va_list args) +data_warning (const struct data_in *i, const char *format, ...) { + va_list args; struct msg m; struct string text; ds_init_empty (&text); ds_put_char (&text, '('); - if (i->first_column != 0) - { - if (i->first_column == i->last_column) - ds_put_format (&text, _("column %d"), i->first_column); - else - ds_put_format (&text, _("columns %d-%d"), - i->first_column, i->last_column); - ds_put_cstr (&text, ", "); - } ds_put_format (&text, _("%s field) "), fmt_name (i->format)); + + va_start (args, format); ds_put_vformat (&text, format, args); + va_end (args); - m.category = MSG_DATA; - m.severity = MSG_WARNING; + m.category = MSG_C_DATA; + m.severity = MSG_S_WARNING; m.text = ds_cstr (&text); + m.where.file_name = NULL; + m.where.line_number = 0; + m.where.first_column = i->first_column; + m.where.last_column = i->last_column; msg_emit (&m); } -/* Outputs FORMAT with the given ARGS as a warning for input - I. */ -static void -data_warning (const struct data_in *i, const char *format, ...) -{ - va_list args; - - va_start (args, format); - vdata_warning (i, format, args); - va_end (args); -} - -/* Apply implied decimal places to output. */ -static void -apply_implied_decimals (struct data_in *i) -{ - if (i->implied_decimals > 0) - i->output->f /= pow (10., i->implied_decimals); -} - /* Sets the default result for I. For a numeric format, this is the value set on SET BLANKS (typically system-missing); for a string format, it is all @@ -1244,9 +1280,9 @@ static void default_result (struct data_in *i) { if (fmt_is_string (i->format)) - memset (i->output->s, ' ', i->width); + memset (value_str_rw (i->output, i->width), ' ', i->width); else - i->output->f = get_blanks (); + i->output->f = settings_get_blanks (); } /* Trims leading and trailing spaces from I.