X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Fdata%2Fformat-guesser.c;h=a9953a532cce8442362ba0193566773014f5b8ec;hb=142f8f8814423f76523825f8df060e2fa9d2a2b6;hp=78af8dfa269ed3ee70ff35782799cdddcc973d93;hpb=3e3d825afe59ad43699664a74ca04e2e1b836786;p=pspp diff --git a/src/data/format-guesser.c b/src/data/format-guesser.c index 78af8dfa26..a9953a532c 100644 --- a/src/data/format-guesser.c +++ b/src/data/format-guesser.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2008 Free Software Foundation, Inc. + Copyright (C) 2008, 2010, 2011 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -16,19 +16,19 @@ #include -#include "format-guesser.h" +#include "data/format-guesser.h" #include #include -#include "c-ctype.h" -#include "minmax.h" -#include "xalloc.h" +#include "data/format.h" +#include "data/settings.h" +#include "libpspp/assertion.h" +#include "libpspp/str.h" -#include -#include -#include -#include +#include "gl/c-ctype.h" +#include "gl/minmax.h" +#include "gl/xalloc.h" /* A token in which potential date or time fields are broken. @@ -70,7 +70,7 @@ struct date_syntax { enum fmt_type format; /* Format type. */ #define MAX_TOKENS 11 - size_t token_cnt; /* Number of tokens. */ + size_t n_tokens; /* Number of tokens. */ enum date_token tokens[MAX_TOKENS]; /* Tokens. */ }; @@ -120,6 +120,15 @@ static struct date_syntax syntax[] = 11, {DT_DAY, DT_DELIM, DT_MONTH, DT_DELIM, DT_YEAR, DT_SPACE, DT_HOUR, DT_COLON, DT_MINUTE, DT_COLON, DT_SECOND} }, + /* yyyy-dd-mmm HH:MM */ + { FMT_YMDHMS, + 9, {DT_YEAR, DT_DELIM, DT_MONTH, DT_DELIM, DT_DAY, DT_SPACE, DT_HOUR, + DT_COLON, DT_MINUTE} }, + /* yyyy-dd-mmm HH:MM:SS */ + { FMT_YMDHMS, + 11, {DT_YEAR, DT_DELIM, DT_MONTH, DT_DELIM, DT_DAY, DT_SPACE, DT_HOUR, + DT_COLON, DT_MINUTE, DT_COLON, DT_SECOND} }, + /* HH:MM */ { FMT_TIME, 3, {DT_HOUR, DT_COLON, DT_MINUTE} }, /* HH:MM:SS */ @@ -135,8 +144,13 @@ static struct date_syntax syntax[] = /* www */ { FMT_WKDAY, 1, {DT_WEEKDAY} }, - /* mmm */ - { FMT_MONTH, 1, {DT_MONTH} }, + /* mmm + + We require a spelled-out English month so that + single-character Roman numerals like "i" and "x" don't get + detected as months. The latter is particularly common in + the password field of /etc/passwd-like files. */ + { FMT_MONTH, 1, {DT_ENGLISH_MONTH} }, }; /* Number of recognized date syntax formats. */ @@ -299,12 +313,12 @@ add_numeric (struct fmt_guesser *g, struct substring s) int c; /* Skip leading "$" and optional following white space. */ - has_dollar = ss_match_char (&s, '$'); + has_dollar = ss_match_byte (&s, '$'); if (has_dollar) ss_ltrim (&s, ss_cstr (CC_SPACES)); /* Skip optional sign. */ - ss_match_char_in (&s, ss_cstr ("+-")); + ss_match_byte_in (&s, ss_cstr ("+-")); /* Skip digits punctuated by commas and dots. We don't know whether the decimal point is a comma or a dot, so for now we @@ -320,7 +334,7 @@ add_numeric (struct fmt_guesser *g, struct substring s) if (dots || commas) delim_digits++; } - else if (c == '.' ) + else if (c == '.') { dots++; prev_delim = c; @@ -343,10 +357,10 @@ add_numeric (struct fmt_guesser *g, struct substring s) } /* Skip the optional exponent. */ - has_exp = ss_match_char_in (&s, ss_cstr ("eEdD")) != EOF; - has_exp_sign = ss_match_char_in (&s, ss_cstr ("-+")) != EOF; + has_exp = ss_match_byte_in (&s, ss_cstr ("eEdD")) != EOF; + has_exp_sign = ss_match_byte_in (&s, ss_cstr ("-+")) != EOF; if (has_exp_sign) - ss_match_char (&s, ' '); + ss_match_byte (&s, ' '); exp_digits = ss_ltrim (&s, ss_cstr (CC_DIGITS)); if ((has_exp || has_exp_sign) && !exp_digits) { @@ -356,7 +370,7 @@ add_numeric (struct fmt_guesser *g, struct substring s) } /* Skip optional '%'. */ - has_percent = ss_match_char (&s, '%'); + has_percent = ss_match_byte (&s, '%'); if (has_dollar && has_percent) { /* A valid number cannot have both '$' and '%'. */ @@ -390,7 +404,7 @@ add_numeric (struct fmt_guesser *g, struct substring s) can't tell whether the ',' or '.' is a grouping or decimal character. Assume that the decimal character from the settings is in use. */ - if (prev_delim == settings_get_decimal_char (FMT_F)) + if (prev_delim == settings_get_fmt_settings ()->decimal) { decimal = prev_delim; precision = delim_digits; @@ -434,7 +448,7 @@ add_numeric (struct fmt_guesser *g, struct substring s) static void guess_numeric (struct fmt_guesser *g, struct fmt_spec *f) { - int decimal_char = settings_get_decimal_char (FMT_COMMA); + int decimal_char = settings_get_fmt_settings ()->decimal; f->d = g->decimals / g->count; if (g->pct) @@ -452,11 +466,15 @@ guess_numeric (struct fmt_guesser *g, struct fmt_spec *f) } /* Tries to parse S as a date (DATE, ADATE, EDATE, SDATE, QYR, - MOYR, WKYR, or DATETIME), time (TIME or DTIME), or date - component (WKDAY or MONTH) format. If successful, increments - G's any_date counter and the counter or counters for the - specific format(s) that S matches. On failure, does not - modify G. + MOYR, WKYR, DATETIME, or YMDHMS), time (TIME or DTIME), or + date component (WKDAY or MONTH) format. If successful, + increments G's any_date counter and the counter or counters + for the specific format(s) that S matches. On failure, does + not modify G. + + XXX How can we distinguish MTIME from TIME? One way might be + that TIME can have three parts (HH:MM:SS) but MTIME only ever + has two (MM:SS). Does not attempt to recognize JDATE format: it looks just like F format and will thus be caught by the numeric parser. @@ -473,27 +491,27 @@ add_date_time (struct fmt_guesser *g, struct substring s) enum date_token token; enum date_token tokens[MAX_TOKENS]; enum date_token tokens_seen; - size_t token_cnt; + size_t n_tokens; int decimals; bool is_date; int i; /* Break S into tokens. */ - token_cnt = 0; + n_tokens = 0; tokens_seen = 0; decimals = 0; while (!ss_is_empty (s)) { - if (token_cnt >= MAX_TOKENS) + if (n_tokens >= MAX_TOKENS) return; token = parse_date_token (&s, tokens_seen, &decimals); if (token == 0) return; - tokens[token_cnt++] = token; + tokens[n_tokens++] = token; tokens_seen |= token; } - if (token_cnt == 0) + if (n_tokens == 0) return; /* Find matching date formats, if any, and increment the @@ -502,7 +520,7 @@ add_date_time (struct fmt_guesser *g, struct substring s) for (i = 0; i < DATE_SYNTAX_CNT; i++) { struct date_syntax *s = &syntax[i]; - if (match_date_syntax (tokens, token_cnt, s->tokens, s->token_cnt)) + if (match_date_syntax (tokens, n_tokens, s->tokens, s->n_tokens)) { is_date = true; g->date[i]++; @@ -566,12 +584,12 @@ guess_date_time (struct fmt_guesser *g, struct fmt_spec *f) (We use the minimum input width, but an output width would be equally appropriate, since all the time formats have the same minimum widths for input and output.) */ - if (f->type == FMT_DATETIME || f->type == FMT_TIME - || f->type == FMT_DTIME) + if (f->type == FMT_DATETIME || f->type == FMT_YMDHMS + || f->type == FMT_MTIME || f->type == FMT_TIME || f->type == FMT_DTIME) { for (i = 0; i < DATE_SYNTAX_CNT; i++) if (g->date[i] - && syntax[i].tokens[syntax[i].token_cnt - 1] == DT_SECOND) + && syntax[i].tokens[syntax[i].n_tokens - 1] == DT_SECOND) { f->d = g->decimals / g->count; f->w = MAX (f->w, fmt_min_input_width (f->type) + 3); @@ -630,7 +648,7 @@ parse_date_token (struct substring *s, enum date_token tokens_seen, ss_advance (s, 1); token = recognize_identifier_token (s); if (token) - ss_match_char_in (s, ss_cstr (CC_SPACES)); + ss_match_byte_in (s, ss_cstr (CC_SPACES)); else token = DT_DELIM | DT_SPACE; return token; @@ -658,10 +676,10 @@ parse_date_number (struct substring *s, enum date_token tokens_seen, int *decimals) { long int value; - size_t digit_cnt = ss_get_long (s, &value); + size_t n_digits = ss_get_long (s, &value); enum date_token token = 0; - if (ss_match_char (s, settings_get_decimal_char (FMT_F)) + if (ss_match_byte (s, settings_get_fmt_settings ()->decimal) && tokens_seen & DT_COLON && value <= 59) { @@ -685,13 +703,13 @@ parse_date_number (struct substring *s, enum date_token tokens_seen, else token = DT_DAY_COUNT; - if (digit_cnt == 2) + if (n_digits == 2) { token |= DT_YEAR; if (value <= 59) token |= DT_MINUTE | DT_SECOND; } - else if (digit_cnt == 4) + else if (n_digits == 4) token |= DT_YEAR; }