X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Fdata%2Fformat-guesser.c;h=f2f5eb6aee6eea864f1a15a2e8e5f21b5e38c27c;hb=2053ecc53ff35591cdba70cf20d43d16ad680367;hp=78af8dfa269ed3ee70ff35782799cdddcc973d93;hpb=3e3d825afe59ad43699664a74ca04e2e1b836786;p=pspp diff --git a/src/data/format-guesser.c b/src/data/format-guesser.c index 78af8dfa26..f2f5eb6aee 100644 --- a/src/data/format-guesser.c +++ b/src/data/format-guesser.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2008 Free Software Foundation, Inc. + Copyright (C) 2008, 2010, 2011 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -16,19 +16,19 @@ #include -#include "format-guesser.h" +#include "data/format-guesser.h" #include #include -#include "c-ctype.h" -#include "minmax.h" -#include "xalloc.h" +#include "data/format.h" +#include "data/settings.h" +#include "libpspp/assertion.h" +#include "libpspp/str.h" -#include -#include -#include -#include +#include "gl/c-ctype.h" +#include "gl/minmax.h" +#include "gl/xalloc.h" /* A token in which potential date or time fields are broken. @@ -120,6 +120,15 @@ static struct date_syntax syntax[] = 11, {DT_DAY, DT_DELIM, DT_MONTH, DT_DELIM, DT_YEAR, DT_SPACE, DT_HOUR, DT_COLON, DT_MINUTE, DT_COLON, DT_SECOND} }, + /* yyyy-dd-mmm HH:MM */ + { FMT_YMDHMS, + 9, {DT_YEAR, DT_DELIM, DT_MONTH, DT_DELIM, DT_DAY, DT_SPACE, DT_HOUR, + DT_COLON, DT_MINUTE} }, + /* yyyy-dd-mmm HH:MM:SS */ + { FMT_YMDHMS, + 11, {DT_YEAR, DT_DELIM, DT_MONTH, DT_DELIM, DT_DAY, DT_SPACE, DT_HOUR, + DT_COLON, DT_MINUTE, DT_COLON, DT_SECOND} }, + /* HH:MM */ { FMT_TIME, 3, {DT_HOUR, DT_COLON, DT_MINUTE} }, /* HH:MM:SS */ @@ -135,8 +144,13 @@ static struct date_syntax syntax[] = /* www */ { FMT_WKDAY, 1, {DT_WEEKDAY} }, - /* mmm */ - { FMT_MONTH, 1, {DT_MONTH} }, + /* mmm + + We require a spelled-out English month so that + single-character Roman numerals like "i" and "x" don't get + detected as months. The latter is particularly common in + the password field of /etc/passwd-like files. */ + { FMT_MONTH, 1, {DT_ENGLISH_MONTH} }, }; /* Number of recognized date syntax formats. */ @@ -299,12 +313,12 @@ add_numeric (struct fmt_guesser *g, struct substring s) int c; /* Skip leading "$" and optional following white space. */ - has_dollar = ss_match_char (&s, '$'); + has_dollar = ss_match_byte (&s, '$'); if (has_dollar) ss_ltrim (&s, ss_cstr (CC_SPACES)); /* Skip optional sign. */ - ss_match_char_in (&s, ss_cstr ("+-")); + ss_match_byte_in (&s, ss_cstr ("+-")); /* Skip digits punctuated by commas and dots. We don't know whether the decimal point is a comma or a dot, so for now we @@ -343,10 +357,10 @@ add_numeric (struct fmt_guesser *g, struct substring s) } /* Skip the optional exponent. */ - has_exp = ss_match_char_in (&s, ss_cstr ("eEdD")) != EOF; - has_exp_sign = ss_match_char_in (&s, ss_cstr ("-+")) != EOF; + has_exp = ss_match_byte_in (&s, ss_cstr ("eEdD")) != EOF; + has_exp_sign = ss_match_byte_in (&s, ss_cstr ("-+")) != EOF; if (has_exp_sign) - ss_match_char (&s, ' '); + ss_match_byte (&s, ' '); exp_digits = ss_ltrim (&s, ss_cstr (CC_DIGITS)); if ((has_exp || has_exp_sign) && !exp_digits) { @@ -356,7 +370,7 @@ add_numeric (struct fmt_guesser *g, struct substring s) } /* Skip optional '%'. */ - has_percent = ss_match_char (&s, '%'); + has_percent = ss_match_byte (&s, '%'); if (has_dollar && has_percent) { /* A valid number cannot have both '$' and '%'. */ @@ -452,11 +466,15 @@ guess_numeric (struct fmt_guesser *g, struct fmt_spec *f) } /* Tries to parse S as a date (DATE, ADATE, EDATE, SDATE, QYR, - MOYR, WKYR, or DATETIME), time (TIME or DTIME), or date - component (WKDAY or MONTH) format. If successful, increments - G's any_date counter and the counter or counters for the - specific format(s) that S matches. On failure, does not - modify G. + MOYR, WKYR, DATETIME, or YMDHMS), time (TIME or DTIME), or + date component (WKDAY or MONTH) format. If successful, + increments G's any_date counter and the counter or counters + for the specific format(s) that S matches. On failure, does + not modify G. + + XXX How can we distinguish MTIME from TIME? One way might be + that TIME can have three parts (HH:MM:SS) but MTIME only ever + has two (MM:SS). Does not attempt to recognize JDATE format: it looks just like F format and will thus be caught by the numeric parser. @@ -566,8 +584,8 @@ guess_date_time (struct fmt_guesser *g, struct fmt_spec *f) (We use the minimum input width, but an output width would be equally appropriate, since all the time formats have the same minimum widths for input and output.) */ - if (f->type == FMT_DATETIME || f->type == FMT_TIME - || f->type == FMT_DTIME) + if (f->type == FMT_DATETIME || f->type == FMT_YMDHMS + || f->type == FMT_MTIME || f->type == FMT_TIME || f->type == FMT_DTIME) { for (i = 0; i < DATE_SYNTAX_CNT; i++) if (g->date[i] @@ -630,7 +648,7 @@ parse_date_token (struct substring *s, enum date_token tokens_seen, ss_advance (s, 1); token = recognize_identifier_token (s); if (token) - ss_match_char_in (s, ss_cstr (CC_SPACES)); + ss_match_byte_in (s, ss_cstr (CC_SPACES)); else token = DT_DELIM | DT_SPACE; return token; @@ -661,7 +679,7 @@ parse_date_number (struct substring *s, enum date_token tokens_seen, size_t digit_cnt = ss_get_long (s, &value); enum date_token token = 0; - if (ss_match_char (s, settings_get_decimal_char (FMT_F)) + if (ss_match_byte (s, settings_get_decimal_char (FMT_F)) && tokens_seen & DT_COLON && value <= 59) {