1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2008, 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/format-guesser.h"
24 #include "data/format.h"
25 #include "data/settings.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/str.h"
29 #include "gl/c-ctype.h"
30 #include "gl/minmax.h"
31 #include "gl/xalloc.h"
33 /* A token in which potential date or time fields are broken.
35 The token type is actually a bit-map. This allows a single
36 token to represent multiple roles, as often happens in parsing
37 adate or a time. For example, the number "1" can be a quarter
38 of the year, month, hour, day of the month, week of the year,
39 or a count of days. Such ambiguities are resolved on the
40 higher-level bases of multiple tokens and multiple full
44 DT_DAY = 1 << 0, /* dd: Day of the month. */
45 DT_MONTH = 1 << 1, /* mm: Month. */
46 DT_ENGLISH_MONTH = 1 << 2, /* mmm: Spelled-out month, e.g. "jan". */
47 DT_YEAR = 1 << 3, /* yy: Year. */
49 DT_HOUR = 1 << 4, /* HH: Hour. */
50 DT_MINUTE = 1 << 5, /* MM: Minute. */
51 DT_SECOND = 1 << 6, /* SS: Second. */
53 DT_WEEKDAY = 1 << 7, /* www: Day of the week. */
55 DT_DAY_COUNT = 1 << 8, /* D: Number of days. */
56 DT_WEEK = 1 << 9, /* ww: Week of the year. */
57 DT_QUARTER = 1 << 10, /* q: Quarter of the year. */
59 DT_Q = 1 << 11, /* Literal "Q". */
60 DT_WK = 1 << 12, /* Literal "WK". */
62 DT_DELIM = 1 << 13, /* One of -/., or white space. */
63 DT_SPACE = 1 << 14, /* Any white space. */
64 DT_COLON = 1 << 15, /* : */
67 /* Syntax of a date format, in terms of the date tokens that
71 enum fmt_type format; /* Format type. */
73 size_t n_tokens; /* Number of tokens. */
74 enum date_token tokens[MAX_TOKENS]; /* Tokens. */
77 /* Syntax of all the data formats that we can parse.
79 The order in the array can make a difference in the final
80 choice of formats: in the case of a tie between the number of
81 times each format is seen, the syntax earlier in the array
82 takes precedence. The most important cases are the ordering
83 of DATE before EDATE, so that spelled-out months in input
84 yield DATE format (that produces spelled-out months in output,
85 and the ordering of EDATE before ADATE, so that ambiguous
86 dates such as "1/1/99" yield the more sensible European date
87 format instead of American format.
89 When a given date format has more than one syntax, they must
90 be in adjacent array elements. */
91 static struct date_syntax syntax[] =
94 { FMT_DATE, 5, {DT_DAY, DT_DELIM, DT_ENGLISH_MONTH, DT_DELIM, DT_YEAR} },
97 { FMT_EDATE, 5, {DT_DAY, DT_DELIM, DT_MONTH, DT_DELIM, DT_YEAR} },
100 { FMT_ADATE, 5, {DT_MONTH, DT_DELIM, DT_DAY, DT_DELIM, DT_YEAR} },
103 { FMT_SDATE, 5, {DT_YEAR, DT_DELIM, DT_MONTH, DT_DELIM, DT_DAY} },
106 { FMT_MOYR, 3, {DT_MONTH, DT_DELIM, DT_YEAR} },
109 { FMT_QYR, 3, {DT_QUARTER, DT_Q, DT_YEAR} },
112 { FMT_WKYR, 3, {DT_WEEK, DT_WK, DT_YEAR} },
114 /* dd-mmm-yyyy HH:MM */
116 9, {DT_DAY, DT_DELIM, DT_MONTH, DT_DELIM, DT_YEAR, DT_SPACE, DT_HOUR,
117 DT_COLON, DT_MINUTE} },
118 /* dd-mmm-yyyy HH:MM:SS */
120 11, {DT_DAY, DT_DELIM, DT_MONTH, DT_DELIM, DT_YEAR, DT_SPACE, DT_HOUR,
121 DT_COLON, DT_MINUTE, DT_COLON, DT_SECOND} },
123 /* yyyy-dd-mmm HH:MM */
125 9, {DT_YEAR, DT_DELIM, DT_MONTH, DT_DELIM, DT_DAY, DT_SPACE, DT_HOUR,
126 DT_COLON, DT_MINUTE} },
127 /* yyyy-dd-mmm HH:MM:SS */
129 11, {DT_YEAR, DT_DELIM, DT_MONTH, DT_DELIM, DT_DAY, DT_SPACE, DT_HOUR,
130 DT_COLON, DT_MINUTE, DT_COLON, DT_SECOND} },
133 { FMT_TIME, 3, {DT_HOUR, DT_COLON, DT_MINUTE} },
135 { FMT_TIME, 5, {DT_HOUR, DT_COLON, DT_MINUTE, DT_COLON, DT_SECOND} },
138 { FMT_DTIME, 5, {DT_DAY_COUNT, DT_SPACE, DT_HOUR, DT_COLON, DT_MINUTE} },
141 7, {DT_DAY_COUNT, DT_SPACE, DT_HOUR, DT_COLON, DT_MINUTE, DT_COLON,
145 { FMT_WKDAY, 1, {DT_WEEKDAY} },
149 We require a spelled-out English month so that
150 single-character Roman numerals like "i" and "x" don't get
151 detected as months. The latter is particularly common in
152 the password field of /etc/passwd-like files. */
153 { FMT_MONTH, 1, {DT_ENGLISH_MONTH} },
156 /* Number of recognized date syntax formats. */
157 #define DATE_SYNTAX_CNT (sizeof syntax / sizeof *syntax)
159 /* A format guesser. */
162 /* Maximum observed input width. */
165 /* Sum of the digits after the decimal point in each input
166 (divide by count to obtain average decimal positions). */
167 unsigned int decimals;
169 /* Number of non-empty, non-missing input values.
171 count is the sum of any_numeric, any_date, and the number
172 of inputs that were not in any recognized format (hence,
173 treated as A format). */
176 /* Numeric input formats. */
177 unsigned int any_numeric; /* Sum of following counts. */
178 unsigned int f; /* Number of inputs in F format. */
179 unsigned int comma; /* Number of inputs in COMMA format. */
180 unsigned int dot; /* Number of inputs in DOT format. */
181 unsigned int dollar; /* Number of inputs in DOLLAR format. */
182 unsigned int pct; /* Number of inputs in PCT format. */
183 unsigned int e; /* Number of inputs in E format. */
185 /* Date or time input formats.
187 The sum of the values in the date array is at least
188 any_date, often higher because many example dates match
189 more than one date format. */
190 unsigned int any_date; /* Number of inputs in any date format. */
191 unsigned int date[DATE_SYNTAX_CNT]; /* Number of inputs in each date
195 static bool add_numeric (struct fmt_guesser *, struct substring);
196 static void guess_numeric (struct fmt_guesser *, struct fmt_spec *);
197 static void add_date_time (struct fmt_guesser *, struct substring);
198 static bool match_date_syntax (const enum date_token a[], size_t a_len,
199 const enum date_token b[], size_t b_len);
200 static void guess_date_time (struct fmt_guesser *, struct fmt_spec *);
201 static enum date_token parse_date_token (struct substring *,
202 enum date_token tokens_seen,
204 static enum date_token parse_date_number (struct substring *,
205 enum date_token tokens_seen,
207 static enum date_token recognize_identifier_token (struct substring *);
208 static enum date_token recognize_id2 (int s0, int s1, bool more);
209 static enum date_token recognize_id3 (int s0, int s1, int s2, bool more);
211 /* Creates and returns a new format guesser. */
213 fmt_guesser_create (void)
215 struct fmt_guesser *g = xmalloc (sizeof *g);
216 fmt_guesser_clear (g);
220 /* Destroys format guesser G. */
222 fmt_guesser_destroy (struct fmt_guesser *g)
227 /* Clears the state of format guesser G, making it available for
228 guessing the format of a new input stream. */
230 fmt_guesser_clear (struct fmt_guesser *g)
232 memset (g, 0, sizeof *g);
235 /* Appends S to the stream of data items whose format G is
238 fmt_guesser_add (struct fmt_guesser *g, struct substring s)
240 if (ss_length (s) > g->width)
241 g->width = ss_length (s);
242 ss_trim (&s, ss_cstr (CC_SPACES));
243 if (ss_is_empty (s) || ss_equals (s, ss_cstr (".")))
245 /* Can't guess anything from an empty string or a missing value. */
250 if (!add_numeric (g, s))
251 add_date_time (g, s);
254 /* Returns a guess about the format of the input previously added to G using
255 fmt_guesser_add(). The guessed format may not actually a valid input or
256 output format, in that its width and number of decimal places may be outside
257 the valid range for the guessed format type. The caller must therefore
258 adjust the format to make it valid, e.g. by calling fmt_fix(). */
260 fmt_guesser_guess (struct fmt_guesser *g)
264 /* Set defaults. The guesser functions typically override
265 the width and type. */
266 struct fmt_spec f = { .type = FMT_A, .w = g->width };
268 if (g->any_numeric > g->count / 2)
269 guess_numeric (g, &f);
270 else if (g->any_date > g->count / 2)
271 guess_date_time (g, &f);
277 /* No data at all. Use fallback default. */
278 return fmt_default_for_width (0);
282 /* Numeric formats. */
284 /* Tries to parse S as a numeric (F, COMMA, DOT, DOLLAR, PCT, or
285 E) format. If successful, increments G's any_numeric counter
286 and the counter for the specific format S that S matches and
287 returns true. On failure, returns false without modifying G.
289 This function is intended to match exactly the same set of
290 strings that the actual numeric value parsers used by the
291 data_in function would match. */
293 add_numeric (struct fmt_guesser *g, struct substring s)
295 bool has_dollar; /* '$' appeared at start of S? */
296 bool has_percent; /* '%' appeared at end of S? */
297 int digits; /* Number of digits in S (before exponent). */
298 int dots; /* Number of '.' in S. */
299 int commas; /* Number of ',' in S. */
300 bool has_exp; /* [eEdD] appeared introducing exponent? */
301 bool has_exp_sign; /* '+' or '-' appeared in exponent? */
302 int exp_digits; /* Number of digits in exponent. */
304 int prev_delim; /* Initially 0, then ',' or '.' as delimiters seen. */
305 int delim_digits; /* Number of digits since last delimiter. */
307 int decimal; /* Decimal point character: '.', ',',
308 or 0 if unknown or no decimal point in S. */
309 int precision; /* Digits of precision after decimal point. */
313 /* Skip leading "$" and optional following white space. */
314 has_dollar = ss_match_byte (&s, '$');
316 ss_ltrim (&s, ss_cstr (CC_SPACES));
318 /* Skip optional sign. */
319 ss_match_byte_in (&s, ss_cstr ("+-"));
321 /* Skip digits punctuated by commas and dots. We don't know
322 whether the decimal point is a comma or a dot, so for now we
324 digits = dots = commas = 0;
327 for (; (c = ss_first (s)) != -1; ss_advance (&s, 1))
329 if (c >= '0' && c <= '9')
350 if (digits == 0 || (dots > 1 && commas > 1))
352 /* A valid number has at least one digit and can't have
353 more than one decimal point. */
357 /* Skip the optional exponent. */
358 has_exp = ss_match_byte_in (&s, ss_cstr ("eEdD")) != EOF;
359 has_exp_sign = ss_match_byte_in (&s, ss_cstr ("-+")) != EOF;
361 ss_match_byte (&s, ' ');
362 exp_digits = ss_ltrim (&s, ss_cstr (CC_DIGITS));
363 if ((has_exp || has_exp_sign) && !exp_digits)
365 /* Can't have the E or sign that leads in the exponent
366 without actually having an exponent. */
370 /* Skip optional '%'. */
371 has_percent = ss_match_byte (&s, '%');
372 if (has_dollar && has_percent)
374 /* A valid number cannot have both '$' and '%'. */
378 /* Make sure there's no trailing garbage. */
379 if (!ss_is_empty (s))
382 /* Figure out the decimal point (and therefore grouping)
383 character and the number of digits following the decimal
384 point. Sometimes the answer is ambiguous. */
385 if (dots > 1 && prev_delim == '.')
387 /* Can't have multiple decimal points, so '.' must really
388 be the grouping character, with a precision of 0. */
392 else if (commas > 1 && prev_delim == ',')
394 /* Can't have multiple decimal points, so ',' must really
395 be the grouping character, with a precision of 0. */
399 else if (delim_digits == 3 && (!dots || !commas))
401 /* The input is something like "1.234" or "1,234" where we
402 can't tell whether the ',' or '.' is a grouping or
403 decimal character. Assume that the decimal character
404 from the settings is in use. */
405 if (prev_delim == settings_get_fmt_settings ()->decimal)
407 decimal = prev_delim;
408 precision = delim_digits;
412 decimal = prev_delim == '.' ? ',' : '.';
418 /* The final delimiter is a decimal point, and the digits
419 following it are decimals. */
420 decimal = prev_delim;
421 precision = delim_digits;
424 /* Decide the most likely format. */
426 g->decimals += precision;
429 else if (has_percent)
431 else if (commas && decimal == '.')
433 else if (dots && decimal == ',')
435 else if (has_exp || has_exp_sign)
443 /* Guess which numeric format is most likely represented by G,
444 and store it in F's type and d members. (f->w is already
447 guess_numeric (struct fmt_guesser *g, struct fmt_spec *f)
449 int decimal_char = settings_get_fmt_settings ()->decimal;
451 f->d = g->decimals / g->count;
455 f->type = FMT_DOLLAR;
456 else if (g->comma > g->dot)
457 f->type = decimal_char == '.' ? FMT_COMMA : FMT_DOT;
458 else if (g->dot > g->comma)
459 f->type = decimal_char == '.' ? FMT_DOT : FMT_COMMA;
460 else if (g->e > g->any_numeric / 2)
466 /* Tries to parse S as a date (DATE, ADATE, EDATE, SDATE, QYR,
467 MOYR, WKYR, DATETIME, or YMDHMS), time (TIME or DTIME), or
468 date component (WKDAY or MONTH) format. If successful,
469 increments G's any_date counter and the counter or counters
470 for the specific format(s) that S matches. On failure, does
473 XXX How can we distinguish MTIME from TIME? One way might be
474 that TIME can have three parts (HH:MM:SS) but MTIME only ever
477 Does not attempt to recognize JDATE format: it looks just like
478 F format and will thus be caught by the numeric parser.
480 This function is intended to match a set of strings close to
481 those that actual date and time parsers used by the data_in
482 function would match, but somewhat pickier. In particular,
483 minutes and seconds are only recognized when they have exactly
484 two digits: "1:02:03" is a valid time, but "1:2:3" is
487 add_date_time (struct fmt_guesser *g, struct substring s)
489 enum date_token token;
490 enum date_token tokens[MAX_TOKENS];
491 enum date_token tokens_seen;
497 /* Break S into tokens. */
501 while (!ss_is_empty (s))
503 if (n_tokens >= MAX_TOKENS)
506 token = parse_date_token (&s, tokens_seen, &decimals);
509 tokens[n_tokens++] = token;
510 tokens_seen |= token;
515 /* Find matching date formats, if any, and increment the
516 counter for each one of them. */
518 for (i = 0; i < DATE_SYNTAX_CNT; i++)
520 struct date_syntax *s = &syntax[i];
521 if (match_date_syntax (tokens, n_tokens, s->tokens, s->n_tokens))
530 g->decimals += decimals;
534 /* Returns true if the A_LEN tokens in A[] match the B_LEN tokens
535 in B[], false otherwise. */
537 match_date_syntax (const enum date_token a[], size_t a_len,
538 const enum date_token b[], size_t b_len)
545 for (i = 0; i < a_len; i++)
552 /* Guess which date or time format is most likely represented by
553 G, and store it in F's type and d members. (f->w is already
556 guess_date_time (struct fmt_guesser *g, struct fmt_spec *f)
558 unsigned int max = 0;
561 /* Choose the date format matched by the most inputs. Break
562 ties by choosing the earliest in the array. */
563 for (i = 0; i < DATE_SYNTAX_CNT; i = j)
565 unsigned int sum = g->date[i];
566 for (j = i + 1; j < DATE_SYNTAX_CNT; j++)
568 if (syntax[i].format != syntax[j].format)
574 f->type = syntax[i].format;
579 /* Formats that include a time have an optional seconds field.
580 If we saw a seconds field in any of the inputs, make sure
581 that the field width is large enough to include for them.
582 (We use the minimum input width, but an output width would
583 be equally appropriate, since all the time formats have the
584 same minimum widths for input and output.) */
585 if (f->type == FMT_DATETIME || f->type == FMT_YMDHMS
586 || f->type == FMT_MTIME || f->type == FMT_TIME || f->type == FMT_DTIME)
588 for (i = 0; i < DATE_SYNTAX_CNT; i++)
590 && syntax[i].tokens[syntax[i].n_tokens - 1] == DT_SECOND)
592 f->d = g->decimals / g->count;
593 f->w = MAX (f->w, fmt_min_input_width (f->type) + 3);
598 /* Extracts the next date token from the string represented by S,
599 which must not be an empty string, and advances *S past the
600 end of the token. Returns the parsed token, or 0 if no valid
603 TOKENS_SEEN should be a bitmap representing all the tokens
604 already seen in this input; this is used to resolve some
605 otherwise ambiguous parsing situation. If a count of seconds
606 is parsed, *DECIMALS is set to the number of digits after the
608 static enum date_token
609 parse_date_token (struct substring *s, enum date_token tokens_seen,
612 int c = ss_first (*s);
616 case '0': case '1': case '2': case '3': case '4':
617 case '5': case '6': case '7': case '8': case '9':
618 return parse_date_number (s, tokens_seen, decimals);
622 /* '+' or '-' at the start of a string, or following a
623 space, could be the sign that optionally introduces a
624 time, e.g. "-1:00" in TIME format, "-1 1:00" in DTIME
625 format, or "1/1/1978 +1:00" in DATETIME format. */
626 if ((!tokens_seen || s->string[-1] == ' ') && c_isdigit (ss_at (*s, 1)))
629 ss_ltrim (s, ss_cstr (CC_DIGITS));
630 return DT_DAY_COUNT | DT_HOUR;
635 case '/': case '.': case ',':
643 case ' ': case '\t': case '\v': case '\r': case '\n':
645 enum date_token token;
647 token = recognize_identifier_token (s);
649 ss_match_byte_in (s, ss_cstr (CC_SPACES));
651 token = DT_DELIM | DT_SPACE;
656 return recognize_identifier_token (s);
663 /* Parses a digit sequence found in a date token. Advances *S
664 past the end of the token. Returns the parsed token, or 0 if
665 no valid token was found.
667 TOKENS_SEEN should be a bitmap representing all the tokens
668 already seen in this input; this is used to resolve some
669 otherwise ambiguous parsing situation. If a count of seconds
670 is parsed, *DECIMALS is set to the number of digits after the
672 static enum date_token
673 parse_date_number (struct substring *s, enum date_token tokens_seen,
677 size_t n_digits = ss_get_long (s, &value);
678 enum date_token token = 0;
680 if (ss_match_byte (s, settings_get_fmt_settings ()->decimal)
681 && tokens_seen & DT_COLON
684 /* Parse digits after the decimal point. */
686 *decimals = ss_ltrim (s, ss_cstr (CC_DIGITS));
691 token = (DT_QUARTER | DT_MONTH | DT_HOUR | DT_DAY | DT_WEEK
693 else if (value <= 12)
694 token = DT_MONTH | DT_HOUR | DT_DAY | DT_WEEK | DT_DAY_COUNT;
695 else if (value <= 23)
696 token = DT_HOUR | DT_DAY | DT_WEEK | DT_DAY_COUNT;
697 else if (value <= 31)
698 token = DT_DAY | DT_WEEK | DT_DAY_COUNT;
699 else if (value <= 52)
700 token = DT_WEEK | DT_DAY_COUNT;
702 token = DT_DAY_COUNT;
708 token |= DT_MINUTE | DT_SECOND;
710 else if (n_digits == 4)
717 /* Attempts to parse an identifier found in a date at the
718 beginning of S. Advances *S past the end of the token.
719 Returns the parsed token, or 0 if no valid token was
721 static enum date_token
722 recognize_identifier_token (struct substring *s)
724 size_t length = ss_span (*s, ss_cstr (CC_LETTERS));
725 enum date_token token = 0;
732 switch (c_tolower (s->string[0]))
748 int s0 = c_tolower ((unsigned char) s->string[0]);
749 int s1 = c_tolower ((unsigned char) s->string[1]);
750 token = recognize_id2 (s0, s1, false);
751 if (!token && s0 == 'w' && s1 == 'k')
758 int s0 = c_tolower ((unsigned char) s->string[0]);
759 int s1 = c_tolower ((unsigned char) s->string[1]);
760 int s2 = c_tolower ((unsigned char) s->string[2]);
761 token = recognize_id2 (s0, s1, true);
763 token = recognize_id3 (s0, s1, s2, length > 3);
764 if (!token && length == 4)
766 int s3 = c_tolower ((unsigned char) s->string[3]);
767 if (s0 == 'v' && s1 == 'i' && s2 == 'i' && s3 == 'i')
774 ss_advance (s, length);
778 static enum date_token
779 recognize_id2 (int s0, int s1, bool more)
784 case 's': weekday = s1 == 'a' || s1 == 'u'; break;
785 case 'm': weekday = s1 == 'o'; break;
786 case 't': weekday = s1 == 'u' || s1 == 'h'; break;
787 case 'w': weekday = s1 == 'e'; break;
788 case 'f': weekday = s1 == 'r'; break;
789 default: weekday = false; break;
799 case 'i': month = s1 == 'i' || s1 == 'v' || s1 == 'x'; break;
800 case 'v': month = s1 == 'i'; break;
801 case 'x': month = s1 == 'i'; break;
802 default: month = false; break;
811 static enum date_token
812 recognize_id3 (int s0, int s1, int s2, bool more)
818 month = ((s1 == 'a' && s2 == 'n')
819 || (s1 == 'u' && (s2 == 'n' || s2 == 'l')));
822 month = s1 == 'e' && s2 == 'b';
825 month = (s1 == 'a' && (s2 == 'r' || s2 == 'y'));
828 month = (s1 == 'p' && s2 == 'r') || (s1 == 'u' && s2 == 'g');
831 month = s1 == 'e' && s2 == 'p';
834 month = s1 == 'c' && s2 == 't';
837 month = s1 == 'o' && s2 == 'v';
840 month = s1 == 'e' && s2 == 'c';
846 return DT_MONTH | DT_ENGLISH_MONTH;
850 bool roman_month = false;
855 roman_month = s1 == 'i' && s2 == 'i';
858 roman_month = s1 == 'i' && s2 == 'i';