pintos-os.org Git - pspp/blob - src/data/format-guesser.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2008, 2010, 2011 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "data/format-guesser.h"
  20
  21 #include <stdlib.h>
  22 #include <string.h>
  23
  24 #include "data/format.h"
  25 #include "data/settings.h"
  26 #include "libpspp/assertion.h"
  27 #include "libpspp/str.h"
  28
  29 #include "gl/c-ctype.h"
  30 #include "gl/minmax.h"
  31 #include "gl/xalloc.h"
  32
  33 /* A token in which potential date or time fields are broken.
  34
  35    The token type is actually a bit-map.  This allows a single
  36    token to represent multiple roles, as often happens in parsing
  37    adate or a time.  For example, the number "1" can be a quarter
  38    of the year, month, hour, day of the month, week of the year,
  39    or a count of days.  Such ambiguities are resolved on the
  40    higher-level bases of multiple tokens and multiple full
  41    dates. */
  42 enum date_token
  43   {
  44     DT_DAY = 1 << 0,            /* dd: Day of the month. */
  45     DT_MONTH = 1 << 1,          /* mm: Month. */
  46     DT_ENGLISH_MONTH = 1 << 2,  /* mmm: Spelled-out month, e.g. "jan". */
  47     DT_YEAR = 1 << 3,           /* yy: Year. */
  48
  49     DT_HOUR = 1 << 4,           /* HH: Hour. */
  50     DT_MINUTE = 1 << 5,         /* MM: Minute. */
  51     DT_SECOND = 1 << 6,         /* SS: Second. */
  52
  53     DT_WEEKDAY = 1 << 7,        /* www: Day of the week. */
  54
  55     DT_DAY_COUNT = 1 << 8,      /* D: Number of days. */
  56     DT_WEEK = 1 << 9,           /* ww: Week of the year. */
  57     DT_QUARTER = 1 << 10,       /* q: Quarter of the year. */
  58
  59     DT_Q = 1 << 11,             /* Literal "Q". */
  60     DT_WK = 1 << 12,            /* Literal "WK". */
  61
  62     DT_DELIM = 1 << 13,         /* One of -/., or white space. */
  63     DT_SPACE = 1 << 14,         /* Any white space. */
  64     DT_COLON = 1 << 15,         /* : */
  65   };
  66
  67 /* Syntax of a date format, in terms of the date tokens that
  68    compose it.*/
  69 struct date_syntax
  70   {
  71     enum fmt_type format;       /* Format type. */
  72 #define MAX_TOKENS 11
  73     size_t n_tokens;           /* Number of tokens. */
  74     enum date_token tokens[MAX_TOKENS]; /* Tokens. */
  75   };
  76
  77 /* Syntax of all the data formats that we can parse.
  78
  79    The order in the array can make a difference in the final
  80    choice of formats: in the case of a tie between the number of
  81    times each format is seen, the syntax earlier in the array
  82    takes precedence.  The most important cases are the ordering
  83    of DATE before EDATE, so that spelled-out months in input
  84    yield DATE format (that produces spelled-out months in output,
  85    and the ordering of EDATE before ADATE, so that ambiguous
  86    dates such as "1/1/99" yield the more sensible European date
  87    format instead of American format.
  88
  89    When a given date format has more than one syntax, they must
  90    be in adjacent array elements. */
  91 static struct date_syntax syntax[] =
  92   {
  93     /* dd-mmm-yy */
  94     { FMT_DATE, 5, {DT_DAY, DT_DELIM, DT_ENGLISH_MONTH, DT_DELIM, DT_YEAR} },
  95
  96     /* dd.mm.yy */
  97     { FMT_EDATE, 5, {DT_DAY, DT_DELIM, DT_MONTH, DT_DELIM, DT_YEAR} },
  98
  99     /* mm/dd/yy */
 100     { FMT_ADATE, 5, {DT_MONTH, DT_DELIM, DT_DAY, DT_DELIM, DT_YEAR} },
 101
 102     /* yy/mm/dd */
 103     { FMT_SDATE, 5, {DT_YEAR, DT_DELIM, DT_MONTH, DT_DELIM, DT_DAY} },
 104
 105     /* mmm yy */
 106     { FMT_MOYR, 3, {DT_MONTH, DT_DELIM, DT_YEAR} },
 107
 108     /* q Q yy */
 109     { FMT_QYR, 3, {DT_QUARTER, DT_Q, DT_YEAR} },
 110
 111     /* ww WK yy */
 112     { FMT_WKYR, 3, {DT_WEEK, DT_WK, DT_YEAR} },
 113
 114     /* dd-mmm-yyyy HH:MM */
 115     { FMT_DATETIME,
 116       9, {DT_DAY, DT_DELIM, DT_MONTH, DT_DELIM, DT_YEAR, DT_SPACE, DT_HOUR,
 117           DT_COLON, DT_MINUTE} },
 118     /* dd-mmm-yyyy HH:MM:SS */
 119     { FMT_DATETIME,
 120       11, {DT_DAY, DT_DELIM, DT_MONTH, DT_DELIM, DT_YEAR, DT_SPACE, DT_HOUR,
 121            DT_COLON, DT_MINUTE, DT_COLON, DT_SECOND} },
 122
 123     /* yyyy-dd-mmm HH:MM */
 124     { FMT_YMDHMS,
 125       9, {DT_YEAR, DT_DELIM, DT_MONTH, DT_DELIM, DT_DAY, DT_SPACE, DT_HOUR,
 126           DT_COLON, DT_MINUTE} },
 127     /* yyyy-dd-mmm HH:MM:SS */
 128     { FMT_YMDHMS,
 129       11, {DT_YEAR, DT_DELIM, DT_MONTH, DT_DELIM, DT_DAY, DT_SPACE, DT_HOUR,
 130            DT_COLON, DT_MINUTE, DT_COLON, DT_SECOND} },
 131
 132     /* HH:MM */
 133     { FMT_TIME, 3, {DT_HOUR, DT_COLON, DT_MINUTE} },
 134     /* HH:MM:SS */
 135     { FMT_TIME, 5, {DT_HOUR, DT_COLON, DT_MINUTE, DT_COLON, DT_SECOND} },
 136
 137     /* D HH:MM */
 138     { FMT_DTIME, 5, {DT_DAY_COUNT, DT_SPACE, DT_HOUR, DT_COLON, DT_MINUTE} },
 139     /* D HH:MM:SS */
 140     { FMT_DTIME,
 141       7, {DT_DAY_COUNT, DT_SPACE, DT_HOUR, DT_COLON, DT_MINUTE, DT_COLON,
 142           DT_SECOND} },
 143
 144     /* www */
 145     { FMT_WKDAY, 1, {DT_WEEKDAY} },
 146
 147     /* mmm
 148
 149        We require a spelled-out English month so that
 150        single-character Roman numerals like "i" and "x" don't get
 151        detected as months.  The latter is particularly common in
 152        the password field of /etc/passwd-like files. */
 153     { FMT_MONTH, 1, {DT_ENGLISH_MONTH} },
 154   };
 155
 156 /* Number of recognized date syntax formats. */
 157 #define DATE_SYNTAX_CNT (sizeof syntax / sizeof *syntax)
 158
 159 /* A format guesser. */
 160 struct fmt_guesser
 161   {
 162     /* Maximum observed input width. */
 163     unsigned int width;
 164
 165     /* Sum of the digits after the decimal point in each input
 166        (divide by count to obtain average decimal positions). */
 167     unsigned int decimals;
 168
 169     /* Number of non-empty, non-missing input values.
 170
 171        count is the sum of any_numeric, any_date, and the number
 172        of inputs that were not in any recognized format (hence,
 173        treated as A format). */
 174     unsigned int count;
 175
 176     /* Numeric input formats. */
 177     unsigned int any_numeric;   /* Sum of following counts. */
 178     unsigned int f;             /* Number of inputs in F format. */
 179     unsigned int comma;         /* Number of inputs in COMMA format. */
 180     unsigned int dot;           /* Number of inputs in DOT format. */
 181     unsigned int dollar;        /* Number of inputs in DOLLAR format. */
 182     unsigned int pct;           /* Number of inputs in PCT format. */
 183     unsigned int e;             /* Number of inputs in E format. */
 184
 185     /* Date or time input formats.
 186
 187        The sum of the values in the date array is at least
 188        any_date, often higher because many example dates match
 189        more than one date format. */
 190     unsigned int any_date;      /* Number of inputs in any date format. */
 191     unsigned int date[DATE_SYNTAX_CNT]; /* Number of inputs in each date
 192                                            format. */
 193   };
 194
 195 static bool add_numeric (struct fmt_guesser *, struct substring);
 196 static void guess_numeric (struct fmt_guesser *, struct fmt_spec *);
 197 static void add_date_time (struct fmt_guesser *, struct substring);
 198 static bool match_date_syntax (const enum date_token a[], size_t a_len,
 199                                const enum date_token b[], size_t b_len);
 200 static void guess_date_time (struct fmt_guesser *, struct fmt_spec *);
 201 static enum date_token parse_date_token (struct substring *,
 202                                          enum date_token tokens_seen,
 203                                          int *decimals);
 204 static enum date_token parse_date_number (struct substring *,
 205                                           enum date_token tokens_seen,
 206                                           int *decimals);
 207 static enum date_token recognize_identifier_token (struct substring *);
 208 static enum date_token recognize_id2 (int s0, int s1, bool more);
 209 static enum date_token recognize_id3 (int s0, int s1, int s2, bool more);
 210
 211 /* Creates and returns a new format guesser. */
 212 struct fmt_guesser *
 213 fmt_guesser_create (void)
 214 {
 215   struct fmt_guesser *g = xmalloc (sizeof *g);
 216   fmt_guesser_clear (g);
 217   return g;
 218 }
 219
 220 /* Destroys format guesser G. */
 221 void
 222 fmt_guesser_destroy (struct fmt_guesser *g)
 223 {
 224   free (g);
 225 }
 226
 227 /* Clears the state of format guesser G, making it available for
 228    guessing the format of a new input stream.  */
 229 void
 230 fmt_guesser_clear (struct fmt_guesser *g)
 231 {
 232   memset (g, 0, sizeof *g);
 233 }
 234
 235 /* Appends S to the stream of data items whose format G is
 236    guessing. */
 237 void
 238 fmt_guesser_add (struct fmt_guesser *g, struct substring s)
 239 {
 240   if (ss_length (s) > g->width)
 241     g->width = ss_length (s);
 242   ss_trim (&s, ss_cstr (CC_SPACES));
 243   if (ss_is_empty (s) || ss_equals (s, ss_cstr (".")))
 244     {
 245       /* Can't guess anything from an empty string or a missing value. */
 246       return;
 247     }
 248
 249   g->count++;
 250   if (!add_numeric (g, s))
 251     add_date_time (g, s);
 252 }
 253
 254 /* Returns a guess about the format of the input previously added to G using
 255    fmt_guesser_add().  The guessed format may not actually a valid input or
 256    output format, in that its width and number of decimal places may be outside
 257    the valid range for the guessed format type.  The caller must therefore
 258    adjust the format to make it valid, e.g. by calling fmt_fix(). */
 259 struct fmt_spec
 260 fmt_guesser_guess (struct fmt_guesser *g)
 261 {
 262   if (g->count > 0)
 263     {
 264       /* Set defaults.  The guesser functions typically override
 265          the width and type. */
 266       struct fmt_spec f = { .type = FMT_A, .w = g->width };
 267
 268       if (g->any_numeric > g->count / 2)
 269         guess_numeric (g, &f);
 270       else if (g->any_date > g->count / 2)
 271         guess_date_time (g, &f);
 272
 273       return f;
 274     }
 275   else
 276     {
 277       /* No data at all.  Use fallback default. */
 278       return fmt_default_for_width (0);
 279     }
 280 }
 281 \f
 282 /* Numeric formats. */
 283
 284 /* Tries to parse S as a numeric (F, COMMA, DOT, DOLLAR, PCT, or
 285    E) format.  If successful, increments G's any_numeric counter
 286    and the counter for the specific format S that S matches and
 287    returns true.  On failure, returns false without modifying G.
 288
 289    This function is intended to match exactly the same set of
 290    strings that the actual numeric value parsers used by the
 291    data_in function would match. */
 292 static bool
 293 add_numeric (struct fmt_guesser *g, struct substring s)
 294 {
 295   bool has_dollar;              /* '$' appeared at start of S? */
 296   bool has_percent;             /* '%' appeared at end of S? */
 297   int digits;                   /* Number of digits in S (before exponent). */
 298   int dots;                     /* Number of '.' in S. */
 299   int commas;                   /* Number of ',' in S. */
 300   bool has_exp;                 /* [eEdD] appeared introducing exponent? */
 301   bool has_exp_sign;            /* '+' or '-' appeared in exponent? */
 302   int exp_digits;               /* Number of digits in exponent. */
 303
 304   int prev_delim;       /* Initially 0, then ',' or '.' as delimiters seen. */
 305   int delim_digits;             /* Number of digits since last delimiter. */
 306
 307   int decimal;                  /* Decimal point character: '.', ',',
 308                                    or 0 if unknown or no decimal point in S. */
 309   int precision;                /* Digits of precision after decimal point. */
 310
 311   int c;
 312
 313   /* Skip leading "$" and optional following white space. */
 314   has_dollar = ss_match_byte (&s, '$');
 315   if (has_dollar)
 316     ss_ltrim (&s, ss_cstr (CC_SPACES));
 317
 318   /* Skip optional sign. */
 319   ss_match_byte_in (&s, ss_cstr ("+-"));
 320
 321   /* Skip digits punctuated by commas and dots.  We don't know
 322      whether the decimal point is a comma or a dot, so for now we
 323      just count them.  */
 324   digits = dots = commas = 0;
 325   delim_digits = 0;
 326   prev_delim = 0;
 327   for (; (c = ss_first (s)) != -1; ss_advance (&s, 1))
 328     {
 329       if (c >= '0' && c <= '9')
 330         {
 331           digits++;
 332           if (dots || commas)
 333             delim_digits++;
 334         }
 335       else if (c == '.')
 336         {
 337           dots++;
 338           prev_delim = c;
 339           delim_digits = 0;
 340         }
 341       else if (c == ',')
 342         {
 343           commas++;
 344           prev_delim = c;
 345           delim_digits = 0;
 346         }
 347       else
 348         break;
 349     }
 350   if (digits == 0 || (dots > 1 && commas > 1))
 351     {
 352       /* A valid number has at least one digit and can't have
 353          more than one decimal point. */
 354       return false;
 355     }
 356
 357   /* Skip the optional exponent. */
 358   has_exp = ss_match_byte_in (&s, ss_cstr ("eEdD")) != EOF;
 359   has_exp_sign = ss_match_byte_in (&s, ss_cstr ("-+")) != EOF;
 360   if (has_exp_sign)
 361     ss_match_byte (&s, ' ');
 362   exp_digits = ss_ltrim (&s, ss_cstr (CC_DIGITS));
 363   if ((has_exp || has_exp_sign) && !exp_digits)
 364     {
 365       /* Can't have the E or sign that leads in the exponent
 366          without actually having an exponent. */
 367       return false;
 368     }
 369
 370   /* Skip optional '%'. */
 371   has_percent = ss_match_byte (&s, '%');
 372   if (has_dollar && has_percent)
 373     {
 374       /* A valid number cannot have both '$' and '%'. */
 375       return false;
 376     }
 377
 378   /* Make sure there's no trailing garbage. */
 379   if (!ss_is_empty (s))
 380     return false;
 381
 382   /* Figure out the decimal point (and therefore grouping)
 383      character and the number of digits following the decimal
 384      point.  Sometimes the answer is ambiguous. */
 385   if (dots > 1 && prev_delim == '.')
 386     {
 387       /* Can't have multiple decimal points, so '.' must really
 388          be the grouping character, with a precision of 0. */
 389       decimal = ',';
 390       precision = 0;
 391     }
 392   else if (commas > 1 && prev_delim == ',')
 393     {
 394       /* Can't have multiple decimal points, so ',' must really
 395          be the grouping character, with a precision of 0. */
 396       decimal = '.';
 397       precision = 0;
 398     }
 399   else if (delim_digits == 3 && (!dots || !commas))
 400     {
 401       /* The input is something like "1.234" or "1,234" where we
 402          can't tell whether the ',' or '.' is a grouping or
 403          decimal character.  Assume that the decimal character
 404          from the settings is in use. */
 405       if (prev_delim == settings_get_fmt_settings ()->decimal)
 406         {
 407           decimal = prev_delim;
 408           precision = delim_digits;
 409         }
 410       else
 411         {
 412           decimal = prev_delim == '.' ? ',' : '.';
 413           precision = 0;
 414         }
 415     }
 416   else
 417     {
 418       /* The final delimiter is a decimal point, and the digits
 419          following it are decimals. */
 420       decimal = prev_delim;
 421       precision = delim_digits;
 422     }
 423
 424   /* Decide the most likely format. */
 425   g->any_numeric++;
 426   g->decimals += precision;
 427   if (has_dollar)
 428     g->dollar++;
 429   else if (has_percent)
 430     g->pct++;
 431   else if (commas && decimal == '.')
 432     g->comma++;
 433   else if (dots && decimal == ',')
 434     g->dot++;
 435   else if (has_exp || has_exp_sign)
 436     g->e++;
 437   else
 438     g->f++;
 439
 440   return true;
 441 }
 442
 443 /* Guess which numeric format is most likely represented by G,
 444    and store it in F's type and d members.  (f->w is already
 445    initialized.) */
 446 static void
 447 guess_numeric (struct fmt_guesser *g, struct fmt_spec *f)
 448 {
 449   int decimal_char = settings_get_fmt_settings ()->decimal;
 450
 451   f->d = g->decimals / g->count;
 452   if (g->pct)
 453     f->type = FMT_PCT;
 454   else if (g->dollar)
 455     f->type = FMT_DOLLAR;
 456   else if (g->comma > g->dot)
 457     f->type = decimal_char == '.' ? FMT_COMMA : FMT_DOT;
 458   else if (g->dot > g->comma)
 459     f->type = decimal_char == '.' ? FMT_DOT : FMT_COMMA;
 460   else if (g->e > g->any_numeric / 2)
 461     f->type = FMT_E;
 462   else
 463     f->type = FMT_F;
 464 }
 465 \f
 466 /* Tries to parse S as a date (DATE, ADATE, EDATE, SDATE, QYR,
 467    MOYR, WKYR, DATETIME, or YMDHMS), time (TIME or DTIME), or
 468    date component (WKDAY or MONTH) format.  If successful,
 469    increments G's any_date counter and the counter or counters
 470    for the specific format(s) that S matches.  On failure, does
 471    not modify G.
 472
 473    XXX How can we distinguish MTIME from TIME?  One way might be
 474    that TIME can have three parts (HH:MM:SS) but MTIME only ever
 475    has two (MM:SS).
 476
 477    Does not attempt to recognize JDATE format: it looks just like
 478    F format and will thus be caught by the numeric parser.
 479
 480    This function is intended to match a set of strings close to
 481    those that actual date and time parsers used by the data_in
 482    function would match, but somewhat pickier.  In particular,
 483    minutes and seconds are only recognized when they have exactly
 484    two digits: "1:02:03" is a valid time, but "1:2:3" is
 485    rejected.  */
 486 static void
 487 add_date_time (struct fmt_guesser *g, struct substring s)
 488 {
 489   enum date_token token;
 490   enum date_token tokens[MAX_TOKENS];
 491   enum date_token tokens_seen;
 492   size_t n_tokens;
 493   int decimals;
 494   bool is_date;
 495   int i;
 496
 497   /* Break S into tokens. */
 498   n_tokens = 0;
 499   tokens_seen = 0;
 500   decimals = 0;
 501   while (!ss_is_empty (s))
 502     {
 503       if (n_tokens >= MAX_TOKENS)
 504         return;
 505
 506       token = parse_date_token (&s, tokens_seen, &decimals);
 507       if (token == 0)
 508         return;
 509       tokens[n_tokens++] = token;
 510       tokens_seen |= token;
 511     }
 512   if (n_tokens == 0)
 513     return;
 514
 515   /* Find matching date formats, if any, and increment the
 516      counter for each one of them. */
 517   is_date = false;
 518   for (i = 0; i < DATE_SYNTAX_CNT; i++)
 519     {
 520       struct date_syntax *s = &syntax[i];
 521       if (match_date_syntax (tokens, n_tokens, s->tokens, s->n_tokens))
 522         {
 523           is_date = true;
 524           g->date[i]++;
 525         }
 526     }
 527   if (is_date)
 528     {
 529       g->any_date++;
 530       g->decimals += decimals;
 531     }
 532 }
 533
 534 /* Returns true if the A_LEN tokens in A[] match the B_LEN tokens
 535    in B[], false otherwise. */
 536 static bool
 537 match_date_syntax (const enum date_token a[], size_t a_len,
 538                    const enum date_token b[], size_t b_len)
 539 {
 540   size_t i;
 541
 542   if (a_len != b_len)
 543     return false;
 544
 545   for (i = 0; i < a_len; i++)
 546     if (!(a[i] & b[i]))
 547       return false;
 548
 549   return true;
 550 }
 551
 552 /* Guess which date or time format is most likely represented by
 553    G, and store it in F's type and d members.  (f->w is already
 554    initialized.) */
 555 static void
 556 guess_date_time (struct fmt_guesser *g, struct fmt_spec *f)
 557 {
 558   unsigned int max = 0;
 559   int i, j;
 560
 561   /* Choose the date format matched by the most inputs.  Break
 562      ties by choosing the earliest in the array. */
 563   for (i = 0; i < DATE_SYNTAX_CNT; i = j)
 564     {
 565       unsigned int sum = g->date[i];
 566       for (j = i + 1; j < DATE_SYNTAX_CNT; j++)
 567         {
 568           if (syntax[i].format != syntax[j].format)
 569             break;
 570           sum += g->date[j];
 571         }
 572       if (sum > max)
 573         {
 574           f->type = syntax[i].format;
 575           max = sum;
 576         }
 577     }
 578
 579   /* Formats that include a time have an optional seconds field.
 580      If we saw a seconds field in any of the inputs, make sure
 581      that the field width is large enough to include for them.
 582      (We use the minimum input width, but an output width would
 583      be equally appropriate, since all the time formats have the
 584      same minimum widths for input and output.)  */
 585   if (f->type == FMT_DATETIME || f->type == FMT_YMDHMS
 586       || f->type == FMT_MTIME || f->type == FMT_TIME || f->type == FMT_DTIME)
 587     {
 588       for (i = 0; i < DATE_SYNTAX_CNT; i++)
 589         if (g->date[i]
 590             && syntax[i].tokens[syntax[i].n_tokens - 1] == DT_SECOND)
 591           {
 592             f->d = g->decimals / g->count;
 593             f->w = MAX (f->w, fmt_min_input_width (f->type) + 3);
 594           }
 595     }
 596 }
 597
 598 /* Extracts the next date token from the string represented by S,
 599    which must not be an empty string, and advances *S past the
 600    end of the token.  Returns the parsed token, or 0 if no valid
 601    token was found.
 602
 603    TOKENS_SEEN should be a bitmap representing all the tokens
 604    already seen in this input; this is used to resolve some
 605    otherwise ambiguous parsing situation.  If a count of seconds
 606    is parsed, *DECIMALS is set to the number of digits after the
 607    decimal point.  */
 608 static enum date_token
 609 parse_date_token (struct substring *s, enum date_token tokens_seen,
 610                   int *decimals)
 611 {
 612   int c = ss_first (*s);
 613
 614   switch (c)
 615     {
 616     case '0': case '1': case '2': case '3': case '4':
 617     case '5': case '6': case '7': case '8': case '9':
 618       return parse_date_number (s, tokens_seen, decimals);
 619
 620     case '+':
 621     case '-':
 622       /* '+' or '-' at the start of a string, or following a
 623          space, could be the sign that optionally introduces a
 624          time, e.g. "-1:00" in TIME format, "-1 1:00" in DTIME
 625          format, or "1/1/1978 +1:00" in DATETIME format. */
 626       if ((!tokens_seen || s->string[-1] == ' ') && c_isdigit (ss_at (*s, 1)))
 627         {
 628           ss_advance (s, 1);
 629           ss_ltrim (s, ss_cstr (CC_DIGITS));
 630           return DT_DAY_COUNT | DT_HOUR;
 631         }
 632       else if (c == '+')
 633         return 0;
 634       /* Fall through. */
 635     case '/': case '.': case ',':
 636       ss_advance (s, 1);
 637       return DT_DELIM;
 638
 639     case ':':
 640       ss_advance (s, 1);
 641       return DT_COLON;
 642
 643     case ' ': case '\t': case '\v': case '\r': case '\n':
 644       {
 645         enum date_token token;
 646         ss_advance (s, 1);
 647         token = recognize_identifier_token (s);
 648         if (token)
 649           ss_match_byte_in (s, ss_cstr (CC_SPACES));
 650         else
 651           token = DT_DELIM | DT_SPACE;
 652         return token;
 653       }
 654
 655     default:
 656       return recognize_identifier_token (s);
 657
 658     case EOF:
 659       NOT_REACHED ();
 660     }
 661 }
 662
 663 /* Parses a digit sequence found in a date token.  Advances *S
 664    past the end of the token.  Returns the parsed token, or 0 if
 665    no valid token was found.
 666
 667    TOKENS_SEEN should be a bitmap representing all the tokens
 668    already seen in this input; this is used to resolve some
 669    otherwise ambiguous parsing situation.  If a count of seconds
 670    is parsed, *DECIMALS is set to the number of digits after the
 671    decimal point.*/
 672 static enum date_token
 673 parse_date_number (struct substring *s, enum date_token tokens_seen,
 674                    int *decimals)
 675 {
 676   long int value;
 677   size_t n_digits = ss_get_long (s, &value);
 678   enum date_token token = 0;
 679
 680   if (ss_match_byte (s, settings_get_fmt_settings ()->decimal)
 681       && tokens_seen & DT_COLON
 682       && value <= 59)
 683     {
 684       /* Parse digits after the decimal point. */
 685       token = DT_SECOND;
 686       *decimals = ss_ltrim (s, ss_cstr (CC_DIGITS));
 687     }
 688   else
 689     {
 690       if (value <= 4)
 691         token = (DT_QUARTER | DT_MONTH | DT_HOUR | DT_DAY | DT_WEEK
 692                  | DT_DAY_COUNT);
 693       else if (value <= 12)
 694         token = DT_MONTH | DT_HOUR | DT_DAY | DT_WEEK | DT_DAY_COUNT;
 695       else if (value <= 23)
 696         token = DT_HOUR | DT_DAY | DT_WEEK | DT_DAY_COUNT;
 697       else if (value <= 31)
 698         token = DT_DAY | DT_WEEK | DT_DAY_COUNT;
 699       else if (value <= 52)
 700         token = DT_WEEK | DT_DAY_COUNT;
 701       else
 702         token = DT_DAY_COUNT;
 703
 704       if (n_digits == 2)
 705         {
 706           token |= DT_YEAR;
 707           if (value <= 59)
 708             token |= DT_MINUTE | DT_SECOND;
 709         }
 710       else if (n_digits == 4)
 711         token |= DT_YEAR;
 712     }
 713
 714   return token;
 715 }
 716
 717 /* Attempts to parse an identifier found in a date at the
 718    beginning of S.  Advances *S past the end of the token.
 719    Returns the parsed token, or 0 if no valid token was
 720    found.  */
 721 static enum date_token
 722 recognize_identifier_token (struct substring *s)
 723 {
 724   size_t length = ss_span (*s, ss_cstr (CC_LETTERS));
 725   enum date_token token = 0;
 726   switch (length)
 727     {
 728     case 0:
 729       break;
 730
 731     case 1:
 732       switch (c_tolower (s->string[0]))
 733         {
 734         case 'i':
 735         case 'v':
 736         case 'x':
 737           token = DT_MONTH;
 738           break;
 739
 740         case 'q':
 741           token = DT_Q;
 742           break;
 743         }
 744       break;
 745
 746     case 2:
 747       {
 748         int s0 = c_tolower ((unsigned char) s->string[0]);
 749         int s1 = c_tolower ((unsigned char) s->string[1]);
 750         token = recognize_id2 (s0, s1, false);
 751         if (!token && s0 == 'w' && s1 == 'k')
 752           token = DT_WK;
 753       }
 754       break;
 755
 756     default:
 757       {
 758         int s0 = c_tolower ((unsigned char) s->string[0]);
 759         int s1 = c_tolower ((unsigned char) s->string[1]);
 760         int s2 = c_tolower ((unsigned char) s->string[2]);
 761         token = recognize_id2 (s0, s1, true);
 762         if (!token)
 763           token = recognize_id3 (s0, s1, s2, length > 3);
 764         if (!token && length == 4)
 765           {
 766             int s3 = c_tolower ((unsigned char) s->string[3]);
 767             if (s0 == 'v' && s1 == 'i' && s2 == 'i' && s3 == 'i')
 768               token = DT_MONTH;
 769           }
 770       }
 771       break;
 772     }
 773   if (token)
 774     ss_advance (s, length);
 775   return token;
 776 }
 777
 778 static enum date_token
 779 recognize_id2 (int s0, int s1, bool more)
 780 {
 781   bool weekday;
 782   switch (s0)
 783     {
 784     case 's': weekday = s1 == 'a' || s1 == 'u'; break;
 785     case 'm': weekday = s1 == 'o'; break;
 786     case 't': weekday = s1 == 'u' || s1 == 'h'; break;
 787     case 'w': weekday = s1 == 'e'; break;
 788     case 'f': weekday = s1 == 'r'; break;
 789     default: weekday = false; break;
 790     }
 791   if (weekday)
 792     return DT_WEEKDAY;
 793
 794   if (!more)
 795     {
 796       bool month;
 797       switch (s0)
 798         {
 799         case 'i': month = s1 == 'i' || s1 == 'v' || s1 == 'x'; break;
 800         case 'v': month = s1 == 'i'; break;
 801         case 'x': month = s1 == 'i'; break;
 802         default: month = false; break;
 803         }
 804       if (month)
 805         return DT_MONTH;
 806     }
 807
 808   return 0;
 809 }
 810
 811 static enum date_token
 812 recognize_id3 (int s0, int s1, int s2, bool more)
 813 {
 814   bool month;
 815   switch (s0)
 816     {
 817     case 'j':
 818       month = ((s1 == 'a' && s2 == 'n')
 819                || (s1 == 'u' && (s2 == 'n' || s2 == 'l')));
 820       break;
 821     case 'f':
 822       month = s1 == 'e' && s2 == 'b';
 823       break;
 824     case 'm':
 825       month = (s1 == 'a' && (s2 == 'r' || s2 == 'y'));
 826       break;
 827     case 'a':
 828       month = (s1 == 'p' && s2 == 'r') || (s1 == 'u' && s2 == 'g');
 829       break;
 830     case 's':
 831       month = s1 == 'e' && s2 == 'p';
 832       break;
 833     case 'o':
 834       month = s1 == 'c' && s2 == 't';
 835       break;
 836     case 'n':
 837       month = s1 == 'o' && s2 == 'v';
 838       break;
 839     case 'd':
 840       month = s1 == 'e' && s2 == 'c';
 841       break;
 842     default:
 843       month = false;
 844     }
 845   if (month)
 846     return DT_MONTH | DT_ENGLISH_MONTH;
 847
 848   if (!more)
 849     {
 850       bool roman_month = false;
 851       switch (s0)
 852         {
 853         case 'i':
 854         case 'x':
 855           roman_month = s1 == 'i' && s2 == 'i';
 856           break;
 857         case 'v':
 858           roman_month = s1 == 'i' && s2 == 'i';
 859           break;
 860         }
 861       if (roman_month)
 862         return DT_MONTH;
 863     }
 864
 865   return 0;
 866 }
 867
 868
 869
 870