src/data/format-guesser.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2008, 2010, 2011 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "data/format-guesser.h"
  20
  21 #include <stdlib.h>
  22 #include <string.h>
  23
  24 #include "data/format.h"
  25 #include "data/settings.h"
  26 #include "libpspp/assertion.h"
  27 #include "libpspp/str.h"
  28
  29 #include "gl/c-ctype.h"
  30 #include "gl/minmax.h"
  31 #include "gl/xalloc.h"
  32
  33 /* A token in which potential date or time fields are broken.
  34
  35    The token type is actually a bit-map.  This allows a single
  36    token to represent multiple roles, as often happens in parsing
  37    adate or a time.  For example, the number "1" can be a quarter
  38    of the year, month, hour, day of the month, week of the year,
  39    or a count of days.  Such ambiguities are resolved on the
  40    higher-level bases of multiple tokens and multiple full
  41    dates. */
  42 enum date_token
  43   {
  44     DT_DAY = 1 << 0,            /* dd: Day of the month. */
  45     DT_MONTH = 1 << 1,          /* mm: Month. */
  46     DT_ENGLISH_MONTH = 1 << 2,  /* mmm: Spelled-out month, e.g. "jan". */
  47     DT_YEAR = 1 << 3,           /* yy: Year. */
  48
  49     DT_HOUR = 1 << 4,           /* HH: Hour. */
  50     DT_MINUTE = 1 << 5,         /* MM: Minute. */
  51     DT_SECOND = 1 << 6,         /* SS: Second. */
  52
  53     DT_WEEKDAY = 1 << 7,        /* www: Day of the week. */
  54
  55     DT_DAY_COUNT = 1 << 8,      /* D: Number of days. */
  56     DT_WEEK = 1 << 9,           /* ww: Week of the year. */
  57     DT_QUARTER = 1 << 10,       /* q: Quarter of the year. */
  58
  59     DT_Q = 1 << 11,             /* Literal "Q". */
  60     DT_WK = 1 << 12,            /* Literal "WK". */
  61
  62     DT_DELIM = 1 << 13,         /* One of -/., or white space. */
  63     DT_SPACE = 1 << 14,         /* Any white space. */
  64     DT_COLON = 1 << 15,         /* : */
  65   };
  66
  67 /* Syntax of a date format, in terms of the date tokens that
  68    compose it.*/
  69 struct date_syntax
  70   {
  71     enum fmt_type format;       /* Format type. */
  72 #define MAX_TOKENS 11
  73     size_t token_cnt;           /* Number of tokens. */
  74     enum date_token tokens[MAX_TOKENS]; /* Tokens. */
  75   };
  76
  77 /* Syntax of all the data formats that we can parse.
  78
  79    The order in the array can make a difference in the final
  80    choice of formats: in the case of a tie between the number of
  81    times each format is seen, the syntax earlier in the array
  82    takes precedence.  The most important cases are the ordering
  83    of DATE before EDATE, so that spelled-out months in input
  84    yield DATE format (that produces spelled-out months in output,
  85    and the ordering of EDATE before ADATE, so that ambiguous
  86    dates such as "1/1/99" yield the more sensible European date
  87    format instead of American format.
  88
  89    When a given date format has more than one syntax, they must
  90    be in adjacent array elements. */
  91 static struct date_syntax syntax[] =
  92   {
  93     /* dd-mmm-yy */
  94     { FMT_DATE, 5, {DT_DAY, DT_DELIM, DT_ENGLISH_MONTH, DT_DELIM, DT_YEAR} },
  95
  96     /* dd.mm.yy */
  97     { FMT_EDATE, 5, {DT_DAY, DT_DELIM, DT_MONTH, DT_DELIM, DT_YEAR} },
  98
  99     /* mm/dd/yy */
 100     { FMT_ADATE, 5, {DT_MONTH, DT_DELIM, DT_DAY, DT_DELIM, DT_YEAR} },
 101
 102     /* yy/mm/dd */
 103     { FMT_SDATE, 5, {DT_YEAR, DT_DELIM, DT_MONTH, DT_DELIM, DT_DAY} },
 104
 105     /* mmm yy */
 106     { FMT_MOYR, 3, {DT_MONTH, DT_DELIM, DT_YEAR} },
 107
 108     /* q Q yy */
 109     { FMT_QYR, 3, {DT_QUARTER, DT_Q, DT_YEAR} },
 110
 111     /* ww WK yy */
 112     { FMT_WKYR, 3, {DT_WEEK, DT_WK, DT_YEAR} },
 113
 114     /* dd-mmm-yyyy HH:MM */
 115     { FMT_DATETIME,
 116       9, {DT_DAY, DT_DELIM, DT_MONTH, DT_DELIM, DT_YEAR, DT_SPACE, DT_HOUR,
 117           DT_COLON, DT_MINUTE} },
 118     /* dd-mmm-yyyy HH:MM:SS */
 119     { FMT_DATETIME,
 120       11, {DT_DAY, DT_DELIM, DT_MONTH, DT_DELIM, DT_YEAR, DT_SPACE, DT_HOUR,
 121            DT_COLON, DT_MINUTE, DT_COLON, DT_SECOND} },
 122
 123     /* HH:MM */
 124     { FMT_TIME, 3, {DT_HOUR, DT_COLON, DT_MINUTE} },
 125     /* HH:MM:SS */
 126     { FMT_TIME, 5, {DT_HOUR, DT_COLON, DT_MINUTE, DT_COLON, DT_SECOND} },
 127
 128     /* D HH:MM */
 129     { FMT_DTIME, 5, {DT_DAY_COUNT, DT_SPACE, DT_HOUR, DT_COLON, DT_MINUTE} },
 130     /* D HH:MM:SS */
 131     { FMT_DTIME,
 132       7, {DT_DAY_COUNT, DT_SPACE, DT_HOUR, DT_COLON, DT_MINUTE, DT_COLON,
 133           DT_SECOND} },
 134
 135     /* www */
 136     { FMT_WKDAY, 1, {DT_WEEKDAY} },
 137
 138     /* mmm
 139
 140        We require a spelled-out English month so that
 141        single-character Roman numerals like "i" and "x" don't get
 142        detected as months.  The latter is particularly common in
 143        the password field of /etc/passwd-like files. */
 144     { FMT_MONTH, 1, {DT_ENGLISH_MONTH} },
 145   };
 146
 147 /* Number of recognized date syntax formats. */
 148 #define DATE_SYNTAX_CNT (sizeof syntax / sizeof *syntax)
 149
 150 /* A format guesser. */
 151 struct fmt_guesser
 152   {
 153     /* Maximum observed input width. */
 154     unsigned int width;
 155
 156     /* Sum of the digits after the decimal point in each input
 157        (divide by count to obtain average decimal positions). */
 158     unsigned int decimals;
 159
 160     /* Number of non-empty, non-missing input values.
 161
 162        count is the sum of any_numeric, any_date, and the number
 163        of inputs that were not in any recognized format (hence,
 164        treated as A format). */
 165     unsigned int count;
 166
 167     /* Numeric input formats. */
 168     unsigned int any_numeric;   /* Sum of following counts. */
 169     unsigned int f;             /* Number of inputs in F format. */
 170     unsigned int comma;         /* Number of inputs in COMMA format. */
 171     unsigned int dot;           /* Number of inputs in DOT format. */
 172     unsigned int dollar;        /* Number of inputs in DOLLAR format. */
 173     unsigned int pct;           /* Number of inputs in PCT format. */
 174     unsigned int e;             /* Number of inputs in E format. */
 175
 176     /* Date or time input formats.
 177
 178        The sum of the values in the date array is at least
 179        any_date, often higher because many example dates match
 180        more than one date format. */
 181     unsigned int any_date;      /* Number of inputs in any date format. */
 182     unsigned int date[DATE_SYNTAX_CNT]; /* Number of inputs in each date
 183                                            format. */
 184   };
 185
 186 static bool add_numeric (struct fmt_guesser *, struct substring);
 187 static void guess_numeric (struct fmt_guesser *, struct fmt_spec *);
 188 static void add_date_time (struct fmt_guesser *, struct substring);
 189 static bool match_date_syntax (const enum date_token a[], size_t a_len,
 190                                const enum date_token b[], size_t b_len);
 191 static void guess_date_time (struct fmt_guesser *, struct fmt_spec *);
 192 static enum date_token parse_date_token (struct substring *,
 193                                          enum date_token tokens_seen,
 194                                          int *decimals);
 195 static enum date_token parse_date_number (struct substring *,
 196                                           enum date_token tokens_seen,
 197                                           int *decimals);
 198 static enum date_token recognize_identifier_token (struct substring *);
 199 static enum date_token recognize_id2 (int s0, int s1, bool more);
 200 static enum date_token recognize_id3 (int s0, int s1, int s2, bool more);
 201
 202 /* Creates and returns a new format guesser. */
 203 struct fmt_guesser *
 204 fmt_guesser_create (void)
 205 {
 206   struct fmt_guesser *g = xmalloc (sizeof *g);
 207   fmt_guesser_clear (g);
 208   return g;
 209 }
 210
 211 /* Destroys format guesser G. */
 212 void
 213 fmt_guesser_destroy (struct fmt_guesser *g)
 214 {
 215   free (g);
 216 }
 217
 218 /* Clears the state of format guesser G, making it available for
 219    guessing the format of a new input stream.  */
 220 void
 221 fmt_guesser_clear (struct fmt_guesser *g)
 222 {
 223   memset (g, 0, sizeof *g);
 224 }
 225
 226 /* Appends S to the stream of data items whose format G is
 227    guessing. */
 228 void
 229 fmt_guesser_add (struct fmt_guesser *g, struct substring s)
 230 {
 231   if (ss_length (s) > g->width)
 232     g->width = ss_length (s);
 233   ss_trim (&s, ss_cstr (CC_SPACES));
 234   if (ss_is_empty (s) || ss_equals (s, ss_cstr (".")))
 235     {
 236       /* Can't guess anything from an empty string or a missing value. */
 237       return;
 238     }
 239
 240   g->count++;
 241   if (!add_numeric (g, s))
 242     add_date_time (g, s);
 243 }
 244
 245 /* Guesses the format of the input previously added to G using
 246    fmt_guesser_add, storing the guess into *F.  The guessed
 247    format may not actually a valid input or output format, in
 248    that its width and number of decimal places may be outside the
 249    valid range for the guessed format type.  The caller must
 250    therefore adjust the format to make it valid, e.g. by calling
 251    fmt_fix. */
 252 void
 253 fmt_guesser_guess (struct fmt_guesser *g, struct fmt_spec *f)
 254 {
 255   if (g->count > 0)
 256     {
 257       /* Set defaults.  The guesser functions typically override
 258          the width and type. */
 259       f->type = FMT_A;
 260       f->w = g->width;
 261       f->d = 0;
 262
 263       if (g->any_numeric > g->count / 2)
 264         guess_numeric (g, f);
 265       else if (g->any_date > g->count / 2)
 266         guess_date_time (g, f);
 267     }
 268   else
 269     {
 270       /* No data at all.  Use fallback default. */
 271       *f = fmt_default_for_width (0);
 272     }
 273 }
 274 \f
 275 /* Numeric formats. */
 276
 277 /* Tries to parse S as a numeric (F, COMMA, DOT, DOLLAR, PCT, or
 278    E) format.  If successful, increments G's any_numeric counter
 279    and the counter for the specific format S that S matches and
 280    returns true.  On failure, returns false without modifying G.
 281
 282    This function is intended to match exactly the same set of
 283    strings that the actual numeric value parsers used by the
 284    data_in function would match. */
 285 static bool
 286 add_numeric (struct fmt_guesser *g, struct substring s)
 287 {
 288   bool has_dollar;              /* '$' appeared at start of S? */
 289   bool has_percent;             /* '%' appeared at end of S? */
 290   int digits;                   /* Number of digits in S (before exponent). */
 291   int dots;                     /* Number of '.' in S. */
 292   int commas;                   /* Number of ',' in S. */
 293   bool has_exp;                 /* [eEdD] appeared introducing exponent? */
 294   bool has_exp_sign;            /* '+' or '-' appeared in exponent? */
 295   int exp_digits;               /* Number of digits in exponent. */
 296
 297   int prev_delim;       /* Initially 0, then ',' or '.' as delimiters seen. */
 298   int delim_digits;             /* Number of digits since last delimiter. */
 299
 300   int decimal;                  /* Decimal point character: '.', ',',
 301                                    or 0 if unknown or no decimal point in S. */
 302   int precision;                /* Digits of precision after decimal point. */
 303
 304   int c;
 305
 306   /* Skip leading "$" and optional following white space. */
 307   has_dollar = ss_match_byte (&s, '$');
 308   if (has_dollar)
 309     ss_ltrim (&s, ss_cstr (CC_SPACES));
 310
 311   /* Skip optional sign. */
 312   ss_match_byte_in (&s, ss_cstr ("+-"));
 313
 314   /* Skip digits punctuated by commas and dots.  We don't know
 315      whether the decimal point is a comma or a dot, so for now we
 316      just count them.  */
 317   digits = dots = commas = 0;
 318   delim_digits = 0;
 319   prev_delim = 0;
 320   for (; (c = ss_first (s)) != -1; ss_advance (&s, 1))
 321     {
 322       if (c >= '0' && c <= '9')
 323         {
 324           digits++;
 325           if (dots || commas)
 326             delim_digits++;
 327         }
 328       else if (c == '.' )
 329         {
 330           dots++;
 331           prev_delim = c;
 332           delim_digits = 0;
 333         }
 334       else if (c == ',')
 335         {
 336           commas++;
 337           prev_delim = c;
 338           delim_digits = 0;
 339         }
 340       else
 341         break;
 342     }
 343   if (digits == 0 || (dots > 1 && commas > 1))
 344     {
 345       /* A valid number has at least one digit and can't have
 346          more than one decimal point. */
 347       return false;
 348     }
 349
 350   /* Skip the optional exponent. */
 351   has_exp = ss_match_byte_in (&s, ss_cstr ("eEdD")) != EOF;
 352   has_exp_sign = ss_match_byte_in (&s, ss_cstr ("-+")) != EOF;
 353   if (has_exp_sign)
 354     ss_match_byte (&s, ' ');
 355   exp_digits = ss_ltrim (&s, ss_cstr (CC_DIGITS));
 356   if ((has_exp || has_exp_sign) && !exp_digits)
 357     {
 358       /* Can't have the E or sign that leads in the exponent
 359          without actually having an exponent. */
 360       return false;
 361     }
 362
 363   /* Skip optional '%'. */
 364   has_percent = ss_match_byte (&s, '%');
 365   if (has_dollar && has_percent)
 366     {
 367       /* A valid number cannot have both '$' and '%'. */
 368       return false;
 369     }
 370
 371   /* Make sure there's no trailing garbage. */
 372   if (!ss_is_empty (s))
 373     return false;
 374
 375   /* Figure out the decimal point (and therefore grouping)
 376      character and the number of digits following the decimal
 377      point.  Sometimes the answer is ambiguous. */
 378   if (dots > 1 && prev_delim == '.')
 379     {
 380       /* Can't have multiple decimal points, so '.' must really
 381          be the grouping character, with a precision of 0. */
 382       decimal = ',';
 383       precision = 0;
 384     }
 385   else if (commas > 1 && prev_delim == ',')
 386     {
 387       /* Can't have multiple decimal points, so ',' must really
 388          be the grouping character, with a precision of 0. */
 389       decimal = '.';
 390       precision = 0;
 391     }
 392   else if (delim_digits == 3 && (!dots || !commas))
 393     {
 394       /* The input is something like "1.234" or "1,234" where we
 395          can't tell whether the ',' or '.' is a grouping or
 396          decimal character.  Assume that the decimal character
 397          from the settings is in use. */
 398       if (prev_delim == settings_get_decimal_char (FMT_F))
 399         {
 400           decimal = prev_delim;
 401           precision = delim_digits;
 402         }
 403       else
 404         {
 405           decimal = prev_delim == '.' ? ',' : '.';
 406           precision = 0;
 407         }
 408     }
 409   else
 410     {
 411       /* The final delimiter is a decimal point, and the digits
 412          following it are decimals. */
 413       decimal = prev_delim;
 414       precision = delim_digits;
 415     }
 416
 417   /* Decide the most likely format. */
 418   g->any_numeric++;
 419   g->decimals += precision;
 420   if (has_dollar)
 421     g->dollar++;
 422   else if (has_percent)
 423     g->pct++;
 424   else if (commas && decimal == '.')
 425     g->comma++;
 426   else if (dots && decimal == ',')
 427     g->dot++;
 428   else if (has_exp || has_exp_sign)
 429     g->e++;
 430   else
 431     g->f++;
 432
 433   return true;
 434 }
 435
 436 /* Guess which numeric format is most likely represented by G,
 437    and store it in F's type and d members.  (f->w is already
 438    initialized.) */
 439 static void
 440 guess_numeric (struct fmt_guesser *g, struct fmt_spec *f)
 441 {
 442   int decimal_char = settings_get_decimal_char (FMT_COMMA);
 443
 444   f->d = g->decimals / g->count;
 445   if (g->pct)
 446     f->type = FMT_PCT;
 447   else if (g->dollar)
 448     f->type = FMT_DOLLAR;
 449   else if (g->comma > g->dot)
 450     f->type = decimal_char == '.' ? FMT_COMMA : FMT_DOT;
 451   else if (g->dot > g->comma)
 452     f->type = decimal_char == '.' ? FMT_DOT : FMT_COMMA;
 453   else if (g->e > g->any_numeric / 2)
 454     f->type = FMT_E;
 455   else
 456     f->type = FMT_F;
 457 }
 458 \f
 459 /* Tries to parse S as a date (DATE, ADATE, EDATE, SDATE, QYR,
 460    MOYR, WKYR, or DATETIME), time (TIME or DTIME), or date
 461    component (WKDAY or MONTH) format.  If successful, increments
 462    G's any_date counter and the counter or counters for the
 463    specific format(s) that S matches.  On failure, does not
 464    modify G.
 465
 466    Does not attempt to recognize JDATE format: it looks just like
 467    F format and will thus be caught by the numeric parser.
 468
 469    This function is intended to match a set of strings close to
 470    those that actual date and time parsers used by the data_in
 471    function would match, but somewhat pickier.  In particular,
 472    minutes and seconds are only recognized when they have exactly
 473    two digits: "1:02:03" is a valid time, but "1:2:3" is
 474    rejected.  */
 475 static void
 476 add_date_time (struct fmt_guesser *g, struct substring s)
 477 {
 478   enum date_token token;
 479   enum date_token tokens[MAX_TOKENS];
 480   enum date_token tokens_seen;
 481   size_t token_cnt;
 482   int decimals;
 483   bool is_date;
 484   int i;
 485
 486   /* Break S into tokens. */
 487   token_cnt = 0;
 488   tokens_seen = 0;
 489   decimals = 0;
 490   while (!ss_is_empty (s))
 491     {
 492       if (token_cnt >= MAX_TOKENS)
 493         return;
 494
 495       token = parse_date_token (&s, tokens_seen, &decimals);
 496       if (token == 0)
 497         return;
 498       tokens[token_cnt++] = token;
 499       tokens_seen |= token;
 500     }
 501   if (token_cnt == 0)
 502     return;
 503
 504   /* Find matching date formats, if any, and increment the
 505      counter for each one of them. */
 506   is_date = false;
 507   for (i = 0; i < DATE_SYNTAX_CNT; i++)
 508     {
 509       struct date_syntax *s = &syntax[i];
 510       if (match_date_syntax (tokens, token_cnt, s->tokens, s->token_cnt))
 511         {
 512           is_date = true;
 513           g->date[i]++;
 514         }
 515     }
 516   if (is_date)
 517     {
 518       g->any_date++;
 519       g->decimals += decimals;
 520     }
 521 }
 522
 523 /* Returns true if the A_LEN tokens in A[] match the B_LEN tokens
 524    in B[], false otherwise. */
 525 static bool
 526 match_date_syntax (const enum date_token a[], size_t a_len,
 527                    const enum date_token b[], size_t b_len)
 528 {
 529   size_t i;
 530
 531   if (a_len != b_len)
 532     return false;
 533
 534   for (i = 0; i < a_len; i++)
 535     if (!(a[i] & b[i]))
 536       return false;
 537
 538   return true;
 539 }
 540
 541 /* Guess which date or time format is most likely represented by
 542    G, and store it in F's type and d members.  (f->w is already
 543    initialized.) */
 544 static void
 545 guess_date_time (struct fmt_guesser *g, struct fmt_spec *f)
 546 {
 547   unsigned int max = 0;
 548   int i, j;
 549
 550   /* Choose the date format matched by the most inputs.  Break
 551      ties by choosing the earliest in the array. */
 552   for (i = 0; i < DATE_SYNTAX_CNT; i = j)
 553     {
 554       unsigned int sum = g->date[i];
 555       for (j = i + 1; j < DATE_SYNTAX_CNT; j++)
 556         {
 557           if (syntax[i].format != syntax[j].format)
 558             break;
 559           sum += g->date[j];
 560         }
 561       if (sum > max)
 562         {
 563           f->type = syntax[i].format;
 564           max = sum;
 565         }
 566     }
 567
 568   /* Formats that include a time have an optional seconds field.
 569      If we saw a seconds field in any of the inputs, make sure
 570      that the field width is large enough to include for them.
 571      (We use the minimum input width, but an output width would
 572      be equally appropriate, since all the time formats have the
 573      same minimum widths for input and output.)  */
 574   if (f->type == FMT_DATETIME || f->type == FMT_TIME
 575       || f->type == FMT_DTIME)
 576     {
 577       for (i = 0; i < DATE_SYNTAX_CNT; i++)
 578         if (g->date[i]
 579             && syntax[i].tokens[syntax[i].token_cnt - 1] == DT_SECOND)
 580           {
 581             f->d = g->decimals / g->count;
 582             f->w = MAX (f->w, fmt_min_input_width (f->type) + 3);
 583           }
 584     }
 585 }
 586
 587 /* Extracts the next date token from the string represented by S,
 588    which must not be an empty string, and advances *S past the
 589    end of the token.  Returns the parsed token, or 0 if no valid
 590    token was found.
 591
 592    TOKENS_SEEN should be a bitmap representing all the tokens
 593    already seen in this input; this is used to resolve some
 594    otherwise ambiguous parsing situation.  If a count of seconds
 595    is parsed, *DECIMALS is set to the number of digits after the
 596    decimal point.  */
 597 static enum date_token
 598 parse_date_token (struct substring *s, enum date_token tokens_seen,
 599                   int *decimals)
 600 {
 601   int c = ss_first (*s);
 602
 603   switch (c)
 604     {
 605     case '0': case '1': case '2': case '3': case '4':
 606     case '5': case '6': case '7': case '8': case '9':
 607       return parse_date_number (s, tokens_seen, decimals);
 608
 609     case '+':
 610     case '-':
 611       /* '+' or '-' at the start of a string, or following a
 612          space, could be the sign that optionally introduces a
 613          time, e.g. "-1:00" in TIME format, "-1 1:00" in DTIME
 614          format, or "1/1/1978 +1:00" in DATETIME format. */
 615       if ((!tokens_seen || s->string[-1] == ' ') && c_isdigit (ss_at (*s, 1)))
 616         {
 617           ss_advance (s, 1);
 618           ss_ltrim (s, ss_cstr (CC_DIGITS));
 619           return DT_DAY_COUNT | DT_HOUR;
 620         }
 621       else if (c == '+')
 622         return 0;
 623       /* Fall through. */
 624     case '/': case '.': case ',':
 625       ss_advance (s, 1);
 626       return DT_DELIM;
 627
 628     case ':':
 629       ss_advance (s, 1);
 630       return DT_COLON;
 631
 632     case ' ': case '\t': case '\v': case '\r': case '\n':
 633       {
 634         enum date_token token;
 635         ss_advance (s, 1);
 636         token = recognize_identifier_token (s);
 637         if (token)
 638           ss_match_byte_in (s, ss_cstr (CC_SPACES));
 639         else
 640           token = DT_DELIM | DT_SPACE;
 641         return token;
 642       }
 643
 644     default:
 645       return recognize_identifier_token (s);
 646
 647     case EOF:
 648       NOT_REACHED ();
 649     }
 650 }
 651
 652 /* Parses a digit sequence found in a date token.  Advances *S
 653    past the end of the token.  Returns the parsed token, or 0 if
 654    no valid token was found.
 655
 656    TOKENS_SEEN should be a bitmap representing all the tokens
 657    already seen in this input; this is used to resolve some
 658    otherwise ambiguous parsing situation.  If a count of seconds
 659    is parsed, *DECIMALS is set to the number of digits after the
 660    decimal point.*/
 661 static enum date_token
 662 parse_date_number (struct substring *s, enum date_token tokens_seen,
 663                    int *decimals)
 664 {
 665   long int value;
 666   size_t digit_cnt = ss_get_long (s, &value);
 667   enum date_token token = 0;
 668
 669   if (ss_match_byte (s, settings_get_decimal_char (FMT_F))
 670       && tokens_seen & DT_COLON
 671       && value <= 59)
 672     {
 673       /* Parse digits after the decimal point. */
 674       token = DT_SECOND;
 675       *decimals = ss_ltrim (s, ss_cstr (CC_DIGITS));
 676     }
 677   else
 678     {
 679       if (value <= 4)
 680         token = (DT_QUARTER | DT_MONTH | DT_HOUR | DT_DAY | DT_WEEK
 681                  | DT_DAY_COUNT);
 682       else if (value <= 12)
 683         token = DT_MONTH | DT_HOUR | DT_DAY | DT_WEEK | DT_DAY_COUNT;
 684       else if (value <= 23)
 685         token = DT_HOUR | DT_DAY | DT_WEEK | DT_DAY_COUNT;
 686       else if (value <= 31)
 687         token = DT_DAY | DT_WEEK | DT_DAY_COUNT;
 688       else if (value <= 52)
 689         token = DT_WEEK | DT_DAY_COUNT;
 690       else
 691         token = DT_DAY_COUNT;
 692
 693       if (digit_cnt == 2)
 694         {
 695           token |= DT_YEAR;
 696           if (value <= 59)
 697             token |= DT_MINUTE | DT_SECOND;
 698         }
 699       else if (digit_cnt == 4)
 700         token |= DT_YEAR;
 701     }
 702
 703   return token;
 704 }
 705
 706 /* Attempts to parse an identifier found in a date at the
 707    beginning of S.  Advances *S past the end of the token.
 708    Returns the parsed token, or 0 if no valid token was
 709    found.  */
 710 static enum date_token
 711 recognize_identifier_token (struct substring *s)
 712 {
 713   size_t length = ss_span (*s, ss_cstr (CC_LETTERS));
 714   enum date_token token = 0;
 715   switch (length)
 716     {
 717     case 0:
 718       break;
 719
 720     case 1:
 721       switch (c_tolower (s->string[0]))
 722         {
 723         case 'i':
 724         case 'v':
 725         case 'x':
 726           token = DT_MONTH;
 727           break;
 728
 729         case 'q':
 730           token = DT_Q;
 731           break;
 732         }
 733       break;
 734
 735     case 2:
 736       {
 737         int s0 = c_tolower ((unsigned char) s->string[0]);
 738         int s1 = c_tolower ((unsigned char) s->string[1]);
 739         token = recognize_id2 (s0, s1, false);
 740         if (!token && s0 == 'w' && s1 == 'k')
 741           token = DT_WK;
 742       }
 743       break;
 744
 745     default:
 746       {
 747         int s0 = c_tolower ((unsigned char) s->string[0]);
 748         int s1 = c_tolower ((unsigned char) s->string[1]);
 749         int s2 = c_tolower ((unsigned char) s->string[2]);
 750         token = recognize_id2 (s0, s1, true);
 751         if (!token)
 752           token = recognize_id3 (s0, s1, s2, length > 3);
 753         if (!token && length == 4)
 754           {
 755             int s3 = c_tolower ((unsigned char) s->string[3]);
 756             if (s0 == 'v' && s1 == 'i' && s2 == 'i' && s3 == 'i')
 757               token = DT_MONTH;
 758           }
 759       }
 760       break;
 761     }
 762   if (token)
 763     ss_advance (s, length);
 764   return token;
 765 }
 766
 767 static enum date_token
 768 recognize_id2 (int s0, int s1, bool more)
 769 {
 770   bool weekday;
 771   switch (s0)
 772     {
 773     case 's': weekday = s1 == 'a' || s1 == 'u'; break;
 774     case 'm': weekday = s1 == 'o'; break;
 775     case 't': weekday = s1 == 'u' || s1 == 'h'; break;
 776     case 'w': weekday = s1 == 'e'; break;
 777     case 'f': weekday = s1 == 'r'; break;
 778     default: weekday = false; break;
 779     }
 780   if (weekday)
 781     return DT_WEEKDAY;
 782
 783   if (!more)
 784     {
 785       bool month;
 786       switch (s0)
 787         {
 788         case 'i': month = s1 == 'i' || s1 == 'v' || s1 == 'x'; break;
 789         case 'v': month = s1 == 'i'; break;
 790         case 'x': month = s1 == 'i'; break;
 791         default: month = false; break;
 792         }
 793       if (month)
 794         return DT_MONTH;
 795     }
 796
 797   return 0;
 798 }
 799
 800 static enum date_token
 801 recognize_id3 (int s0, int s1, int s2, bool more)
 802 {
 803   bool month;
 804   switch (s0)
 805     {
 806     case 'j':
 807       month = ((s1 == 'a' && s2 == 'n')
 808                || (s1 == 'u' && (s2 == 'n' || s2 == 'l')));
 809       break;
 810     case 'f':
 811       month = s1 == 'e' && s2 == 'b';
 812       break;
 813     case 'm':
 814       month = (s1 == 'a' && (s2 == 'r' || s2 == 'y'));
 815       break;
 816     case 'a':
 817       month = (s1 == 'p' && s2 == 'r') || (s1 == 'u' && s2 == 'g');
 818       break;
 819     case 's':
 820       month = s1 == 'e' && s2 == 'p';
 821       break;
 822     case 'o':
 823       month = s1 == 'c' && s2 == 't';
 824       break;
 825     case 'n':
 826       month = s1 == 'o' && s2 == 'v';
 827       break;
 828     case 'd':
 829       month = s1 == 'e' && s2 == 'c';
 830       break;
 831     default:
 832       month = false;
 833     }
 834   if (month)
 835     return DT_MONTH | DT_ENGLISH_MONTH;
 836
 837   if (!more)
 838     {
 839       bool roman_month = false;
 840       switch (s0)
 841         {
 842         case 'i':
 843         case 'x':
 844           roman_month = s1 == 'i' && s2 == 'i';
 845           break;
 846         case 'v':
 847           roman_month = s1 == 'i' && s2 == 'i';
 848           break;
 849         }
 850       if (roman_month)
 851         return DT_MONTH;
 852     }
 853
 854   return 0;
 855 }
 856
 857
 858
 859