pintos-os.org Git - pspp/blob - src/language/data-io/get-data.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2007, 2008, 2009, 2010, 2011, 2012,
   3                  2013 Free Software Foundation, Inc.
   4
   5    This program is free software: you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation, either version 3 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  17
  18 #include <config.h>
  19
  20 #include <stdlib.h>
  21
  22 #include <string.h>
  23
  24 #include "data/dataset.h"
  25 #include "data/dictionary.h"
  26 #include "data/format.h"
  27 #include "data/gnumeric-reader.h"
  28 #include "data/ods-reader.h"
  29 #include "data/spreadsheet-reader.h"
  30 #include "data/psql-reader.h"
  31 #include "data/settings.h"
  32 #include "language/command.h"
  33 #include "language/data-io/data-parser.h"
  34 #include "language/data-io/data-reader.h"
  35 #include "language/data-io/file-handle.h"
  36 #include "language/data-io/placement-parser.h"
  37 #include "language/lexer/format-parser.h"
  38 #include "language/lexer/lexer.h"
  39 #include "libpspp/cast.h"
  40 #include "libpspp/i18n.h"
  41 #include "libpspp/message.h"
  42
  43 #include "gl/xalloc.h"
  44
  45 #include "gettext.h"
  46 #define _(msgid) gettext (msgid)
  47 #define N_(msgid) (msgid)
  48
  49 static bool parse_spreadsheet (struct lexer *lexer, char **filename,
  50                                struct spreadsheet_read_options *opts);
  51
  52 static void destroy_spreadsheet_read_info (struct spreadsheet_read_options *);
  53
  54 static int parse_get_txt (struct lexer *lexer, struct dataset *);
  55 static int parse_get_psql (struct lexer *lexer, struct dataset *);
  56
  57 int
  58 cmd_get_data (struct lexer *lexer, struct dataset *ds)
  59 {
  60   char *tok = NULL;
  61   struct spreadsheet_read_options opts;
  62
  63   opts.sheet_name = NULL;
  64   opts.sheet_index = -1;
  65   opts.cell_range = NULL;
  66   opts.read_names = false;
  67   opts.asw = -1;
  68
  69   lex_force_match (lexer, T_SLASH);
  70
  71   if (!lex_force_match_id (lexer, "TYPE"))
  72     goto error;
  73
  74   lex_force_match (lexer, T_EQUALS);
  75
  76   tok = strdup (lex_tokcstr (lexer));
  77   if (lex_match_id (lexer, "TXT"))
  78     {
  79       free (tok);
  80       return parse_get_txt (lexer, ds);
  81     }
  82   else if (lex_match_id (lexer, "PSQL"))
  83     {
  84       free (tok);
  85       return parse_get_psql (lexer, ds);
  86     }
  87   else if (lex_match_id (lexer, "GNM") ||
  88       lex_match_id (lexer, "ODS"))
  89     {
  90       char *filename = NULL;
  91       struct casereader *reader = NULL;
  92       struct dictionary *dict = NULL;
  93
  94       if (!parse_spreadsheet (lexer, &filename, &opts))
  95         goto error;
  96
  97       if ( 0 == strncasecmp (tok, "GNM", 3))
  98         {
  99           struct spreadsheet *spreadsheet = gnumeric_probe (filename, true);
 100           if (spreadsheet == NULL)
 101             goto error;
 102           reader = gnumeric_make_reader (spreadsheet, &opts);
 103           dict = spreadsheet->dict;
 104           gnumeric_destroy (spreadsheet);
 105         }
 106       else if (0 == strncasecmp (tok, "ODS", 3))
 107         {
 108           struct spreadsheet *spreadsheet = ods_probe (filename, true);
 109           if (spreadsheet == NULL)
 110             goto error;
 111           reader = ods_make_reader (spreadsheet, &opts);
 112           dict = spreadsheet->dict;
 113           ods_destroy (spreadsheet);
 114         }
 115
 116       free (filename);
 117
 118       if (reader)
 119         {
 120           dataset_set_dict (ds, dict);
 121           dataset_set_source (ds, reader);
 122           free (tok);
 123           destroy_spreadsheet_read_info (&opts);
 124           return CMD_SUCCESS;
 125         }
 126     }
 127   else
 128     msg (SE, _("Unsupported TYPE %s."), tok);
 129
 130
 131  error:
 132   destroy_spreadsheet_read_info (&opts);
 133   free (tok);
 134   return CMD_FAILURE;
 135 }
 136
 137 static int
 138 parse_get_psql (struct lexer *lexer, struct dataset *ds)
 139 {
 140   struct psql_read_info psql;
 141   psql.allow_clear = false;
 142   psql.conninfo = NULL;
 143   psql.str_width = -1;
 144   psql.bsize = -1;
 145   ds_init_empty (&psql.sql);
 146
 147   lex_force_match (lexer, T_SLASH);
 148
 149   if (!lex_force_match_id (lexer, "CONNECT"))
 150     goto error;
 151
 152   lex_force_match (lexer, T_EQUALS);
 153
 154   if (!lex_force_string (lexer))
 155     goto error;
 156
 157   psql.conninfo = ss_xstrdup (lex_tokss (lexer));
 158
 159   lex_get (lexer);
 160
 161   while (lex_match (lexer, T_SLASH) )
 162     {
 163       if ( lex_match_id (lexer, "ASSUMEDSTRWIDTH"))
 164         {
 165           lex_match (lexer, T_EQUALS);
 166           psql.str_width = lex_integer (lexer);
 167           lex_get (lexer);
 168         }
 169       else if ( lex_match_id (lexer, "BSIZE"))
 170         {
 171           lex_match (lexer, T_EQUALS);
 172           psql.bsize = lex_integer (lexer);
 173           lex_get (lexer);
 174         }
 175       else if ( lex_match_id (lexer, "UNENCRYPTED"))
 176         {
 177           psql.allow_clear = true;
 178         }
 179       else if (lex_match_id (lexer, "SQL"))
 180         {
 181           lex_match (lexer, T_EQUALS);
 182           if ( ! lex_force_string (lexer) )
 183             goto error;
 184
 185           ds_put_substring (&psql.sql, lex_tokss (lexer));
 186           lex_get (lexer);
 187         }
 188      }
 189   {
 190     struct dictionary *dict = NULL;
 191     struct casereader *reader = psql_open_reader (&psql, &dict);
 192
 193     if ( reader )
 194       {
 195         dataset_set_dict (ds, dict);
 196         dataset_set_source (ds, reader);
 197       }
 198   }
 199
 200   ds_destroy (&psql.sql);
 201   free (psql.conninfo);
 202
 203   return CMD_SUCCESS;
 204
 205  error:
 206
 207   ds_destroy (&psql.sql);
 208   free (psql.conninfo);
 209
 210   return CMD_FAILURE;
 211 }
 212
 213 static bool
 214 parse_spreadsheet (struct lexer *lexer, char **filename,
 215                    struct spreadsheet_read_options *opts)
 216 {
 217   opts->sheet_index = 1;
 218   opts->sheet_name = NULL;
 219   opts->cell_range = NULL;
 220   opts->read_names = true;
 221   opts->asw = -1;
 222
 223   lex_force_match (lexer, T_SLASH);
 224
 225   if (!lex_force_match_id (lexer, "FILE"))
 226     goto error;
 227
 228   lex_force_match (lexer, T_EQUALS);
 229
 230   if (!lex_force_string (lexer))
 231     goto error;
 232
 233   *filename  = utf8_to_filename (lex_tokcstr (lexer));
 234
 235   lex_get (lexer);
 236
 237   while (lex_match (lexer, T_SLASH) )
 238     {
 239       if ( lex_match_id (lexer, "ASSUMEDSTRWIDTH"))
 240         {
 241           lex_match (lexer, T_EQUALS);
 242           opts->asw = lex_integer (lexer);
 243           lex_get (lexer);
 244         }
 245       else if (lex_match_id (lexer, "SHEET"))
 246         {
 247           lex_match (lexer, T_EQUALS);
 248           if (lex_match_id (lexer, "NAME"))
 249             {
 250               if ( ! lex_force_string (lexer) )
 251                 goto error;
 252
 253               opts->sheet_name = ss_xstrdup (lex_tokss (lexer));
 254               opts->sheet_index = -1;
 255
 256               lex_get (lexer);
 257             }
 258           else if (lex_match_id (lexer, "INDEX"))
 259             {
 260               opts->sheet_index = lex_integer (lexer);
 261               if (opts->sheet_index <= 0)
 262                 {
 263                   msg (SE, _("The sheet index must be greater than or equal to 1"));
 264                   goto error;
 265                 }
 266               lex_get (lexer);
 267             }
 268           else
 269             {
 270               msg (SE, _("%s must be followed by either \"%s\" or \"%s\"."),
 271                    "/SHEET", "NAME", "INDEX");
 272               goto error;
 273             }
 274         }
 275       else if (lex_match_id (lexer, "CELLRANGE"))
 276         {
 277           lex_match (lexer, T_EQUALS);
 278
 279           if (lex_match_id (lexer, "FULL"))
 280             {
 281               opts->cell_range = NULL;
 282             }
 283           else if (lex_match_id (lexer, "RANGE"))
 284             {
 285               if ( ! lex_force_string (lexer) )
 286                 goto error;
 287
 288               opts->cell_range = ss_xstrdup (lex_tokss (lexer));
 289               lex_get (lexer);
 290             }
 291           else
 292             {
 293               msg (SE, _("%s must be followed by either \"%s\" or \"%s\"."),
 294                    "/CELLRANGE", "FULL", "RANGE");
 295               goto error;
 296             }
 297         }
 298       else if (lex_match_id (lexer, "READNAMES"))
 299         {
 300           lex_match (lexer, T_EQUALS);
 301
 302           if ( lex_match_id (lexer, "ON"))
 303             {
 304               opts->read_names = true;
 305             }
 306           else if (lex_match_id (lexer, "OFF"))
 307             {
 308               opts->read_names = false;
 309             }
 310           else
 311             {
 312               msg (SE, _("%s must be followed by either \"%s\" or \"%s\"."),
 313                    "/READNAMES", "ON", "OFF");
 314               goto error;
 315             }
 316         }
 317       else
 318         {
 319           lex_error (lexer, NULL);
 320           goto error;
 321         }
 322     }
 323
 324   return true;
 325
 326  error:
 327   return false;
 328 }
 329
 330
 331 static bool
 332 set_type (struct data_parser *parser, const char *subcommand,
 333           enum data_parser_type type, bool *has_type)
 334 {
 335   if (!*has_type)
 336     {
 337       data_parser_set_type (parser, type);
 338       *has_type = true;
 339     }
 340   else if (type != data_parser_get_type (parser))
 341     {
 342       msg (SE, _("%s is allowed only with %s arrangement, but %s arrangement "
 343                  "was stated or implied earlier in this command."),
 344            subcommand,
 345            type == DP_FIXED ? "FIXED" : "DELIMITED",
 346            type == DP_FIXED ? "DELIMITED" : "FIXED");
 347       return false;
 348     }
 349   return true;
 350 }
 351
 352 static int
 353 parse_get_txt (struct lexer *lexer, struct dataset *ds)
 354 {
 355   struct data_parser *parser = NULL;
 356   struct dictionary *dict = dict_create (get_default_encoding ());
 357   struct file_handle *fh = NULL;
 358   struct dfm_reader *reader = NULL;
 359   char *encoding = NULL;
 360   char *name = NULL;
 361
 362   int record;
 363   enum data_parser_type type;
 364   bool has_type;
 365
 366   lex_force_match (lexer, T_SLASH);
 367
 368   if (!lex_force_match_id (lexer, "FILE"))
 369     goto error;
 370   lex_force_match (lexer, T_EQUALS);
 371   fh = fh_parse (lexer, FH_REF_FILE | FH_REF_INLINE, NULL);
 372   if (fh == NULL)
 373     goto error;
 374
 375   parser = data_parser_create (dict);
 376   has_type = false;
 377   data_parser_set_type (parser, DP_DELIMITED);
 378   data_parser_set_span (parser, false);
 379   data_parser_set_quotes (parser, ss_empty ());
 380   data_parser_set_empty_line_has_field (parser, true);
 381
 382   for (;;)
 383     {
 384       if (!lex_force_match (lexer, T_SLASH))
 385         goto error;
 386
 387       if (lex_match_id (lexer, "ENCODING"))
 388         {
 389           lex_match (lexer, T_EQUALS);
 390           if (!lex_force_string (lexer))
 391             goto error;
 392
 393           free (encoding);
 394           encoding = ss_xstrdup (lex_tokss (lexer));
 395
 396           lex_get (lexer);
 397         }
 398       else if (lex_match_id (lexer, "ARRANGEMENT"))
 399         {
 400           bool ok;
 401
 402           lex_match (lexer, T_EQUALS);
 403           if (lex_match_id (lexer, "FIXED"))
 404             ok = set_type (parser, "ARRANGEMENT=FIXED", DP_FIXED, &has_type);
 405           else if (lex_match_id (lexer, "DELIMITED"))
 406             ok = set_type (parser, "ARRANGEMENT=DELIMITED",
 407                            DP_DELIMITED, &has_type);
 408           else
 409             {
 410               lex_error_expecting (lexer, "FIXED", "DELIMITED", NULL_SENTINEL);
 411               goto error;
 412             }
 413           if (!ok)
 414             goto error;
 415         }
 416       else if (lex_match_id (lexer, "FIRSTCASE"))
 417         {
 418           lex_match (lexer, T_EQUALS);
 419           if (!lex_force_int (lexer))
 420             goto error;
 421           if (lex_integer (lexer) < 1)
 422             {
 423               msg (SE, _("Value of FIRSTCASE must be 1 or greater."));
 424               goto error;
 425             }
 426           data_parser_set_skip (parser, lex_integer (lexer) - 1);
 427           lex_get (lexer);
 428         }
 429       else if (lex_match_id_n (lexer, "DELCASE", 4))
 430         {
 431           if (!set_type (parser, "DELCASE", DP_DELIMITED, &has_type))
 432             goto error;
 433           lex_match (lexer, T_EQUALS);
 434           if (lex_match_id (lexer, "LINE"))
 435             data_parser_set_span (parser, false);
 436           else if (lex_match_id (lexer, "VARIABLES"))
 437             {
 438               data_parser_set_span (parser, true);
 439
 440               /* VARIABLES takes an integer argument, but for no
 441                  good reason.  We just ignore it. */
 442               if (!lex_force_int (lexer))
 443                 goto error;
 444               lex_get (lexer);
 445             }
 446           else
 447             {
 448               lex_error_expecting (lexer, "LINE", "VARIABLES", NULL_SENTINEL);
 449               goto error;
 450             }
 451         }
 452       else if (lex_match_id (lexer, "FIXCASE"))
 453         {
 454           if (!set_type (parser, "FIXCASE", DP_FIXED, &has_type))
 455             goto error;
 456           lex_match (lexer, T_EQUALS);
 457           if (!lex_force_int (lexer))
 458             goto error;
 459           if (lex_integer (lexer) < 1)
 460             {
 461               msg (SE, _("Value of FIXCASE must be at least 1."));
 462               goto error;
 463             }
 464           data_parser_set_records (parser, lex_integer (lexer));
 465           lex_get (lexer);
 466         }
 467       else if (lex_match_id (lexer, "IMPORTCASES"))
 468         {
 469           lex_match (lexer, T_EQUALS);
 470           if (lex_match (lexer, T_ALL))
 471             {
 472               data_parser_set_case_limit (parser, -1);
 473               data_parser_set_case_percent (parser, 100);
 474             }
 475           else if (lex_match_id (lexer, "FIRST"))
 476             {
 477               if (!lex_force_int (lexer))
 478                 goto error;
 479               if (lex_integer (lexer) < 1)
 480                 {
 481                   msg (SE, _("Value of FIRST must be at least 1."));
 482                   goto error;
 483                 }
 484               data_parser_set_case_limit (parser, lex_integer (lexer));
 485               lex_get (lexer);
 486             }
 487           else if (lex_match_id (lexer, "PERCENT"))
 488             {
 489               if (!lex_force_int (lexer))
 490                 goto error;
 491               if (lex_integer (lexer) < 1 || lex_integer (lexer) > 100)
 492                 {
 493                   msg (SE, _("Value of PERCENT must be between 1 and 100."));
 494                   goto error;
 495                 }
 496               data_parser_set_case_percent (parser, lex_integer (lexer));
 497               lex_get (lexer);
 498             }
 499         }
 500       else if (lex_match_id_n (lexer, "DELIMITERS", 4))
 501         {
 502           struct string hard_seps = DS_EMPTY_INITIALIZER;
 503           const char *soft_seps = "";
 504           struct substring s;
 505           int c;
 506
 507           if (!set_type (parser, "DELIMITERS", DP_DELIMITED, &has_type))
 508             goto error;
 509           lex_match (lexer, T_EQUALS);
 510
 511           if (!lex_force_string (lexer))
 512             goto error;
 513
 514           /* XXX should support multibyte UTF-8 characters */
 515           s = lex_tokss (lexer);
 516           if (ss_match_string (&s, ss_cstr ("\\t")))
 517             ds_put_cstr (&hard_seps, "\t");
 518           if (ss_match_string (&s, ss_cstr ("\\\\")))
 519             ds_put_cstr (&hard_seps, "\\");
 520           while ((c = ss_get_byte (&s)) != EOF)
 521             if (c == ' ')
 522               soft_seps = " ";
 523             else
 524               ds_put_byte (&hard_seps, c);
 525           data_parser_set_soft_delimiters (parser, ss_cstr (soft_seps));
 526           data_parser_set_hard_delimiters (parser, ds_ss (&hard_seps));
 527           ds_destroy (&hard_seps);
 528
 529           lex_get (lexer);
 530         }
 531       else if (lex_match_id (lexer, "QUALIFIERS"))
 532         {
 533           if (!set_type (parser, "QUALIFIERS", DP_DELIMITED, &has_type))
 534             goto error;
 535           lex_match (lexer, T_EQUALS);
 536
 537           if (!lex_force_string (lexer))
 538             goto error;
 539
 540           /* XXX should support multibyte UTF-8 characters */
 541           if (settings_get_syntax () == COMPATIBLE
 542               && ss_length (lex_tokss (lexer)) != 1)
 543             {
 544               msg (SE, _("In compatible syntax mode, the QUALIFIER string "
 545                          "must contain exactly one character."));
 546               goto error;
 547             }
 548
 549           data_parser_set_quotes (parser, lex_tokss (lexer));
 550           lex_get (lexer);
 551         }
 552       else if (settings_get_syntax () == ENHANCED
 553                && lex_match_id (lexer, "ESCAPE"))
 554         data_parser_set_quote_escape (parser, true);
 555       else if (lex_match_id (lexer, "VARIABLES"))
 556         break;
 557       else
 558         {
 559           lex_error_expecting (lexer, "VARIABLES", NULL_SENTINEL);
 560           goto error;
 561         }
 562     }
 563   lex_match (lexer, T_EQUALS);
 564
 565   record = 1;
 566   type = data_parser_get_type (parser);
 567   do
 568     {
 569       struct fmt_spec input, output;
 570       struct variable *v;
 571       int fc, lc;
 572
 573       while (type == DP_FIXED && lex_match (lexer, T_SLASH))
 574         {
 575           if (!lex_force_int (lexer))
 576             goto error;
 577           if (lex_integer (lexer) < record)
 578             {
 579               msg (SE, _("The record number specified, %ld, is at or "
 580                          "before the previous record, %d.  Data "
 581                          "fields must be listed in order of "
 582                          "increasing record number."),
 583                    lex_integer (lexer), record);
 584               goto error;
 585             }
 586           if (lex_integer (lexer) > data_parser_get_records (parser))
 587             {
 588               msg (SE, _("The record number specified, %ld, exceeds "
 589                          "the number of records per case specified "
 590                          "on FIXCASE, %d."),
 591                    lex_integer (lexer), data_parser_get_records (parser));
 592               goto error;
 593             }
 594           record = lex_integer (lexer);
 595           lex_get (lexer);
 596         }
 597
 598       if (!lex_force_id (lexer)
 599           || !dict_id_is_valid (dict, lex_tokcstr (lexer), true))
 600         goto error;
 601       name = xstrdup (lex_tokcstr (lexer));
 602       lex_get (lexer);
 603
 604       if (type == DP_DELIMITED)
 605         {
 606           if (!parse_format_specifier (lexer, &input)
 607               || !fmt_check_input (&input))
 608             goto error;
 609
 610           output = fmt_for_output_from_input (&input);
 611         }
 612       else
 613         {
 614           char fmt_type_name[FMT_TYPE_LEN_MAX + 1];
 615           enum fmt_type fmt_type;
 616           int w, d;
 617
 618           if (!parse_column_range (lexer, 0, &fc, &lc, NULL))
 619             goto error;
 620
 621           /* Accept a format (e.g. F8.2) or just a type name (e.g. DOLLAR).  */
 622           if (!parse_abstract_format_specifier (lexer, fmt_type_name, &w, &d))
 623             goto error;
 624           if (!fmt_from_name (fmt_type_name, &fmt_type))
 625             {
 626               msg (SE, _("Unknown format type `%s'."), fmt_type_name);
 627               goto error;
 628             }
 629
 630           /* Compose input format. */
 631           input.type = fmt_type;
 632           input.w = lc - fc + 1;
 633           input.d = 0;
 634           if (!fmt_check_input (&input))
 635             goto error;
 636
 637           /* Compose output format. */
 638           if (w != 0)
 639             {
 640               output.type = fmt_type;
 641               output.w = w;
 642               output.d = d;
 643               if (!fmt_check_output (&output))
 644                 goto error;
 645             }
 646           else
 647             output = fmt_for_output_from_input (&input);
 648         }
 649
 650       v = dict_create_var (dict, name, fmt_var_width (&input));
 651       if (v == NULL)
 652         {
 653           msg (SE, _("%s is a duplicate variable name."), name);
 654           goto error;
 655         }
 656       var_set_both_formats (v, &output);
 657
 658       if (type == DP_DELIMITED)
 659         data_parser_add_delimited_field (parser, &input,
 660                                          var_get_case_index (v),
 661                                          name);
 662       else
 663         data_parser_add_fixed_field (parser, &input, var_get_case_index (v),
 664                                      name, record, fc);
 665       free (name);
 666       name = NULL;
 667     }
 668   while (lex_token (lexer) != T_ENDCMD);
 669
 670   reader = dfm_open_reader (fh, lexer, encoding);
 671   if (reader == NULL)
 672     goto error;
 673
 674   data_parser_make_active_file (parser, ds, reader, dict);
 675   fh_unref (fh);
 676   free (encoding);
 677   return CMD_SUCCESS;
 678
 679  error:
 680   data_parser_destroy (parser);
 681   dict_destroy (dict);
 682   fh_unref (fh);
 683   free (name);
 684   free (encoding);
 685   return CMD_CASCADING_FAILURE;
 686 }
 687
 688
 689 static void
 690 destroy_spreadsheet_read_info (struct spreadsheet_read_options *opts)
 691 {
 692   free (opts->cell_range);
 693   free (opts->sheet_name);
 694 }