pintos-os.org Git - pspp/blob - src/data/ods-reader.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2011 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "libpspp/message.h"
  20 #include "libpspp/misc.h"
  21
  22 #include "data/data-in.h"
  23
  24 #include "gl/minmax.h"
  25
  26 #include "gettext.h"
  27 #define _(msgid) gettext (msgid)
  28 #define N_(msgid) (msgid)
  29
  30 #include "ods-reader.h"
  31 #include "spreadsheet-reader.h"
  32
  33 #if !ODF_READ_SUPPORT
  34
  35 struct casereader *
  36 ods_open_reader (struct spreadsheet_read_info *gri, struct dictionary **dict)
  37 {
  38   msg (ME, _("Support for %s files was not compiled into this installation of PSPP"), "OpenDocument");
  39
  40   return NULL;
  41 }
  42
  43 #else
  44
  45 #include "libpspp/zip-reader.h"
  46
  47
  48 #include <assert.h>
  49 #include <stdbool.h>
  50 #include <errno.h>
  51 #include <libxml/xmlreader.h>
  52 #include <zlib.h>
  53
  54 #include "data/format.h"
  55 #include "data/case.h"
  56 #include "data/casereader-provider.h"
  57 #include "data/dictionary.h"
  58 #include "data/identifier.h"
  59 #include "data/value.h"
  60 #include "data/variable.h"
  61 #include "libpspp/i18n.h"
  62 #include "libpspp/str.h"
  63
  64 #include "gl/xalloc.h"
  65
  66 static void ods_file_casereader_destroy (struct casereader *, void *);
  67
  68 static struct ccase *ods_file_casereader_read (struct casereader *, void *);
  69
  70 static const struct casereader_class ods_file_casereader_class =
  71   {
  72     ods_file_casereader_read,
  73     ods_file_casereader_destroy,
  74     NULL,
  75     NULL,
  76   };
  77
  78 enum reader_state
  79   {
  80     STATE_INIT = 0,        /* Initial state */
  81     STATE_SPREADSHEET,     /* Found the start of the spreadsheet doc */
  82     STATE_TABLE,           /* Found the sheet that we actually want */
  83     STATE_ROW,             /* Found the start of the cell array */
  84     STATE_CELL,            /* Found a cell */
  85     STATE_CELL_CONTENT     /* Found a the text within a cell */
  86   };
  87
  88 struct ods_reader
  89 {
  90   xmlTextReaderPtr xtr;
  91
  92   enum reader_state state;
  93   bool sheet_found;
  94   int row;
  95   int col;
  96   int node_type;
  97   int sheet_index;
  98
  99   const xmlChar *target_sheet;
 100   int target_sheet_index;
 101
 102   int start_row;
 103   int start_col;
 104   int stop_row;
 105   int stop_col;
 106
 107   struct caseproto *proto;
 108   struct dictionary *dict;
 109   struct ccase *first_case;
 110   bool used_first_case;
 111   bool read_names;
 112
 113   struct string ods_errs;
 114   int span;
 115 };
 116
 117 static void process_node (struct ods_reader *r);
 118
 119 static void
 120 ods_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
 121 {
 122   struct ods_reader *r = r_;
 123   if ( r == NULL)
 124     return ;
 125
 126   if (r->xtr)
 127     xmlFreeTextReader (r->xtr);
 128
 129   if ( ! ds_is_empty (&r->ods_errs))
 130     msg (ME, "%s", ds_cstr (&r->ods_errs));
 131
 132   ds_destroy (&r->ods_errs);
 133
 134   if ( ! r->used_first_case )
 135     case_unref (r->first_case);
 136
 137   caseproto_unref (r->proto);
 138
 139   free (r);
 140 }
 141
 142 static void
 143 process_node (struct ods_reader *r)
 144 {
 145   xmlChar *name = xmlTextReaderName (r->xtr);
 146   if (name == NULL)
 147     name = xmlStrdup (_xml ("--"));
 148
 149   r->node_type = xmlTextReaderNodeType (r->xtr);
 150
 151   switch ( r->state)
 152     {
 153     case STATE_INIT:
 154       if (0 == xmlStrcasecmp (name, _xml("office:spreadsheet")) &&
 155           XML_READER_TYPE_ELEMENT  == r->node_type)
 156         {
 157           r->state = STATE_SPREADSHEET;
 158         }
 159       break;
 160     case STATE_SPREADSHEET:
 161       if (0 == xmlStrcasecmp (name, _xml("table:table")))
 162         {
 163           if (XML_READER_TYPE_ELEMENT == r->node_type)
 164             {
 165               r->col = -1;
 166               r->row = -1;
 167               ++r->sheet_index;
 168               if ( r->target_sheet != NULL)
 169                 {
 170                   xmlChar *value = xmlTextReaderGetAttribute (r->xtr, _xml ("table:name"));
 171                   if ( 0 == xmlStrcmp (value, r->target_sheet))
 172                     {
 173                       r->sheet_found = true;
 174                       r->state = STATE_TABLE;
 175                     }
 176                   free (value);
 177                 }
 178               else if (r->target_sheet_index == r->sheet_index)
 179                 {
 180                   r->sheet_found = true;
 181                   r->state = STATE_TABLE;
 182                 }
 183               else if ( r->target_sheet_index == -1)
 184                 r->state = STATE_TABLE;
 185             }
 186         }
 187       else if (XML_READER_TYPE_END_ELEMENT  == r->node_type
 188                    && r->sheet_found)
 189         {
 190           r->state = STATE_INIT;
 191         }
 192         break;
 193     case STATE_TABLE:
 194       if (0 == xmlStrcasecmp (name, _xml("table:table-row")) )
 195         {
 196           if ( XML_READER_TYPE_ELEMENT  == r->node_type)
 197             {
 198               if (! xmlTextReaderIsEmptyElement (r->xtr))
 199                 {
 200                   r->state = STATE_ROW;
 201                 }
 202               r->row++;
 203               r->span = 1;
 204             }
 205         }
 206       else if (XML_READER_TYPE_END_ELEMENT  == r->node_type)
 207         {
 208           r->state = STATE_SPREADSHEET;
 209         }
 210       break;
 211     case STATE_ROW:
 212       if (0 == xmlStrcasecmp (name, _xml ("table:table-cell")))
 213         {
 214           if ( XML_READER_TYPE_ELEMENT  == r->node_type)
 215             {
 216               xmlChar *value =
 217                 xmlTextReaderGetAttribute (r->xtr,
 218                                            _xml ("table:number-columns-repeated"));
 219               r->col += r->span;
 220               r->span = value ? _xmlchar_to_int (value) : 1;
 221               free (value);
 222               if (! xmlTextReaderIsEmptyElement (r->xtr))
 223                 {
 224                   r->state = STATE_CELL;
 225                 }
 226             }
 227         }
 228       else if (XML_READER_TYPE_END_ELEMENT  == r->node_type)
 229         {
 230           r->state = STATE_TABLE;
 231           r->col = -1;
 232           /* Set the span back to the default */
 233           r->span = 1;
 234         }
 235       break;
 236     case STATE_CELL:
 237       if (0 == xmlStrcasecmp (name, _xml("text:p")))
 238         {
 239           if ( XML_READER_TYPE_ELEMENT  == r->node_type)
 240             {
 241               r->state = STATE_CELL_CONTENT;
 242             }
 243         }
 244       else if (XML_READER_TYPE_END_ELEMENT  == r->node_type)
 245         {
 246           r->state = STATE_ROW;
 247         }
 248       break;
 249     case STATE_CELL_CONTENT:
 250       if (XML_READER_TYPE_TEXT != r->node_type)
 251         r->state = STATE_CELL;
 252       break;
 253     default:
 254       break;
 255     };
 256
 257   xmlFree (name);
 258 }
 259
 260 /*
 261    A struct containing the parameters of a cell's value
 262    parsed from the xml
 263 */
 264 struct xml_value
 265 {
 266   xmlChar *type;
 267   xmlChar *value;
 268   xmlChar *text;
 269 };
 270
 271 struct var_spec
 272 {
 273   char *name;
 274   struct xml_value firstval;
 275 };
 276
 277
 278 /* Determine the width that a xmv should probably have */
 279 static int
 280 xmv_to_width (const struct xml_value *xmv, int fallback)
 281 {
 282   int width = SPREADSHEET_DEFAULT_WIDTH;
 283
 284   /* Non-strings always have zero width */
 285   if (xmv->type != NULL && 0 != xmlStrcmp (xmv->type, _xml("string")))
 286     return 0;
 287
 288   if ( fallback != -1)
 289     return fallback;
 290
 291   if ( xmv->value )
 292     width = ROUND_UP (xmlStrlen (xmv->value),
 293                       SPREADSHEET_DEFAULT_WIDTH);
 294   else if ( xmv->text)
 295     width = ROUND_UP (xmlStrlen (xmv->text),
 296                       SPREADSHEET_DEFAULT_WIDTH);
 297
 298   return width;
 299 }
 300
 301 /*
 302    Sets the VAR of case C, to the value corresponding to the xml data
 303  */
 304 static void
 305 convert_xml_to_value (struct ccase *c, const struct variable *var,
 306                       const struct xml_value *xmv)
 307 {
 308   union value *v = case_data_rw (c, var);
 309
 310   if (xmv->value == NULL && xmv->text == NULL)
 311     value_set_missing (v, var_get_width (var));
 312   else if ( var_is_alpha (var))
 313     /* Use the text field, because it seems that there is no
 314        value field for strings */
 315     value_copy_str_rpad (v, var_get_width (var), xmv->text, ' ');
 316   else
 317     {
 318       const char *text ;
 319       const struct fmt_spec *fmt = var_get_write_format (var);
 320       enum fmt_category fc  = fmt_get_category (fmt->type);
 321
 322       assert ( fc != FMT_CAT_STRING);
 323
 324       text =
 325         xmv->value ? CHAR_CAST (const char *, xmv->value) : CHAR_CAST (const char *, xmv->text);
 326
 327       data_in (ss_cstr (text), "UTF-8",
 328                fmt->type,
 329                v,
 330                var_get_width (var),
 331                "UTF-8");
 332     }
 333 }
 334
 335
 336 struct casereader *
 337 ods_open_reader (struct spreadsheet_read_info *gri, struct dictionary **dict)
 338 {
 339   int ret = 0;
 340   xmlChar *type = NULL;
 341   unsigned long int vstart = 0;
 342   casenumber n_cases = CASENUMBER_MAX;
 343   int i;
 344   struct var_spec *var_spec = NULL;
 345   int n_var_specs = 0;
 346
 347   struct ods_reader *r = xzalloc (sizeof *r);
 348   struct zip_member *content = NULL;
 349   struct zip_reader *zreader ;
 350   xmlChar *val_string = NULL;
 351
 352   r->read_names = gri->read_names;
 353   ds_init_empty (&r->ods_errs);
 354
 355   zreader = zip_reader_create (gri->file_name, &r->ods_errs);
 356
 357   if ( NULL == zreader)
 358     {
 359       msg (ME, _("Error opening `%s' for reading as a OpenDocument spreadsheet file: %s."),
 360            gri->file_name, ds_cstr (&r->ods_errs));
 361
 362       goto error;
 363     }
 364
 365   content = zip_member_open (zreader, "content.xml");
 366   if ( NULL == content)
 367     {
 368       msg (ME, _("Could not extract OpenDocument spreadsheet from file `%s': %s."),
 369            gri->file_name, ds_cstr (&r->ods_errs));
 370
 371       goto error;
 372     }
 373
 374   zip_member_ref (content);
 375
 376   r->xtr = xmlReaderForIO ((xmlInputReadCallback) zip_member_read,
 377                            (xmlInputCloseCallback) zip_member_finish,
 378                            content,   NULL, NULL, XML_PARSE_RECOVER);
 379
 380   if ( r->xtr == NULL)
 381     {
 382       goto error;
 383     }
 384
 385   if ( gri->cell_range )
 386     {
 387       if ( ! convert_cell_ref (gri->cell_range,
 388                                &r->start_col, &r->start_row,
 389                                &r->stop_col, &r->stop_row))
 390         {
 391           msg (SE, _("Invalid cell range `%s'"),
 392                gri->cell_range);
 393           goto error;
 394         }
 395     }
 396   else
 397     {
 398       r->start_col = 0;
 399       r->start_row = 0;
 400       r->stop_col = -1;
 401       r->stop_row = -1;
 402     }
 403
 404   r->state = STATE_INIT;
 405   r->target_sheet = BAD_CAST gri->sheet_name;
 406   r->target_sheet_index = gri->sheet_index;
 407   r->row = r->col = -1;
 408   r->sheet_index = 0;
 409
 410
 411   /* If CELLRANGE was given, then we know how many variables should be read */
 412   if ( r->stop_col != -1 )
 413     {
 414       assert (var_spec == NULL);
 415       n_var_specs =  r->stop_col - r->start_col + 1;
 416       var_spec = xrealloc (var_spec, sizeof (*var_spec) * n_var_specs);
 417       memset (var_spec, '\0', sizeof (*var_spec) * n_var_specs);
 418     }
 419
 420
 421   /* Advance to the start of the cells for the target sheet */
 422   while ( (r->row < r->start_row ))
 423     {
 424       if (1 != (ret = xmlTextReaderRead (r->xtr)))
 425            break;
 426
 427       process_node (r);
 428     }
 429
 430   if (ret < 1)
 431     {
 432       msg (MW, _("Selected sheet or range of spreadsheet `%s' is empty."),
 433            gri->file_name);
 434       goto error;
 435     }
 436
 437   if ( gri->read_names)
 438     {
 439       while (1 == (ret = xmlTextReaderRead (r->xtr)))
 440         {
 441           int idx;
 442           process_node (r);
 443           if ( r->row > r->start_row)
 444             break;
 445
 446           if (r->col == -1 && r->row == r->start_row)
 447             break;
 448
 449           if ( r->col < r->start_col)
 450             continue;
 451
 452           idx = r->col - r->start_col;
 453
 454           if (r->state == STATE_CELL_CONTENT
 455               &&
 456               XML_READER_TYPE_TEXT  == r->node_type)
 457             {
 458               xmlChar *value = xmlTextReaderValue (r->xtr);
 459               if ( idx >= n_var_specs)
 460                 {
 461
 462                   var_spec = xrealloc (var_spec, sizeof (*var_spec) * (idx + 1));
 463
 464                   /* xrealloc (unlike realloc) doesn't initialise its memory to 0 */
 465                   memset (var_spec + n_var_specs,
 466                           0,
 467                           (n_var_specs - idx + 1) * sizeof (*var_spec));
 468                   n_var_specs = idx + 1;
 469                 }
 470               var_spec[idx].firstval.text = 0;
 471               var_spec[idx].firstval.value = 0;
 472               var_spec[idx].firstval.type = 0;
 473
 474               var_spec [idx].name = strdup (CHAR_CAST (const char *, value));
 475               free (value);
 476               value = NULL;
 477             }
 478         }
 479     }
 480
 481   /* Read in the first row of data */
 482   while (1 == xmlTextReaderRead (r->xtr))
 483     {
 484       int idx;
 485       process_node (r);
 486       if ( r->row >= r->start_row + 1 + gri->read_names)
 487         break;
 488
 489       if ( r->col < r->start_col)
 490         continue;
 491
 492       if ( r->col - r->start_col + 1 > n_var_specs)
 493         continue;
 494
 495       idx = r->col - r->start_col;
 496
 497       if ( r->state == STATE_CELL &&
 498            XML_READER_TYPE_ELEMENT  == r->node_type)
 499         {
 500           type = xmlTextReaderGetAttribute (r->xtr, _xml ("office:value-type"));
 501           val_string = xmlTextReaderGetAttribute (r->xtr, _xml ("office:value"));
 502         }
 503
 504       if ( r->state == STATE_CELL_CONTENT &&
 505            XML_READER_TYPE_TEXT  == r->node_type)
 506         {
 507           var_spec [idx].firstval.type = type;
 508           var_spec [idx].firstval.text = xmlTextReaderValue (r->xtr);
 509           var_spec [idx].firstval.value = val_string;
 510           val_string = NULL;
 511           type = NULL;
 512         }
 513     }
 514
 515   /* Create the dictionary and populate it */
 516   *dict = r->dict = dict_create (
 517     CHAR_CAST (const char *, xmlTextReaderConstEncoding (r->xtr)));
 518
 519   for (i = 0 ; i < n_var_specs ; ++i )
 520     {
 521       struct fmt_spec fmt;
 522       struct variable *var = NULL;
 523       char *name = dict_make_unique_var_name (r->dict, var_spec[i].name, &vstart);
 524       int width  = xmv_to_width (&var_spec[i].firstval, gri->asw);
 525       dict_create_var (r->dict, name, width);
 526       free (name);
 527
 528       var = dict_get_var (r->dict, i);
 529
 530       if ( 0 == xmlStrcmp (var_spec[i].firstval.type, _xml("date")))
 531         {
 532           fmt.type = FMT_DATE;
 533           fmt.d = 0;
 534           fmt.w = 20;
 535         }
 536       else
 537         fmt = fmt_default_for_width (width);
 538
 539       var_set_both_formats (var, &fmt);
 540     }
 541
 542   /* Create the first case, and cache it */
 543   r->used_first_case = false;
 544
 545   if ( n_var_specs ==  0 )
 546     {
 547       msg (MW, _("Selected sheet or range of spreadsheet `%s' is empty."),
 548            gri->file_name);
 549       goto error;
 550     }
 551
 552   r->proto = caseproto_ref (dict_get_proto (r->dict));
 553   r->first_case = case_create (r->proto);
 554   case_set_missing (r->first_case);
 555
 556   for ( i = 0 ; i < n_var_specs ; ++i )
 557     {
 558       const struct variable *var = dict_get_var (r->dict, i);
 559
 560       convert_xml_to_value (r->first_case, var,  &var_spec[i].firstval);
 561     }
 562
 563   zip_reader_destroy (zreader);
 564
 565   for ( i = 0 ; i < n_var_specs ; ++i )
 566     {
 567       free (var_spec[i].firstval.type);
 568       free (var_spec[i].firstval.value);
 569       free (var_spec[i].firstval.text);
 570       free (var_spec[i].name);
 571     }
 572
 573   free (var_spec);
 574
 575   return casereader_create_sequential
 576     (NULL,
 577      r->proto,
 578      n_cases,
 579      &ods_file_casereader_class, r);
 580
 581  error:
 582
 583   zip_reader_destroy (zreader);
 584
 585   for ( i = 0 ; i < n_var_specs ; ++i )
 586     {
 587       free (var_spec[i].firstval.type);
 588       free (var_spec[i].firstval.value);
 589       free (var_spec[i].firstval.text);
 590       free (var_spec[i].name);
 591     }
 592
 593   free (var_spec);
 594
 595   return NULL;
 596 }
 597
 598
 599 /* Reads and returns one case from READER's file.  Returns a null
 600    pointer on failure. */
 601 static struct ccase *
 602 ods_file_casereader_read (struct casereader *reader UNUSED, void *r_)
 603 {
 604   struct ccase *c = NULL;
 605   xmlChar *val_string = NULL;
 606   struct ods_reader *r = r_;
 607   int current_row = r->row;
 608
 609   if ( r->row == -1)
 610     return NULL;
 611
 612   if ( !r->used_first_case )
 613     {
 614       r->used_first_case = true;
 615       return r->first_case;
 616     }
 617
 618
 619   if ( r->state > STATE_INIT)
 620     {
 621       c = case_create (r->proto);
 622       case_set_missing (c);
 623     }
 624
 625   while (1 == xmlTextReaderRead (r->xtr))
 626     {
 627       process_node (r);
 628       if ( r->row > current_row)
 629         {
 630           break;
 631         }
 632       if ( r->col < r->start_col || (r->stop_col != -1 && r->col > r->stop_col))
 633         {
 634           continue;
 635         }
 636       if ( r->col - r->start_col >= caseproto_get_n_widths (r->proto))
 637         {
 638           continue;
 639         }
 640       if ( r->stop_row != -1 && r->row > r->stop_row)
 641         {
 642           continue;
 643         }
 644       if ( r->state == STATE_CELL &&
 645            r->node_type == XML_READER_TYPE_ELEMENT )
 646         {
 647           val_string = xmlTextReaderGetAttribute (r->xtr, _xml ("office:value"));
 648         }
 649
 650       if ( r->state == STATE_CELL_CONTENT && r->node_type == XML_READER_TYPE_TEXT )
 651         {
 652           int col;
 653           struct xml_value *xmv = xzalloc (sizeof *xmv);
 654           xmv->text = xmlTextReaderValue (r->xtr);
 655           xmv->value = val_string;
 656           val_string = NULL;
 657
 658           for (col = 0; col < r->span ; ++col)
 659             {
 660               const int idx = r->col + col - r->start_col;
 661
 662               const struct variable *var = dict_get_var (r->dict, idx);
 663
 664               convert_xml_to_value (c, var, xmv);
 665             }
 666           free (xmv->text);
 667           free (xmv->value);
 668           free (xmv);
 669         }
 670
 671       if ( r->state < STATE_TABLE)
 672         break;
 673     }
 674
 675   if (NULL == c || (r->stop_row != -1 && r->row > r->stop_row + 1))
 676     {
 677       case_unref (c);
 678       return NULL;
 679     }
 680   else
 681     {
 682       return c;
 683     }
 684 }
 685 #endif