src/data/ods-reader.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2011 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "libpspp/message.h"
  20 #include "libpspp/misc.h"
  21
  22 #include "data/data-in.h"
  23
  24 #include "gl/minmax.h"
  25
  26 #include "gettext.h"
  27 #define _(msgid) gettext (msgid)
  28 #define N_(msgid) (msgid)
  29
  30 #include "ods-reader.h"
  31 #include "spreadsheet-reader.h"
  32
  33 #if !ODF_READ_SUPPORT
  34
  35 struct casereader *
  36 ods_open_reader (struct spreadsheet_read_info *gri, struct dictionary **dict)
  37 {
  38   msg (ME, _("Support for %s files was not compiled into this installation of PSPP"), "OpenDocument");
  39
  40   return NULL;
  41 }
  42
  43 #else
  44
  45 #include "libpspp/zip-reader.h"
  46
  47
  48 #include <assert.h>
  49 #include <stdbool.h>
  50 #include <errno.h>
  51 #include <libxml/xmlreader.h>
  52 #include <zlib.h>
  53
  54 #include "data/format.h"
  55 #include "data/case.h"
  56 #include "data/casereader-provider.h"
  57 #include "data/dictionary.h"
  58 #include "data/identifier.h"
  59 #include "data/value.h"
  60 #include "data/variable.h"
  61 #include "libpspp/i18n.h"
  62 #include "libpspp/str.h"
  63
  64 #include "gl/xalloc.h"
  65
  66 static void ods_file_casereader_destroy (struct casereader *, void *);
  67
  68 static struct ccase *ods_file_casereader_read (struct casereader *, void *);
  69
  70 static const struct casereader_class ods_file_casereader_class =
  71   {
  72     ods_file_casereader_read,
  73     ods_file_casereader_destroy,
  74     NULL,
  75     NULL,
  76   };
  77
  78 enum reader_state
  79   {
  80     STATE_INIT = 0,        /* Initial state */
  81     STATE_SPREADSHEET,     /* Found the start of the spreadsheet doc */
  82     STATE_TABLE,           /* Found the sheet that we actually want */
  83     STATE_ROW,             /* Found the start of the cell array */
  84     STATE_CELL,            /* Found a cell */
  85     STATE_CELL_CONTENT     /* Found a the text within a cell */
  86   };
  87
  88 struct ods_reader
  89 {
  90   xmlTextReaderPtr xtr;
  91
  92   enum reader_state state;
  93   bool sheet_found;
  94   int row;
  95   int col;
  96   int node_type;
  97   int sheet_index;
  98
  99   const xmlChar *target_sheet;
 100   int target_sheet_index;
 101
 102   int start_row;
 103   int start_col;
 104   int stop_row;
 105   int stop_col;
 106
 107   struct caseproto *proto;
 108   struct dictionary *dict;
 109   struct ccase *first_case;
 110   bool used_first_case;
 111   bool read_names;
 112
 113   struct string ods_errs;
 114   int span;
 115 };
 116
 117 static void process_node (struct ods_reader *r);
 118
 119 static void
 120 ods_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
 121 {
 122   struct ods_reader *r = r_;
 123   if ( r == NULL)
 124     return ;
 125
 126   if (r->xtr)
 127     xmlFreeTextReader (r->xtr);
 128
 129   if ( ! ds_is_empty (&r->ods_errs))
 130     msg (ME, ds_cstr (&r->ods_errs));
 131
 132   ds_destroy (&r->ods_errs);
 133
 134   if ( ! r->used_first_case )
 135     case_unref (r->first_case);
 136
 137   caseproto_unref (r->proto);
 138
 139   free (r);
 140 }
 141
 142 static void
 143 process_node (struct ods_reader *r)
 144 {
 145   xmlChar *name = xmlTextReaderName (r->xtr);
 146   if (name == NULL)
 147     name = xmlStrdup (_xml ("--"));
 148
 149   r->node_type = xmlTextReaderNodeType (r->xtr);
 150
 151   switch ( r->state)
 152     {
 153     case STATE_INIT:
 154       if (0 == xmlStrcasecmp (name, _xml("office:spreadsheet")) &&
 155           XML_READER_TYPE_ELEMENT  == r->node_type)
 156         {
 157           r->state = STATE_SPREADSHEET;
 158         }
 159       break;
 160     case STATE_SPREADSHEET:
 161       if (0 == xmlStrcasecmp (name, _xml("table:table")))
 162         {
 163           if (XML_READER_TYPE_ELEMENT == r->node_type)
 164             {
 165               r->col = -1;
 166               r->row = -1;
 167               ++r->sheet_index;
 168               if ( r->target_sheet != NULL)
 169                 {
 170                   xmlChar *value = xmlTextReaderGetAttribute (r->xtr, _xml ("table:name"));
 171                   if ( 0 == xmlStrcmp (value, r->target_sheet))
 172                     {
 173                       r->sheet_found = true;
 174                       r->state = STATE_TABLE;
 175                     }
 176                   free (value);
 177                 }
 178               else if (r->target_sheet_index == r->sheet_index)
 179                 {
 180                   r->sheet_found = true;
 181                   r->state = STATE_TABLE;
 182                 }
 183               else if ( r->target_sheet_index == -1)
 184                 r->state = STATE_TABLE;
 185             }
 186         }
 187       else if (XML_READER_TYPE_END_ELEMENT  == r->node_type
 188                    && r->sheet_found)
 189         {
 190           r->state = STATE_INIT;
 191         }
 192         break;
 193     case STATE_TABLE:
 194       if (0 == xmlStrcasecmp (name, _xml("table:table-row")) )
 195         {
 196           if ( XML_READER_TYPE_ELEMENT  == r->node_type)
 197             {
 198               if (! xmlTextReaderIsEmptyElement (r->xtr))
 199                 {
 200                   r->state = STATE_ROW;
 201                 }
 202               r->row++;
 203               r->span = 1;
 204             }
 205         }
 206       else if (XML_READER_TYPE_END_ELEMENT  == r->node_type)
 207         {
 208           r->state = STATE_SPREADSHEET;
 209         }
 210       break;
 211     case STATE_ROW:
 212       if (0 == xmlStrcasecmp (name, _xml ("table:table-cell")))
 213         {
 214           if ( XML_READER_TYPE_ELEMENT  == r->node_type)
 215             {
 216               xmlChar *value =
 217                 xmlTextReaderGetAttribute (r->xtr,
 218                                            _xml ("table:number-columns-repeated"));
 219               r->col += r->span;
 220               r->span = value ? _xmlchar_to_int (value) : 1;
 221               free (value);
 222               if (! xmlTextReaderIsEmptyElement (r->xtr))
 223                 {
 224                   r->state = STATE_CELL;
 225                 }
 226             }
 227         }
 228       else if (XML_READER_TYPE_END_ELEMENT  == r->node_type)
 229         {
 230           r->state = STATE_TABLE;
 231           r->col = -1;
 232           /* Set the span back to the default */
 233           r->span = 1;
 234         }
 235       break;
 236     case STATE_CELL:
 237       if (0 == xmlStrcasecmp (name, _xml("text:p")))
 238         {
 239           if ( XML_READER_TYPE_ELEMENT  == r->node_type)
 240             {
 241               r->state = STATE_CELL_CONTENT;
 242             }
 243         }
 244       else if (XML_READER_TYPE_END_ELEMENT  == r->node_type)
 245         {
 246           r->state = STATE_ROW;
 247         }
 248       break;
 249     case STATE_CELL_CONTENT:
 250       if (XML_READER_TYPE_TEXT != r->node_type)
 251         r->state = STATE_CELL;
 252       break;
 253     default:
 254       break;
 255     };
 256
 257   xmlFree (name);
 258 }
 259
 260 /*
 261    A struct containing the parameters of a cell's value
 262    parsed from the xml
 263 */
 264 struct xml_value
 265 {
 266   xmlChar *type;
 267   xmlChar *value;
 268   xmlChar *text;
 269 };
 270
 271 struct var_spec
 272 {
 273   char *name;
 274   struct xml_value firstval;
 275 };
 276
 277
 278 /* Determine the width that a xmv should probably have */
 279 static int
 280 xmv_to_width (const struct xml_value *xmv, int fallback)
 281 {
 282   int width = SPREADSHEET_DEFAULT_WIDTH;
 283
 284   /* Non-strings always have zero width */
 285   if (xmv->type != NULL && 0 != xmlStrcmp (xmv->type, _xml("string")))
 286     return 0;
 287
 288   if ( fallback != -1)
 289     return fallback;
 290
 291   if ( xmv->value )
 292     width = ROUND_UP (xmlStrlen (xmv->value),
 293                       SPREADSHEET_DEFAULT_WIDTH);
 294   else if ( xmv->text)
 295     width = ROUND_UP (xmlStrlen (xmv->text),
 296                       SPREADSHEET_DEFAULT_WIDTH);
 297
 298   return width;
 299 }
 300
 301 /*
 302    Sets the VAR of case C, to the value corresponding to the xml data
 303  */
 304 static void
 305 convert_xml_to_value (struct ccase *c, const struct variable *var,
 306                       const struct xml_value *xmv)
 307 {
 308   union value *v = case_data_rw (c, var);
 309
 310   if (xmv->value == NULL && xmv->text == NULL)
 311     value_set_missing (v, var_get_width (var));
 312   else if ( var_is_alpha (var))
 313     /* Use the text field, because it seems that there is no
 314        value field for strings */
 315     value_copy_str_rpad (v, var_get_width (var), xmv->text, ' ');
 316   else
 317     {
 318       const struct fmt_spec *fmt = var_get_write_format (var);
 319       enum fmt_category fc  = fmt_get_category (fmt->type);
 320
 321       assert ( fc != FMT_CAT_STRING);
 322
 323       const char *text = xmv->value ? CHAR_CAST (const char *, xmv->value):
 324         CHAR_CAST (const char *, xmv->text);
 325
 326       data_in (ss_cstr (text), "UTF-8",
 327                fmt->type,
 328                v,
 329                var_get_width (var),
 330                "UTF-8");
 331     }
 332 }
 333
 334
 335 struct casereader *
 336 ods_open_reader (struct spreadsheet_read_info *gri, struct dictionary **dict)
 337 {
 338   int ret = 0;
 339   xmlChar *type = NULL;
 340   unsigned long int vstart = 0;
 341   casenumber n_cases = CASENUMBER_MAX;
 342   int i;
 343   struct var_spec *var_spec = NULL;
 344   int n_var_specs = 0;
 345
 346   struct ods_reader *r = xzalloc (sizeof *r);
 347
 348   r->read_names = gri->read_names;
 349   ds_init_empty (&r->ods_errs);
 350
 351   struct zip_reader *zreader = zip_reader_create (gri->file_name, &r->ods_errs);
 352   struct zip_member *content = NULL;
 353
 354   if ( NULL == zreader)
 355     {
 356       msg (ME, _("Error opening `%s' for reading as a OpenDocument spreadsheet file: %s."),
 357            gri->file_name, ds_cstr (&r->ods_errs));
 358
 359       goto error;
 360     }
 361
 362   content = zip_member_open (zreader, "content.xml");
 363   if ( NULL == content)
 364     {
 365       msg (ME, _("Could not extract OpenDocument spreadsheet from file `%s': %s."),
 366            gri->file_name, ds_cstr (&r->ods_errs));
 367
 368       goto error;
 369     }
 370
 371   zip_member_ref (content);
 372
 373   r->xtr = xmlReaderForIO ((xmlInputReadCallback) zip_member_read,
 374                            (xmlInputCloseCallback) zip_member_finish,
 375                            content,   NULL, NULL, XML_PARSE_RECOVER);
 376
 377   if ( r->xtr == NULL)
 378     {
 379       goto error;
 380     }
 381
 382   if ( gri->cell_range )
 383     {
 384       if ( ! convert_cell_ref (gri->cell_range,
 385                                &r->start_col, &r->start_row,
 386                                &r->stop_col, &r->stop_row))
 387         {
 388           msg (SE, _("Invalid cell range `%s'"),
 389                gri->cell_range);
 390           goto error;
 391         }
 392     }
 393   else
 394     {
 395       r->start_col = 0;
 396       r->start_row = 0;
 397       r->stop_col = -1;
 398       r->stop_row = -1;
 399     }
 400
 401   r->state = STATE_INIT;
 402   r->target_sheet = BAD_CAST gri->sheet_name;
 403   r->target_sheet_index = gri->sheet_index;
 404   r->row = r->col = -1;
 405   r->sheet_index = 0;
 406
 407
 408   /* If CELLRANGE was given, then we know how many variables should be read */
 409   if ( r->stop_col != -1 )
 410     {
 411       n_var_specs =  r->stop_col - r->start_col + 1;
 412       var_spec = xrealloc (var_spec, sizeof (*var_spec) * n_var_specs);
 413     }
 414
 415
 416   /* Advance to the start of the cells for the target sheet */
 417   while ( (r->row < r->start_row ))
 418     {
 419       if (1 != (ret = xmlTextReaderRead (r->xtr)))
 420            break;
 421
 422       process_node (r);
 423     }
 424
 425   if (ret < 1)
 426     {
 427       msg (MW, _("Selected sheet or range of spreadsheet `%s' is empty."),
 428            gri->file_name);
 429       goto error;
 430     }
 431
 432   if ( gri->read_names)
 433     {
 434       while (1 == (ret = xmlTextReaderRead (r->xtr)))
 435         {
 436           int idx;
 437           process_node (r);
 438           if ( r->row > r->start_row)
 439             break;
 440
 441           if (r->col == -1 && r->row == r->start_row)
 442             break;
 443
 444           if ( r->col < r->start_col)
 445             continue;
 446
 447           idx = r->col - r->start_col;
 448
 449           if (r->state == STATE_CELL_CONTENT
 450               &&
 451               XML_READER_TYPE_TEXT  == r->node_type)
 452             {
 453               xmlChar *value = xmlTextReaderValue (r->xtr);
 454               if ( idx >= n_var_specs)
 455                 {
 456
 457                   var_spec = xrealloc (var_spec, sizeof (*var_spec) * (idx + 1));
 458
 459                   /* xrealloc (unlike realloc) doesn't initialise its memory to 0 */
 460                   memset (var_spec + n_var_specs * sizeof (*var_spec),
 461                           0,
 462                           (n_var_specs - idx + 1) * sizeof (*var_spec));
 463                   n_var_specs = idx + 1;
 464                 }
 465               var_spec[idx].firstval.text = 0;
 466               var_spec[idx].firstval.value = 0;
 467               var_spec[idx].firstval.type = 0;
 468
 469               var_spec [idx].name = strdup (CHAR_CAST (const char *, value));
 470               free (value);
 471               value = NULL;
 472             }
 473         }
 474     }
 475
 476   xmlChar *val_string = NULL;
 477   /* Read in the first row of data */
 478   while (1 == xmlTextReaderRead (r->xtr))
 479     {
 480       int idx;
 481       process_node (r);
 482       if ( r->row >= r->start_row + 1 + gri->read_names)
 483         break;
 484
 485       if ( r->col < r->start_col)
 486         continue;
 487
 488       if ( r->col - r->start_col + 1 > n_var_specs)
 489         continue;
 490
 491       idx = r->col - r->start_col;
 492
 493       if ( r->state == STATE_CELL &&
 494            XML_READER_TYPE_ELEMENT  == r->node_type)
 495         {
 496           type = xmlTextReaderGetAttribute (r->xtr, _xml ("office:value-type"));
 497           val_string = xmlTextReaderGetAttribute (r->xtr, _xml ("office:value"));
 498         }
 499
 500       if ( r->state == STATE_CELL_CONTENT &&
 501            XML_READER_TYPE_TEXT  == r->node_type)
 502         {
 503           var_spec [idx].firstval.type = type;
 504           var_spec [idx].firstval.text = xmlTextReaderValue (r->xtr);
 505           var_spec [idx].firstval.value = val_string;
 506           val_string = NULL;
 507           type = NULL;
 508         }
 509     }
 510
 511   /* Create the dictionary and populate it */
 512   *dict = r->dict = dict_create (
 513     CHAR_CAST (const char *, xmlTextReaderConstEncoding (r->xtr)));
 514
 515   for (i = 0 ; i < n_var_specs ; ++i )
 516     {
 517       struct fmt_spec fmt;
 518       struct variable *var = NULL;
 519       char *name = dict_make_unique_var_name (r->dict, var_spec[i].name, &vstart);
 520       int width  = xmv_to_width (&var_spec[i].firstval, gri->asw);
 521       dict_create_var (r->dict, name, width);
 522       free (name);
 523
 524       var = dict_get_var (r->dict, i);
 525
 526       if ( 0 == xmlStrcmp (var_spec[i].firstval.type, _xml("date")))
 527         {
 528           fmt.type = FMT_DATE;
 529           fmt.d = 0;
 530           fmt.w = 20;
 531         }
 532       else
 533         fmt = fmt_default_for_width (width);
 534
 535       var_set_both_formats (var, &fmt);
 536     }
 537
 538   /* Create the first case, and cache it */
 539   r->used_first_case = false;
 540
 541   if ( n_var_specs ==  0 )
 542     {
 543       msg (MW, _("Selected sheet or range of spreadsheet `%s' is empty."),
 544            gri->file_name);
 545       goto error;
 546     }
 547
 548   r->proto = caseproto_ref (dict_get_proto (r->dict));
 549   r->first_case = case_create (r->proto);
 550   case_set_missing (r->first_case);
 551
 552   for ( i = 0 ; i < n_var_specs ; ++i )
 553     {
 554       const struct variable *var = dict_get_var (r->dict, i);
 555
 556       convert_xml_to_value (r->first_case, var,  &var_spec[i].firstval);
 557     }
 558
 559   zip_reader_destroy (zreader);
 560
 561   for ( i = 0 ; i < n_var_specs ; ++i )
 562     {
 563       free (var_spec[i].firstval.type);
 564       free (var_spec[i].firstval.value);
 565       free (var_spec[i].firstval.text);
 566       free (var_spec[i].name);
 567     }
 568
 569   free (var_spec);
 570
 571   return casereader_create_sequential
 572     (NULL,
 573      r->proto,
 574      n_cases,
 575      &ods_file_casereader_class, r);
 576
 577  error:
 578
 579   zip_reader_destroy (zreader);
 580
 581   for ( i = 0 ; i < n_var_specs ; ++i )
 582     {
 583       free (var_spec[i].firstval.type);
 584       free (var_spec[i].firstval.value);
 585       free (var_spec[i].firstval.text);
 586       free (var_spec[i].name);
 587     }
 588
 589   free (var_spec);
 590
 591   return NULL;
 592 }
 593
 594
 595 /* Reads and returns one case from READER's file.  Returns a null
 596    pointer on failure. */
 597 static struct ccase *
 598 ods_file_casereader_read (struct casereader *reader UNUSED, void *r_)
 599 {
 600   struct ccase *c = NULL;
 601   xmlChar *val_string = NULL;
 602   struct ods_reader *r = r_;
 603   int current_row = r->row;
 604
 605   if ( r->row == -1)
 606     return NULL;
 607
 608   if ( !r->used_first_case )
 609     {
 610       r->used_first_case = true;
 611       return r->first_case;
 612     }
 613
 614
 615   if ( r->state > STATE_INIT)
 616     {
 617       c = case_create (r->proto);
 618       case_set_missing (c);
 619     }
 620
 621   while (1 == xmlTextReaderRead (r->xtr))
 622     {
 623       process_node (r);
 624       if ( r->row > current_row)
 625         {
 626           break;
 627         }
 628       if ( r->col < r->start_col || (r->stop_col != -1 && r->col > r->stop_col))
 629         {
 630           continue;
 631         }
 632       if ( r->col - r->start_col >= caseproto_get_n_widths (r->proto))
 633         {
 634           continue;
 635         }
 636       if ( r->stop_row != -1 && r->row > r->stop_row)
 637         {
 638           continue;
 639         }
 640       if ( r->state == STATE_CELL &&
 641            r->node_type == XML_READER_TYPE_ELEMENT )
 642         {
 643           val_string = xmlTextReaderGetAttribute (r->xtr, _xml ("office:value"));
 644         }
 645
 646       if ( r->state == STATE_CELL_CONTENT && r->node_type == XML_READER_TYPE_TEXT )
 647         {
 648           int col;
 649           struct xml_value *xmv = xzalloc (sizeof *xmv);
 650           xmv->text = xmlTextReaderValue (r->xtr);
 651           xmv->value = val_string;
 652           val_string = NULL;
 653
 654           for (col = 0; col < r->span ; ++col)
 655             {
 656               const int idx = r->col + col - r->start_col;
 657
 658               const struct variable *var = dict_get_var (r->dict, idx);
 659
 660               convert_xml_to_value (c, var, xmv);
 661             }
 662           free (xmv->text);
 663           free (xmv->value);
 664           free (xmv);
 665         }
 666
 667       if ( r->state < STATE_TABLE)
 668         break;
 669     }
 670
 671   if (NULL == c || (r->stop_row != -1 && r->row > r->stop_row + 1))
 672     {
 673       case_unref (c);
 674       return NULL;
 675     }
 676   else
 677     {
 678       return c;
 679     }
 680 }
 681 #endif