pintos-os.org Git - pspp/blob - src/data/ods-reader.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2011 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "libpspp/message.h"
  20 #include "libpspp/misc.h"
  21
  22 #include "data/data-in.h"
  23
  24 #include "gl/minmax.h"
  25
  26 #include "gettext.h"
  27 #define _(msgid) gettext (msgid)
  28 #define N_(msgid) (msgid)
  29
  30 #include "ods-reader.h"
  31 #include "spreadsheet-reader.h"
  32
  33 #if !ODF_READ_SUPPORT
  34
  35 struct casereader *
  36 ods_open_reader (struct spreadsheet_read_info *gri, struct dictionary **dict)
  37 {
  38   msg (ME, _("Support for %s files was not compiled into this installation of PSPP"), "OpenDocument");
  39
  40   return NULL;
  41 }
  42
  43 #else
  44
  45 #include "libpspp/zip-reader.h"
  46
  47
  48 #include <assert.h>
  49 #include <stdbool.h>
  50 #include <errno.h>
  51 #include <libxml/xmlreader.h>
  52 #include <zlib.h>
  53
  54 #include "data/format.h"
  55 #include "data/case.h"
  56 #include "data/casereader-provider.h"
  57 #include "data/dictionary.h"
  58 #include "data/identifier.h"
  59 #include "data/value.h"
  60 #include "data/variable.h"
  61 #include "libpspp/i18n.h"
  62 #include "libpspp/str.h"
  63
  64 #include "gl/xalloc.h"
  65
  66 static void ods_file_casereader_destroy (struct casereader *, void *);
  67
  68 static struct ccase *ods_file_casereader_read (struct casereader *, void *);
  69
  70 static const struct casereader_class ods_file_casereader_class =
  71   {
  72     ods_file_casereader_read,
  73     ods_file_casereader_destroy,
  74     NULL,
  75     NULL,
  76   };
  77
  78 enum reader_state
  79   {
  80     STATE_INIT = 0,        /* Initial state */
  81     STATE_SPREADSHEET,     /* Found the start of the spreadsheet doc */
  82     STATE_TABLE,           /* Found the sheet that we actually want */
  83     STATE_ROW,             /* Found the start of the cell array */
  84     STATE_CELL,            /* Found a cell */
  85     STATE_CELL_CONTENT     /* Found a the text within a cell */
  86   };
  87
  88 struct ods_reader
  89 {
  90   xmlTextReaderPtr xtr;
  91
  92   enum reader_state state;
  93   bool sheet_found;
  94   int row;
  95   int col;
  96   int node_type;
  97   int sheet_index;
  98
  99   const xmlChar *target_sheet;
 100   int target_sheet_index;
 101
 102   int start_row;
 103   int start_col;
 104   int stop_row;
 105   int stop_col;
 106
 107   struct caseproto *proto;
 108   struct dictionary *dict;
 109   struct ccase *first_case;
 110   bool used_first_case;
 111   bool read_names;
 112
 113   struct string ods_errs;
 114   int span;
 115 };
 116
 117 static void process_node (struct ods_reader *r);
 118
 119 static void
 120 ods_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
 121 {
 122   struct ods_reader *r = r_;
 123   if ( r == NULL)
 124     return ;
 125
 126   if (r->xtr)
 127     xmlFreeTextReader (r->xtr);
 128
 129   if ( ! ds_is_empty (&r->ods_errs))
 130     msg (ME, "%s", ds_cstr (&r->ods_errs));
 131
 132   ds_destroy (&r->ods_errs);
 133
 134   if ( ! r->used_first_case )
 135     case_unref (r->first_case);
 136
 137   caseproto_unref (r->proto);
 138
 139   free (r);
 140 }
 141
 142 static void
 143 process_node (struct ods_reader *r)
 144 {
 145   xmlChar *name = xmlTextReaderName (r->xtr);
 146   if (name == NULL)
 147     name = xmlStrdup (_xml ("--"));
 148
 149   r->node_type = xmlTextReaderNodeType (r->xtr);
 150
 151   switch ( r->state)
 152     {
 153     case STATE_INIT:
 154       if (0 == xmlStrcasecmp (name, _xml("office:spreadsheet")) &&
 155           XML_READER_TYPE_ELEMENT  == r->node_type)
 156         {
 157           r->state = STATE_SPREADSHEET;
 158         }
 159       break;
 160     case STATE_SPREADSHEET:
 161       if (0 == xmlStrcasecmp (name, _xml("table:table")))
 162         {
 163           if (XML_READER_TYPE_ELEMENT == r->node_type)
 164             {
 165               r->col = -1;
 166               r->row = -1;
 167               ++r->sheet_index;
 168               if ( r->target_sheet != NULL)
 169                 {
 170                   xmlChar *value = xmlTextReaderGetAttribute (r->xtr, _xml ("table:name"));
 171                   if ( 0 == xmlStrcmp (value, r->target_sheet))
 172                     {
 173                       r->sheet_found = true;
 174                       r->state = STATE_TABLE;
 175                     }
 176                   free (value);
 177                 }
 178               else if (r->target_sheet_index == r->sheet_index)
 179                 {
 180                   r->sheet_found = true;
 181                   r->state = STATE_TABLE;
 182                 }
 183               else if ( r->target_sheet_index == -1)
 184                 r->state = STATE_TABLE;
 185             }
 186         }
 187       else if (XML_READER_TYPE_END_ELEMENT  == r->node_type
 188                    && r->sheet_found)
 189         {
 190           r->state = STATE_INIT;
 191         }
 192         break;
 193     case STATE_TABLE:
 194       if (0 == xmlStrcasecmp (name, _xml("table:table-row")) )
 195         {
 196           if ( XML_READER_TYPE_ELEMENT  == r->node_type)
 197             {
 198               if (! xmlTextReaderIsEmptyElement (r->xtr))
 199                 {
 200                   r->state = STATE_ROW;
 201                 }
 202               r->row++;
 203               r->span = 1;
 204             }
 205         }
 206       else if (XML_READER_TYPE_END_ELEMENT  == r->node_type)
 207         {
 208           r->state = STATE_SPREADSHEET;
 209         }
 210       break;
 211     case STATE_ROW:
 212       if (0 == xmlStrcasecmp (name, _xml ("table:table-cell")))
 213         {
 214           if ( XML_READER_TYPE_ELEMENT  == r->node_type)
 215             {
 216               xmlChar *value =
 217                 xmlTextReaderGetAttribute (r->xtr,
 218                                            _xml ("table:number-columns-repeated"));
 219               r->col += r->span;
 220               r->span = value ? _xmlchar_to_int (value) : 1;
 221               free (value);
 222               if (! xmlTextReaderIsEmptyElement (r->xtr))
 223                 {
 224                   r->state = STATE_CELL;
 225                 }
 226             }
 227         }
 228       else if (XML_READER_TYPE_END_ELEMENT  == r->node_type)
 229         {
 230           r->state = STATE_TABLE;
 231           r->col = -1;
 232           /* Set the span back to the default */
 233           r->span = 1;
 234         }
 235       break;
 236     case STATE_CELL:
 237       if (0 == xmlStrcasecmp (name, _xml("text:p")))
 238         {
 239           if ( XML_READER_TYPE_ELEMENT  == r->node_type)
 240             {
 241               r->state = STATE_CELL_CONTENT;
 242             }
 243         }
 244       else if (XML_READER_TYPE_END_ELEMENT  == r->node_type)
 245         {
 246           r->state = STATE_ROW;
 247         }
 248       break;
 249     case STATE_CELL_CONTENT:
 250       if (XML_READER_TYPE_TEXT != r->node_type)
 251         r->state = STATE_CELL;
 252       break;
 253     default:
 254       break;
 255     };
 256
 257   xmlFree (name);
 258 }
 259
 260 /*
 261    A struct containing the parameters of a cell's value
 262    parsed from the xml
 263 */
 264 struct xml_value
 265 {
 266   xmlChar *type;
 267   xmlChar *value;
 268   xmlChar *text;
 269 };
 270
 271 struct var_spec
 272 {
 273   char *name;
 274   struct xml_value firstval;
 275 };
 276
 277
 278 /* Determine the width that a xmv should probably have */
 279 static int
 280 xmv_to_width (const struct xml_value *xmv, int fallback)
 281 {
 282   int width = SPREADSHEET_DEFAULT_WIDTH;
 283
 284   /* Non-strings always have zero width */
 285   if (xmv->type != NULL && 0 != xmlStrcmp (xmv->type, _xml("string")))
 286     return 0;
 287
 288   if ( fallback != -1)
 289     return fallback;
 290
 291   if ( xmv->value )
 292     width = ROUND_UP (xmlStrlen (xmv->value),
 293                       SPREADSHEET_DEFAULT_WIDTH);
 294   else if ( xmv->text)
 295     width = ROUND_UP (xmlStrlen (xmv->text),
 296                       SPREADSHEET_DEFAULT_WIDTH);
 297
 298   return width;
 299 }
 300
 301 /*
 302    Sets the VAR of case C, to the value corresponding to the xml data
 303  */
 304 static void
 305 convert_xml_to_value (struct ccase *c, const struct variable *var,
 306                       const struct xml_value *xmv)
 307 {
 308   union value *v = case_data_rw (c, var);
 309
 310   if (xmv->value == NULL && xmv->text == NULL)
 311     value_set_missing (v, var_get_width (var));
 312   else if ( var_is_alpha (var))
 313     /* Use the text field, because it seems that there is no
 314        value field for strings */
 315     value_copy_str_rpad (v, var_get_width (var), xmv->text, ' ');
 316   else
 317     {
 318       const char *text ;
 319       const struct fmt_spec *fmt = var_get_write_format (var);
 320       enum fmt_category fc  = fmt_get_category (fmt->type);
 321
 322       assert ( fc != FMT_CAT_STRING);
 323
 324       text =
 325         xmv->value ? CHAR_CAST (const char *, xmv->value) : CHAR_CAST (const char *, xmv->text);
 326
 327       data_in (ss_cstr (text), "UTF-8",
 328                fmt->type,
 329                v,
 330                var_get_width (var),
 331                "UTF-8");
 332     }
 333 }
 334
 335
 336 struct casereader *
 337 ods_open_reader (struct spreadsheet_read_info *gri, struct dictionary **dict)
 338 {
 339   int ret = 0;
 340   xmlChar *type = NULL;
 341   unsigned long int vstart = 0;
 342   casenumber n_cases = CASENUMBER_MAX;
 343   int i;
 344   struct var_spec *var_spec = NULL;
 345   int n_var_specs = 0;
 346
 347   struct ods_reader *r = xzalloc (sizeof *r);
 348   struct zip_member *content = NULL;
 349   struct zip_reader *zreader ;
 350   xmlChar *val_string = NULL;
 351
 352   r->read_names = gri->read_names;
 353   ds_init_empty (&r->ods_errs);
 354
 355   zreader = zip_reader_create (gri->file_name, &r->ods_errs);
 356
 357   if ( NULL == zreader)
 358     {
 359       msg (ME, _("Error opening `%s' for reading as a OpenDocument spreadsheet file: %s."),
 360            gri->file_name, ds_cstr (&r->ods_errs));
 361
 362       goto error;
 363     }
 364
 365   content = zip_member_open (zreader, "content.xml");
 366   if ( NULL == content)
 367     {
 368       msg (ME, _("Could not extract OpenDocument spreadsheet from file `%s': %s."),
 369            gri->file_name, ds_cstr (&r->ods_errs));
 370
 371       goto error;
 372     }
 373
 374   zip_member_ref (content);
 375
 376   r->xtr = xmlReaderForIO ((xmlInputReadCallback) zip_member_read,
 377                            (xmlInputCloseCallback) zip_member_finish,
 378                            content,   NULL, NULL, XML_PARSE_RECOVER);
 379
 380   if ( r->xtr == NULL)
 381     {
 382       goto error;
 383     }
 384
 385   if ( gri->cell_range )
 386     {
 387       if ( ! convert_cell_ref (gri->cell_range,
 388                                &r->start_col, &r->start_row,
 389                                &r->stop_col, &r->stop_row))
 390         {
 391           msg (SE, _("Invalid cell range `%s'"),
 392                gri->cell_range);
 393           goto error;
 394         }
 395     }
 396   else
 397     {
 398       r->start_col = 0;
 399       r->start_row = 0;
 400       r->stop_col = -1;
 401       r->stop_row = -1;
 402     }
 403
 404   r->state = STATE_INIT;
 405   r->target_sheet = BAD_CAST gri->sheet_name;
 406   r->target_sheet_index = gri->sheet_index;
 407   r->row = r->col = -1;
 408   r->sheet_index = 0;
 409
 410
 411   /* If CELLRANGE was given, then we know how many variables should be read */
 412   if ( r->stop_col != -1 )
 413     {
 414       n_var_specs =  r->stop_col - r->start_col + 1;
 415       var_spec = xrealloc (var_spec, sizeof (*var_spec) * n_var_specs);
 416     }
 417
 418
 419   /* Advance to the start of the cells for the target sheet */
 420   while ( (r->row < r->start_row ))
 421     {
 422       if (1 != (ret = xmlTextReaderRead (r->xtr)))
 423            break;
 424
 425       process_node (r);
 426     }
 427
 428   if (ret < 1)
 429     {
 430       msg (MW, _("Selected sheet or range of spreadsheet `%s' is empty."),
 431            gri->file_name);
 432       goto error;
 433     }
 434
 435   if ( gri->read_names)
 436     {
 437       while (1 == (ret = xmlTextReaderRead (r->xtr)))
 438         {
 439           int idx;
 440           process_node (r);
 441           if ( r->row > r->start_row)
 442             break;
 443
 444           if (r->col == -1 && r->row == r->start_row)
 445             break;
 446
 447           if ( r->col < r->start_col)
 448             continue;
 449
 450           idx = r->col - r->start_col;
 451
 452           if (r->state == STATE_CELL_CONTENT
 453               &&
 454               XML_READER_TYPE_TEXT  == r->node_type)
 455             {
 456               xmlChar *value = xmlTextReaderValue (r->xtr);
 457               if ( idx >= n_var_specs)
 458                 {
 459
 460                   var_spec = xrealloc (var_spec, sizeof (*var_spec) * (idx + 1));
 461
 462                   /* xrealloc (unlike realloc) doesn't initialise its memory to 0 */
 463                   memset (var_spec + n_var_specs * sizeof (*var_spec),
 464                           0,
 465                           (n_var_specs - idx + 1) * sizeof (*var_spec));
 466                   n_var_specs = idx + 1;
 467                 }
 468               var_spec[idx].firstval.text = 0;
 469               var_spec[idx].firstval.value = 0;
 470               var_spec[idx].firstval.type = 0;
 471
 472               var_spec [idx].name = strdup (CHAR_CAST (const char *, value));
 473               free (value);
 474               value = NULL;
 475             }
 476         }
 477     }
 478
 479   /* Read in the first row of data */
 480   while (1 == xmlTextReaderRead (r->xtr))
 481     {
 482       int idx;
 483       process_node (r);
 484       if ( r->row >= r->start_row + 1 + gri->read_names)
 485         break;
 486
 487       if ( r->col < r->start_col)
 488         continue;
 489
 490       if ( r->col - r->start_col + 1 > n_var_specs)
 491         continue;
 492
 493       idx = r->col - r->start_col;
 494
 495       if ( r->state == STATE_CELL &&
 496            XML_READER_TYPE_ELEMENT  == r->node_type)
 497         {
 498           type = xmlTextReaderGetAttribute (r->xtr, _xml ("office:value-type"));
 499           val_string = xmlTextReaderGetAttribute (r->xtr, _xml ("office:value"));
 500         }
 501
 502       if ( r->state == STATE_CELL_CONTENT &&
 503            XML_READER_TYPE_TEXT  == r->node_type)
 504         {
 505           var_spec [idx].firstval.type = type;
 506           var_spec [idx].firstval.text = xmlTextReaderValue (r->xtr);
 507           var_spec [idx].firstval.value = val_string;
 508           val_string = NULL;
 509           type = NULL;
 510         }
 511     }
 512
 513   /* Create the dictionary and populate it */
 514   *dict = r->dict = dict_create (
 515     CHAR_CAST (const char *, xmlTextReaderConstEncoding (r->xtr)));
 516
 517   for (i = 0 ; i < n_var_specs ; ++i )
 518     {
 519       struct fmt_spec fmt;
 520       struct variable *var = NULL;
 521       char *name = dict_make_unique_var_name (r->dict, var_spec[i].name, &vstart);
 522       int width  = xmv_to_width (&var_spec[i].firstval, gri->asw);
 523       dict_create_var (r->dict, name, width);
 524       free (name);
 525
 526       var = dict_get_var (r->dict, i);
 527
 528       if ( 0 == xmlStrcmp (var_spec[i].firstval.type, _xml("date")))
 529         {
 530           fmt.type = FMT_DATE;
 531           fmt.d = 0;
 532           fmt.w = 20;
 533         }
 534       else
 535         fmt = fmt_default_for_width (width);
 536
 537       var_set_both_formats (var, &fmt);
 538     }
 539
 540   /* Create the first case, and cache it */
 541   r->used_first_case = false;
 542
 543   if ( n_var_specs ==  0 )
 544     {
 545       msg (MW, _("Selected sheet or range of spreadsheet `%s' is empty."),
 546            gri->file_name);
 547       goto error;
 548     }
 549
 550   r->proto = caseproto_ref (dict_get_proto (r->dict));
 551   r->first_case = case_create (r->proto);
 552   case_set_missing (r->first_case);
 553
 554   for ( i = 0 ; i < n_var_specs ; ++i )
 555     {
 556       const struct variable *var = dict_get_var (r->dict, i);
 557
 558       convert_xml_to_value (r->first_case, var,  &var_spec[i].firstval);
 559     }
 560
 561   zip_reader_destroy (zreader);
 562
 563   for ( i = 0 ; i < n_var_specs ; ++i )
 564     {
 565       free (var_spec[i].firstval.type);
 566       free (var_spec[i].firstval.value);
 567       free (var_spec[i].firstval.text);
 568       free (var_spec[i].name);
 569     }
 570
 571   free (var_spec);
 572
 573   return casereader_create_sequential
 574     (NULL,
 575      r->proto,
 576      n_cases,
 577      &ods_file_casereader_class, r);
 578
 579  error:
 580
 581   zip_reader_destroy (zreader);
 582
 583   for ( i = 0 ; i < n_var_specs ; ++i )
 584     {
 585       free (var_spec[i].firstval.type);
 586       free (var_spec[i].firstval.value);
 587       free (var_spec[i].firstval.text);
 588       free (var_spec[i].name);
 589     }
 590
 591   free (var_spec);
 592
 593   return NULL;
 594 }
 595
 596
 597 /* Reads and returns one case from READER's file.  Returns a null
 598    pointer on failure. */
 599 static struct ccase *
 600 ods_file_casereader_read (struct casereader *reader UNUSED, void *r_)
 601 {
 602   struct ccase *c = NULL;
 603   xmlChar *val_string = NULL;
 604   struct ods_reader *r = r_;
 605   int current_row = r->row;
 606
 607   if ( r->row == -1)
 608     return NULL;
 609
 610   if ( !r->used_first_case )
 611     {
 612       r->used_first_case = true;
 613       return r->first_case;
 614     }
 615
 616
 617   if ( r->state > STATE_INIT)
 618     {
 619       c = case_create (r->proto);
 620       case_set_missing (c);
 621     }
 622
 623   while (1 == xmlTextReaderRead (r->xtr))
 624     {
 625       process_node (r);
 626       if ( r->row > current_row)
 627         {
 628           break;
 629         }
 630       if ( r->col < r->start_col || (r->stop_col != -1 && r->col > r->stop_col))
 631         {
 632           continue;
 633         }
 634       if ( r->col - r->start_col >= caseproto_get_n_widths (r->proto))
 635         {
 636           continue;
 637         }
 638       if ( r->stop_row != -1 && r->row > r->stop_row)
 639         {
 640           continue;
 641         }
 642       if ( r->state == STATE_CELL &&
 643            r->node_type == XML_READER_TYPE_ELEMENT )
 644         {
 645           val_string = xmlTextReaderGetAttribute (r->xtr, _xml ("office:value"));
 646         }
 647
 648       if ( r->state == STATE_CELL_CONTENT && r->node_type == XML_READER_TYPE_TEXT )
 649         {
 650           int col;
 651           struct xml_value *xmv = xzalloc (sizeof *xmv);
 652           xmv->text = xmlTextReaderValue (r->xtr);
 653           xmv->value = val_string;
 654           val_string = NULL;
 655
 656           for (col = 0; col < r->span ; ++col)
 657             {
 658               const int idx = r->col + col - r->start_col;
 659
 660               const struct variable *var = dict_get_var (r->dict, idx);
 661
 662               convert_xml_to_value (c, var, xmv);
 663             }
 664           free (xmv->text);
 665           free (xmv->value);
 666           free (xmv);
 667         }
 668
 669       if ( r->state < STATE_TABLE)
 670         break;
 671     }
 672
 673   if (NULL == c || (r->stop_row != -1 && r->row > r->stop_row + 1))
 674     {
 675       case_unref (c);
 676       return NULL;
 677     }
 678   else
 679     {
 680       return c;
 681     }
 682 }
 683 #endif