1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2007, 2009, 2010, 2011, 2012, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "libpspp/message.h"
20 #include "libpspp/misc.h"
22 #include "gl/minmax.h"
23 #include "gl/c-strtod.h"
26 #define _(msgid) gettext (msgid)
27 #define N_(msgid) (msgid)
29 #include "spreadsheet-reader.h"
34 gnumeric_probe (const char *filename, bool report_errors)
37 msg (ME, _("Support for %s files was not compiled into this installation of PSPP"), "Gnumeric");
43 gnumeric_get_sheet_name (struct spreadsheet *s, int n)
49 gnumeric_get_sheet_range (struct spreadsheet *s, int n)
55 gnumeric_make_reader (struct spreadsheet *spreadsheet,
56 const struct spreadsheet_read_options *opts)
62 gnumeric_unref (struct spreadsheet *r)
69 #include "data/gnumeric-reader.h"
74 #include <libxml/xmlreader.h>
77 #include "data/format.h"
78 #include "data/data-in.h"
79 #include "data/case.h"
80 #include "data/casereader-provider.h"
81 #include "data/dictionary.h"
82 #include "data/identifier.h"
83 #include "data/value.h"
84 #include "data/variable.h"
85 #include "libpspp/i18n.h"
86 #include "libpspp/str.h"
88 #include "gl/xalloc.h"
91 /* Shamelessly lifted from the Gnumeric sources:
92 https://git.gnome.org/browse/gnumeric/tree/src/value.h
98 VALUE_INTEGER = 30, /* Note, this was removed from gnumeric in 2006 - old versions may of
99 course still be around. New ones are supposed to use float.*/
103 VALUE_CELLRANGE = 70,
109 static void gnm_file_casereader_destroy (struct casereader *, void *);
111 static struct ccase *gnm_file_casereader_read (struct casereader *, void *);
114 static const struct casereader_class gnm_file_casereader_class =
116 gnm_file_casereader_read,
117 gnm_file_casereader_destroy,
124 STATE_PRE_INIT = 0, /* Initial state */
125 STATE_SHEET_COUNT, /* Found the sheet index */
126 STATE_INIT , /* Other Initial state */
127 STATE_SHEET_START, /* Found the start of a sheet */
128 STATE_SHEET_NAME, /* Found the sheet name */
131 STATE_SHEET_FOUND, /* Found the sheet that we actually want */
132 STATE_CELLS_START, /* Found the start of the cell array */
133 STATE_CELL /* Found a cell */
138 /* The name of the sheet (utf8 encoding) */
152 /* The libxml reader for this instance */
153 xmlTextReaderPtr xtr;
155 /* An internal state variable */
156 enum reader_state state;
169 state_data_destroy (struct state_data *sd)
171 xmlFreeTextReader (sd->xtr);
175 struct gnumeric_reader
177 struct spreadsheet spreadsheet;
179 struct state_data rsd;
180 struct state_data msd;
187 struct sheet_detail *sheets;
189 const xmlChar *target_sheet;
190 int target_sheet_index;
192 struct caseproto *proto;
193 struct dictionary *dict;
194 struct ccase *first_case;
195 bool used_first_case;
197 enum gnm_value_type vtype;
202 gnumeric_unref (struct spreadsheet *s)
204 struct gnumeric_reader *r = (struct gnumeric_reader *) s;
206 if (0 == --s->ref_cnt)
210 for (i = 0; i < s->n_sheets; ++i)
212 xmlFree (r->sheets[i].name);
217 state_data_destroy (&r->msd);
219 dict_destroy (r->dict);
229 gnumeric_get_sheet_name (struct spreadsheet *s, int n)
231 struct gnumeric_reader *gr = (struct gnumeric_reader *) s;
232 assert (n < s->n_sheets);
234 return gr->sheets[n].name;
238 static void process_node (struct gnumeric_reader *r, struct state_data *sd);
243 gnumeric_get_sheet_range (struct spreadsheet *s, int n)
246 struct gnumeric_reader *gr = (struct gnumeric_reader *) s;
248 assert (n < s->n_sheets);
251 (gr->sheets[n].stop_col == -1)
253 (1 == (ret = xmlTextReaderRead (gr->msd.xtr)))
256 process_node (gr, &gr->msd);
259 return create_cell_range (
260 gr->sheets[n].start_col,
261 gr->sheets[n].start_row,
262 gr->sheets[n].stop_col,
263 gr->sheets[n].stop_row);
268 gnm_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
270 struct gnumeric_reader *r = r_;
275 state_data_destroy (&r->rsd);
277 if (r->first_case && ! r->used_first_case )
278 case_unref (r->first_case);
281 caseproto_unref (r->proto);
283 gnumeric_unref (&r->spreadsheet);
288 process_node (struct gnumeric_reader *r, struct state_data *sd)
290 xmlChar *name = xmlTextReaderName (sd->xtr);
292 name = xmlStrdup (_xml ("--"));
294 sd->node_type = xmlTextReaderNodeType (sd->xtr);
299 sd->current_sheet = -1;
300 if (0 == xmlStrcasecmp (name, _xml("gnm:SheetNameIndex")) &&
301 XML_READER_TYPE_ELEMENT == sd->node_type)
303 sd->state = STATE_SHEET_COUNT;
307 case STATE_SHEET_COUNT:
308 if (0 == xmlStrcasecmp (name, _xml("gnm:SheetName")) &&
309 XML_READER_TYPE_ELEMENT == sd->node_type)
312 if (sd->current_sheet + 1 > r->spreadsheet.n_sheets)
314 struct sheet_detail *detail ;
315 r->sheets = xrealloc (r->sheets, (sd->current_sheet + 1) * sizeof *r->sheets);
316 detail = &r->sheets[sd->current_sheet];
317 detail->start_col = detail->stop_col = detail->start_row = detail->stop_row = -1;
319 r->spreadsheet.n_sheets = sd->current_sheet + 1;
322 else if (0 == xmlStrcasecmp (name, _xml("gnm:SheetNameIndex")) &&
323 XML_READER_TYPE_END_ELEMENT == sd->node_type)
325 sd->state = STATE_INIT;
326 sd->current_sheet = -1;
328 else if (XML_READER_TYPE_TEXT == sd->node_type)
330 if ( r->sheets [r->spreadsheet.n_sheets - 1].name == NULL)
331 r->sheets [r->spreadsheet.n_sheets - 1].name = CHAR_CAST (char *, xmlTextReaderValue (sd->xtr));
336 if (0 == xmlStrcasecmp (name, _xml("gnm:Sheet")) &&
337 XML_READER_TYPE_ELEMENT == sd->node_type)
340 sd->state = STATE_SHEET_START;
343 case STATE_SHEET_START:
344 if (0 == xmlStrcasecmp (name, _xml("gnm:Name")) &&
345 XML_READER_TYPE_ELEMENT == sd->node_type)
347 sd->state = STATE_SHEET_NAME;
350 case STATE_SHEET_NAME:
351 if (0 == xmlStrcasecmp (name, _xml("gnm:Name")) &&
352 XML_READER_TYPE_END_ELEMENT == sd->node_type)
354 sd->state = STATE_INIT;
356 else if (0 == xmlStrcasecmp (name, _xml("gnm:Sheet")) &&
357 XML_READER_TYPE_END_ELEMENT == sd->node_type)
359 sd->state = STATE_INIT;
361 else if (XML_READER_TYPE_TEXT == sd->node_type)
363 if ( r->target_sheet != NULL)
365 xmlChar *value = xmlTextReaderValue (sd->xtr);
366 if ( 0 == xmlStrcmp (value, r->target_sheet))
367 sd->state = STATE_SHEET_FOUND;
370 else if (r->target_sheet_index == sd->current_sheet + 1)
372 sd->state = STATE_SHEET_FOUND;
374 else if (r->target_sheet_index == -1)
376 sd->state = STATE_SHEET_FOUND;
380 case STATE_SHEET_FOUND:
381 if (0 == xmlStrcasecmp (name, _xml("gnm:Cells")) &&
382 XML_READER_TYPE_ELEMENT == sd->node_type)
384 sd->min_col = INT_MAX;
385 if (! xmlTextReaderIsEmptyElement (sd->xtr))
386 sd->state = STATE_CELLS_START;
388 else if (0 == xmlStrcasecmp (name, _xml("gnm:MaxRow")) &&
389 XML_READER_TYPE_ELEMENT == sd->node_type)
391 sd->state = STATE_MAXROW;
393 else if (0 == xmlStrcasecmp (name, _xml("gnm:MaxCol")) &&
394 XML_READER_TYPE_ELEMENT == sd->node_type)
396 sd->state = STATE_MAXCOL;
398 else if (0 == xmlStrcasecmp (name, _xml("gnm:Sheet")) &&
399 XML_READER_TYPE_END_ELEMENT == sd->node_type)
401 sd->state = STATE_INIT;
405 if (0 == xmlStrcasecmp (name, _xml("gnm:MaxRow")) &&
406 XML_READER_TYPE_END_ELEMENT == sd->node_type)
408 sd->state = STATE_SHEET_FOUND;
410 else if (sd->node_type == XML_READER_TYPE_TEXT)
412 xmlChar *value = xmlTextReaderValue (sd->xtr);
413 r->sheets[sd->current_sheet].maxrow = _xmlchar_to_int (value);
418 if (0 == xmlStrcasecmp (name, _xml("gnm:MaxCol")) &&
419 XML_READER_TYPE_END_ELEMENT == sd->node_type)
421 sd->state = STATE_SHEET_FOUND;
423 else if (sd->node_type == XML_READER_TYPE_TEXT)
425 xmlChar *value = xmlTextReaderValue (sd->xtr);
426 r->sheets[sd->current_sheet].maxcol = _xmlchar_to_int (value);
430 case STATE_CELLS_START:
431 if (0 == xmlStrcasecmp (name, _xml ("gnm:Cell")) &&
432 XML_READER_TYPE_ELEMENT == sd->node_type)
434 xmlChar *attr = NULL;
436 attr = xmlTextReaderGetAttribute (sd->xtr, _xml ("Col"));
437 sd->col = _xmlchar_to_int (attr);
440 if (sd->col < sd->min_col)
441 sd->min_col = sd->col;
443 attr = xmlTextReaderGetAttribute (sd->xtr, _xml ("Row"));
444 sd->row = _xmlchar_to_int (attr);
447 if (r->sheets[sd->current_sheet].start_row == -1)
449 r->sheets[sd->current_sheet].start_row = sd->row;
452 if (r->sheets[sd->current_sheet].start_col == -1)
454 r->sheets[sd->current_sheet].start_col = sd->col;
456 if (! xmlTextReaderIsEmptyElement (sd->xtr))
457 sd->state = STATE_CELL;
459 else if ( (0 == xmlStrcasecmp (name, _xml("gnm:Cells"))) && (XML_READER_TYPE_END_ELEMENT == sd->node_type) )
461 r->sheets[sd->current_sheet].stop_col = sd->col;
462 r->sheets[sd->current_sheet].stop_row = sd->row;
463 sd->state = STATE_SHEET_NAME;
467 if (0 == xmlStrcasecmp (name, _xml("gnm:Cell")) && XML_READER_TYPE_END_ELEMENT == sd->node_type)
469 sd->state = STATE_CELLS_START;
481 Sets the VAR of case C, to the value corresponding to the xml string XV
484 convert_xml_string_to_value (struct ccase *c, const struct variable *var,
485 const xmlChar *xv, enum gnm_value_type type, int col, int row)
487 union value *v = case_data_rw (c, var);
490 value_set_missing (v, var_get_width (var));
491 else if ( var_is_alpha (var))
492 value_copy_str_rpad (v, var_get_width (var), xv, ' ');
493 else if (type == VALUE_FLOAT || type == VALUE_INTEGER)
495 const char *text = CHAR_CAST (const char *, xv);
499 v->f = c_strtod (text, &endptr);
500 if ( errno != 0 || endptr == text)
505 const char *text = CHAR_CAST (const char *, xv);
507 const struct fmt_spec *fmt = var_get_write_format (var);
509 char *m = data_in (ss_cstr (text), "UTF-8",
517 char buf [FMT_STRING_LEN_MAX + 1];
518 char *cell = create_cell_ref (col, row);
520 msg (MW, _("Cannot convert the value in the spreadsheet cell %s to format (%s): %s"),
521 cell, fmt_to_string (fmt, buf), m);
532 xmlChar *first_value;
538 gnumeric_error_handler (void *ctx, const char *mesg,
539 UNUSED xmlParserSeverities sev, xmlTextReaderLocatorPtr loc)
541 struct gnumeric_reader *r = ctx;
543 msg (MW, _("There was a problem whilst reading the %s file `%s' (near line %d): `%s'"),
545 r->spreadsheet.file_name,
546 xmlTextReaderLocatorLineNumber (loc),
550 static struct gnumeric_reader *
551 gnumeric_reopen (struct gnumeric_reader *r, const char *filename, bool show_errors)
554 struct state_data *sd;
556 xmlTextReaderPtr xtr;
559 assert (r == NULL || filename == NULL);
563 gz = gzopen (filename, "r");
567 gz = gzopen (r->spreadsheet.file_name, "r");
574 xtr = xmlReaderForIO ((xmlInputReadCallback) gzread,
575 (xmlInputCloseCallback) gzclose, gz,
577 show_errors ? 0 : (XML_PARSE_NOERROR | XML_PARSE_NOWARNING) );
587 r = xzalloc (sizeof *r);
588 r->spreadsheet.n_sheets = -1;
589 r->spreadsheet.file_name = strdup (filename);
598 xmlTextReaderSetErrorHandler (xtr, gnumeric_error_handler, r);
600 r->target_sheet = NULL;
601 r->target_sheet_index = -1;
603 sd->row = sd->col = -1;
604 sd->state = STATE_PRE_INIT;
606 r->spreadsheet.ref_cnt++;
609 /* Advance to the start of the workbook.
610 This gives us some confidence that we are actually dealing with a gnumeric
613 while ( (sd->state != STATE_INIT )
614 && 1 == (ret = xmlTextReaderRead (sd->xtr)))
616 process_node (r, sd);
622 /* Does not seem to be a gnumeric file */
623 gnumeric_unref (&r->spreadsheet);
627 r->spreadsheet.type = SPREADSHEET_GNUMERIC;
631 const xmlChar *enc = xmlTextReaderConstEncoding (sd->xtr);
632 xmlCharEncoding xce = xmlParseCharEncoding (CHAR_CAST (const char *, enc));
634 if ( XML_CHAR_ENCODING_UTF8 != xce)
636 /* I have been told that ALL gnumeric files are UTF8 encoded. If that is correct, this
638 msg (MW, _("The gnumeric file `%s' is encoded as %s instead of the usual UTF-8 encoding. "
639 "Any non-ascii characters will be incorrectly imported."),
640 r->spreadsheet.file_name,
650 gnumeric_probe (const char *filename, bool report_errors)
652 struct gnumeric_reader *r = gnumeric_reopen (NULL, filename, report_errors);
654 return &r->spreadsheet;
659 gnumeric_make_reader (struct spreadsheet *spreadsheet,
660 const struct spreadsheet_read_options *opts)
664 struct gnumeric_reader *r = NULL;
665 unsigned long int vstart = 0;
667 casenumber n_cases = CASENUMBER_MAX;
669 struct var_spec *var_spec = NULL;
672 r = (struct gnumeric_reader *) (spreadsheet);
674 r = gnumeric_reopen (r, NULL, true);
676 if ( opts->cell_range )
678 if ( ! convert_cell_ref (opts->cell_range,
679 &r->start_col, &r->start_row,
680 &r->stop_col, &r->stop_row))
682 msg (SE, _("Invalid cell range `%s'"),
695 r->target_sheet = BAD_CAST opts->sheet_name;
696 r->target_sheet_index = opts->sheet_index;
697 r->rsd.row = r->rsd.col = -1;
698 r->rsd.current_sheet = -1;
699 r->first_case = NULL;
702 /* Advance to the start of the cells for the target sheet */
703 while ( (r->rsd.state != STATE_CELL || r->rsd.row < r->start_row )
704 && 1 == (ret = xmlTextReaderRead (r->rsd.xtr)))
707 process_node (r, &r->rsd);
708 value = xmlTextReaderValue (r->rsd.xtr);
710 if ( r->rsd.state == STATE_MAXROW && r->rsd.node_type == XML_READER_TYPE_TEXT)
712 n_cases = 1 + _xmlchar_to_int (value) ;
717 /* If a range has been given, then use that to calculate the number
719 if ( opts->cell_range)
721 n_cases = MIN (n_cases, r->stop_row - r->start_row + 1);
724 if ( opts->read_names )
731 /* Read in the first row of cells,
732 including the headers if read_names was set */
734 (( r->rsd.state == STATE_CELLS_START && r->rsd.row <= r->start_row) || r->rsd.state == STATE_CELL )
735 && (ret = xmlTextReaderRead (r->rsd.xtr))
740 if (r->rsd.state == STATE_CELL && r->rsd.node_type == XML_READER_TYPE_TEXT)
743 xmlTextReaderGetAttribute (r->rsd.xtr, _xml ("ValueType"));
745 type = _xmlchar_to_int (attr);
750 process_node (r, &r->rsd);
752 if ( r->rsd.row > r->start_row )
755 xmlTextReaderGetAttribute (r->rsd.xtr, _xml ("ValueType"));
757 r->vtype = _xmlchar_to_int (attr);
763 if ( r->rsd.col < r->start_col ||
764 (r->stop_col != -1 && r->rsd.col > r->stop_col))
767 idx = r->rsd.col - r->start_col;
769 if ( idx >= n_var_specs )
772 var_spec = xrealloc (var_spec, sizeof (*var_spec) * (idx + 1));
773 for (i = n_var_specs; i <= idx; ++i)
775 var_spec [i].name = NULL;
776 var_spec [i].width = -1;
777 var_spec [i].first_value = NULL;
778 var_spec [i].first_type = -1;
780 n_var_specs = idx + 1 ;
783 var_spec [idx].first_type = type;
785 if ( r->rsd.node_type == XML_READER_TYPE_TEXT )
787 xmlChar *value = xmlTextReaderValue (r->rsd.xtr);
788 const char *text = CHAR_CAST (const char *, value);
790 if ( r->rsd.row < r->start_row)
792 if ( opts->read_names )
794 var_spec [idx].name = xstrdup (text);
799 var_spec [idx].first_value = xmlStrdup (value);
801 if (-1 == var_spec [idx].width )
802 var_spec [idx].width = (opts->asw == -1) ?
803 ROUND_UP (strlen(text), SPREADSHEET_DEFAULT_WIDTH) : opts->asw;
808 else if ( r->rsd.node_type == XML_READER_TYPE_ELEMENT
809 && r->rsd.state == STATE_CELL)
811 if ( r->rsd.row == r->start_row )
814 xmlTextReaderGetAttribute (r->rsd.xtr, _xml ("ValueType"));
816 if ( NULL == attr || VALUE_STRING != _xmlchar_to_int (attr))
817 var_spec [idx].width = 0;
825 const xmlChar *enc = xmlTextReaderConstEncoding (r->rsd.xtr);
828 /* Create the dictionary and populate it */
829 spreadsheet->dict = r->dict = dict_create (CHAR_CAST (const char *, enc));
832 for (i = 0 ; i < n_var_specs ; ++i )
836 if ( (var_spec[i].name == NULL) && (var_spec[i].first_value == NULL))
839 /* Probably no data exists for this variable, so allocate a
841 if ( var_spec[i].width == -1 )
842 var_spec[i].width = SPREADSHEET_DEFAULT_WIDTH;
844 name = dict_make_unique_var_name (r->dict, var_spec[i].name, &vstart);
845 dict_create_var (r->dict, name, var_spec[i].width);
849 /* Create the first case, and cache it */
850 r->used_first_case = false;
852 if ( n_var_specs == 0 )
854 msg (MW, _("Selected sheet or range of spreadsheet `%s' is empty."),
855 spreadsheet->file_name);
859 r->proto = caseproto_ref (dict_get_proto (r->dict));
860 r->first_case = case_create (r->proto);
861 case_set_missing (r->first_case);
864 for ( i = 0 ; i < n_var_specs ; ++i )
866 const struct variable *var;
868 if ( (var_spec[i].name == NULL) && (var_spec[i].first_value == NULL))
871 var = dict_get_var (r->dict, x++);
873 convert_xml_string_to_value (r->first_case, var,
874 var_spec[i].first_value,
875 var_spec[i].first_type,
880 for ( i = 0 ; i < n_var_specs ; ++i )
882 free (var_spec[i].first_value);
883 free (var_spec[i].name);
889 return casereader_create_sequential
893 &gnm_file_casereader_class, r);
897 for ( i = 0 ; i < n_var_specs ; ++i )
899 free (var_spec[i].first_value);
900 free (var_spec[i].name);
905 gnm_file_casereader_destroy (NULL, r);
911 /* Reads and returns one case from READER's file. Returns a null
912 pointer on failure. */
913 static struct ccase *
914 gnm_file_casereader_read (struct casereader *reader UNUSED, void *r_)
919 struct gnumeric_reader *r = r_;
920 int current_row = r->rsd.row;
922 if ( !r->used_first_case )
924 r->used_first_case = true;
925 return r->first_case;
928 c = case_create (r->proto);
929 case_set_missing (c);
931 if (r->start_col == -1)
932 r->start_col = r->rsd.min_col;
935 while ((r->rsd.state == STATE_CELL || r->rsd.state == STATE_CELLS_START )
936 && r->rsd.row == current_row && (ret = xmlTextReaderRead (r->rsd.xtr)))
938 process_node (r, &r->rsd);
940 if (r->rsd.state == STATE_CELL && r->rsd.node_type == XML_READER_TYPE_ELEMENT)
943 xmlTextReaderGetAttribute (r->rsd.xtr, _xml ("ValueType"));
945 r->vtype = _xmlchar_to_int (attr);
950 if ( r->rsd.col < r->start_col || (r->stop_col != -1 &&
951 r->rsd.col > r->stop_col))
954 if ( r->rsd.col - r->start_col >= caseproto_get_n_widths (r->proto))
957 if ( r->stop_row != -1 && r->rsd.row > r->stop_row)
961 if ( r->rsd.node_type == XML_READER_TYPE_TEXT )
963 xmlChar *value = xmlTextReaderValue (r->rsd.xtr);
964 const int idx = r->rsd.col - r->start_col;
965 const struct variable *var = dict_get_var (r->dict, idx);
967 convert_xml_string_to_value (c, var, value, r->vtype,
968 r->rsd.col, r->rsd.row);
984 #endif /* GNM_READ_SUPPORT */