1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2007, 2009, 2010, 2011, 2012, 2013, 2016,
3 2020 Free Software Foundation, Inc.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
20 #include "data/gnumeric-reader.h"
21 #include "spreadsheet-reader.h"
26 #include <libxml/xmlreader.h>
29 #include "data/case.h"
30 #include "data/casereader-provider.h"
31 #include "data/data-in.h"
32 #include "data/dictionary.h"
33 #include "data/format.h"
34 #include "data/identifier.h"
35 #include "data/value.h"
36 #include "data/variable.h"
37 #include "libpspp/i18n.h"
38 #include "libpspp/message.h"
39 #include "libpspp/misc.h"
40 #include "libpspp/hmap.h"
41 #include "libpspp/hash-functions.h"
43 #include "libpspp/str.h"
45 #include "gl/c-strtod.h"
46 #include "gl/minmax.h"
47 #include "gl/xalloc.h"
50 #define _(msgid) gettext (msgid)
51 #define N_(msgid) (msgid)
53 /* Setting this to false can help with debugging and development.
54 Don't forget to set it back to true, or users will complain that
55 all but the smallest spreadsheets display VERY slowly. */
56 static const bool use_cache = true;
58 /* Shamelessly lifted from the Gnumeric sources:
59 https://git.gnome.org/browse/gnumeric/tree/src/value.h
65 VALUE_INTEGER = 30, /* Note, this was removed from gnumeric in 2006 - old versions may of
66 course still be around. New ones are supposed to use float.*/
75 static void gnm_file_casereader_destroy (struct casereader *, void *);
77 static struct ccase *gnm_file_casereader_read (struct casereader *, void *);
80 static const struct casereader_class gnm_file_casereader_class =
82 gnm_file_casereader_read,
83 gnm_file_casereader_destroy,
90 STATE_PRE_INIT = 0, /* Initial state */
91 STATE_SHEET_COUNT, /* Found the sheet index */
92 STATE_INIT , /* Other Initial state */
93 STATE_SHEET_START, /* Found the start of a sheet */
94 STATE_SHEET_NAME, /* Found the sheet name */
97 STATE_SHEET_FOUND, /* Found the sheet that we actually want */
98 STATE_CELLS_START, /* Found the start of the cell array */
99 STATE_CELL /* Found a cell */
106 /* The libxml reader for this instance */
107 xmlTextReaderPtr xtr;
109 /* An internal state variable */
110 enum reader_state state;
123 state_data_destroy (struct state_data *sd)
125 xmlFreeTextReader (sd->xtr);
129 struct gnumeric_reader
131 struct spreadsheet spreadsheet;
133 struct state_data rsd;
134 struct state_data msd;
136 const xmlChar *target_sheet_name;
137 int target_sheet_index;
139 enum gnm_value_type vtype;
141 /* The total number of sheets in the "workbook" */
147 /* A value to be kept in the hash table for cache purposes. */
150 struct hmap_node node;
152 /* The cell's row. */
155 /* The cell's column. */
158 /* The value of the cell. */
163 gnumeric_destroy (struct spreadsheet *s)
165 struct gnumeric_reader *r = (struct gnumeric_reader *) s;
169 for (i = 0; i < r->n_sheets; ++i)
171 xmlFree (r->spreadsheet.sheets[i].name);
175 dict_unref (s->dict);
177 free (r->spreadsheet.sheets);
178 state_data_destroy (&r->msd);
182 struct cache_datum *cell;
183 struct cache_datum *next;
184 HMAP_FOR_EACH_SAFE (cell, next, struct cache_datum, node, &r->cache)
190 hmap_destroy (&r->cache);
197 gnumeric_get_sheet_name (struct spreadsheet *s, int n)
199 struct gnumeric_reader *gr = (struct gnumeric_reader *) s;
200 assert (n < gr->n_sheets);
202 return gr->spreadsheet.sheets[n].name;
206 static void process_node (struct gnumeric_reader *r, struct state_data *sd);
210 gnumeric_get_sheet_n_sheets (struct spreadsheet *s)
212 struct gnumeric_reader *gr = (struct gnumeric_reader *) s;
215 while (1 == (ret = xmlTextReaderRead (gr->msd.xtr)))
217 process_node (gr, &gr->msd);
225 gnumeric_get_sheet_range (struct spreadsheet *s, int n)
228 struct gnumeric_reader *gr = (struct gnumeric_reader *) s;
230 while ((gr->spreadsheet.sheets[n].last_col == -1)
232 (1 == (ret = xmlTextReaderRead (gr->msd.xtr))))
234 process_node (gr, &gr->msd);
237 assert (n < gr->n_sheets);
238 return create_cell_range (
239 gr->spreadsheet.sheets[n].first_col,
240 gr->spreadsheet.sheets[n].first_row,
241 gr->spreadsheet.sheets[n].last_col,
242 gr->spreadsheet.sheets[n].last_row);
247 gnumeric_get_sheet_n_rows (struct spreadsheet *s, int n)
249 struct gnumeric_reader *gr = (struct gnumeric_reader *) s;
251 while ((gr->spreadsheet.sheets[n].last_col == -1)
253 (1 == xmlTextReaderRead (gr->msd.xtr)))
255 process_node (gr, &gr->msd);
258 assert (n < gr->n_sheets);
259 return gr->spreadsheet.sheets[n].last_row + 1;
263 gnumeric_get_sheet_n_columns (struct spreadsheet *s, int n)
265 struct gnumeric_reader *gr = (struct gnumeric_reader *) s;
267 while ((gr->spreadsheet.sheets[n].last_col == -1)
269 (1 == xmlTextReaderRead (gr->msd.xtr)))
271 process_node (gr, &gr->msd);
274 assert (n < gr->n_sheets);
275 return gr->spreadsheet.sheets[n].last_col + 1;
278 static struct gnumeric_reader *
279 gnumeric_reopen (struct gnumeric_reader *r, const char *filename, bool show_errors);
283 gnumeric_get_sheet_cell (struct spreadsheet *s, int n, int row, int column)
285 struct gnumeric_reader *gr = (struct gnumeric_reader *) s;
287 /* See if this cell is in the cache. If it is, then use it. */
290 struct cache_datum *lookup = NULL;
291 unsigned int hash = hash_int (row, 0);
292 hash = hash_int (column, hash);
294 HMAP_FOR_EACH_WITH_HASH (lookup, struct cache_datum, node, hash,
297 if (lookup->row == row && lookup->col == column)
304 return strdup (lookup->value);
308 struct state_data sd;
310 sd.state = STATE_PRE_INIT;
311 sd.current_sheet = -1;
315 sd.gz = gzopen (s->file_name, "r");
317 sd.xtr = xmlReaderForIO ((xmlInputReadCallback) gzread,
318 (xmlInputCloseCallback) gzclose,
324 gr->target_sheet_name = NULL;
326 int current_row = -1;
327 int current_col = -1;
329 /* Spool to the target cell, caching values of cells as they are encountered. */
330 for (int ret = 1; ret; )
332 while ((ret = xmlTextReaderRead (sd.xtr)))
334 process_node (gr, &sd);
335 if (sd.state == STATE_CELL)
337 if (sd.current_sheet == n)
339 current_row = sd.row;
340 current_col = sd.col;
345 if (current_row >= row && current_col >= column - 1)
348 while ((ret = xmlTextReaderRead (sd.xtr)))
350 process_node (gr, &sd);
351 if (sd.node_type == XML_READER_TYPE_TEXT)
357 /* See if this cell has already been cached ... */
358 unsigned int hash = hash_int (current_row, 0);
359 hash = hash_int (current_col, hash);
360 struct cache_datum *probe = NULL;
361 HMAP_FOR_EACH_WITH_HASH (probe, struct cache_datum, node, hash,
364 if (probe->row == current_row && probe->col == current_col)
367 /* If not, then cache it. */
370 char *str = CHAR_CAST (char *, xmlTextReaderValue (sd.xtr));
371 struct cache_datum *cell_data = XMALLOC (struct cache_datum);
372 cell_data->row = current_row;
373 cell_data->col = current_col;
374 cell_data->value = str;
375 hmap_insert (&gr->cache, &cell_data->node, hash);
380 while (xmlTextReaderRead (sd.xtr))
382 process_node (gr, &sd);
383 if (sd.state == STATE_CELL && sd.node_type == XML_READER_TYPE_TEXT)
385 if (sd.current_sheet == n)
387 if (row == sd.row && column == sd.col)
393 char *cell_content = CHAR_CAST (char *, xmlTextReaderValue (sd.xtr));
394 xmlFreeTextReader (sd.xtr);
400 gnm_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
402 struct gnumeric_reader *r = r_;
407 state_data_destroy (&r->rsd);
409 if (r->spreadsheet.first_case && ! r->spreadsheet.used_first_case)
410 case_unref (r->spreadsheet.first_case);
412 if (r->spreadsheet.proto)
413 caseproto_unref (r->spreadsheet.proto);
415 spreadsheet_unref (&r->spreadsheet);
420 process_node (struct gnumeric_reader *r, struct state_data *sd)
422 xmlChar *name = xmlTextReaderName (sd->xtr);
424 name = xmlStrdup (_xml ("--"));
426 sd->node_type = xmlTextReaderNodeType (sd->xtr);
431 sd->current_sheet = -1;
432 if (0 == xmlStrcasecmp (name, _xml("gnm:SheetNameIndex")) &&
433 XML_READER_TYPE_ELEMENT == sd->node_type)
435 sd->state = STATE_SHEET_COUNT;
439 case STATE_SHEET_COUNT:
440 if (0 == xmlStrcasecmp (name, _xml("gnm:SheetName")) &&
441 XML_READER_TYPE_ELEMENT == sd->node_type)
444 if (sd->current_sheet + 1 > r->n_sheets)
446 struct sheet_detail *detail ;
447 r->spreadsheet.sheets = xrealloc (r->spreadsheet.sheets, (sd->current_sheet + 1) * sizeof *r->spreadsheet.sheets);
448 detail = &r->spreadsheet.sheets[sd->current_sheet];
449 detail->first_col = detail->last_col = detail->first_row = detail->last_row = -1;
451 r->n_sheets = sd->current_sheet + 1;
454 else if (0 == xmlStrcasecmp (name, _xml("gnm:SheetNameIndex")) &&
455 XML_READER_TYPE_END_ELEMENT == sd->node_type)
457 sd->state = STATE_INIT;
458 sd->current_sheet = -1;
460 else if (XML_READER_TYPE_TEXT == sd->node_type)
462 if (r->spreadsheet.sheets [r->n_sheets - 1].name == NULL)
463 r->spreadsheet.sheets [r->n_sheets - 1].name =
464 CHAR_CAST (char *, xmlTextReaderValue (sd->xtr));
469 if (0 == xmlStrcasecmp (name, _xml("gnm:Sheet")) &&
470 XML_READER_TYPE_ELEMENT == sd->node_type)
473 sd->state = STATE_SHEET_START;
476 case STATE_SHEET_START:
477 if (0 == xmlStrcasecmp (name, _xml("gnm:Name")) &&
478 XML_READER_TYPE_ELEMENT == sd->node_type)
480 sd->state = STATE_SHEET_NAME;
483 case STATE_SHEET_NAME:
484 if (0 == xmlStrcasecmp (name, _xml("gnm:Name")) &&
485 XML_READER_TYPE_END_ELEMENT == sd->node_type)
487 sd->state = STATE_INIT;
489 else if (0 == xmlStrcasecmp (name, _xml("gnm:Sheet")) &&
490 XML_READER_TYPE_END_ELEMENT == sd->node_type)
492 sd->state = STATE_INIT;
494 else if (XML_READER_TYPE_TEXT == sd->node_type)
496 if (r->target_sheet_name != NULL)
498 xmlChar *value = xmlTextReaderValue (sd->xtr);
499 if (0 == xmlStrcmp (value, r->target_sheet_name))
500 sd->state = STATE_SHEET_FOUND;
503 else if (r->target_sheet_index == sd->current_sheet + 1)
505 sd->state = STATE_SHEET_FOUND;
507 else if (r->target_sheet_index == -1)
509 sd->state = STATE_SHEET_FOUND;
513 case STATE_SHEET_FOUND:
514 if (0 == xmlStrcasecmp (name, _xml("gnm:Cells")) &&
515 XML_READER_TYPE_ELEMENT == sd->node_type)
517 sd->min_col = INT_MAX;
518 if (! xmlTextReaderIsEmptyElement (sd->xtr))
519 sd->state = STATE_CELLS_START;
521 else if (0 == xmlStrcasecmp (name, _xml("gnm:MaxRow")) &&
522 XML_READER_TYPE_ELEMENT == sd->node_type)
524 sd->state = STATE_MAXROW;
526 else if (0 == xmlStrcasecmp (name, _xml("gnm:MaxCol")) &&
527 XML_READER_TYPE_ELEMENT == sd->node_type)
529 sd->state = STATE_MAXCOL;
531 else if (0 == xmlStrcasecmp (name, _xml("gnm:Sheet")) &&
532 XML_READER_TYPE_END_ELEMENT == sd->node_type)
534 sd->state = STATE_INIT;
538 if (0 == xmlStrcasecmp (name, _xml("gnm:MaxRow")) &&
539 XML_READER_TYPE_END_ELEMENT == sd->node_type)
541 sd->state = STATE_SHEET_FOUND;
543 else if (sd->node_type == XML_READER_TYPE_TEXT)
545 xmlChar *value = xmlTextReaderValue (sd->xtr);
550 if (0 == xmlStrcasecmp (name, _xml("gnm:MaxCol")) &&
551 XML_READER_TYPE_END_ELEMENT == sd->node_type)
553 sd->state = STATE_SHEET_FOUND;
555 else if (sd->node_type == XML_READER_TYPE_TEXT)
557 xmlChar *value = xmlTextReaderValue (sd->xtr);
561 case STATE_CELLS_START:
562 if (0 == xmlStrcasecmp (name, _xml ("gnm:Cell")) &&
563 XML_READER_TYPE_ELEMENT == sd->node_type)
565 xmlChar *attr = xmlTextReaderGetAttribute (sd->xtr, _xml ("Col"));
566 sd->col = _xmlchar_to_int (attr);
569 if (sd->col < sd->min_col)
570 sd->min_col = sd->col;
572 attr = xmlTextReaderGetAttribute (sd->xtr, _xml ("Row"));
573 sd->row = _xmlchar_to_int (attr);
576 if (r->spreadsheet.sheets[sd->current_sheet].first_row == -1)
578 r->spreadsheet.sheets[sd->current_sheet].first_row = sd->row;
581 if (r->spreadsheet.sheets[sd->current_sheet].first_col == -1)
583 r->spreadsheet.sheets[sd->current_sheet].first_col = sd->col;
585 if (! xmlTextReaderIsEmptyElement (sd->xtr))
586 sd->state = STATE_CELL;
588 else if ((0 == xmlStrcasecmp (name, _xml("gnm:Cells")))
589 && (XML_READER_TYPE_END_ELEMENT == sd->node_type))
591 r->spreadsheet.sheets[sd->current_sheet].last_col = sd->col;
592 r->spreadsheet.sheets[sd->current_sheet].last_row = sd->row;
593 sd->state = STATE_SHEET_NAME;
597 if (0 == xmlStrcasecmp (name, _xml("gnm:Cell"))
598 && XML_READER_TYPE_END_ELEMENT == sd->node_type)
600 sd->state = STATE_CELLS_START;
612 Sets the VAR of case C, to the value corresponding to the xml string XV
615 convert_xml_string_to_value (struct ccase *c, const struct variable *var,
616 const xmlChar *xv, enum gnm_value_type type, int col, int row)
618 union value *v = case_data_rw (c, var);
621 value_set_missing (v, var_get_width (var));
622 else if (var_is_alpha (var))
623 value_copy_str_rpad (v, var_get_width (var), xv, ' ');
624 else if (type == VALUE_FLOAT || type == VALUE_INTEGER)
626 const char *text = CHAR_CAST (const char *, xv);
630 v->f = c_strtod (text, &endptr);
631 if (errno != 0 || endptr == text)
636 const char *text = CHAR_CAST (const char *, xv);
638 const struct fmt_spec *fmt = var_get_write_format (var);
640 char *m = data_in (ss_cstr (text), "UTF-8",
648 char buf [FMT_STRING_LEN_MAX + 1];
649 char *cell = create_cell_ref (col, row);
651 msg (MW, _("Cannot convert the value in the spreadsheet cell %s to format (%s): %s"),
652 cell, fmt_to_string (fmt, buf), m);
663 xmlChar *first_value;
669 gnumeric_error_handler (void *ctx, const char *mesg,
670 xmlParserSeverities sev UNUSED,
671 xmlTextReaderLocatorPtr loc)
673 struct gnumeric_reader *r = ctx;
675 msg (MW, _("There was a problem whilst reading the %s file `%s' (near line %d): `%s'"),
677 r->spreadsheet.file_name,
678 xmlTextReaderLocatorLineNumber (loc),
682 static struct casereader *
683 gnumeric_make_reader (struct spreadsheet *spreadsheet,
684 const struct spreadsheet_read_options *opts)
688 struct gnumeric_reader *r = NULL;
689 unsigned long int vstart = 0;
691 casenumber n_cases = CASENUMBER_MAX;
693 struct var_spec *var_spec = NULL;
696 r = (struct gnumeric_reader *) (spreadsheet);
698 r = gnumeric_reopen (r, NULL, true);
700 if (opts->cell_range)
702 if (! convert_cell_ref (opts->cell_range,
703 &r->spreadsheet.start_col, &r->spreadsheet.start_row,
704 &r->spreadsheet.stop_col, &r->spreadsheet.stop_row))
706 msg (SE, _("Invalid cell range `%s'"),
713 r->spreadsheet.start_col = -1;
714 r->spreadsheet.start_row = 0;
715 r->spreadsheet.stop_col = -1;
716 r->spreadsheet.stop_row = -1;
719 r->target_sheet_name = BAD_CAST opts->sheet_name;
720 r->target_sheet_index = opts->sheet_index;
721 r->rsd.row = r->rsd.col = -1;
722 r->rsd.current_sheet = -1;
723 r->spreadsheet.first_case = NULL;
724 r->spreadsheet.proto = NULL;
726 /* Advance to the start of the cells for the target sheet */
727 while ((r->rsd.state != STATE_CELL || r->rsd.row < r->spreadsheet.start_row)
728 && 1 == (ret = xmlTextReaderRead (r->rsd.xtr)))
731 process_node (r, &r->rsd);
732 value = xmlTextReaderValue (r->rsd.xtr);
734 if (r->rsd.state == STATE_MAXROW && r->rsd.node_type == XML_READER_TYPE_TEXT)
736 n_cases = 1 + _xmlchar_to_int (value) ;
741 /* If a range has been given, then use that to calculate the number
743 if (opts->cell_range)
745 n_cases = MIN (n_cases, r->spreadsheet.stop_row - r->spreadsheet.start_row + 1);
748 if (opts->read_names)
750 r->spreadsheet.start_row++;
755 /* Read in the first row of cells,
756 including the headers if read_names was set */
758 ((r->rsd.state == STATE_CELLS_START && r->rsd.row <= r->spreadsheet.start_row) || r->rsd.state == STATE_CELL)
759 && (ret = xmlTextReaderRead (r->rsd.xtr))
764 if (r->rsd.state == STATE_CELL && r->rsd.node_type == XML_READER_TYPE_TEXT)
767 xmlTextReaderGetAttribute (r->rsd.xtr, _xml ("ValueType"));
769 type = _xmlchar_to_int (attr);
774 process_node (r, &r->rsd);
776 if (r->rsd.row > r->spreadsheet.start_row)
779 xmlTextReaderGetAttribute (r->rsd.xtr, _xml ("ValueType"));
781 r->vtype = _xmlchar_to_int (attr);
787 if (r->rsd.col < r->spreadsheet.start_col ||
788 (r->spreadsheet.stop_col != -1 && r->rsd.col > r->spreadsheet.stop_col))
791 idx = r->rsd.col - r->spreadsheet.start_col;
793 if (idx >= n_var_specs)
796 var_spec = xrealloc (var_spec, sizeof (*var_spec) * (idx + 1));
797 for (i = n_var_specs; i <= idx; ++i)
799 var_spec [i].name = NULL;
800 var_spec [i].width = -1;
801 var_spec [i].first_value = NULL;
802 var_spec [i].first_type = -1;
804 n_var_specs = idx + 1 ;
807 var_spec [idx].first_type = type;
809 if (r->rsd.node_type == XML_READER_TYPE_TEXT)
811 xmlChar *value = xmlTextReaderValue (r->rsd.xtr);
812 const char *text = CHAR_CAST (const char *, value);
814 if (r->rsd.row < r->spreadsheet.start_row)
816 if (opts->read_names)
818 var_spec [idx].name = xstrdup (text);
823 var_spec [idx].first_value = xmlStrdup (value);
825 if (-1 == var_spec [idx].width)
826 var_spec [idx].width = (opts->asw == -1) ?
827 ROUND_UP (strlen(text), SPREADSHEET_DEFAULT_WIDTH) : opts->asw;
832 else if (r->rsd.node_type == XML_READER_TYPE_ELEMENT
833 && r->rsd.state == STATE_CELL)
835 if (r->rsd.row == r->spreadsheet.start_row)
838 xmlTextReaderGetAttribute (r->rsd.xtr, _xml ("ValueType"));
840 if (NULL == attr || VALUE_STRING != _xmlchar_to_int (attr))
841 var_spec [idx].width = 0;
849 const xmlChar *enc = xmlTextReaderConstEncoding (r->rsd.xtr);
852 /* Create the dictionary and populate it */
853 spreadsheet->dict = dict_create (CHAR_CAST (const char *, enc));
856 for (i = 0 ; i < n_var_specs ; ++i)
860 if ((var_spec[i].name == NULL) && (var_spec[i].first_value == NULL))
863 /* Probably no data exists for this variable, so allocate a
865 if (var_spec[i].width == -1)
866 var_spec[i].width = SPREADSHEET_DEFAULT_WIDTH;
868 name = dict_make_unique_var_name (r->spreadsheet.dict, var_spec[i].name, &vstart);
869 dict_create_var (r->spreadsheet.dict, name, var_spec[i].width);
873 /* Create the first case, and cache it */
874 r->spreadsheet.used_first_case = false;
876 if (n_var_specs == 0)
878 msg (MW, _("Selected sheet or range of spreadsheet `%s' is empty."),
879 spreadsheet->file_name);
883 r->spreadsheet.proto = caseproto_ref (dict_get_proto (r->spreadsheet.dict));
884 r->spreadsheet.first_case = case_create (r->spreadsheet.proto);
885 case_set_missing (r->spreadsheet.first_case);
888 for (i = 0 ; i < n_var_specs ; ++i)
890 const struct variable *var;
892 if ((var_spec[i].name == NULL) && (var_spec[i].first_value == NULL))
895 var = dict_get_var (r->spreadsheet.dict, x++);
897 convert_xml_string_to_value (r->spreadsheet.first_case, var,
898 var_spec[i].first_value,
899 var_spec[i].first_type,
904 for (i = 0 ; i < n_var_specs ; ++i)
906 free (var_spec[i].first_value);
907 free (var_spec[i].name);
913 return casereader_create_sequential
915 r->spreadsheet.proto,
917 &gnm_file_casereader_class, r);
921 for (i = 0 ; i < n_var_specs ; ++i)
923 free (var_spec[i].first_value);
924 free (var_spec[i].name);
929 gnm_file_casereader_destroy (NULL, r);
935 /* Reads and returns one case from READER's file. Returns a null
936 pointer on failure. */
937 static struct ccase *
938 gnm_file_casereader_read (struct casereader *reader UNUSED, void *r_)
943 struct gnumeric_reader *r = r_;
944 int current_row = r->rsd.row;
946 if (!r->spreadsheet.used_first_case)
948 r->spreadsheet.used_first_case = true;
949 return r->spreadsheet.first_case;
952 c = case_create (r->spreadsheet.proto);
953 case_set_missing (c);
955 if (r->spreadsheet.start_col == -1)
956 r->spreadsheet.start_col = r->rsd.min_col;
959 while ((r->rsd.state == STATE_CELL || r->rsd.state == STATE_CELLS_START)
960 && r->rsd.row == current_row && (ret = xmlTextReaderRead (r->rsd.xtr)))
962 process_node (r, &r->rsd);
964 if (r->rsd.state == STATE_CELL && r->rsd.node_type == XML_READER_TYPE_ELEMENT)
967 xmlTextReaderGetAttribute (r->rsd.xtr, _xml ("ValueType"));
969 r->vtype = _xmlchar_to_int (attr);
974 if (r->rsd.col < r->spreadsheet.start_col || (r->spreadsheet.stop_col != -1 &&
975 r->rsd.col > r->spreadsheet.stop_col))
978 if (r->rsd.col - r->spreadsheet.start_col >= caseproto_get_n_widths (r->spreadsheet.proto))
981 if (r->spreadsheet.stop_row != -1 && r->rsd.row > r->spreadsheet.stop_row)
985 if (r->rsd.node_type == XML_READER_TYPE_TEXT)
987 xmlChar *value = xmlTextReaderValue (r->rsd.xtr);
988 const int idx = r->rsd.col - r->spreadsheet.start_col;
989 const struct variable *var = dict_get_var (r->spreadsheet.dict, idx);
991 convert_xml_string_to_value (c, var, value, r->vtype,
992 r->rsd.col, r->rsd.row);
1007 static struct gnumeric_reader *
1008 gnumeric_reopen (struct gnumeric_reader *r, const char *filename, bool show_errors)
1011 struct state_data *sd;
1013 xmlTextReaderPtr xtr;
1016 assert (r == NULL || filename == NULL);
1020 gz = gzopen (filename, "r");
1024 gz = gzopen (r->spreadsheet.file_name, "r");
1032 r = xzalloc (sizeof *r);
1034 r->spreadsheet.file_name = strdup (filename);
1035 struct spreadsheet *s = SPREADSHEET_CAST (r);
1036 strcpy (s->type, "GNM");
1037 s->destroy = gnumeric_destroy;
1038 s->make_reader = gnumeric_make_reader;
1039 s->get_sheet_name = gnumeric_get_sheet_name;
1040 s->get_sheet_range = gnumeric_get_sheet_range;
1041 s->get_sheet_n_sheets = gnumeric_get_sheet_n_sheets;
1042 s->get_sheet_n_rows = gnumeric_get_sheet_n_rows;
1043 s->get_sheet_n_columns = gnumeric_get_sheet_n_columns;
1044 s->get_sheet_cell = gnumeric_get_sheet_cell;
1047 hmap_init (&r->cache);
1055 r = (struct gnumeric_reader *) spreadsheet_ref (SPREADSHEET_CAST (r));
1058 xtr = xmlReaderForIO ((xmlInputReadCallback) gzread,
1059 (xmlInputCloseCallback) gzclose, gz,
1061 show_errors ? 0 : (XML_PARSE_NOERROR | XML_PARSE_NOWARNING));
1071 xmlTextReaderSetErrorHandler (xtr, gnumeric_error_handler, r);
1073 sd->row = sd->col = -1;
1074 sd->state = STATE_PRE_INIT;
1078 r->target_sheet_name = NULL;
1079 r->target_sheet_index = -1;
1082 /* Advance to the start of the workbook.
1083 This gives us some confidence that we are actually dealing with a gnumeric
1086 while ((sd->state != STATE_INIT)
1087 && 1 == (ret = xmlTextReaderRead (sd->xtr)))
1089 process_node (r, sd);
1094 /* Does not seem to be a gnumeric file */
1095 spreadsheet_unref (&r->spreadsheet);
1101 const xmlChar *enc = xmlTextReaderConstEncoding (sd->xtr);
1102 xmlCharEncoding xce = xmlParseCharEncoding (CHAR_CAST (const char *, enc));
1104 if (XML_CHAR_ENCODING_UTF8 != xce)
1106 /* I have been told that ALL gnumeric files are UTF8 encoded. If that is correct, this
1107 can never happen. */
1108 msg (MW, _("The gnumeric file `%s' is encoded as %s instead of the usual UTF-8 encoding. "
1109 "Any non-ascii characters will be incorrectly imported."),
1110 r->spreadsheet.file_name,
1119 struct spreadsheet *
1120 gnumeric_probe (const char *filename, bool report_errors)
1122 struct gnumeric_reader *r = gnumeric_reopen (NULL, filename, report_errors);
1124 return &r->spreadsheet;