1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2007, 2009 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <data/datasheet.h>
24 #include <data/casereader-provider.h>
25 #include <data/casereader.h>
26 #include <data/casewriter.h>
27 #include <data/lazy-casereader.h>
28 #include <data/settings.h>
29 #include <libpspp/array.h>
30 #include <libpspp/assertion.h>
31 #include <libpspp/misc.h>
32 #include <libpspp/range-map.h>
33 #include <libpspp/range-set.h>
34 #include <libpspp/sparse-xarray.h>
35 #include <libpspp/taint.h>
36 #include <libpspp/tower.h>
44 static struct axis *axis_create (void);
45 static struct axis *axis_clone (const struct axis *);
46 static void axis_destroy (struct axis *);
48 static void axis_hash (const struct axis *, struct md4_ctx *);
50 static bool axis_allocate (struct axis *, unsigned long int request,
51 unsigned long int *start,
52 unsigned long int *width);
53 static void axis_make_available (struct axis *,
54 unsigned long int start,
55 unsigned long int width);
56 static unsigned long int axis_extend (struct axis *, unsigned long int width);
58 static unsigned long int axis_map (const struct axis *, unsigned long log_pos);
60 static unsigned long axis_get_size (const struct axis *);
61 static void axis_insert (struct axis *,
62 unsigned long int log_start,
63 unsigned long int phy_start,
64 unsigned long int cnt);
65 static void axis_remove (struct axis *,
66 unsigned long int start, unsigned long int cnt);
67 static void axis_move (struct axis *,
68 unsigned long int old_start,
69 unsigned long int new_start,
70 unsigned long int cnt);
72 static struct source *source_create_empty (size_t n_bytes);
73 static struct source *source_create_casereader (struct casereader *);
74 static struct source *source_clone (const struct source *);
75 static void source_destroy (struct source *);
77 static casenumber source_get_backing_n_rows (const struct source *);
79 static int source_allocate_column (struct source *, int width);
80 static void source_release_column (struct source *, int ofs, int width);
81 static bool source_in_use (const struct source *);
83 static bool source_read (const struct column *, casenumber row, union value *);
84 static bool source_write (const struct column *, casenumber row,
86 static bool source_write_column (struct column *, const union value *);
87 static bool source_has_backing (const struct source *);
89 static int get_source_index (const struct datasheet *ds, const struct source *source);
91 /* A datasheet is internally composed from a set of data files,
92 called "sources". The sources that make up a datasheet must
93 have the same number of rows (cases), but their numbers of
94 columns (variables) may vary.
96 A datasheet's external view is produced by mapping (permuting
97 and selecting) its internal data. Thus, we can rearrange or
98 delete rows or columns simply by modifying the mapping. We
99 add rows by adding rows to each source and to the row mapping.
100 We add columns by adding a new source, then adding that source
101 to the column mapping.
103 Each source in a datasheet can be a casereader or a
104 sparse_xarray. Casereaders are read-only, so when sources
105 made from casereaders need to be modified, it is done
106 "virtually" through being overlaid by a sparse_xarray. */
109 struct range_set *avail; /* Free bytes are set to 1s. */
110 struct sparse_xarray *data; /* Data at top level, atop the backing. */
111 struct casereader *backing; /* Backing casereader (or null). */
112 casenumber backing_rows; /* Number of rows in backing (if backed). */
113 size_t n_used; /* Number of column in use (if backed). */
116 /* A logical column. */
119 struct source *source; /* Source of the underlying physical column. */
120 int value_ofs; /* If 'source' has a backing casereader,
121 column's value offset in its cases. */
122 int byte_ofs; /* Byte offset in source's sparse_xarray. */
123 int width; /* 0=numeric, otherwise string width. */
130 struct source **sources; /* Sources, in no particular order. */
131 size_t n_sources; /* Number of sources. */
134 struct caseproto *proto; /* Prototype for rows (initialized lazily). */
135 struct column *columns; /* Logical to physical column mapping. */
136 size_t n_columns; /* Number of logical columns. */
137 unsigned column_min_alloc; /* Min. # of columns to put in a new source. */
140 struct axis *rows; /* Logical to physical row mapping. */
143 struct taint *taint; /* Indicates corrupted data. */
146 /* Is this operation a read or a write? */
153 static void allocate_column (struct datasheet *, int width, struct column *);
154 static void release_source (struct datasheet *, struct source *);
155 static bool rw_case (struct datasheet *ds, enum rw_op op,
156 casenumber lrow, size_t start_column, size_t n_columns,
159 /* Returns the number of bytes needed to store a value with the
160 given WIDTH on disk. */
162 width_to_n_bytes (int width)
164 return width == 0 ? sizeof (double) : width;
167 /* Returns the address of the data in VALUE (for reading or
168 writing to/from disk). VALUE must have the given WIDTH. */
170 value_to_data (const union value *value_, int width)
172 union value *value = (union value *) value_;
173 assert (sizeof value->f == sizeof (double));
177 return value_str_rw (value, width);
180 /* Returns the number of bytes needed to store all the values in
183 caseproto_to_n_bytes (const struct caseproto *proto)
189 for (i = 0; i < caseproto_get_n_widths (proto); i++)
191 int width = caseproto_get_width (proto, i);
193 n_bytes += width_to_n_bytes (width);
198 /* Creates and returns a new datasheet.
200 If READER is nonnull, then the datasheet initially contains
201 the contents of READER. */
203 datasheet_create (struct casereader *reader)
205 struct datasheet *ds = xmalloc (sizeof *ds);
211 ds->column_min_alloc = 8;
212 ds->rows = axis_create ();
213 ds->taint = taint_create ();
221 taint_propagate (casereader_get_taint (reader), ds->taint);
223 ds->proto = caseproto_ref (casereader_get_proto (reader));
225 ds->sources = xmalloc (sizeof *ds->sources);
226 ds->sources[0] = source_create_casereader (reader);
229 ds->n_columns = caseproto_get_n_widths (ds->proto);
230 ds->columns = xnmalloc (ds->n_columns, sizeof *ds->columns);
232 for (i = 0; i < ds->n_columns; i++)
234 struct column *column = &ds->columns[i];
235 int width = caseproto_get_width (ds->proto, i);
236 column->source = ds->sources[0];
237 column->width = width;
240 column->value_ofs = i;
241 column->byte_ofs = byte_ofs;
242 byte_ofs += width_to_n_bytes (column->width);
246 n_rows = source_get_backing_n_rows (ds->sources[0]);
248 axis_insert (ds->rows, 0, axis_extend (ds->rows, n_rows), n_rows);
254 /* Destroys datasheet DS. */
256 datasheet_destroy (struct datasheet *ds)
263 for (i = 0; i < ds->n_sources; i++)
264 source_destroy (ds->sources[i]);
266 caseproto_unref (ds->proto);
268 axis_destroy (ds->rows);
269 taint_destroy (ds->taint);
273 /* Returns the prototype for the cases in DS. The caller must
274 not unref the returned prototype. */
275 const struct caseproto *
276 datasheet_get_proto (const struct datasheet *ds_)
278 struct datasheet *ds = (struct datasheet *) ds_;
279 if (ds->proto == NULL)
283 ds->proto = caseproto_create ();
284 for (i = 0; i < ds->n_columns; i++)
285 ds->proto = caseproto_add_width (ds->proto, ds->columns[i].width);
290 /* Returns the width of the given COLUMN within DS.
291 COLUMN must be less than the number of columns in DS. */
293 datasheet_get_column_width (const struct datasheet *ds, size_t column)
295 assert (column < datasheet_get_n_columns (ds));
296 return ds->columns[column].width;
299 /* Moves datasheet DS to a new location in memory, and returns
300 the new location. Afterward, the datasheet must not be
301 accessed at its former location.
303 This function is useful for ensuring that all references to a
304 datasheet have been dropped, especially in conjunction with
305 tools like Valgrind. */
307 datasheet_rename (struct datasheet *ds)
309 struct datasheet *new = xmemdup (ds, sizeof *ds);
314 /* Returns true if datasheet DS is tainted.
315 A datasheet is tainted by an I/O error or by taint
316 propagation to the datasheet. */
318 datasheet_error (const struct datasheet *ds)
320 return taint_is_tainted (ds->taint);
323 /* Marks datasheet DS tainted. */
325 datasheet_force_error (struct datasheet *ds)
327 taint_set_taint (ds->taint);
330 /* Returns datasheet DS's taint object. */
332 datasheet_get_taint (const struct datasheet *ds)
337 /* Returns the number of rows in DS. */
339 datasheet_get_n_rows (const struct datasheet *ds)
341 return axis_get_size (ds->rows);
344 /* Returns the number of columns in DS. */
346 datasheet_get_n_columns (const struct datasheet *ds)
348 return ds->n_columns;
351 /* Inserts a column of the given WIDTH into datasheet DS just
352 before column BEFORE. Initializes the contents of each row in
353 the inserted column to VALUE (which must have width WIDTH).
355 Returns true if successful, false on failure. In case of
356 failure, the datasheet is unchanged. */
358 datasheet_insert_column (struct datasheet *ds,
359 const union value *value, int width, size_t before)
363 ds->columns = xnrealloc (ds->columns,
364 ds->n_columns + 1, sizeof *ds->columns);
365 insert_element (ds->columns, ds->n_columns, sizeof *ds->columns, before);
366 col = &ds->columns[before];
369 allocate_column (ds, width, col);
371 if (width >= 0 && !source_write_column (col, value))
373 datasheet_delete_columns (ds, before, 1);
374 taint_set_taint (ds->taint);
381 /* Deletes the N columns in DS starting from column START. */
383 datasheet_delete_columns (struct datasheet *ds, size_t start, size_t n)
389 for (i = start; i < start + n; i++)
391 struct column *column = &ds->columns[i];
392 struct source *source = column->source;
393 source_release_column (source, column->byte_ofs, column->width);
394 release_source (ds, source);
397 remove_range (ds->columns, ds->n_columns, sizeof *ds->columns, start, n);
400 caseproto_unref (ds->proto);
405 /* Moves the N columns in DS starting at position OLD_START so
406 that they then start at position NEW_START. Equivalent to
407 deleting the column rows, then inserting them at what becomes
408 position NEW_START after the deletion. */
410 datasheet_move_columns (struct datasheet *ds,
411 size_t old_start, size_t new_start,
414 move_range (ds->columns, ds->n_columns, sizeof *ds->columns,
415 old_start, new_start, n);
417 caseproto_unref (ds->proto);
421 struct resize_datasheet_value_aux
423 union value src_value;
427 void (*resize_cb) (const union value *, union value *, void *aux);
430 union value dst_value;
436 resize_datasheet_value (const void *src, void *dst, void *aux_)
438 struct resize_datasheet_value_aux *aux = aux_;
440 memcpy (value_to_data (&aux->src_value, aux->src_width),
441 (uint8_t *) src + aux->src_ofs,
442 width_to_n_bytes (aux->src_width));
444 aux->resize_cb (&aux->src_value, &aux->dst_value, aux->resize_cb_aux);
446 memcpy ((uint8_t *) dst + aux->dst_ofs,
447 value_to_data (&aux->dst_value, aux->dst_width),
448 width_to_n_bytes (aux->dst_width));
454 datasheet_resize_column (struct datasheet *ds, size_t column, int new_width,
455 void (*resize_cb) (const union value *,
456 union value *, void *aux),
459 /* XXX needs a test. */
460 struct column old_col;
464 assert (column < datasheet_get_n_columns (ds));
466 col = &ds->columns[column];
468 old_width = old_col.width;
470 if (old_width == new_width)
472 /* FIXME: for consistency, we should call resize_cb() on
475 else if (new_width == -1)
477 datasheet_delete_columns (ds, column, 1);
478 datasheet_insert_column (ds, NULL, -1, column);
480 else if (old_width == -1)
483 value_init (&value, new_width);
484 value_set_missing (&value, new_width);
485 if (resize_cb != NULL)
486 resize_cb (NULL, &value, resize_cb_aux);
487 datasheet_delete_columns (ds, column, 1);
488 datasheet_insert_column (ds, &value, new_width, column);
489 value_destroy (&value, new_width);
491 else if (source_has_backing (col->source))
493 unsigned long int n_rows = axis_get_size (ds->rows);
494 union value src, dst;
497 source_release_column (col->source, col->byte_ofs, col->width);
498 allocate_column (ds, new_width, col);
500 value_init (&src, old_width);
501 value_init (&dst, new_width);
502 for (row = 0; row < n_rows; row++)
504 if (!source_read (&old_col, row, &src))
506 /* FIXME: back out col changes. */
509 resize_cb (&src, &dst, resize_cb_aux);
510 if (!source_write (col, row, &dst))
512 /* FIXME: back out col changes. */
517 release_source (ds, old_col.source);
521 struct resize_datasheet_value_aux aux;
523 source_release_column (col->source, col->byte_ofs, col->width);
524 allocate_column (ds, new_width, col);
526 value_init (&aux.src_value, old_col.width);
527 aux.src_ofs = old_col.byte_ofs;
528 aux.src_width = old_col.width;
529 aux.resize_cb = resize_cb;
530 aux.resize_cb_aux = resize_cb_aux;
531 value_init (&aux.dst_value, new_width);
532 aux.dst_ofs = col->byte_ofs;
533 aux.dst_width = new_width;
534 sparse_xarray_copy (old_col.source->data, col->source->data,
535 resize_datasheet_value, &aux);
536 value_destroy (&aux.src_value, old_width);
537 value_destroy (&aux.dst_value, new_width);
539 release_source (ds, old_col.source);
544 /* Retrieves and returns the contents of the given ROW in
545 datasheet DS. The caller owns the returned case and must
546 unref it when it is no longer needed. Returns a null pointer
549 datasheet_get_row (const struct datasheet *ds, casenumber row)
551 size_t n_columns = datasheet_get_n_columns (ds);
552 struct ccase *c = case_create (datasheet_get_proto (ds));
553 if (rw_case ((struct datasheet *) ds, OP_READ,
554 row, 0, n_columns, case_data_all_rw (c)))
563 /* Stores the contents of case C, which is destroyed, into the
564 given ROW in DS. Returns true on success, false on I/O error.
565 On failure, the given ROW might be partially modified or
568 datasheet_put_row (struct datasheet *ds, casenumber row, struct ccase *c)
570 size_t n_columns = datasheet_get_n_columns (ds);
571 bool ok = rw_case (ds, OP_WRITE, row, 0, n_columns,
572 (union value *) case_data_all (c));
577 /* Stores the values of COLUMN in DS in the given ROW in DS into
578 VALUE. The caller must have already initialized VALUE as a
579 value of the appropriate width (as returned by
580 datasheet_get_column_width (DS, COLUMN)). Returns true if
581 successful, false on I/O error. */
583 datasheet_get_value (const struct datasheet *ds, casenumber row,
584 size_t column, union value *value)
587 return rw_case ((struct datasheet *) ds, OP_READ, row, column, 1, value);
590 /* Stores VALUE into DS in the given ROW and COLUMN. VALUE must
591 have the correct width for COLUMN (as returned by
592 datasheet_get_column_width (DS, COLUMN)). Returns true if
593 successful, false on I/O error. On failure, ROW might be
594 partially modified or corrupted. */
596 datasheet_put_value (struct datasheet *ds UNUSED, casenumber row UNUSED,
597 size_t column UNUSED, const union value *value UNUSED)
599 return rw_case (ds, OP_WRITE, row, column, 1, (union value *) value);
602 /* Inserts the CNT cases at C into datasheet DS just before row
603 BEFORE. Returns true if successful, false on I/O error. On
604 failure, datasheet DS is not modified.
606 Regardless of success, this function unrefs all of the cases
609 datasheet_insert_rows (struct datasheet *ds,
610 casenumber before, struct ccase *c[],
613 casenumber added = 0;
616 unsigned long first_phy;
617 unsigned long phy_cnt;
620 /* Allocate physical rows from the pool of available
622 if (!axis_allocate (ds->rows, cnt, &first_phy, &phy_cnt))
624 /* No rows were available. Extend the row axis to make
625 some new ones available. */
627 first_phy = axis_extend (ds->rows, cnt);
630 /* Insert the new rows into the row mapping. */
631 axis_insert (ds->rows, before, first_phy, phy_cnt);
633 /* Initialize the new rows. */
634 for (i = 0; i < phy_cnt; i++)
635 if (!datasheet_put_row (ds, before + i, c[i]))
639 datasheet_delete_rows (ds, before - added, phy_cnt + added);
652 /* Deletes the CNT rows in DS starting from row FIRST. */
654 datasheet_delete_rows (struct datasheet *ds,
655 casenumber first, casenumber cnt)
659 /* Free up rows for reuse.
661 for (lrow = first; lrow < first + cnt; lrow++)
662 axis_make_available (ds->rows, axis_map (ds->rows, lrow), 1);
664 /* Remove rows from logical-to-physical mapping. */
665 axis_remove (ds->rows, first, cnt);
668 /* Moves the CNT rows in DS starting at position OLD_START so
669 that they then start at position NEW_START. Equivalent to
670 deleting the given rows, then inserting them at what becomes
671 position NEW_START after the deletion. */
673 datasheet_move_rows (struct datasheet *ds,
674 size_t old_start, size_t new_start,
677 axis_move (ds->rows, old_start, new_start, cnt);
680 static const struct casereader_random_class datasheet_reader_class;
682 /* Creates and returns a casereader whose input cases are the
683 rows in datasheet DS. From the caller's perspective, DS is
684 effectively destroyed by this operation, such that the caller
685 must not reference it again. */
687 datasheet_make_reader (struct datasheet *ds)
689 struct casereader *reader;
690 ds = datasheet_rename (ds);
691 reader = casereader_create_random (datasheet_get_proto (ds),
692 datasheet_get_n_rows (ds),
693 &datasheet_reader_class, ds);
694 taint_propagate (datasheet_get_taint (ds), casereader_get_taint (reader));
698 /* "read" function for the datasheet random casereader. */
699 static struct ccase *
700 datasheet_reader_read (struct casereader *reader UNUSED, void *ds_,
703 struct datasheet *ds = ds_;
704 if (case_idx < datasheet_get_n_rows (ds))
706 struct ccase *c = datasheet_get_row (ds, case_idx);
708 taint_set_taint (ds->taint);
715 /* "destroy" function for the datasheet random casereader. */
717 datasheet_reader_destroy (struct casereader *reader UNUSED, void *ds_)
719 struct datasheet *ds = ds_;
720 datasheet_destroy (ds);
723 /* "advance" function for the datasheet random casereader. */
725 datasheet_reader_advance (struct casereader *reader UNUSED, void *ds_,
728 struct datasheet *ds = ds_;
729 datasheet_delete_rows (ds, 0, case_cnt);
732 /* Random casereader class for a datasheet. */
733 static const struct casereader_random_class datasheet_reader_class =
735 datasheet_reader_read,
736 datasheet_reader_destroy,
737 datasheet_reader_advance,
741 allocate_column (struct datasheet *ds, int width, struct column *column)
743 caseproto_unref (ds->proto);
746 column->value_ofs = -1;
747 column->width = width;
753 n_bytes = width_to_n_bytes (width);
754 for (i = 0; i < ds->n_sources; i++)
756 column->source = ds->sources[i];
757 column->byte_ofs = source_allocate_column (column->source, n_bytes);
758 if (column->byte_ofs >= 0)
762 column->source = source_create_empty (MAX (n_bytes,
763 ds->column_min_alloc));
764 ds->sources = xnrealloc (ds->sources,
765 ds->n_sources + 1, sizeof *ds->sources);
766 ds->sources[ds->n_sources++] = column->source;
768 ds->column_min_alloc = MIN (65536, ds->column_min_alloc * 2);
770 column->byte_ofs = source_allocate_column (column->source, n_bytes);
771 assert (column->byte_ofs >= 0);
775 column->source = NULL;
776 column->byte_ofs = -1;
781 release_source (struct datasheet *ds, struct source *source)
783 if (source_has_backing (source) && !source_in_use (source))
785 /* Since only the first source to be added ever
786 has a backing, this source must have index
788 assert (source == ds->sources[0]);
789 ds->sources[0] = ds->sources[--ds->n_sources];
790 source_destroy (source);
794 /* Reads (if OP is OP_READ) or writes (if op is OP_WRITE) the
795 N_COLUMNS columns starting from column START_COLUMN in row
796 LROW to/from the N_COLUMNS values in DATA. */
798 rw_case (struct datasheet *ds, enum rw_op op,
799 casenumber lrow, size_t start_column, size_t n_columns,
805 assert (lrow < datasheet_get_n_rows (ds));
806 assert (n_columns <= datasheet_get_n_columns (ds));
807 assert (start_column + n_columns <= datasheet_get_n_columns (ds));
809 prow = axis_map (ds->rows, lrow);
810 for (i = 0; i < n_columns; i++)
812 struct column *c = &ds->columns[start_column + i];
818 ok = source_read (c, prow, &data[i]);
820 ok = source_write (c, prow, &data[i]);
824 taint_set_taint (ds->taint);
834 An axis has two functions. First, it maintains a mapping from
835 logical (client-visible) to physical (storage) ordinates. The
836 axis_map and axis_get_size functions inspect this mapping, and
837 the axis_insert, axis_remove, and axis_move functions modify
838 it. Second, it tracks the set of ordinates that are unused
839 and available for reuse. The axis_allocate,
840 axis_make_available, and axis_extend functions affect the set
841 of available ordinates. */
844 struct tower log_to_phy; /* Map from logical to physical ordinates;
845 contains "struct axis_group"s. */
846 struct range_set *available; /* Set of unused, available ordinates. */
847 unsigned long int phy_size; /* Current physical length of axis. */
850 /* A mapping from logical to physical ordinates. */
853 struct tower_node logical; /* Range of logical ordinates. */
854 unsigned long phy_start; /* First corresponding physical ordinate. */
857 static struct axis_group *axis_group_from_tower_node (struct tower_node *);
858 static struct tower_node *make_axis_group (unsigned long int phy_start);
859 static struct tower_node *split_axis (struct axis *, unsigned long int where);
860 static void merge_axis_nodes (struct axis *, struct tower_node *,
861 struct tower_node **other_node);
862 static void check_axis_merged (const struct axis *axis UNUSED);
864 /* Creates and returns a new, initially empty axis. */
868 struct axis *axis = xmalloc (sizeof *axis);
869 tower_init (&axis->log_to_phy);
870 axis->available = range_set_create ();
875 /* Returns a clone of existing axis OLD.
877 Currently this is used only by the datasheet model checker
878 driver, but it could be otherwise useful. */
880 axis_clone (const struct axis *old)
882 const struct tower_node *node;
885 new = xmalloc (sizeof *new);
886 tower_init (&new->log_to_phy);
887 new->available = range_set_clone (old->available, NULL);
888 new->phy_size = old->phy_size;
890 for (node = tower_first (&old->log_to_phy); node != NULL;
891 node = tower_next (&old->log_to_phy, node))
893 unsigned long int size = tower_node_get_size (node);
894 struct axis_group *group = tower_data (node, struct axis_group, logical);
895 tower_insert (&new->log_to_phy, size, make_axis_group (group->phy_start),
902 /* Adds the state of AXIS to the MD4 hash context CTX.
904 This is only used by the datasheet model checker driver. It
905 is unlikely to be otherwise useful. */
907 axis_hash (const struct axis *axis, struct md4_ctx *ctx)
909 const struct tower_node *tn;
910 const struct range_set_node *rsn;
912 for (tn = tower_first (&axis->log_to_phy); tn != NULL;
913 tn = tower_next (&axis->log_to_phy, tn))
915 struct axis_group *group = tower_data (tn, struct axis_group, logical);
916 unsigned long int phy_start = group->phy_start;
917 unsigned long int size = tower_node_get_size (tn);
919 md4_process_bytes (&phy_start, sizeof phy_start, ctx);
920 md4_process_bytes (&size, sizeof size, ctx);
923 for (rsn = range_set_first (axis->available); rsn != NULL;
924 rsn = range_set_next (axis->available, rsn))
926 unsigned long int start = range_set_node_get_start (rsn);
927 unsigned long int end = range_set_node_get_end (rsn);
929 md4_process_bytes (&start, sizeof start, ctx);
930 md4_process_bytes (&end, sizeof end, ctx);
933 md4_process_bytes (&axis->phy_size, sizeof axis->phy_size, ctx);
938 axis_destroy (struct axis *axis)
943 while (!tower_is_empty (&axis->log_to_phy))
945 struct tower_node *node = tower_first (&axis->log_to_phy);
946 struct axis_group *group = tower_data (node, struct axis_group,
948 tower_delete (&axis->log_to_phy, node);
952 range_set_destroy (axis->available);
956 /* Allocates up to REQUEST contiguous unused and available
957 ordinates from AXIS. If successful, stores the number
958 obtained into *WIDTH and the ordinate of the first into
959 *START, marks the ordinates as now unavailable return true.
960 On failure, which occurs only if AXIS has no unused and
961 available ordinates, returns false without modifying AXIS. */
963 axis_allocate (struct axis *axis, unsigned long int request,
964 unsigned long int *start, unsigned long int *width)
966 return range_set_allocate (axis->available, request, start, width);
969 /* Marks the WIDTH contiguous ordinates in AXIS, starting from
970 START, as unused and available. */
972 axis_make_available (struct axis *axis,
973 unsigned long int start, unsigned long int width)
975 range_set_insert (axis->available, start, width);
978 /* Extends the total physical length of AXIS by WIDTH and returns
979 the first ordinate in the new physical region. */
980 static unsigned long int
981 axis_extend (struct axis *axis, unsigned long int width)
983 unsigned long int start = axis->phy_size;
984 axis->phy_size += width;
988 /* Returns the physical ordinate in AXIS corresponding to logical
989 ordinate LOG_POS. LOG_POS must be less than the logical
991 static unsigned long int
992 axis_map (const struct axis *axis, unsigned long log_pos)
994 struct tower_node *node;
995 struct axis_group *group;
996 unsigned long int group_start;
998 node = tower_lookup (&axis->log_to_phy, log_pos, &group_start);
999 group = tower_data (node, struct axis_group, logical);
1000 return group->phy_start + (log_pos - group_start);
1003 /* Returns the logical length of AXIS. */
1004 static unsigned long
1005 axis_get_size (const struct axis *axis)
1007 return tower_height (&axis->log_to_phy);
1010 /* Inserts the CNT contiguous physical ordinates starting at
1011 PHY_START into AXIS's logical-to-physical mapping, starting at
1012 logical position LOG_START. */
1014 axis_insert (struct axis *axis,
1015 unsigned long int log_start, unsigned long int phy_start,
1016 unsigned long int cnt)
1018 struct tower_node *before = split_axis (axis, log_start);
1019 struct tower_node *new = make_axis_group (phy_start);
1020 tower_insert (&axis->log_to_phy, cnt, new, before);
1021 merge_axis_nodes (axis, new, NULL);
1022 check_axis_merged (axis);
1025 /* Removes CNT ordinates from AXIS's logical-to-physical mapping
1026 starting at logical position START. */
1028 axis_remove (struct axis *axis,
1029 unsigned long int start, unsigned long int cnt)
1033 struct tower_node *last = split_axis (axis, start + cnt);
1034 struct tower_node *cur, *next;
1035 for (cur = split_axis (axis, start); cur != last; cur = next)
1037 next = tower_delete (&axis->log_to_phy, cur);
1038 free (axis_group_from_tower_node (cur));
1040 merge_axis_nodes (axis, last, NULL);
1041 check_axis_merged (axis);
1045 /* Moves the CNT ordinates in AXIS's logical-to-mapping starting
1046 at logical position OLD_START so that they then start at
1047 position NEW_START. */
1049 axis_move (struct axis *axis,
1050 unsigned long int old_start, unsigned long int new_start,
1051 unsigned long int cnt)
1053 if (cnt > 0 && old_start != new_start)
1055 struct tower_node *old_first, *old_last, *new_first;
1056 struct tower_node *merge1, *merge2;
1057 struct tower tmp_array;
1059 /* Move ordinates OLD_START...(OLD_START + CNT) into new,
1060 separate TMP_ARRAY. */
1061 old_first = split_axis (axis, old_start);
1062 old_last = split_axis (axis, old_start + cnt);
1063 tower_init (&tmp_array);
1064 tower_splice (&tmp_array, NULL,
1065 &axis->log_to_phy, old_first, old_last);
1066 merge_axis_nodes (axis, old_last, NULL);
1067 check_axis_merged (axis);
1069 /* Move TMP_ARRAY to position NEW_START. */
1070 new_first = split_axis (axis, new_start);
1071 merge1 = tower_first (&tmp_array);
1072 merge2 = tower_last (&tmp_array);
1073 if (merge2 == merge1)
1075 tower_splice (&axis->log_to_phy, new_first, &tmp_array, old_first, NULL);
1076 merge_axis_nodes (axis, merge1, &merge2);
1077 merge_axis_nodes (axis, merge2, NULL);
1078 check_axis_merged (axis);
1082 /* Returns the axis_group in which NODE is embedded. */
1083 static struct axis_group *
1084 axis_group_from_tower_node (struct tower_node *node)
1086 return tower_data (node, struct axis_group, logical);
1089 /* Creates and returns a new axis_group at physical position
1091 static struct tower_node *
1092 make_axis_group (unsigned long phy_start)
1094 struct axis_group *group = xmalloc (sizeof *group);
1095 group->phy_start = phy_start;
1096 return &group->logical;
1099 /* Returns the tower_node in AXIS's logical-to-physical map whose
1100 bottom edge is at exact level WHERE. If there is no such
1101 tower_node in AXIS's logical-to-physical map, then split_axis
1102 creates one by breaking an existing tower_node into two
1103 separate ones, unless WHERE is equal to the tower height, in
1104 which case it simply returns a null pointer. */
1105 static struct tower_node *
1106 split_axis (struct axis *axis, unsigned long int where)
1108 unsigned long int group_start;
1109 struct tower_node *group_node;
1110 struct axis_group *group;
1112 assert (where <= tower_height (&axis->log_to_phy));
1113 if (where >= tower_height (&axis->log_to_phy))
1116 group_node = tower_lookup (&axis->log_to_phy, where, &group_start);
1117 group = axis_group_from_tower_node (group_node);
1118 if (where > group_start)
1120 unsigned long int size_1 = where - group_start;
1121 unsigned long int size_2 = tower_node_get_size (group_node) - size_1;
1122 struct tower_node *next = tower_next (&axis->log_to_phy, group_node);
1123 struct tower_node *new = make_axis_group (group->phy_start + size_1);
1124 tower_resize (&axis->log_to_phy, group_node, size_1);
1125 tower_insert (&axis->log_to_phy, size_2, new, next);
1129 return &group->logical;
1132 /* Within AXIS, attempts to merge NODE (or the last node in AXIS,
1133 if NODE is null) with its neighbor nodes. This is possible
1134 when logically adjacent nodes are also adjacent physically (in
1137 When a merge occurs, and OTHER_NODE is non-null and points to
1138 the node to be deleted, this function also updates
1139 *OTHER_NODE, if necessary, to ensure that it remains a valid
1142 merge_axis_nodes (struct axis *axis, struct tower_node *node,
1143 struct tower_node **other_node)
1145 struct tower *t = &axis->log_to_phy;
1146 struct axis_group *group;
1147 struct tower_node *next, *prev;
1149 /* Find node to potentially merge with neighbors. */
1151 node = tower_last (t);
1154 group = axis_group_from_tower_node (node);
1156 /* Try to merge NODE with successor. */
1157 next = tower_next (t, node);
1160 struct axis_group *next_group = axis_group_from_tower_node (next);
1161 unsigned long this_height = tower_node_get_size (node);
1163 if (group->phy_start + this_height == next_group->phy_start)
1165 unsigned long next_height = tower_node_get_size (next);
1166 tower_resize (t, node, this_height + next_height);
1167 if (other_node != NULL && *other_node == next)
1168 *other_node = tower_next (t, *other_node);
1169 tower_delete (t, next);
1174 /* Try to merge NODE with predecessor. */
1175 prev = tower_prev (t, node);
1178 struct axis_group *prev_group = axis_group_from_tower_node (prev);
1179 unsigned long prev_height = tower_node_get_size (prev);
1181 if (prev_group->phy_start + prev_height == group->phy_start)
1183 unsigned long this_height = tower_node_get_size (node);
1184 group->phy_start = prev_group->phy_start;
1185 tower_resize (t, node, this_height + prev_height);
1186 if (other_node != NULL && *other_node == prev)
1187 *other_node = tower_next (t, *other_node);
1188 tower_delete (t, prev);
1194 /* Verify that all potentially merge-able nodes in AXIS are
1197 check_axis_merged (const struct axis *axis UNUSED)
1199 #if ASSERT_LEVEL >= 10
1200 struct tower_node *prev, *node;
1202 for (prev = NULL, node = tower_first (&axis->log_to_phy); node != NULL;
1203 prev = node, node = tower_next (&axis->log_to_phy, node))
1206 struct axis_group *prev_group = axis_group_from_tower_node (prev);
1207 unsigned long prev_height = tower_node_get_size (prev);
1208 struct axis_group *node_group = axis_group_from_tower_node (node);
1209 assert (prev_group->phy_start + prev_height != node_group->phy_start);
1216 /* Creates and returns an empty, unbacked source with N_BYTES
1217 bytes per case, none of which are initially in use. */
1218 static struct source *
1219 source_create_empty (size_t n_bytes)
1221 struct source *source = xmalloc (sizeof *source);
1222 size_t row_size = n_bytes + 4 * sizeof (void *);
1223 size_t max_memory_rows = settings_get_workspace () / row_size;
1224 source->avail = range_set_create ();
1225 range_set_insert (source->avail, 0, n_bytes);
1226 source->data = sparse_xarray_create (n_bytes, MAX (max_memory_rows, 4));
1227 source->backing = NULL;
1228 source->backing_rows = 0;
1233 /* Creates and returns a new source backed by READER and with the
1234 same initial dimensions and content. */
1235 static struct source *
1236 source_create_casereader (struct casereader *reader)
1238 const struct caseproto *proto = casereader_get_proto (reader);
1239 size_t n_bytes = caseproto_to_n_bytes (proto);
1240 struct source *source = source_create_empty (n_bytes);
1244 range_set_delete (source->avail, 0, n_bytes);
1245 source->backing = reader;
1246 source->backing_rows = casereader_count_cases (reader);
1249 n_columns = caseproto_get_n_widths (proto);
1250 for (i = 0; i < n_columns; i++)
1251 if (caseproto_get_width (proto, i) >= 0)
1257 /* Returns a clone of source OLD with the same data and backing
1260 Currently this is used only by the datasheet model checker
1261 driver, but it could be otherwise useful. */
1262 static struct source *
1263 source_clone (const struct source *old)
1265 struct source *new = xmalloc (sizeof *new);
1266 new->avail = range_set_clone (old->avail, NULL);
1267 new->data = sparse_xarray_clone (old->data);
1268 new->backing = old->backing != NULL ? casereader_clone (old->backing) : NULL;
1269 new->backing_rows = old->backing_rows;
1270 new->n_used = old->n_used;
1271 if (new->data == NULL)
1273 source_destroy (new);
1280 source_allocate_column (struct source *source, int width)
1282 unsigned long int start;
1285 assert (width >= 0);
1286 n_bytes = width_to_n_bytes (width);
1287 if (source->backing == NULL
1288 && range_set_allocate_fully (source->avail, n_bytes, &start))
1295 source_release_column (struct source *source, int ofs, int width)
1297 assert (width >= 0);
1298 range_set_insert (source->avail, ofs, width_to_n_bytes (width));
1299 if (source->backing != NULL)
1303 /* Returns true if SOURCE has any columns in use,
1306 source_in_use (const struct source *source)
1308 return source->n_used > 0;
1311 /* Destroys SOURCE and its data and backing, if any. */
1313 source_destroy (struct source *source)
1317 range_set_destroy (source->avail);
1318 sparse_xarray_destroy (source->data);
1319 casereader_destroy (source->backing);
1324 /* Returns the number of rows in SOURCE's backing casereader
1325 (SOURCE must have a backing casereader). */
1327 source_get_backing_n_rows (const struct source *source)
1329 assert (source_has_backing (source));
1330 return source->backing_rows;
1333 /* Reads the given COLUMN from SOURCE in the given ROW, into
1334 VALUE. Returns true if successful, false on I/O error.
1336 The caller must have initialized VALUE with the proper
1339 source_read (const struct column *column, casenumber row, union value *value)
1341 struct source *source = column->source;
1343 assert (column->width >= 0);
1344 if (source->backing == NULL
1345 || sparse_xarray_contains_row (source->data, row))
1346 return sparse_xarray_read (source->data, row, column->byte_ofs,
1347 width_to_n_bytes (column->width),
1348 value_to_data (value, column->width));
1351 struct ccase *c = casereader_peek (source->backing, row);
1352 bool ok = c != NULL;
1355 value_copy (value, case_data_idx (c, column->value_ofs),
1364 copy_case_into_source (struct source *source, struct ccase *c, casenumber row)
1366 const struct caseproto *proto = casereader_get_proto (source->backing);
1367 size_t n_widths = caseproto_get_n_widths (proto);
1372 for (i = 0; i < n_widths; i++)
1374 int width = caseproto_get_width (proto, i);
1377 int n_bytes = width_to_n_bytes (width);
1378 if (!sparse_xarray_write (source->data, row, ofs, n_bytes,
1379 value_to_data (case_data_idx (c, i),
1388 /* Writes VALUE to SOURCE in the given ROW and COLUMN. Returns
1389 true if successful, false on I/O error. On error, the row's
1390 data may be completely or partially corrupted, both inside and
1391 outside the region to be written. */
1393 source_write (const struct column *column, casenumber row,
1394 const union value *value)
1396 struct source *source = column->source;
1397 struct casereader *backing = source->backing;
1399 assert (column->width >= 0);
1401 && !sparse_xarray_contains_row (source->data, row)
1402 && row < source->backing_rows)
1407 c = casereader_peek (backing, row);
1411 ok = copy_case_into_source (source, c, row);
1417 return sparse_xarray_write (source->data, row, column->byte_ofs,
1418 width_to_n_bytes (column->width),
1419 value_to_data (value, column->width));
1422 /* Within SOURCE, which must not have a backing casereader,
1423 writes the VALUE_CNT values in VALUES_CNT to the VALUE_CNT
1424 columns starting from START_COLUMN, in every row, even in rows
1425 not yet otherwise initialized. Returns true if successful,
1426 false if an I/O error occurs.
1428 We don't support backing != NULL because (1) it's harder and
1429 (2) this function is only called by
1430 datasheet_insert_column, which doesn't reuse columns from
1431 sources that are backed by casereaders. */
1433 source_write_column (struct column *column, const union value *value)
1435 int width = column->width;
1437 assert (column->source->backing == NULL);
1438 assert (width >= 0);
1440 return sparse_xarray_write_columns (column->source->data, column->byte_ofs,
1441 width_to_n_bytes (width),
1442 value_to_data (value, width));
1445 /* Returns true if SOURCE has a backing casereader, false
1448 source_has_backing (const struct source *source)
1450 return source->backing != NULL;
1453 /* Datasheet model checker test driver. */
1456 get_source_index (const struct datasheet *ds, const struct source *source)
1460 for (i = 0; i < ds->n_sources; i++)
1461 if (ds->sources[i] == source)
1466 /* Clones the structure and contents of ODS into a new datasheet,
1467 and returns the new datasheet. */
1469 clone_datasheet (const struct datasheet *ods)
1471 struct datasheet *ds;
1474 ds = xmalloc (sizeof *ds);
1476 ds->sources = xmalloc (ods->n_sources * sizeof *ds->sources);
1477 for (i = 0; i < ods->n_sources; i++)
1478 ds->sources[i] = source_clone (ods->sources[i]);
1479 ds->n_sources = ods->n_sources;
1481 ds->proto = ods->proto != NULL ? caseproto_ref (ods->proto) : NULL;
1482 ds->columns = xmemdup (ods->columns, ods->n_columns * sizeof *ods->columns);
1483 for (i = 0; i < ods->n_columns; i++)
1484 ds->columns[i].source
1485 = ds->sources[get_source_index (ods, ods->columns[i].source)];
1486 ds->n_columns = ods->n_columns;
1487 ds->column_min_alloc = ods->column_min_alloc;
1489 ds->rows = axis_clone (ods->rows);
1491 ds->taint = taint_create ();
1496 /* Hashes the structure of datasheet DS and returns the hash.
1497 We use MD4 because it is much faster than MD5 or SHA-1 but its
1498 collision resistance is just as good. */
1500 hash_datasheet (const struct datasheet *ds)
1502 unsigned int hash[DIV_RND_UP (20, sizeof (unsigned int))];
1506 md4_init_ctx (&ctx);
1507 for (i = 0; i < ds->n_columns; i++)
1509 const struct column *column = &ds->columns[i];
1510 int source_n_bytes = sparse_xarray_get_n_columns (column->source->data);
1511 md4_process_bytes (&source_n_bytes, sizeof source_n_bytes, &ctx);
1512 /*md4_process_bytes (&column->byte_ofs, sizeof column->byte_ofs, &ctx);*/
1513 md4_process_bytes (&column->value_ofs, sizeof column->value_ofs, &ctx);
1514 md4_process_bytes (&column->width, sizeof column->width, &ctx);
1516 axis_hash (ds->rows, &ctx);
1517 md4_process_bytes (&ds->column_min_alloc, sizeof ds->column_min_alloc, &ctx);
1518 md4_finish_ctx (&ctx, hash);