1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2007, 2009 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <libpspp/sparse-xarray.h>
25 #include <libpspp/assertion.h>
26 #include <libpspp/misc.h>
27 #include <libpspp/range-set.h>
28 #include <libpspp/sparse-array.h>
29 #include <libpspp/tmpfile.h>
35 /* A sparse array of arrays of bytes. */
38 size_t n_bytes; /* Number of bytes per row. */
39 uint8_t *default_row; /* Defaults for unwritten rows. */
40 unsigned long int max_memory_rows; /* Max rows before dumping to disk. */
41 struct sparse_array *memory; /* Backing, if stored in memory. */
42 struct tmpfile *disk; /* Backing, if stored on disk. */
43 struct range_set *disk_rows; /* Allocated rows, if on disk. */
47 range_is_valid (const struct sparse_xarray *sx, size_t ofs, size_t n)
49 return n <= sx->n_bytes && ofs <= sx->n_bytes && ofs + n <= sx->n_bytes;
52 /* Creates and returns a new sparse array of arrays of bytes.
53 Each row in the sparse array will consist of N_BYTES bytes.
54 If fewer than MAX_MEMORY_ROWS rows are added to the array,
55 then it will be kept in memory; if more than that are added,
56 then it will be stored on disk. */
57 struct sparse_xarray *
58 sparse_xarray_create (size_t n_bytes, size_t max_memory_rows)
60 struct sparse_xarray *sx = xmalloc (sizeof *sx);
61 sx->n_bytes = n_bytes;
62 sx->default_row = xzalloc (n_bytes);
63 sx->max_memory_rows = max_memory_rows;
64 sx->memory = sparse_array_create (sizeof (uint8_t *));
70 /* Creates and returns a new sparse array of rows that contains
71 the same data as OLD. Returns a null pointer if cloning
73 struct sparse_xarray *
74 sparse_xarray_clone (const struct sparse_xarray *old)
76 struct sparse_xarray *new = xmalloc (sizeof *new);
78 new->n_bytes = old->n_bytes;
80 new->default_row = xmemdup (old->default_row, old->n_bytes);
82 new->max_memory_rows = old->max_memory_rows;
84 if (old->memory != NULL)
86 unsigned long int idx;
89 new->memory = sparse_array_create (sizeof (uint8_t *));
90 for (old_row = sparse_array_first (old->memory, &idx); old_row != NULL;
91 old_row = sparse_array_next (old->memory, idx, &idx))
93 uint8_t **new_row = sparse_array_insert (new->memory, idx);
94 *new_row = xmemdup (*old_row, new->n_bytes);
100 if (old->disk != NULL)
102 const struct range_set_node *node;
103 void *tmp = xmalloc (old->n_bytes);
105 new->disk = tmpfile_create ();
106 new->disk_rows = range_set_clone (old->disk_rows, NULL);
107 for (node = range_set_first (old->disk_rows); node != NULL;
108 node = range_set_next (old->disk_rows, node))
110 unsigned long int start = range_set_node_get_start (node);
111 unsigned long int end = range_set_node_get_end (node);
112 unsigned long int idx;
114 for (idx = start; idx < end; idx++)
116 off_t offset = (off_t) idx * old->n_bytes;
117 if (!tmpfile_read (old->disk, offset, old->n_bytes, tmp)
118 || !tmpfile_write (new->disk, offset, old->n_bytes, tmp))
121 sparse_xarray_destroy (new);
131 new->disk_rows = NULL;
137 /* Destroys sparse array of rows SX. */
139 sparse_xarray_destroy (struct sparse_xarray *sx)
143 free (sx->default_row);
144 if (sx->memory != NULL)
146 unsigned long int idx;
148 for (row = sparse_array_first (sx->memory, &idx); row != NULL;
149 row = sparse_array_next (sx->memory, idx, &idx))
151 sparse_array_destroy (sx->memory);
153 tmpfile_destroy (sx->disk);
154 range_set_destroy (sx->disk_rows);
159 /* Returns the number of bytes in each row in SX. */
161 sparse_xarray_get_n_columns (const struct sparse_xarray *sx)
166 /* Returns the number of rows in SX. */
168 sparse_xarray_get_n_rows (const struct sparse_xarray *sx)
172 unsigned long int idx;
173 return sparse_array_last (sx->memory, &idx) != NULL ? idx + 1 : 0;
177 const struct range_set_node *last = range_set_last (sx->disk_rows);
178 return last != NULL ? range_set_node_get_end (last) : 0;
182 /* Dumps the rows in SX, which must currently be stored in
183 memory, to disk. Returns true if successful, false on I/O
186 dump_sparse_xarray_to_disk (struct sparse_xarray *sx)
188 unsigned long int idx;
191 assert (sx->memory != NULL);
192 assert (sx->disk == NULL);
194 sx->disk = tmpfile_create ();
195 sx->disk_rows = range_set_create ();
197 for (row = sparse_array_first (sx->memory, &idx); row != NULL;
198 row = sparse_array_next (sx->memory, idx, &idx))
200 if (!tmpfile_write (sx->disk, (off_t) idx * sx->n_bytes, sx->n_bytes,
203 tmpfile_destroy (sx->disk);
205 range_set_destroy (sx->disk_rows);
206 sx->disk_rows = NULL;
209 range_set_insert (sx->disk_rows, idx, 1);
211 sparse_array_destroy (sx->memory);
216 /* Returns true if any data has ever been written to ROW in SX,
219 sparse_xarray_contains_row (const struct sparse_xarray *sx,
220 unsigned long int row)
222 return (sx->memory != NULL
223 ? sparse_array_get (sx->memory, row) != NULL
224 : range_set_contains (sx->disk_rows, row));
227 /* Reads columns COLUMNS...(COLUMNS + VALUE_CNT), exclusive, in
228 the given ROW in SX, into the VALUE_CNT values in VALUES.
229 Returns true if successful, false on I/O error. */
231 sparse_xarray_read (const struct sparse_xarray *sx, unsigned long int row,
232 size_t start, size_t n, void *data)
234 assert (range_is_valid (sx, start, n));
236 if (sx->memory != NULL)
238 uint8_t **p = sparse_array_get (sx->memory, row);
241 memcpy (data, *p + start, n);
247 if (range_set_contains (sx->disk_rows, row))
248 return tmpfile_read (sx->disk, (off_t) row * sx->n_bytes + start,
252 memcpy (data, sx->default_row + start, n);
256 /* Implements sparse_xarray_write for an on-disk sparse_xarray. */
258 write_disk_row (struct sparse_xarray *sx, unsigned long int row,
259 size_t start, size_t n, const void *data)
261 off_t ofs = (off_t) row * sx->n_bytes;
262 if (range_set_contains (sx->disk_rows, row))
263 return tmpfile_write (sx->disk, ofs + start, n, data);
266 range_set_insert (sx->disk_rows, row, 1);
267 return (tmpfile_write (sx->disk, ofs, start, sx->default_row)
268 && tmpfile_write (sx->disk, ofs + start, n, data)
269 && tmpfile_write (sx->disk, ofs + start + n,
270 sx->n_bytes - start - n,
271 sx->default_row + start + n));
275 /* Writes the VALUE_CNT values in VALUES into columns
276 COLUMNS...(COLUMNS + VALUE_CNT), exclusive, in the given ROW
278 Returns true if successful, false on I/O error. */
280 sparse_xarray_write (struct sparse_xarray *sx, unsigned long int row,
281 size_t start, size_t n, const void *data)
283 assert (range_is_valid (sx, start, n));
284 if (sx->memory != NULL)
286 uint8_t **p = sparse_array_get (sx->memory, row);
289 if (sparse_array_count (sx->memory) < sx->max_memory_rows)
291 p = sparse_array_insert (sx->memory, row);
292 *p = xmemdup (sx->default_row, sx->n_bytes);
296 if (!dump_sparse_xarray_to_disk (sx))
298 return write_disk_row (sx, row, start, n, data);
301 memcpy (*p + start, data, n);
305 return write_disk_row (sx, row, start, n, data);
308 /* Writes the VALUE_CNT values in VALUES to columns
309 START_COLUMN...(START_COLUMN + VALUE_CNT), exclusive, in every
310 row in SX, even those rows that have not yet been written.
311 Returns true if successful, false on I/O error.
313 The runtime of this function is linear in the number of rows
314 in SX that have already been written. */
316 sparse_xarray_write_columns (struct sparse_xarray *sx, size_t start,
317 size_t n, const void *data)
319 assert (range_is_valid (sx, start, n));
322 memcpy (sx->default_row + start, data, n);
324 /* Set individual rows. */
325 if (sx->memory != NULL)
327 unsigned long int idx;
330 for (p = sparse_array_first (sx->memory, &idx); p != NULL;
331 p = sparse_array_next (sx->memory, idx, &idx))
332 memcpy (*p + start, data, n);
336 const struct range_set_node *node;
338 for (node = range_set_first (sx->disk_rows); node != NULL;
339 node = range_set_next (sx->disk_rows, node))
341 unsigned long int start_row = range_set_node_get_start (node);
342 unsigned long int end_row = range_set_node_get_end (node);
343 unsigned long int row;
345 for (row = start_row; row < end_row; row++)
347 off_t offset = (off_t) row * sx->n_bytes;
348 if (!tmpfile_write (sx->disk, offset + start, n, data))
353 if (tmpfile_error (sx->disk))
359 static unsigned long int
360 scan_first (const struct sparse_xarray *sx)
364 unsigned long int idx;
365 return sparse_array_first (sx->memory, &idx) ? idx : ULONG_MAX;
368 return range_set_scan (sx->disk_rows, 0);
371 static unsigned long int
372 scan_next (const struct sparse_xarray *sx, unsigned long int start)
376 unsigned long int idx;
377 return sparse_array_next (sx->memory, start, &idx) ? idx : ULONG_MAX;
380 return range_set_scan (sx->disk_rows, start + 1);
383 /* Only works for rows for which sparse_xarray_contains_row()
384 would return true. */
386 get_row (const struct sparse_xarray *sx, unsigned long int idx,
391 uint8_t **p = sparse_array_get (sx->memory, idx);
394 else if (tmpfile_read (sx->disk, (off_t) idx * sx->n_bytes,
395 sx->n_bytes, buffer))
401 /* Iterates over all the rows in SX and DX, passing each pair of
402 rows with equal indexes to CB. CB's modifications, if any, to
403 destination rows are written back to DX.
405 All rows that are actually in use in SX or in DX or both are
406 passed to CB. If a row is in use in SX but not in DX, or vice
407 versa, then the "default" row (as set by
408 sparse_xarray_write_columns) is passed as the contents of the
411 CB is also called once with the default row from SX and the
412 default row from DX. Modifying the data passed as the default
413 row from DX will change DX's default row.
415 Returns true if successful, false if I/O on SX or DX fails or
416 if CB returns false. On failure, the contents of DX are
419 sparse_xarray_copy (const struct sparse_xarray *sx, struct sparse_xarray *dx,
420 bool (*cb) (const void *src, void *dst, void *aux),
425 if (!cb (sx->default_row, dx->default_row, aux))
432 unsigned long int idx;
435 for (row = sparse_array_first (sx->memory, &idx); row != NULL;
436 row = sparse_array_next (sx->memory, idx, &idx))
438 success = cb (*row, *row, aux);
445 const struct range_set_node *node;
446 void *tmp = xmalloc (sx->n_bytes);
448 for (node = range_set_first (sx->disk_rows); node != NULL;
449 node = range_set_next (sx->disk_rows, node))
451 unsigned long int start = range_set_node_get_start (node);
452 unsigned long int end = range_set_node_get_end (node);
453 unsigned long int row;
455 for (row = start; row < end; row++)
457 off_t offset = (off_t) row * sx->n_bytes;
458 success = (tmpfile_read (sx->disk, offset, sx->n_bytes, tmp)
459 && cb (tmp, tmp, aux)
460 && tmpfile_write (dx->disk, offset,
472 unsigned long int src_idx = scan_first (sx);
473 unsigned long int dst_idx = scan_first (dx);
475 uint8_t *tmp_src_row = xmalloc (sx->n_bytes);
476 uint8_t *tmp_dst_row = xmalloc (dx->n_bytes);
480 unsigned long int idx;
481 const uint8_t *src_row;
484 /* Determine the index of the row to process. If
485 src_idx == dst_idx, then the row has been written in
486 both SX and DX. Otherwise, it has been written in
487 only the sparse_xarray corresponding to the smaller
488 index, and has the default contents in the other. */
489 idx = MIN (src_idx, dst_idx);
490 if (idx == ULONG_MAX)
493 /* Obtain a copy of the source row as src_row. */
495 src_row = get_row (sx, idx, tmp_src_row);
497 src_row = sx->default_row;
499 /* Obtain the destination row as dst_row. */
501 dst_row = get_row (dx, idx, tmp_dst_row);
503 && sparse_array_count (dx->memory) < dx->max_memory_rows)
505 uint8_t **p = sparse_array_insert (dx->memory, idx);
506 dst_row = *p = xmemdup (dx->default_row, dx->n_bytes);
510 memcpy (tmp_dst_row, dx->default_row, dx->n_bytes);
511 dst_row = tmp_dst_row;
514 /* Run the callback. */
515 success = cb (src_row, dst_row, aux);
519 /* Write back the destination row, if necessary. */
520 if (dst_row == tmp_dst_row)
522 success = sparse_xarray_write (dx, idx, 0, dx->n_bytes, dst_row);
528 /* Nothing to do: we modified the destination row in-place. */
531 /* Advance to the next row. */
533 src_idx = scan_next (sx, src_idx);
535 dst_idx = scan_next (dx, dst_idx);
545 /* Returns a hash value for SX suitable for use with the model
546 checker. The value in BASIS is folded into the hash.
548 The returned hash value is *not* suitable for storage and
549 retrieval of sparse_xarrays that have identical contents,
550 because it will return different hash values for
551 sparse_xarrays that have the same contents (and it's slow).
553 We use MD4 because it is much faster than MD5 or SHA-1 but its
554 collision resistance is just as good. */
556 sparse_xarray_model_checker_hash (const struct sparse_xarray *sx,
559 unsigned int hash[DIV_RND_UP (20, sizeof (unsigned int))];
563 md4_process_bytes (&basis, sizeof basis, &ctx);
564 md4_process_bytes (&sx->n_bytes, sizeof sx->n_bytes, &ctx);
565 md4_process_bytes (sx->default_row, sx->n_bytes, &ctx);
569 unsigned long int idx;
572 md4_process_bytes ("m", 1, &ctx);
573 md4_process_bytes (&sx->max_memory_rows, sizeof sx->max_memory_rows,
575 for (row = sparse_array_first (sx->memory, &idx); row != NULL;
576 row = sparse_array_next (sx->memory, idx, &idx))
578 md4_process_bytes (&idx, sizeof idx, &ctx);
579 md4_process_bytes (*row, sx->n_bytes, &ctx);
584 const struct range_set_node *node;
585 void *tmp = xmalloc (sx->n_bytes);
587 md4_process_bytes ("d", 1, &ctx);
588 for (node = range_set_first (sx->disk_rows); node != NULL;
589 node = range_set_next (sx->disk_rows, node))
591 unsigned long int start = range_set_node_get_start (node);
592 unsigned long int end = range_set_node_get_end (node);
593 unsigned long int idx;
595 for (idx = start; idx < end; idx++)
597 off_t offset = (off_t) idx * sx->n_bytes;
598 if (!tmpfile_read (sx->disk, offset, sx->n_bytes, tmp))
600 md4_process_bytes (&idx, sizeof idx, &ctx);
601 md4_process_bytes (tmp, sx->n_bytes, &ctx);
606 md4_finish_ctx (&ctx, hash);