1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2007, 2009, 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "libpspp/sparse-xarray.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/ext-array.h"
28 #include "libpspp/misc.h"
29 #include "libpspp/range-set.h"
30 #include "libpspp/sparse-array.h"
33 #include "gl/minmax.h"
34 #include "gl/xalloc.h"
36 /* A sparse array of arrays of bytes. */
39 size_t n_bytes; /* Number of bytes per row. */
40 uint8_t *default_row; /* Defaults for unwritten rows. */
41 unsigned long int max_memory_rows; /* Max rows before dumping to disk. */
42 struct sparse_array *memory; /* Backing, if stored in memory. */
43 struct ext_array *disk; /* Backing, if stored on disk. */
44 struct range_set *disk_rows; /* Allocated rows, if on disk. */
48 range_is_valid (const struct sparse_xarray *sx, size_t ofs, size_t n)
50 return n <= sx->n_bytes && ofs <= sx->n_bytes && ofs + n <= sx->n_bytes;
53 /* Creates and returns a new sparse array of arrays of bytes.
54 Each row in the sparse array will consist of N_BYTES bytes.
55 If fewer than MAX_MEMORY_ROWS rows are added to the array,
56 then it will be kept in memory; if more than that are added,
57 then it will be stored on disk. */
58 struct sparse_xarray *
59 sparse_xarray_create (size_t n_bytes, size_t max_memory_rows)
61 struct sparse_xarray *sx = xmalloc (sizeof *sx);
62 sx->n_bytes = n_bytes;
63 sx->default_row = xzalloc (n_bytes);
64 sx->max_memory_rows = max_memory_rows;
65 sx->memory = sparse_array_create (sizeof (uint8_t *));
71 /* Creates and returns a new sparse array of rows that contains
72 the same data as OLD. Returns a null pointer if cloning
74 struct sparse_xarray *
75 sparse_xarray_clone (const struct sparse_xarray *old)
77 struct sparse_xarray *new = xmalloc (sizeof *new);
79 new->n_bytes = old->n_bytes;
81 new->default_row = xmemdup (old->default_row, old->n_bytes);
83 new->max_memory_rows = old->max_memory_rows;
85 if (old->memory != NULL)
87 unsigned long int idx;
90 new->memory = sparse_array_create (sizeof (uint8_t *));
91 for (old_row = sparse_array_first (old->memory, &idx); old_row != NULL;
92 old_row = sparse_array_next (old->memory, idx, &idx))
94 uint8_t **new_row = sparse_array_insert (new->memory, idx);
95 *new_row = xmemdup (*old_row, new->n_bytes);
101 if (old->disk != NULL)
103 const struct range_set_node *node;
104 void *tmp = xmalloc (old->n_bytes);
106 new->disk = ext_array_create ();
107 new->disk_rows = range_set_clone (old->disk_rows, NULL);
108 for (node = range_set_first (old->disk_rows); node != NULL;
109 node = range_set_next (old->disk_rows, node))
111 unsigned long int start = range_set_node_get_start (node);
112 unsigned long int end = range_set_node_get_end (node);
113 unsigned long int idx;
115 for (idx = start; idx < end; idx++)
117 off_t offset = (off_t) idx * old->n_bytes;
118 if (!ext_array_read (old->disk, offset, old->n_bytes, tmp)
119 || !ext_array_write (new->disk, offset, old->n_bytes, tmp))
122 sparse_xarray_destroy (new);
132 new->disk_rows = NULL;
138 /* Destroys sparse array of rows SX. */
140 sparse_xarray_destroy (struct sparse_xarray *sx)
144 free (sx->default_row);
145 if (sx->memory != NULL)
147 unsigned long int idx;
149 for (row = sparse_array_first (sx->memory, &idx); row != NULL;
150 row = sparse_array_next (sx->memory, idx, &idx))
152 sparse_array_destroy (sx->memory);
154 ext_array_destroy (sx->disk);
155 range_set_destroy (sx->disk_rows);
160 /* Returns the number of bytes in each row in SX. */
162 sparse_xarray_get_n_columns (const struct sparse_xarray *sx)
167 /* Returns the number of rows in SX. */
169 sparse_xarray_get_n_rows (const struct sparse_xarray *sx)
173 unsigned long int idx;
174 return sparse_array_last (sx->memory, &idx) != NULL ? idx + 1 : 0;
178 const struct range_set_node *last = range_set_last (sx->disk_rows);
179 return last != NULL ? range_set_node_get_end (last) : 0;
183 /* Dumps the rows in SX, which must currently be stored in
184 memory, to disk. Returns true if successful, false on I/O
187 dump_sparse_xarray_to_disk (struct sparse_xarray *sx)
189 unsigned long int idx;
192 assert (sx->memory != NULL);
193 assert (sx->disk == NULL);
195 sx->disk = ext_array_create ();
196 sx->disk_rows = range_set_create ();
198 for (row = sparse_array_first (sx->memory, &idx); row != NULL;
199 row = sparse_array_next (sx->memory, idx, &idx))
201 if (!ext_array_write (sx->disk, (off_t) idx * sx->n_bytes, sx->n_bytes,
204 ext_array_destroy (sx->disk);
206 range_set_destroy (sx->disk_rows);
207 sx->disk_rows = NULL;
210 range_set_insert (sx->disk_rows, idx, 1);
212 sparse_array_destroy (sx->memory);
217 /* Returns true if any data has ever been written to ROW in SX,
220 sparse_xarray_contains_row (const struct sparse_xarray *sx,
221 unsigned long int row)
223 return (sx->memory != NULL
224 ? sparse_array_get (sx->memory, row) != NULL
225 : range_set_contains (sx->disk_rows, row));
228 /* Reads columns COLUMNS...(COLUMNS + VALUE_CNT), exclusive, in
229 the given ROW in SX, into the VALUE_CNT values in VALUES.
230 Returns true if successful, false on I/O error. */
232 sparse_xarray_read (const struct sparse_xarray *sx, unsigned long int row,
233 size_t start, size_t n, void *data)
235 assert (range_is_valid (sx, start, n));
237 if (sx->memory != NULL)
239 uint8_t **p = sparse_array_get (sx->memory, row);
242 memcpy (data, *p + start, n);
248 if (range_set_contains (sx->disk_rows, row))
249 return ext_array_read (sx->disk, (off_t) row * sx->n_bytes + start,
253 memcpy (data, sx->default_row + start, n);
257 /* Implements sparse_xarray_write for an on-disk sparse_xarray. */
259 write_disk_row (struct sparse_xarray *sx, unsigned long int row,
260 size_t start, size_t n, const void *data)
262 off_t ofs = (off_t) row * sx->n_bytes;
263 if (range_set_contains (sx->disk_rows, row))
264 return ext_array_write (sx->disk, ofs + start, n, data);
267 range_set_insert (sx->disk_rows, row, 1);
268 return (ext_array_write (sx->disk, ofs, start, sx->default_row)
269 && ext_array_write (sx->disk, ofs + start, n, data)
270 && ext_array_write (sx->disk, ofs + start + n,
271 sx->n_bytes - start - n,
272 sx->default_row + start + n));
276 /* Writes the VALUE_CNT values in VALUES into columns
277 COLUMNS...(COLUMNS + VALUE_CNT), exclusive, in the given ROW
279 Returns true if successful, false on I/O error. */
281 sparse_xarray_write (struct sparse_xarray *sx, unsigned long int row,
282 size_t start, size_t n, const void *data)
284 assert (range_is_valid (sx, start, n));
285 if (sx->memory != NULL)
287 uint8_t **p = sparse_array_get (sx->memory, row);
290 if (sparse_array_count (sx->memory) < sx->max_memory_rows)
292 p = sparse_array_insert (sx->memory, row);
293 *p = xmemdup (sx->default_row, sx->n_bytes);
297 if (!dump_sparse_xarray_to_disk (sx))
299 return write_disk_row (sx, row, start, n, data);
302 memcpy (*p + start, data, n);
306 return write_disk_row (sx, row, start, n, data);
309 /* Writes the VALUE_CNT values in VALUES to columns
310 START_COLUMN...(START_COLUMN + VALUE_CNT), exclusive, in every
311 row in SX, even those rows that have not yet been written.
312 Returns true if successful, false on I/O error.
314 The runtime of this function is linear in the number of rows
315 in SX that have already been written. */
317 sparse_xarray_write_columns (struct sparse_xarray *sx, size_t start,
318 size_t n, const void *data)
320 assert (range_is_valid (sx, start, n));
323 memcpy (sx->default_row + start, data, n);
325 /* Set individual rows. */
326 if (sx->memory != NULL)
328 unsigned long int idx;
331 for (p = sparse_array_first (sx->memory, &idx); p != NULL;
332 p = sparse_array_next (sx->memory, idx, &idx))
333 memcpy (*p + start, data, n);
337 const struct range_set_node *node;
339 for (node = range_set_first (sx->disk_rows); node != NULL;
340 node = range_set_next (sx->disk_rows, node))
342 unsigned long int start_row = range_set_node_get_start (node);
343 unsigned long int end_row = range_set_node_get_end (node);
344 unsigned long int row;
346 for (row = start_row; row < end_row; row++)
348 off_t offset = (off_t) row * sx->n_bytes;
349 if (!ext_array_write (sx->disk, offset + start, n, data))
354 if (ext_array_error (sx->disk))
360 static unsigned long int
361 scan_first (const struct sparse_xarray *sx)
365 unsigned long int idx;
366 return sparse_array_first (sx->memory, &idx) ? idx : ULONG_MAX;
369 return range_set_scan (sx->disk_rows, 0);
372 static unsigned long int
373 scan_next (const struct sparse_xarray *sx, unsigned long int start)
377 unsigned long int idx;
378 return sparse_array_next (sx->memory, start, &idx) ? idx : ULONG_MAX;
381 return range_set_scan (sx->disk_rows, start + 1);
384 /* Only works for rows for which sparse_xarray_contains_row()
385 would return true. */
387 get_row (const struct sparse_xarray *sx, unsigned long int idx,
392 uint8_t **p = sparse_array_get (sx->memory, idx);
395 else if (ext_array_read (sx->disk, (off_t) idx * sx->n_bytes,
396 sx->n_bytes, buffer))
402 /* Iterates over all the rows in SX and DX, passing each pair of
403 rows with equal indexes to CB. CB's modifications, if any, to
404 destination rows are written back to DX.
406 All rows that are actually in use in SX or in DX or both are
407 passed to CB. If a row is in use in SX but not in DX, or vice
408 versa, then the "default" row (as set by
409 sparse_xarray_write_columns) is passed as the contents of the
412 CB is also called once with the default row from SX and the
413 default row from DX. Modifying the data passed as the default
414 row from DX will change DX's default row.
416 Returns true if successful, false if I/O on SX or DX fails or
417 if CB returns false. On failure, the contents of DX are
420 sparse_xarray_copy (const struct sparse_xarray *sx, struct sparse_xarray *dx,
421 bool (*cb) (const void *src, void *dst, void *aux),
426 if (!cb (sx->default_row, dx->default_row, aux))
433 unsigned long int idx;
436 for (row = sparse_array_first (sx->memory, &idx); row != NULL;
437 row = sparse_array_next (sx->memory, idx, &idx))
439 success = cb (*row, *row, aux);
446 const struct range_set_node *node;
447 void *tmp = xmalloc (sx->n_bytes);
449 for (node = range_set_first (sx->disk_rows); node != NULL;
450 node = range_set_next (sx->disk_rows, node))
452 unsigned long int start = range_set_node_get_start (node);
453 unsigned long int end = range_set_node_get_end (node);
454 unsigned long int row;
456 for (row = start; row < end; row++)
458 off_t offset = (off_t) row * sx->n_bytes;
459 success = (ext_array_read (sx->disk, offset, sx->n_bytes,
461 && cb (tmp, tmp, aux)
462 && ext_array_write (dx->disk, offset,
474 unsigned long int src_idx = scan_first (sx);
475 unsigned long int dst_idx = scan_first (dx);
477 uint8_t *tmp_src_row = xmalloc (sx->n_bytes);
478 uint8_t *tmp_dst_row = xmalloc (dx->n_bytes);
482 unsigned long int idx;
483 const uint8_t *src_row;
486 /* Determine the index of the row to process. If
487 src_idx == dst_idx, then the row has been written in
488 both SX and DX. Otherwise, it has been written in
489 only the sparse_xarray corresponding to the smaller
490 index, and has the default contents in the other. */
491 idx = MIN (src_idx, dst_idx);
492 if (idx == ULONG_MAX)
495 /* Obtain a copy of the source row as src_row. */
497 src_row = get_row (sx, idx, tmp_src_row);
499 src_row = sx->default_row;
501 /* Obtain the destination row as dst_row. */
503 dst_row = get_row (dx, idx, tmp_dst_row);
505 && sparse_array_count (dx->memory) < dx->max_memory_rows)
507 uint8_t **p = sparse_array_insert (dx->memory, idx);
508 dst_row = *p = xmemdup (dx->default_row, dx->n_bytes);
512 memcpy (tmp_dst_row, dx->default_row, dx->n_bytes);
513 dst_row = tmp_dst_row;
516 /* Run the callback. */
517 success = cb (src_row, dst_row, aux);
521 /* Write back the destination row, if necessary. */
522 if (dst_row == tmp_dst_row)
524 success = sparse_xarray_write (dx, idx, 0, dx->n_bytes, dst_row);
530 /* Nothing to do: we modified the destination row in-place. */
533 /* Advance to the next row. */
535 src_idx = scan_next (sx, src_idx);
537 dst_idx = scan_next (dx, dst_idx);
547 /* Returns a hash value for SX suitable for use with the model
548 checker. The value in BASIS is folded into the hash.
550 The returned hash value is *not* suitable for storage and
551 retrieval of sparse_xarrays that have identical contents,
552 because it will return different hash values for
553 sparse_xarrays that have the same contents (and it's slow).
555 We use MD4 because it is much faster than MD5 or SHA-1 but its
556 collision resistance is just as good. */
558 sparse_xarray_model_checker_hash (const struct sparse_xarray *sx,
561 unsigned int hash[DIV_RND_UP (20, sizeof (unsigned int))];
565 md4_process_bytes (&basis, sizeof basis, &ctx);
566 md4_process_bytes (&sx->n_bytes, sizeof sx->n_bytes, &ctx);
567 md4_process_bytes (sx->default_row, sx->n_bytes, &ctx);
571 unsigned long int idx;
574 md4_process_bytes ("m", 1, &ctx);
575 md4_process_bytes (&sx->max_memory_rows, sizeof sx->max_memory_rows,
577 for (row = sparse_array_first (sx->memory, &idx); row != NULL;
578 row = sparse_array_next (sx->memory, idx, &idx))
580 md4_process_bytes (&idx, sizeof idx, &ctx);
581 md4_process_bytes (*row, sx->n_bytes, &ctx);
586 const struct range_set_node *node;
587 void *tmp = xmalloc (sx->n_bytes);
589 md4_process_bytes ("d", 1, &ctx);
590 for (node = range_set_first (sx->disk_rows); node != NULL;
591 node = range_set_next (sx->disk_rows, node))
593 unsigned long int start = range_set_node_get_start (node);
594 unsigned long int end = range_set_node_get_end (node);
595 unsigned long int idx;
597 for (idx = start; idx < end; idx++)
599 off_t offset = (off_t) idx * sx->n_bytes;
600 if (!ext_array_read (sx->disk, offset, sx->n_bytes, tmp))
602 md4_process_bytes (&idx, sizeof idx, &ctx);
603 md4_process_bytes (tmp, sx->n_bytes, &ctx);
608 md4_finish_ctx (&ctx, hash);