1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2006, 2009, 2010 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/sys-file-private.h"
21 #include "data/dictionary.h"
22 #include "data/value.h"
23 #include "data/variable.h"
24 #include "libpspp/assertion.h"
25 #include "libpspp/misc.h"
27 #include "gl/c-strcase.h"
28 #include "gl/minmax.h"
29 #include "gl/xalloc.h"
31 /* Number of bytes really stored in each segment of a very long
33 #define REAL_VLS_CHUNK 255
35 /* Number of bytes per segment by which the amount of space for
36 very long string variables is allocated. */
37 #define EFFECTIVE_VLS_CHUNK 252
39 /* Returns true if WIDTH is a very long string width,
42 is_very_long (int width)
47 /* Returns the smaller of A or B.
48 (Defined as a function to avoid evaluating A or B more than
51 min_int (int a, int b)
56 /* Returns the larger of A or B.
57 (Defined as a function to avoid evaluating A or B more than
60 max_int (int a, int b)
65 /* Returns the number of bytes of uncompressed case data used for
66 writing a variable of the given WIDTH to a system file. All
67 required space is included, including trailing padding and
70 sfm_width_to_bytes (int width)
78 else if (!is_very_long (width))
82 int chunks = width / EFFECTIVE_VLS_CHUNK;
83 int remainder = width % EFFECTIVE_VLS_CHUNK;
84 bytes = remainder + (chunks * ROUND_UP (REAL_VLS_CHUNK, 8));
86 return ROUND_UP (bytes, 8);
89 /* Returns the number of 8-byte units (octs) used to write data
90 for a variable of the given WIDTH. */
92 sfm_width_to_octs (int width)
94 return sfm_width_to_bytes (width) / 8;
97 /* Returns the number of "segments" used for writing case data
98 for a variable of the given WIDTH. A segment is a physical
99 variable in the system file that represents some piece of a
100 logical variable as seen by a PSPP user. Only very long
101 string variables have more than one segment. */
103 sfm_width_to_segments (int width)
107 return !is_very_long (width) ? 1 : DIV_RND_UP (width, EFFECTIVE_VLS_CHUNK);
110 /* Returns the width to allocate to the given SEGMENT within a
111 variable of the given WIDTH. A segment is a physical variable
112 in the system file that represents some piece of a logical
113 variable as seen by a PSPP user. */
115 sfm_segment_alloc_width (int width, int segment)
117 assert (segment < sfm_width_to_segments (width));
119 return (!is_very_long (width) ? width
120 : segment < sfm_width_to_segments (width) - 1 ? 255
121 : width - segment * EFFECTIVE_VLS_CHUNK);
124 /* Returns the number of bytes to allocate to the given SEGMENT
125 within a variable of the given width. This is the same as
126 sfm_segment_alloc_width, except that a numeric value takes up
127 8 bytes despite having a width of 0. */
129 sfm_segment_alloc_bytes (int width, int segment)
131 assert (segment < sfm_width_to_segments (width));
132 return (width == 0 ? 8
133 : ROUND_UP (sfm_segment_alloc_width (width, segment), 8));
136 /* Returns the number of bytes in the given SEGMENT within a
137 variable of the given WIDTH that are actually used to store
138 data. For a numeric value (WIDTH of 0), this is 8 bytes; for
139 a string value less than 256 bytes wide, it is WIDTH bytes.
140 For very long string values, the calculation is more
141 complicated and ranges between 255 bytes for the first segment
142 to as little as 0 bytes for final segments. */
144 sfm_segment_used_bytes (int width, int segment)
146 assert (segment < sfm_width_to_segments (width));
147 return (width == 0 ? 8
148 : !is_very_long (width) ? width
149 : max_int (0, min_int (width - REAL_VLS_CHUNK * segment,
153 /* Returns the number of bytes at the end of the given SEGMENT
154 within a variable of the given WIDTH that are not used for
155 data; that is, the number of bytes that must be padded with
156 data that a reader ignores. */
158 sfm_segment_padding (int width, int segment)
160 return (sfm_segment_alloc_bytes (width, segment)
161 - sfm_segment_used_bytes (width, segment));
164 /* Returns the byte offset of the start of the given SEGMENT
165 within a variable of the given WIDTH. The first segment
166 starts at offset 0; only very long string variables have any
169 sfm_segment_offset (int width, int segment)
171 assert (segment < sfm_width_to_segments (width));
172 return min_int (REAL_VLS_CHUNK * segment, width);
175 /* Returns the byte offset of the start of the given SEGMENT
176 within a variable of the given WIDTH, given the (incorrect)
177 assumption that there are EFFECTIVE_VLS_CHUNK bytes per
178 segment. (Use of this function is questionable at best.) */
180 sfm_segment_effective_offset (int width, int segment)
182 assert (segment < sfm_width_to_segments (width));
183 return EFFECTIVE_VLS_CHUNK * segment;
186 /* Creates and initializes an array of struct sfm_vars that
187 describe how a case drawn from dictionary DICT is laid out in
188 a system file. Returns the number of segments in a case. A
189 segment is a physical variable in the system file that
190 represents some piece of a logical variable as seen by a PSPP
193 The array is allocated with malloc and stored in *SFM_VARS,
194 and its number of elements is stored in *SFM_N_VARS. The
195 caller is responsible for freeing it when it is no longer
198 sfm_dictionary_to_sfm_vars (const struct dictionary *dict,
199 struct sfm_var **sfm_vars, size_t *sfm_n_vars)
201 size_t n_vars = dict_get_n_vars (dict);
205 /* Estimate the number of sfm_vars that will be needed.
206 We might not need all of these, because very long string
207 variables can have segments that are all padding, which do
208 not need sfm_vars of their own. */
210 for (i = 0; i < n_vars; i++)
212 const struct variable *v = dict_get_var (dict, i);
213 n_segments += sfm_width_to_segments (var_get_width (v));
216 /* Compose the sfm_vars. */
217 *sfm_vars = xnmalloc (n_segments, sizeof **sfm_vars);
219 for (i = 0; i < n_vars; i++)
221 const struct variable *dv = dict_get_var (dict, i);
222 int width = var_get_width (dv);
225 for (j = 0; j < sfm_width_to_segments (width); j++)
227 int used_bytes = sfm_segment_used_bytes (width, j);
228 int padding = sfm_segment_padding (width, j);
232 sv = &(*sfm_vars)[(*sfm_n_vars)++];
233 sv->var_width = width;
234 sv->segment_width = width == 0 ? 0 : used_bytes;
235 sv->case_index = var_get_case_index (dv);
236 sv->offset = sfm_segment_offset (width, j);
237 sv->padding = padding;
241 /* Segment is all padding. Just add it to the
243 sv = &(*sfm_vars)[*sfm_n_vars - 1];
244 sv->padding += padding;
246 assert ((sv->segment_width + sv->padding) % 8 == 0);
253 /* Given the name of an encoding, returns the codepage number to use in the
254 'character_code' member of the machine integer info record for writing a
257 sys_get_codepage_from_encoding (const char *name)
259 const struct sys_encoding *e;
261 for (e = sys_codepage_name_to_number; e->name != NULL; e++)
262 if (!c_strcasecmp (name, e->name))
268 /* Given a codepage number from the 'character_code' member of the machine
269 integer info record in a system file, returns a corresponding encoding name.
270 Most encodings have multiple aliases; the one returned is the one that would
271 be used in the character encoding record. */
273 sys_get_encoding_from_codepage (int codepage)
275 const struct sys_encoding *e;
277 for (e = sys_codepage_number_to_name; e->name != NULL; e++)
278 if (codepage == e->number)