1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "line-reader.h"
28 #include "libpspp/assertion.h"
29 #include "libpspp/encoding-guesser.h"
30 #include "libpspp/i18n.h"
31 #include "libpspp/str.h"
33 #include "gl/minmax.h"
34 #include "gl/xalloc.h"
36 enum line_reader_state
38 S_UNIBYTE, /* Known stream encoding, 1-byte unit. */
39 S_MULTIBYTE, /* Known stream encoding, multibyte unit. */
40 S_AUTO /* Encoding autodetection in progress. */
46 enum line_reader_state state;
47 struct encoding_info encoding_info;
49 char *encoding; /* Current encoding. */
50 char *auto_encoding; /* In S_AUTO mode, user-specified encoding. */
60 static ssize_t fill_buffer (struct line_reader *);
62 /* Opens FILENAME, which is encoded in ENCODING, for reading line by line,
63 passing FLAGS to the open() function. Returns a new line_reader if
64 successful, otherwise returns NULL and sets errno to an appropriate value.
66 The accepted forms for ENCODING are listed at the top of
67 encoding-guesser.h. */
69 line_reader_for_file (const char *encoding, const char *filename, int flags)
71 struct line_reader *r;
74 assert (!(flags & O_CREAT));
76 fd = open (filename, flags);
80 r = line_reader_for_fd (encoding, fd);
83 int save_errno = errno;
91 /* Creates and returns a new line_reader that reads its input from FD. Returns
92 a new line_reader if successful, otherwise returns NULL and sets errno to an
95 The accepted forms for ENCODING are listed at the top of
96 encoding-guesser.h. */
98 line_reader_for_fd (const char *encoding, int fd)
100 struct line_reader *r;
102 r = calloc (1, sizeof *r);
107 r->buffer = malloc (LINE_READER_BUFFER_SIZE);
108 if (r->buffer == NULL)
113 if (fill_buffer (r) < 0)
116 r->encoding = xstrdup (encoding_guess_head_encoding (
117 encoding, r->buffer, r->length));
118 if (!get_encoding_info (&r->encoding_info, r->encoding))
124 if (encoding_guess_encoding_is_auto (encoding)
125 && !strcmp (r->encoding, "ASCII"))
128 r->auto_encoding = xstrdup (encoding);
131 r->state = r->encoding_info.unit == 1 ? S_UNIBYTE : S_MULTIBYTE;
136 line_reader_free (r);
140 /* Closes R and its underlying file descriptor and frees all associated
141 resources. Returns the return value from close(). */
143 line_reader_close (struct line_reader *r)
148 line_reader_free (r);
154 /* Frees R and associated resources, but does not close the underlying file
155 descriptor. (Thus, the client must close the file descriptor when it is no
158 line_reader_free (struct line_reader *r)
164 free (r->auto_encoding);
170 fill_buffer (struct line_reader *r)
174 /* Move any unused bytes to the beginning of the input buffer. */
175 if (r->length > 0 && r->buffer != r->head)
176 memmove (r->buffer, r->head, r->length);
179 /* Read more input. */
182 n = read (r->fd, r->buffer + r->length,
183 LINE_READER_BUFFER_SIZE - r->length);
185 while (n < 0 && errno == EINTR);
196 output_bytes (struct line_reader *r, struct string *s, size_t n)
198 ds_put_substring (s, ss_buffer (r->head, n));
204 output_line (struct line_reader *r, struct string *s, size_t n)
206 int unit = r->encoding_info.unit;
208 output_bytes (r, s, n);
213 ds_chomp (s, ss_buffer (r->encoding_info.cr, unit));
216 /* Reads a line of text, but no more than MAX_LENGTH bytes, from R and appends
217 it to S, omitting the final new-line and the carriage return that
218 immediately precedes it, if one is present. The line is left in its
221 Returns true if anything was successfully read from the file. (If an empty
222 line was read, then nothing is appended to S.) Returns false if end of file
223 was reached or a read error occurred before any text could be read. */
225 line_reader_read (struct line_reader *r, struct string *s, size_t max_length)
227 size_t original_length = ds_length (s);
228 int unit = r->encoding_info.unit;
232 size_t max_out = max_length - (ds_length (s) - original_length);
233 size_t max_in = r->length;
234 size_t max = MIN (max_in, max_out);
244 p = memchr (r->head, r->encoding_info.lf[0], max);
247 output_line (r, s, p - r->head);
254 for (n = 0; n + unit <= max; n += unit)
255 if (!memcmp (r->head + n, r->encoding_info.lf, unit))
257 output_line (r, s, n);
263 for (n = 0; n < max; n++)
264 if (!encoding_guess_is_ascii_text (r->head[n]))
268 output_bytes (r, s, n);
270 r->state = S_UNIBYTE;
272 encoding = xstrdup (encoding_guess_tail_encoding (
273 r->auto_encoding, r->head, r->length));
275 r->encoding = encoding;
277 free (r->auto_encoding);
278 r->auto_encoding = NULL;
283 else if (r->head[n] == '\n')
285 output_line (r, s, n);
294 output_bytes (r, s, n);
296 while (r->length >= unit || fill_buffer (r) > 0);
298 return ds_length (s) > original_length;
301 /* Returns the file descriptor underlying R. */
303 line_reader_fileno (const struct line_reader *r)
308 /* Returns the offset in the file of the next byte to be read from R, or -1 on
309 error (e.g. if the file is not seekable). */
311 line_reader_tell (const struct line_reader *r)
313 off_t pos = lseek (r->fd, 0, SEEK_CUR);
315 pos = MAX (0, pos - r->length);
319 /* Returns true if end of file has been encountered reading R. */
321 line_reader_eof (const struct line_reader *r)
323 return r->eof && !r->length;
326 /* Returns an nonzero errno value if an error has been encountered reading
327 R, zero otherwise. */
329 line_reader_error (const struct line_reader *r)
331 return !r->length ? r->error : 0;
334 /* Returns the encoding of R. If line_reader_is_auto(R) returns true, the
335 encoding might change as more lines are read. */
337 line_reader_get_encoding (const struct line_reader *r)
342 /* Returns true if the encoding of the file being read by R is not yet
343 completely known. If this function returns true, then the encoding returned
344 by line_reader_get_encoding() might change as more lines are read (and after
345 the change, this function will return false). */
347 line_reader_is_auto (const struct line_reader *r)
349 return r->state == S_AUTO;