X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?p=pspp;a=blobdiff_plain;f=src%2Flibpspp%2Fline-reader.c;fp=src%2Flibpspp%2Fline-reader.c;h=6f90b502cd856dd64db366ce1bb36561141648f0;hp=0000000000000000000000000000000000000000;hb=20af4ead4d4c440c5bc269274cd5a87fa9e7056d;hpb=bde3425960865b3740bca3e883864c7597dd59a3 diff --git a/src/libpspp/line-reader.c b/src/libpspp/line-reader.c new file mode 100644 index 0000000000..6f90b502cd --- /dev/null +++ b/src/libpspp/line-reader.c @@ -0,0 +1,350 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2010, 2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include "line-reader.h" + +#include +#include +#include +#include +#include +#include + +#include "libpspp/assertion.h" +#include "libpspp/encoding-guesser.h" +#include "libpspp/i18n.h" +#include "libpspp/str.h" + +#include "gl/minmax.h" +#include "gl/xalloc.h" + +enum line_reader_state + { + S_UNIBYTE, /* Known stream encoding, 1-byte unit. */ + S_MULTIBYTE, /* Known stream encoding, multibyte unit. */ + S_AUTO /* Encoding autodetection in progress. */ + }; + +struct line_reader + { + int fd; + enum line_reader_state state; + struct encoding_info encoding_info; + + char *encoding; /* Current encoding. */ + char *auto_encoding; /* In S_AUTO mode, user-specified encoding. */ + + char *buffer; + char *head; + size_t length; + + int error; + bool eof; + }; + +static ssize_t fill_buffer (struct line_reader *); + +/* Opens FILENAME, which is encoded in ENCODING, for reading line by line, + passing FLAGS to the open() function. Returns a new line_reader if + successful, otherwise returns NULL and sets errno to an appropriate value. + + The accepted forms for ENCODING are listed at the top of + encoding-guesser.h. */ +struct line_reader * +line_reader_for_file (const char *encoding, const char *filename, int flags) +{ + struct line_reader *r; + int fd; + + assert (!(flags & O_CREAT)); + + fd = open (filename, flags); + if (fd < 0) + return NULL; + + r = line_reader_for_fd (encoding, fd); + if (r == NULL) + { + int save_errno = errno; + close (fd); + errno = save_errno; + } + + return r; +} + +/* Creates and returns a new line_reader that reads its input from FD. Returns + a new line_reader if successful, otherwise returns NULL and sets errno to an + appropriate value. + + The accepted forms for ENCODING are listed at the top of + encoding-guesser.h. */ +struct line_reader * +line_reader_for_fd (const char *encoding, int fd) +{ + struct line_reader *r; + + r = calloc (1, sizeof *r); + if (r == NULL) + return NULL; + + r->fd = fd; + r->buffer = malloc (LINE_READER_BUFFER_SIZE); + if (r->buffer == NULL) + goto error; + r->head = r->buffer; + r->length = 0; + + if (fill_buffer (r) < 0) + goto error; + + r->encoding = xstrdup (encoding_guess_head_encoding ( + encoding, r->buffer, r->length)); + if (!get_encoding_info (&r->encoding_info, r->encoding)) + { + errno = EINVAL; + goto error; + } + + if (encoding_guess_encoding_is_auto (encoding) + && !strcmp (r->encoding, "ASCII")) + { + r->state = S_AUTO; + r->auto_encoding = xstrdup (encoding); + } + else + r->state = r->encoding_info.unit == 1 ? S_UNIBYTE : S_MULTIBYTE; + + return r; + +error: + line_reader_free (r); + return NULL; +} + +/* Closes R and its underlying file descriptor and frees all associated + resources. Returns the return value from close(). */ +int +line_reader_close (struct line_reader *r) +{ + if (r != NULL) + { + int fd = r->fd; + line_reader_free (r); + return close (fd); + } + return 0; +} + +/* Frees R and associated resources, but does not close the underlying file + descriptor. (Thus, the client must close the file descriptor when it is no + longer needed.) */ +void +line_reader_free (struct line_reader *r) +{ + if (r != NULL) + { + free (r->buffer); + free (r->encoding); + free (r->auto_encoding); + free (r); + } +} + +static ssize_t +fill_buffer (struct line_reader *r) +{ + ssize_t n; + + /* Move any unused bytes to the beginning of the input buffer. */ + if (r->length > 0 && r->buffer != r->head) + memmove (r->buffer, r->head, r->length); + r->head = r->buffer; + + /* Read more input. */ + do + { + n = read (r->fd, r->buffer + r->length, + LINE_READER_BUFFER_SIZE - r->length); + } + while (n < 0 && errno == EINTR); + if (n > 0) + r->length += n; + else if (n < 0) + r->error = errno; + else + r->eof = true; + return n; +} + +static void +output_bytes (struct line_reader *r, struct string *s, size_t n) +{ + ds_put_substring (s, ss_buffer (r->head, n)); + r->head += n; + r->length -= n; +} + +static void +output_line (struct line_reader *r, struct string *s, size_t n) +{ + int unit = r->encoding_info.unit; + + output_bytes (r, s, n); + + r->head += unit; + r->length -= unit; + + ds_chomp (s, ss_buffer (r->encoding_info.cr, unit)); +} + +/* Reads a line of text, but no more than MAX_LENGTH bytes, from R and appends + it to S, omitting the final new-line and the carriage return that + immediately precedes it, if one is present. The line is left in its + original encoding. + + Returns true if anything was successfully read from the file. (If an empty + line was read, then nothing is appended to S.) Returns false if end of file + was reached or a read error occurred before any text could be read. */ +bool +line_reader_read (struct line_reader *r, struct string *s, size_t max_length) +{ + size_t original_length = ds_length (s); + int unit = r->encoding_info.unit; + + do + { + size_t max_out = max_length - (ds_length (s) - original_length); + size_t max_in = r->length; + size_t max = MIN (max_in, max_out); + size_t n; + char *p; + + if (max_out < unit) + break; + + switch (r->state) + { + case S_UNIBYTE: + p = memchr (r->head, r->encoding_info.lf[0], max); + if (p != NULL) + { + output_line (r, s, p - r->head); + return true; + } + n = max; + break; + + case S_MULTIBYTE: + for (n = 0; n + unit <= max; n += unit) + if (!memcmp (r->head + n, r->encoding_info.lf, unit)) + { + output_line (r, s, n); + return true; + } + break; + + case S_AUTO: + for (n = 0; n < max; n++) + if (!encoding_guess_is_ascii_text (r->head[n])) + { + char *encoding; + + output_bytes (r, s, n); + fill_buffer (r); + r->state = S_UNIBYTE; + + encoding = xstrdup (encoding_guess_tail_encoding ( + r->auto_encoding, r->head, r->length)); + free (r->encoding); + r->encoding = encoding; + + free (r->auto_encoding); + r->auto_encoding = NULL; + + n = 0; + break; + } + else if (r->head[n] == '\n') + { + output_line (r, s, n); + return true; + } + break; + + default: + NOT_REACHED (); + } + + output_bytes (r, s, n); + } + while (r->length >= unit || fill_buffer (r) > 0); + + return ds_length (s) > original_length; +} + +/* Returns the file descriptor underlying R. */ +int +line_reader_fileno (const struct line_reader *r) +{ + return r->fd; +} + +/* Returns the offset in the file of the next byte to be read from R, or -1 on + error (e.g. if the file is not seekable). */ +off_t +line_reader_tell (const struct line_reader *r) +{ + off_t pos = lseek (r->fd, 0, SEEK_CUR); + if (pos >= 0) + pos = MAX (0, pos - r->length); + return pos; +} + +/* Returns true if end of file has been encountered reading R. */ +bool +line_reader_eof (const struct line_reader *r) +{ + return r->eof && !r->length; +} + +/* Returns an nonzero errno value if an error has been encountered reading + R, zero otherwise. */ +int +line_reader_error (const struct line_reader *r) +{ + return !r->length ? r->error : 0; +} + +/* Returns the encoding of R. If line_reader_is_auto(R) returns true, the + encoding might change as more lines are read. */ +const char * +line_reader_get_encoding (const struct line_reader *r) +{ + return r->encoding; +} + +/* Returns true if the encoding of the file being read by R is not yet + completely known. If this function returns true, then the encoding returned + by line_reader_get_encoding() might change as more lines are read (and after + the change, this function will return false). */ +bool +line_reader_is_auto (const struct line_reader *r) +{ + return r->state == S_AUTO; +}