From 20af4ead4d4c440c5bc269274cd5a87fa9e7056d Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sat, 19 Mar 2011 16:26:55 -0700 Subject: [PATCH] line-reader: New library for reading a file line-by-line. This library reads a file line-by-line in an arbitrary 8-bit or wider encoding, without requiring the file to be recoded. This will be used in an upcoming commit. --- src/libpspp/automake.mk | 2 + src/libpspp/line-reader.c | 350 +++++++++++++++++++++++++++++++ src/libpspp/line-reader.h | 54 +++++ tests/automake.mk | 5 + tests/libpspp/line-reader-test.c | 130 ++++++++++++ tests/libpspp/line-reader.at | 74 +++++++ 6 files changed, 615 insertions(+) create mode 100644 src/libpspp/line-reader.c create mode 100644 src/libpspp/line-reader.h create mode 100644 tests/libpspp/line-reader-test.c create mode 100644 tests/libpspp/line-reader.at diff --git a/src/libpspp/automake.mk b/src/libpspp/automake.mk index 244f1d1520..2f81243cf9 100644 --- a/src/libpspp/automake.mk +++ b/src/libpspp/automake.mk @@ -44,6 +44,8 @@ src_libpspp_liblibpspp_la_SOURCES = \ src/libpspp/integer-format.h \ src/libpspp/intern.c \ src/libpspp/intern.h \ + src/libpspp/line-reader.c \ + src/libpspp/line-reader.h \ src/libpspp/ll.c \ src/libpspp/ll.h \ src/libpspp/llx.c \ diff --git a/src/libpspp/line-reader.c b/src/libpspp/line-reader.c new file mode 100644 index 0000000000..6f90b502cd --- /dev/null +++ b/src/libpspp/line-reader.c @@ -0,0 +1,350 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2010, 2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include "line-reader.h" + +#include +#include +#include +#include +#include +#include + +#include "libpspp/assertion.h" +#include "libpspp/encoding-guesser.h" +#include "libpspp/i18n.h" +#include "libpspp/str.h" + +#include "gl/minmax.h" +#include "gl/xalloc.h" + +enum line_reader_state + { + S_UNIBYTE, /* Known stream encoding, 1-byte unit. */ + S_MULTIBYTE, /* Known stream encoding, multibyte unit. */ + S_AUTO /* Encoding autodetection in progress. */ + }; + +struct line_reader + { + int fd; + enum line_reader_state state; + struct encoding_info encoding_info; + + char *encoding; /* Current encoding. */ + char *auto_encoding; /* In S_AUTO mode, user-specified encoding. */ + + char *buffer; + char *head; + size_t length; + + int error; + bool eof; + }; + +static ssize_t fill_buffer (struct line_reader *); + +/* Opens FILENAME, which is encoded in ENCODING, for reading line by line, + passing FLAGS to the open() function. Returns a new line_reader if + successful, otherwise returns NULL and sets errno to an appropriate value. + + The accepted forms for ENCODING are listed at the top of + encoding-guesser.h. */ +struct line_reader * +line_reader_for_file (const char *encoding, const char *filename, int flags) +{ + struct line_reader *r; + int fd; + + assert (!(flags & O_CREAT)); + + fd = open (filename, flags); + if (fd < 0) + return NULL; + + r = line_reader_for_fd (encoding, fd); + if (r == NULL) + { + int save_errno = errno; + close (fd); + errno = save_errno; + } + + return r; +} + +/* Creates and returns a new line_reader that reads its input from FD. Returns + a new line_reader if successful, otherwise returns NULL and sets errno to an + appropriate value. + + The accepted forms for ENCODING are listed at the top of + encoding-guesser.h. */ +struct line_reader * +line_reader_for_fd (const char *encoding, int fd) +{ + struct line_reader *r; + + r = calloc (1, sizeof *r); + if (r == NULL) + return NULL; + + r->fd = fd; + r->buffer = malloc (LINE_READER_BUFFER_SIZE); + if (r->buffer == NULL) + goto error; + r->head = r->buffer; + r->length = 0; + + if (fill_buffer (r) < 0) + goto error; + + r->encoding = xstrdup (encoding_guess_head_encoding ( + encoding, r->buffer, r->length)); + if (!get_encoding_info (&r->encoding_info, r->encoding)) + { + errno = EINVAL; + goto error; + } + + if (encoding_guess_encoding_is_auto (encoding) + && !strcmp (r->encoding, "ASCII")) + { + r->state = S_AUTO; + r->auto_encoding = xstrdup (encoding); + } + else + r->state = r->encoding_info.unit == 1 ? S_UNIBYTE : S_MULTIBYTE; + + return r; + +error: + line_reader_free (r); + return NULL; +} + +/* Closes R and its underlying file descriptor and frees all associated + resources. Returns the return value from close(). */ +int +line_reader_close (struct line_reader *r) +{ + if (r != NULL) + { + int fd = r->fd; + line_reader_free (r); + return close (fd); + } + return 0; +} + +/* Frees R and associated resources, but does not close the underlying file + descriptor. (Thus, the client must close the file descriptor when it is no + longer needed.) */ +void +line_reader_free (struct line_reader *r) +{ + if (r != NULL) + { + free (r->buffer); + free (r->encoding); + free (r->auto_encoding); + free (r); + } +} + +static ssize_t +fill_buffer (struct line_reader *r) +{ + ssize_t n; + + /* Move any unused bytes to the beginning of the input buffer. */ + if (r->length > 0 && r->buffer != r->head) + memmove (r->buffer, r->head, r->length); + r->head = r->buffer; + + /* Read more input. */ + do + { + n = read (r->fd, r->buffer + r->length, + LINE_READER_BUFFER_SIZE - r->length); + } + while (n < 0 && errno == EINTR); + if (n > 0) + r->length += n; + else if (n < 0) + r->error = errno; + else + r->eof = true; + return n; +} + +static void +output_bytes (struct line_reader *r, struct string *s, size_t n) +{ + ds_put_substring (s, ss_buffer (r->head, n)); + r->head += n; + r->length -= n; +} + +static void +output_line (struct line_reader *r, struct string *s, size_t n) +{ + int unit = r->encoding_info.unit; + + output_bytes (r, s, n); + + r->head += unit; + r->length -= unit; + + ds_chomp (s, ss_buffer (r->encoding_info.cr, unit)); +} + +/* Reads a line of text, but no more than MAX_LENGTH bytes, from R and appends + it to S, omitting the final new-line and the carriage return that + immediately precedes it, if one is present. The line is left in its + original encoding. + + Returns true if anything was successfully read from the file. (If an empty + line was read, then nothing is appended to S.) Returns false if end of file + was reached or a read error occurred before any text could be read. */ +bool +line_reader_read (struct line_reader *r, struct string *s, size_t max_length) +{ + size_t original_length = ds_length (s); + int unit = r->encoding_info.unit; + + do + { + size_t max_out = max_length - (ds_length (s) - original_length); + size_t max_in = r->length; + size_t max = MIN (max_in, max_out); + size_t n; + char *p; + + if (max_out < unit) + break; + + switch (r->state) + { + case S_UNIBYTE: + p = memchr (r->head, r->encoding_info.lf[0], max); + if (p != NULL) + { + output_line (r, s, p - r->head); + return true; + } + n = max; + break; + + case S_MULTIBYTE: + for (n = 0; n + unit <= max; n += unit) + if (!memcmp (r->head + n, r->encoding_info.lf, unit)) + { + output_line (r, s, n); + return true; + } + break; + + case S_AUTO: + for (n = 0; n < max; n++) + if (!encoding_guess_is_ascii_text (r->head[n])) + { + char *encoding; + + output_bytes (r, s, n); + fill_buffer (r); + r->state = S_UNIBYTE; + + encoding = xstrdup (encoding_guess_tail_encoding ( + r->auto_encoding, r->head, r->length)); + free (r->encoding); + r->encoding = encoding; + + free (r->auto_encoding); + r->auto_encoding = NULL; + + n = 0; + break; + } + else if (r->head[n] == '\n') + { + output_line (r, s, n); + return true; + } + break; + + default: + NOT_REACHED (); + } + + output_bytes (r, s, n); + } + while (r->length >= unit || fill_buffer (r) > 0); + + return ds_length (s) > original_length; +} + +/* Returns the file descriptor underlying R. */ +int +line_reader_fileno (const struct line_reader *r) +{ + return r->fd; +} + +/* Returns the offset in the file of the next byte to be read from R, or -1 on + error (e.g. if the file is not seekable). */ +off_t +line_reader_tell (const struct line_reader *r) +{ + off_t pos = lseek (r->fd, 0, SEEK_CUR); + if (pos >= 0) + pos = MAX (0, pos - r->length); + return pos; +} + +/* Returns true if end of file has been encountered reading R. */ +bool +line_reader_eof (const struct line_reader *r) +{ + return r->eof && !r->length; +} + +/* Returns an nonzero errno value if an error has been encountered reading + R, zero otherwise. */ +int +line_reader_error (const struct line_reader *r) +{ + return !r->length ? r->error : 0; +} + +/* Returns the encoding of R. If line_reader_is_auto(R) returns true, the + encoding might change as more lines are read. */ +const char * +line_reader_get_encoding (const struct line_reader *r) +{ + return r->encoding; +} + +/* Returns true if the encoding of the file being read by R is not yet + completely known. If this function returns true, then the encoding returned + by line_reader_get_encoding() might change as more lines are read (and after + the change, this function will return false). */ +bool +line_reader_is_auto (const struct line_reader *r) +{ + return r->state == S_AUTO; +} diff --git a/src/libpspp/line-reader.h b/src/libpspp/line-reader.h new file mode 100644 index 0000000000..e9b2f50107 --- /dev/null +++ b/src/libpspp/line-reader.h @@ -0,0 +1,54 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2010, 2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#ifndef LIBPSPP_LINE_READER_H +#define LIBPSPP_LINE_READER_H 1 + +#include +#include +#include + +/* line_reader. + + Reads a text file in an arbitrary encoding one line at a time, with + optional automatic encoding detection. +*/ + +#define LINE_READER_BUFFER_SIZE 4096 + +struct string; + +struct line_reader *line_reader_for_fd (const char *encoding, int fd); +struct line_reader *line_reader_for_file (const char *encoding, + const char *filename, int flags); + +int line_reader_close (struct line_reader *); +void line_reader_free (struct line_reader *); + +bool line_reader_read (struct line_reader *, struct string *, + size_t max_length); + +int line_reader_fileno (const struct line_reader *); +off_t line_reader_tell (const struct line_reader *); + +bool line_reader_eof (const struct line_reader *); +int line_reader_error (const struct line_reader *); + +const char *line_reader_get_encoding (const struct line_reader *); + +bool line_reader_is_auto (const struct line_reader *); + +#endif /* libpspp/line-reader.h */ diff --git a/tests/automake.mk b/tests/automake.mk index 0af3d1e604..b8e4c2dd19 100644 --- a/tests/automake.mk +++ b/tests/automake.mk @@ -14,6 +14,7 @@ check_PROGRAMS += \ tests/libpspp/hmap-test \ tests/libpspp/hmapx-test \ tests/libpspp/i18n-test \ + tests/libpspp/line-reader-test \ tests/libpspp/ll-test \ tests/libpspp/llx-test \ tests/libpspp/range-map-test \ @@ -43,6 +44,9 @@ tests_data_sack_SOURCES = \ tests_data_sack_LDADD = src/libpspp-core.la tests_data_sack_CFLAGS = $(AM_CFLAGS) +tests_libpspp_line_reader_test_SOURCES = tests/libpspp/line-reader-test.c +tests_libpspp_line_reader_test_LDADD = src/libpspp/liblibpspp.la gl/libgl.la + tests_libpspp_ll_test_SOURCES = \ src/libpspp/ll.c \ tests/libpspp/ll-test.c @@ -320,6 +324,7 @@ TESTSUITE_AT = \ tests/libpspp/hmap.at \ tests/libpspp/hmapx.at \ tests/libpspp/i18n.at \ + tests/libpspp/line-reader.at \ tests/libpspp/ll.at \ tests/libpspp/llx.at \ tests/libpspp/range-map.at \ diff --git a/tests/libpspp/line-reader-test.c b/tests/libpspp/line-reader-test.c new file mode 100644 index 0000000000..fef9eb6390 --- /dev/null +++ b/tests/libpspp/line-reader-test.c @@ -0,0 +1,130 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2010, 2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include "libpspp/line-reader.h" + +#include +#include +#include +#include +#include +#include + +#include "libpspp/i18n.h" +#include "libpspp/str.h" + +#include "gl/error.h" +#include "gl/progname.h" +#include "gl/xalloc.h" + +static void +usage (void) +{ + printf ("usage: %s COMMAND [ARG]...\n" + "The available commands are:\n" + " help\n" + " print this usage message\n" + " buffer-size\n" + " print the buffer size, in bytes, on stdout\n" + " read FILE ENCODING\n" + " read FILE encoded in ENCODING and print it in UTF-8\n", + program_name); + exit (0); +} + +static void +cmd_read (int argc, char *argv[]) +{ + struct line_reader *r; + const char *filename; + struct string line; + char *encoding; + + if (argc != 4) + error (1, 0, "bad syntax for `%s' command; use `%s help' for help", + argv[1], program_name); + + filename = argv[2]; + + r = (!strcmp(filename, "-") + ? line_reader_for_fd (argv[3], STDIN_FILENO) + : line_reader_for_file (argv[3], filename, O_RDONLY)); + if (r == NULL) + error (1, errno, "line_reader_open failed"); + + encoding = xstrdup (line_reader_get_encoding (r)); + printf ("encoded in %s", encoding); + if (line_reader_is_auto (r)) + printf (" (auto)"); + printf ("\n"); + + ds_init_empty (&line); + while (line_reader_read (r, &line, SIZE_MAX)) + { + const char *new_encoding; + char *utf8_line; + + new_encoding = line_reader_get_encoding (r); + if (strcmp (encoding, new_encoding)) + { + free (encoding); + encoding = xstrdup (new_encoding); + + printf ("encoded in %s", encoding); + if (line_reader_is_auto (r)) + printf (" (auto)"); + printf ("\n"); + } + + utf8_line = recode_string ("UTF-8", encoding, + ds_data (&line), ds_length (&line)); + printf ("\"%s\"\n", utf8_line); + free (utf8_line); + + ds_clear (&line); + } + + if (!strcmp(filename, "-")) + line_reader_free (r); + else + { + if (line_reader_close (r) != 0) + error (1, errno, "line_reader_close failed"); + } +} + +int +main (int argc, char *argv[]) +{ + set_program_name (argv[0]); + i18n_init (); + + if (argc < 2) + error (1, 0, "missing command name; use `%s help' for help", program_name); + else if (!strcmp(argv[1], "help") || !strcmp(argv[1], "--help")) + usage (); + else if (!strcmp(argv[1], "buffer-size")) + printf ("%d\n", LINE_READER_BUFFER_SIZE); + else if (!strcmp(argv[1], "read")) + cmd_read (argc, argv); + else + error (1, 0, "unknown command `%s'; use `%s help' for help", + argv[1], program_name); + + return 0; +} diff --git a/tests/libpspp/line-reader.at b/tests/libpspp/line-reader.at new file mode 100644 index 0000000000..29cff4d792 --- /dev/null +++ b/tests/libpspp/line-reader.at @@ -0,0 +1,74 @@ +AT_BANNER([line_reader]) + +AT_SETUP([read ASCII]) +AT_KEYWORDS([line_reader]) +AT_CHECK([i18n-test supports_encodings ASCII]) +AT_CHECK([echo string | line-reader-test read - ASCII], [0], [dnl +encoded in ASCII +"string" +]) +AT_CLEANUP + +AT_SETUP([read UTF-8]) +AT_KEYWORDS([line_reader]) +AT_CHECK([printf '\346\227\245\346\234\254\350\252\236\n' | line-reader-test read - UTF-8], [0], [dnl +encoded in UTF-8 +"日本語" +]) +AT_CLEANUP + +AT_SETUP([read EUC-JP]) +AT_KEYWORDS([line_reader]) +AT_CHECK([i18n-test supports_encodings EUC-JP]) +AT_CHECK([printf '\244\241 \244\242 \244\243 \244\244 \244\245 \244\246 \244\247 \244\250 \244\251 \244\252\n' | line-reader-test read - EUC-JP], [0], [dnl +encoded in EUC-JP +"ぁ あ ぃ い ぅ う ぇ え ぉ お" +]) +AT_CLEANUP + +AT_SETUP([read ASCII as Auto]) +AT_KEYWORDS([line_reader]) +AT_CHECK([echo string | line-reader-test read - Auto], [0], [dnl +encoded in ASCII (auto) +"string" +]) +AT_CLEANUP + +AT_SETUP([read UTF-8 as Auto]) +AT_KEYWORDS([line_reader]) +AT_CHECK([printf 'entr\303\251e\n' | line-reader-test read - Auto], [0], [dnl +encoded in ASCII (auto) +encoded in UTF-8 +"entrée" +]) +AT_CLEANUP + +AT_SETUP([read ISO-8859-1 as Auto,ISO-8859-1]) +AT_KEYWORDS([line_reader]) +AT_CHECK([i18n-test supports_encodings ISO-8859-1]) +buffer_size=`line-reader-test buffer-size` +($PERL -e "print 'x' x ($buffer_size - 2)" + printf '\none line\ntwo lines\nentr\351e\nfour lines\n') > input +(printf 'encoded in ASCII (auto)\n\"' + $PERL -e "print 'x' x ($buffer_size - 2)" + printf '\"\n"one line"\n"two lines"\nencoded in ISO-8859-1\n"entr\303\251e"\n"four lines"\n') > expout +AT_CHECK([line-reader-test read input Auto,ISO-8859-1], [0], [expout]) +AT_CLEANUP + +AT_SETUP([read UTF-16BE as Auto,UTF-16BE]) +AT_KEYWORDS([line_reader]) +AT_CHECK([i18n-test supports_encodings UTF-16BE]) +AT_CHECK([printf '\0e\0n\0t\0r\0\351\0e\0\n' | line-reader-test read - Auto,UTF-16BE], + [0], [encoded in UTF-16BE +"entrée" +]) +AT_CLEANUP + +AT_SETUP([read EUC-JP as Auto,EUC-JP]) +AT_KEYWORDS([line_reader]) +AT_CHECK([i18n-test supports_encodings EUC-JP]) +AT_CHECK([printf 'entr\217\253\261e\n' | line-reader-test read - Auto,EUC-JP], + [0], [encoded in EUC-JP +"entrée" +]) +AT_CLEANUP -- 2.30.2