From 0697a4277a80981c438518e844ae8174ea9b8453 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Wed, 24 Apr 2013 22:14:01 -0700 Subject: [PATCH] u8-istream: Fix handling of UTF-8 byte order marks. The UTF-8 special case state didn't skip over an initial byte order mark. This fixes it. Reported by Dr. Holger Handstein . Signed-off-by: Ben Pfaff --- src/libpspp/u8-istream.c | 11 +++++++++-- tests/libpspp/u8-istream.at | 5 +++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/libpspp/u8-istream.c b/src/libpspp/u8-istream.c index b172b164f5..22135dbb33 100644 --- a/src/libpspp/u8-istream.c +++ b/src/libpspp/u8-istream.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2010, 2011, 2012 Free Software Foundation, Inc. + Copyright (C) 2010, 2011, 2012, 2013 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -122,7 +122,14 @@ u8_istream_for_fd (const char *fromcode, int fd) encoding = encoding_guess_head_encoding (fromcode, is->buffer, is->length); if (is_encoding_utf8 (encoding)) - is->state = S_UTF8; + { + unsigned int bom_len; + + is->state = S_UTF8; + bom_len = encoding_guess_bom_length (encoding, is->buffer, is->length); + is->head += bom_len; + is->length -= bom_len; + } else { if (encoding_guess_encoding_is_auto (fromcode) diff --git a/tests/libpspp/u8-istream.at b/tests/libpspp/u8-istream.at index 842e77336d..9757f9605b 100644 --- a/tests/libpspp/u8-istream.at +++ b/tests/libpspp/u8-istream.at @@ -109,6 +109,11 @@ Auto mode entrée UTF-8 mode ]) +AT_CHECK([printf '\357\273\277entr\303\251e\n' | u8-istream-test read - Auto], [0], [dnl +UTF-8 mode +entrée +UTF-8 mode +]) AT_CLEANUP AT_SETUP([read ISO-8859-1 as Auto,ISO-8859-1]) -- 2.30.2