From 33820c6420e31b9b7e878eeda38708cc447e4ca2 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sat, 19 Aug 2023 18:39:32 -0700 Subject: [PATCH] find bad utf8 --- rust/src/main.rs | 61 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 51 insertions(+), 10 deletions(-) diff --git a/rust/src/main.rs b/rust/src/main.rs index 6024b67f2d..56b007e05b 100644 --- a/rust/src/main.rs +++ b/rust/src/main.rs @@ -16,11 +16,11 @@ use anyhow::Result; use clap::Parser; -use pspp::raw::{Reader, Record}; +use pspp::raw::{Reader, Record, UnencodedStr, Value}; use std::fs::File; use std::io::BufReader; use std::path::{Path, PathBuf}; -use std::str; +use std::str::{self, from_utf8}; /// A utility to dissect SPSS system files. #[derive(Parser, Debug)] @@ -39,29 +39,70 @@ fn main() -> Result<()> { let Args { max_cases, files } = Args::parse(); for file in files { - dissect(&file, max_cases)?; + if let Err(error) = dissect(&file, max_cases) { + println!("{}: {error}", file.display()); + } } Ok(()) } -fn dissect(file_name: &Path, max_cases: u64) -> Result<()> { +fn dissect(file_name: &Path, _max_cases: u64) -> Result<()> { let reader = File::open(file_name)?; let reader = BufReader::new(reader); let mut reader = Reader::new(reader)?; let records: Vec = reader.collect_headers()?; + let mut character_code = None; for record in records { - println!("{record:?}"); + //println!("{record:?}"); + if let Record::IntegerInfo(ref info) = record { + character_code = Some(info.character_code); + } if let Record::EndOfHeaders(_) = record { break; }; } - for _ in 0..max_cases { - let Some(Ok(Record::Case(data))) = reader.next() else { - break; - }; - println!("{:?}", data); + if character_code != Some(65001) { + return Ok(()); + } + let mut n = 0; + while let Some(Ok(Record::Case(data))) = reader.next() { + n += 1; + let mut strings = Vec::new(); + for value in data.iter() { + if let Value::String(UnencodedStr(s)) = value { + strings.extend_from_slice(&s[..]); + } + } + + let mut rest = &strings[..]; + let mut any_errors = false; + while let Err(error) = from_utf8(&rest) { + if !any_errors { + print!("{}: UTF-8 error", file_name.display()); + any_errors = true; + } + let start = error.valid_up_to(); + let len = match error.error_len() { + Some(len) => len, + None => rest.len() - start + }; +// print!(" {}", (start + len) % 8); + print!("["); + for i in 0..len { + print!("{:02x}", rest[i + start]); + } + print!("]"); + rest = &rest[start + len..]; + } + if any_errors { + println!(); + println!("Lossy: {}", String::from_utf8_lossy(&strings[..]).replace(char::REPLACEMENT_CHARACTER, "??????").replace(&[' ', '\0'], "")); + return Ok(()) + } + //println!("{:?}", data); } + println!("{}: read {n} records", file_name.display()); Ok(()) } -- 2.30.2