From: Ben Pfaff Date: Thu, 13 Nov 2025 16:51:52 +0000 (-0800) Subject: one xml test file works OK at least for text output X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=84ce4550ff03e394f30ab5971e0ae1c8f4485aab;p=pspp one xml test file works OK at least for text output --- diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 07d54ed734..d51db7cebb 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -1937,9 +1937,9 @@ dependencies = [ [[package]] name = "quick-xml" -version = "0.37.5" +version = "0.38.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "331e97a1af0bf59823e6eadffe373d7b27f485be8748f71471c662c1f269b7fb" +checksum = "b66c2058c55a409d601666cffe35f04333cf1013010882cec174a7467cd4e21c" dependencies = [ "memchr", "serde", diff --git a/rust/pspp/Cargo.toml b/rust/pspp/Cargo.toml index 47e4671e8c..b1fd21b0ed 100644 --- a/rust/pspp/Cargo.toml +++ b/rust/pspp/Cargo.toml @@ -30,7 +30,7 @@ libm = "0.2.11" smallstr = "0.3.0" itertools = "0.14.0" unicode-linebreak = "0.1.5" -quick-xml = { version = "0.37.2", features = ["serialize"] } +quick-xml = { version = "0.38.4", features = ["serialize"] } serde = { version = "1.0.218", features = ["derive", "rc"] } color = { version = "0.2.3", features = ["serde"] } binrw = "0.14.1" diff --git a/rust/pspp/src/output/drivers/cairo/fsm.rs b/rust/pspp/src/output/drivers/cairo/fsm.rs index e4b8002a50..8f87532850 100644 --- a/rust/pspp/src/output/drivers/cairo/fsm.rs +++ b/rust/pspp/src/output/drivers/cairo/fsm.rs @@ -332,7 +332,19 @@ impl CairoDevice<'_> { let (body, suffixes) = cell.display().split_suffixes(); let horz_align = cell.horz_align(&body); - let body = body.to_string(); + + let mut attrs = None; + let mut body = if let Some(markup) = body.markup() { + match parse_markup(markup, 0 as char) { + Ok((markup_attrs, string, _accel)) => { + attrs = Some(markup_attrs); + string.into() + } + Err(_) => String::from(markup), + } + } else { + avoid_decimal_split(body.to_string()) + }; match horz_align { HorzAlign::Decimal { offset, decimal } if !cell.rotate => { @@ -348,19 +360,6 @@ impl CairoDevice<'_> { _ => (), } - let mut attrs = None; - let mut body = if cell.font_style.markup { - match parse_markup(&body, 0 as char) { - Ok((markup_attrs, string, _accel)) => { - attrs = Some(markup_attrs); - string.into() - } - Err(_) => body, - } - } else { - avoid_decimal_split(body) - }; - if cell.font_style.underline { attrs .get_or_insert_default() diff --git a/rust/pspp/src/output/drivers/spv.rs b/rust/pspp/src/output/drivers/spv.rs index db1a9e724b..82e424b086 100644 --- a/rust/pspp/src/output/drivers/spv.rs +++ b/rust/pspp/src/output/drivers/spv.rs @@ -1337,6 +1337,17 @@ impl BinWrite for Value { ) .write_options(writer, endian, args)?; } + ValueInner::Markup(markup) => { + ( + 3u8, + SpvString(&markup.xml), // XXX + ValueMod::new(self), + SpvString(&markup.xml), + SpvString(&markup.xml), + SpvBool(true), + ) + .write_options(writer, endian, args)?; + } ValueInner::Text(text) => { ( 3u8, diff --git a/rust/pspp/src/output/pivot.rs b/rust/pspp/src/output/pivot.rs index badb4f8ea9..8f6a91e373 100644 --- a/rust/pspp/src/output/pivot.rs +++ b/rust/pspp/src/output/pivot.rs @@ -60,7 +60,7 @@ use enum_iterator::Sequence; use enum_map::{Enum, EnumMap, enum_map}; use itertools::Itertools; pub use look_xml::{Length, TableProperties}; -use quick_xml::{DeError, de::from_str}; +use quick_xml::{DeError, de::from_str, escape::resolve_xml_entity, events::Event}; use serde::{ Deserialize, Serialize, Serializer, de::Visitor, @@ -818,28 +818,28 @@ impl Category { pub fn as_group(&self) -> Option<&Group> { match self { Category::Group(group) => Some(group), - Category::Leaf(leaf) => None, + Category::Leaf(_) => None, } } pub fn as_group_mut(&mut self) -> Option<&mut Group> { match self { Category::Group(group) => Some(group), - Category::Leaf(leaf) => None, + Category::Leaf(_) => None, } } pub fn as_leaf(&self) -> Option<&Leaf> { match self { - Category::Group(group) => None, Category::Leaf(leaf) => Some(leaf), + Category::Group(_) => None, } } pub fn as_leaf_mut(&mut self) -> Option<&mut Leaf> { match self { - Category::Group(group) => None, Category::Leaf(leaf) => Some(leaf), + Category::Group(_) => None, } } @@ -884,10 +884,10 @@ impl Category { } } - fn index_path(&self, index: usize, mut path: IndexVec) -> Option { + fn index_path(&self, index: usize, path: IndexVec) -> Option { match self { Category::Group(group) => group.index_path(index, path), - Category::Leaf(leaf) if index == 0 => Some(path), + Category::Leaf(_) if index == 0 => Some(path), _ => None, } } @@ -1255,7 +1255,6 @@ pub struct FontStyle { pub bold: bool, pub italic: bool, pub underline: bool, - pub markup: bool, pub font: String, pub fg: Color, pub bg: Color, @@ -1270,7 +1269,6 @@ impl Default for FontStyle { bold: false, italic: false, underline: false, - markup: false, font: String::from("Sans Serif"), fg: Color::BLACK, bg: Color::WHITE, @@ -1292,9 +1290,6 @@ impl FontStyle { pub fn with_underline(self, underline: bool) -> Self { Self { underline, ..self } } - pub fn with_markup(self, markup: bool) -> Self { - Self { markup, ..self } - } pub fn with_font(self, font: impl Into) -> Self { Self { font: font.into(), @@ -2400,6 +2395,7 @@ impl Value { ValueInner::String(string_value) => string_value.s.serialize(serializer), ValueInner::Variable(variable_value) => variable_value.var_name.serialize(serializer), ValueInner::Text(text_value) => text_value.localized.serialize(serializer), + ValueInner::Markup(markup) => markup.xml.serialize(serializer), /*XXX*/ ValueInner::Template(template_value) => template_value.localized.serialize(serializer), ValueInner::Empty => serializer.serialize_none(), } @@ -2496,6 +2492,9 @@ impl Value { localized, })) } + pub fn new_markup(s: impl Into) -> Self { + Self::new(ValueInner::Markup(Markup { xml: s.into() })) + } pub fn new_user_text(s: impl Into) -> Self { let s: String = s.into(); if s.is_empty() { @@ -2627,7 +2626,6 @@ impl From<&Variable> for Value { pub struct DisplayValue<'a> { inner: &'a ValueInner, - markup: bool, subscripts: &'a [String], footnotes: &'a [Arc], options: ValueOptions, @@ -2663,6 +2661,10 @@ impl<'a> DisplayValue<'a> { } } + pub fn markup(&self) -> Option<&str> { + self.inner.markup() + } + /// Returns this display split into `(body, suffixes)` where `suffixes` is /// subscripts and footnotes and `body` is everything else. pub fn split_suffixes(self) -> (Self, Self) { @@ -2674,21 +2676,11 @@ impl<'a> DisplayValue<'a> { } pub fn with_styling(mut self, styling: &'a ValueStyle) -> Self { - if let Some(font_style) = &styling.font_style { - self.markup = font_style.markup; - } self.subscripts = styling.subscripts.as_slice(); self.footnotes = styling.footnotes.as_slice(); self } - pub fn with_font_style(self, font_style: &FontStyle) -> Self { - Self { - markup: font_style.markup, - ..self - } - } - pub fn with_subscripts(self, subscripts: &'a [String]) -> Self { Self { subscripts, ..self } } @@ -2877,15 +2869,31 @@ impl Display for DisplayValue<'_> { } } - ValueInner::Text(TextValue { - localized: local, .. - }) => { - if self.markup { - dbg!(local); + ValueInner::Markup(Markup { xml }) => { + let mut reader = quick_xml::Reader::from_str(xml.as_str()); + while let Ok(event) = reader.read_event() { + match event { + Event::Text(bytes_text) => { + f.write_str(&bytes_text.decode().unwrap())?; + } + Event::GeneralRef(bytes) => { + if let Ok(entity) = bytes.decode() + && let Some(s) = resolve_xml_entity(&entity) + { + f.write_str(s)?; + } + } + Event::Eof => break, + _ => (), + } } - f.write_str(local) + Ok(()) } + ValueInner::Text(TextValue { + localized: local, .. + }) => f.write_str(local), + ValueInner::Template(TemplateValue { args, localized: local, @@ -3015,6 +3023,11 @@ pub struct VariableValue { pub variable_label: Option, } +#[derive(Clone, Debug, PartialEq, Serialize /*XXX*/)] +pub struct Markup { + pub xml: String, +} + #[derive(Clone, Debug, PartialEq)] pub struct TextValue { pub user_provided: bool, @@ -3077,6 +3090,7 @@ pub enum ValueInner { String(StringValue), Variable(VariableValue), Text(TextValue), + Markup(Markup), Template(TemplateValue), #[default] @@ -3118,6 +3132,13 @@ impl ValueInner { _ => None, } } + + fn markup(&self) -> Option<&str> { + match self { + ValueInner::Markup(markup) => Some(&markup.xml), + _ => None, + } + } } #[derive(Clone, Debug, Default, PartialEq)] @@ -3161,7 +3182,6 @@ impl ValueInner { }; DisplayValue { inner: self, - markup: false, subscripts: &[], footnotes: &[], options, diff --git a/rust/pspp/src/output/pivot/look_xml.rs b/rust/pspp/src/output/pivot/look_xml.rs index 908c322458..96086d8e85 100644 --- a/rust/pspp/src/output/pivot/look_xml.rs +++ b/rust/pspp/src/output/pivot/look_xml.rs @@ -230,7 +230,6 @@ impl CellStyle { bold: self.font_weight == FontWeight::Bold, italic: self.font_style == FontStyle::Italic, underline: self.font_underline == FontUnderline::Underline, - markup: false, font: self.font_family.clone(), fg: match data_row { RowParity::Even => self.color.unwrap_or(Color::BLACK), @@ -581,7 +580,6 @@ mod tests { bold: true, italic: false, underline: false, - markup: false, font: String::from("Sans Serif"), fg: Color::BLACK, bg: Color::WHITE, @@ -609,7 +607,6 @@ mod tests { bold: false, italic: false, underline: false, - markup: false, font: String::from("Sans Serif"), fg: Color::BLACK, bg: Color::WHITE, @@ -637,7 +634,6 @@ mod tests { bold: false, italic: false, underline: false, - markup: false, font: String::from("Sans Serif"), fg: Color::BLACK, bg: Color::WHITE, @@ -665,7 +661,6 @@ mod tests { bold: false, italic: false, underline: false, - markup: false, font: String::from("Sans Serif"), fg: Color::BLACK, bg: Color::WHITE, @@ -695,7 +690,6 @@ mod tests { bold: false, italic: false, underline: false, - markup: false, font: String::from("Sans Serif"), fg: Color::BLACK, bg: Color::WHITE, @@ -725,7 +719,6 @@ mod tests { bold: false, italic: false, underline: false, - markup: false, font: String::from("Sans Serif"), fg: Color::BLACK, bg: Color::WHITE, @@ -753,7 +746,6 @@ mod tests { bold: false, italic: false, underline: false, - markup: false, font: String::from("Sans Serif"), fg: Color::BLACK, bg: Color::WHITE, @@ -781,7 +773,6 @@ mod tests { bold: false, italic: false, underline: false, - markup: false, font: String::from("Sans Serif"), fg: Color::BLACK, bg: Color::BLACK, @@ -809,7 +800,6 @@ mod tests { bold: false, italic: false, underline: false, - markup: false, font: String::from("Sans Serif"), fg: Color::BLACK, bg: Color::WHITE, diff --git a/rust/pspp/src/output/pivot/tlo.rs b/rust/pspp/src/output/pivot/tlo.rs index 12f474f12c..b83526186a 100644 --- a/rust/pspp/src/output/pivot/tlo.rs +++ b/rust/pspp/src/output/pivot/tlo.rs @@ -371,7 +371,6 @@ impl super::AreaStyle { bold: style.weight > 400, italic: style.italic, underline: style.underline, - markup: false, font: style.font_name.string.clone(), fg: style.text_color, bg, diff --git a/rust/pspp/src/output/spv.rs b/rust/pspp/src/output/spv.rs index f2b22359d5..9512da3451 100644 --- a/rust/pspp/src/output/spv.rs +++ b/rust/pspp/src/output/spv.rs @@ -30,7 +30,7 @@ use zip::{ZipArchive, result::ZipError}; use crate::output::{ Details, Item, SpvInfo, SpvMembers, Text, page::PageSetup, - pivot::{Look, PivotTable, TableProperties, Value}, + pivot::{Look, TableProperties, Value}, spv::{ legacy_bin::LegacyBin, legacy_xml::Visualization, @@ -481,7 +481,6 @@ struct ContainerText { impl ContainerText { fn decode(&self) -> Value { - dbg!(&self.html); html::parse(&self.html) } } diff --git a/rust/pspp/src/output/spv/html.rs b/rust/pspp/src/output/spv/html.rs index ff9deb2467..baca5a5da7 100644 --- a/rust/pspp/src/output/spv/html.rs +++ b/rust/pspp/src/output/spv/html.rs @@ -22,35 +22,37 @@ fn find_element<'a>(elements: &'a [Node], name: &str) -> Option<&'a Element> { None } +fn parse_entity(s: &str) -> (char, &str) { + static ENTITIES: [(&str, char); 6] = [ + ("amp;", '&'), + ("lt;", '<'), + ("gt;", '>'), + ("apos;", '\''), + ("quot;", '"'), + ("nbsp;", '\u{00a0}'), + ]; + for (name, ch) in ENTITIES { + if let Some(rest) = s.strip_prefix(name) { + return (ch, rest); + } + } + ('&', s) +} + fn get_node_text(node: &Node, text: &mut String) { match node { Node::Text(string) => { let mut s = string.as_str(); - 'OUTER: while !s.is_empty() { + while !s.is_empty() { let amp = s.find('&').unwrap_or(s.len()); let (head, rest) = s.split_at(amp); text.push_str(head); if rest.is_empty() { break; } - - static ENTITIES: [(&str, char); 6] = [ - ("&", '&'), - ("<", '<'), - (">", '>'), - ("'", '\''), - (""", '"'), - (" ", '\u{00a0}'), - ]; - for (name, character) in ENTITIES { - if let Some(rest) = rest.strip_prefix(name) { - text.push(character); - s = rest; - continue 'OUTER; - } - } - text.push('&'); - s = &s[1..]; + let ch; + (ch, s) = parse_entity(&s[1..]); + text.push(ch); } } Node::Element(element) => get_element_text(element, text), @@ -67,14 +69,19 @@ fn get_element_text(element: &Element, text: &mut String) { fn extract_html_text(node: &Node, base_font_size: i32, s: &mut String) { match node { Node::Text(text) => { - dbg!(text); - for c in text.chars() { + let mut iter = text.chars(); + while let Some(mut c) = iter.next() { fn push_whitespace(c: char, s: &mut String) { if s.chars().next_back().is_none_or(|c| !c.is_whitespace()) { s.push(c); } } + if c == '&' { + let rest; + (c, rest) = parse_entity(iter.as_str()); + iter = rest.chars(); + } match c { '\u{00a0}' => { // U+00A0 NONBREAKING SPACE is really, really common @@ -91,6 +98,9 @@ fn extract_html_text(node: &Node, base_font_size: i32, s: &mut String) { _ if c.is_whitespace() => push_whitespace(c, s), '<' => s.push_str("<"), '>' => s.push_str(">"), + '&' => s.push_str("&"), + '\'' => s.push_str("'"), + '"' => s.push_str("""), _ => s.push(c), } } @@ -269,7 +279,6 @@ fn parse2( font_style: &mut FontStyle, ) -> Result<(), html_parser::Error> { let dom = Dom::parse(&format!("{input}"))?; - font_style.markup = true; for node in &dom.children { match node.element() { Some(head) if head.name.eq_ignore_ascii_case("head") => { @@ -310,7 +319,7 @@ pub fn parse_value(input: &str) -> Value { let mut font_style = FontStyle::default().with_size(10); let mut html = String::new(); if parse2(input, &mut html, &mut font_style).is_ok() { - Value::new_user_text(html) + Value::new_markup(html) } else { Value::new_user_text(input) } @@ -361,9 +370,8 @@ pub fn parse_paragraphs(input: &str) -> Vec { pub fn parse(input: &str) -> Value { let mut font_style = FontStyle::default().with_size(10); - let text = match Dom::parse(&format!("{input}")) { + let value = match Dom::parse(&format!("{input}")) { Ok(dom) => { - font_style.markup = true; let mut s = String::new(); for node in &dom.children { if let Node::Element(head) = node @@ -378,18 +386,15 @@ pub fn parse(input: &str) -> Value { extract_html_text(node, font_style.size, &mut s); } } - dbg!(&s); - s + Value::new_markup(s) } - _ => input.into(), + _ => Value::new_user_text(input), }; - Value::new_user_text(text).with_font_style(font_style) + value.with_font_style(font_style) } #[cfg(test)] mod tests { - use html_parser::Dom; - use crate::output::{ pivot::{FontStyle, Value}, spv::html::{parse, parse_paragraphs, parse_value}, @@ -399,12 +404,8 @@ mod tests { fn css() { assert_eq!( parse("text"), - Value::new_user_text("text").with_font_style( - FontStyle::default() - .with_size(18) - .with_bold(true) - .with_markup(true) - ) + Value::new_markup("text") + .with_font_style(FontStyle::default().with_size(18).with_bold(true)) ); } @@ -415,7 +416,7 @@ mod tests { ); assert_eq!( value, - Value::new_user_text( + Value::new_markup( r##"bold italic bold italic @@ -423,17 +424,10 @@ mod tests { big "## ) - .with_font_style(FontStyle::default().with_size(10).with_markup(true)) + .with_font_style(FontStyle::default().with_size(10)) ); } - #[test] - fn entity() { - let html = r#"Hi there!"#; - dbg!(Dom::parse(html)); - todo!() - } - #[test] fn paragraphs() { let paragraphs = parse_paragraphs( diff --git a/rust/pspp/src/output/spv/legacy_xml.rs b/rust/pspp/src/output/spv/legacy_xml.rs index 08509304f5..f44da6dfe6 100644 --- a/rust/pspp/src/output/spv/legacy_xml.rs +++ b/rust/pspp/src/output/spv/legacy_xml.rs @@ -27,7 +27,6 @@ use std::{ use chrono::{NaiveDateTime, NaiveTime}; use enum_map::{Enum, EnumMap}; use hashbrown::HashSet; -use itertools::Itertools; use ordered_float::OrderedFloat; use serde::Deserialize; @@ -42,7 +41,6 @@ use crate::{ PivotTable, RowParity, Value, ValueInner, VertAlign, }, spv::legacy_bin::DataValue, - table, }, }; @@ -365,7 +363,7 @@ impl Visualization { } for dv in take(&mut derived_variables) { - match dv.decode(&data, &series) { + match dv.decode(&series) { Ok(s) => { series.insert(&dv.id, s); } @@ -875,7 +873,6 @@ impl Visualization { // they are redundant. Ignore them. continue; }; - let dimension = &mut dims[dim_index].dimension; for index in w.include.split(';').filter_map(|s| s.parse::().ok()) { @@ -1178,11 +1175,7 @@ struct DerivedVariable { } impl DerivedVariable { - fn decode( - &self, - data: &HashMap>>, - series: &HashMap<&str, Series>, - ) -> Result { + fn decode(&self, series: &HashMap<&str, Series>) -> Result { let mut values = if self.value == "constant(0)" { let n_values = if let Some(series) = series.values().next() { series.values.len() @@ -1868,7 +1861,7 @@ impl Style { Some(SetFormatChild::NumberFormat(format)) => { Some(SignificantNumberFormat::from(format).decode()) } - Some(SetFormatChild::StringFormat(format)) => None, + Some(SetFormatChild::StringFormat(_)) => None, Some(SetFormatChild::DateTimeFormat(format)) => Some(format.decode()), Some(SetFormatChild::ElapsedTimeFormat(format)) => Some(format.decode()), None => None, @@ -1919,7 +1912,7 @@ impl Style { } if fg.is_some() || bg.is_some() { - let mut styling = value.styling_mut(); + let styling = value.styling_mut(); let font_style = styling .font_style .get_or_insert_with(|| base_style.font_style.clone()); @@ -2654,7 +2647,7 @@ impl LabelFrame { for t in labels { if let LabelChild::Text(text) = &t.child { for t in text { - if let Some(defines_reference) = t.defines_reference { + if let Some(_defines_reference) = t.defines_reference { // XXX footnote } s += &t.text; diff --git a/rust/pspp/src/output/spv/light.rs b/rust/pspp/src/output/spv/light.rs index 903e48ecef..d2f26966d0 100644 --- a/rust/pspp/src/output/spv/light.rs +++ b/rust/pspp/src/output/spv/light.rs @@ -430,7 +430,6 @@ impl Area { bold: (self.style & 1) != 0, italic: (self.style & 2) != 0, underline: self.underline, - markup: false, font: self.typeface.decode(encoding), fg: match data_row { RowParity::Even => self.fg, @@ -1400,7 +1399,6 @@ impl ValueMods { bold: font_style.bold, italic: font_style.italic, underline: font_style.underline, - markup: false, font: font_style.typeface.decode(encoding), fg: font_style.fg, bg: font_style.bg, diff --git a/rust/pspp/src/output/table.rs b/rust/pspp/src/output/table.rs index 5597ae794b..40c7c9c9f9 100644 --- a/rust/pspp/src/output/table.rs +++ b/rust/pspp/src/output/table.rs @@ -416,7 +416,6 @@ impl<'a> DrawCell<'a> { pub fn display(&self) -> DisplayValue<'a> { self.inner .display(self.value_options) - .with_font_style(&self.font_style) .with_subscripts(self.subscripts) .with_footnotes(self.footnotes) }