From 7cf9815d68a18a9ad0c60b86d6cf87b819ad69c0 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Fri, 26 Dec 2025 14:54:17 -0800 Subject: [PATCH] Implement multiparagraph values. --- rust/pspp/src/spv/read/html.rs | 79 ++++++++++++++++++++++++++++++++-- 1 file changed, 76 insertions(+), 3 deletions(-) diff --git a/rust/pspp/src/spv/read/html.rs b/rust/pspp/src/spv/read/html.rs index 4cc4b12caf..7f4b61c55b 100644 --- a/rust/pspp/src/spv/read/html.rs +++ b/rust/pspp/src/spv/read/html.rs @@ -438,6 +438,41 @@ impl Markup { Some(results) } } + + fn append(&mut self, other: &mut Self) { + let mut a = take(self).into_seq(); + let mut b = take(other).into_seq(); + if let Some(Markup::Text(head)) = a.last_mut() + && let Some(Markup::Text(tail)) = b.first() + { + head.push_str(&tail); + a.extend(b.drain(1..)); + } else { + a.append(&mut b); + } + *self = Self::from_seq(a); + } + + fn into_seq(self) -> Vec { + if self.is_empty() { + Vec::new() + } else if let Markup::Seq(markups) = self { + markups + } else { + vec![self] + } + } + + fn from_seq(seq: Vec) -> Self { + if seq.is_empty() { + Self::default() + } else { + match <[Self; 1]>::try_from(seq) { + Ok([singleton]) => singleton, + Err(multiple) => Self::Seq(multiple), + } + } + } } /// A block of styled text. @@ -547,10 +582,16 @@ impl Document { } } - /// Returns the document converted to a [Value]. If the document contains - /// more than one [Block], only the first one appears in the [Value]. + /// Returns the document converted to a [Value]. If the document has + /// multiple [Block]s, then they are concatenated with new-lines in between. pub fn into_value(self) -> Value { - self.0.into_iter().next().unwrap_or_default().into_value() + let mut iter = self.0.into_iter(); + let mut block = iter.next().unwrap_or_default(); + for mut additional in iter { + block.markup.append(&mut Markup::Text(String::from("\n"))); + block.markup.append(&mut additional.markup); + } + block.into_value() } /// Returns the document converted to XHTML, except that the result will not @@ -1098,6 +1139,38 @@ NPAR TEST ); } + /// From the corpus, anonymized. + /// + /// This tests treatment of multiple paragraphs in a context where we + /// usually expect only one. + #[test] + fn multiparagraph_value() { + let text = r##"<html> + + + <head> + + </head> + <body> + <p style="margin-top: 0"> + H0:There is no association between X and Y + </p> + <p style="margin-top: 0"> + H1:There is association between X and Y + </p> + </body> +</html> +"##; + let content = quick_xml::de::from_str::(text).unwrap(); + let html = Document::from_html(&content); + let s = html.into_value().display(()).to_string(); + assert_eq!( + s, + "H0:There is no association between X and Y +H1:There is association between X and Y" + ); + } + /// Checks that the `escape-html` feature is enabled in [quick_xml], since /// we need that to resolve ` ` and other HTML entities. #[test] -- 2.30.2