handle invalid entities compatibly rust
authorBen Pfaff <blp@cs.stanford.edu>
Sat, 10 Jan 2026 17:44:44 +0000 (09:44 -0800)
committerBen Pfaff <blp@cs.stanford.edu>
Sat, 10 Jan 2026 17:48:38 +0000 (09:48 -0800)
rust/pspp/src/spv/read/html.rs
rust/pspp/src/spv/read/tests.rs
rust/pspp/src/spv/testdata/text1.expected [new file with mode: 0644]
rust/pspp/src/spv/testdata/text1.spv [new file with mode: 0644]

index 7f4b61c55bb0e10828fcbef524d575015436cc9e..161e790becb59cdc84a7bd6e19d5e708929b42d5 100644 (file)
@@ -34,8 +34,8 @@ use html_parser::{Dom, Element, Node};
 use pango::{AttrColor, AttrInt, AttrList, AttrSize, AttrString, IsAttribute};
 use quick_xml::{
     Writer as XmlWriter,
-    escape::unescape,
-    events::{BytesText, Event},
+    escape::resolve_html5_entity,
+    events::{BytesRef, BytesText, Event},
 };
 use serde::{Deserialize, Deserializer, Serialize, ser::SerializeMap};
 
@@ -777,6 +777,38 @@ fn parse_dom(dom: &Dom) -> Vec<Block> {
     blocks
 }
 
+fn unescape(mut input: &str) -> Cow<'_, str> {
+    let mut output = String::new();
+    while let Some(amp) = input.find('&') {
+        if amp > 0 {
+            output.push_str(&input[..amp]);
+        }
+        input = &input[amp + 1..];
+        if let Some(semi) = input.find([';', '&']) {
+            let entity = &input[..semi];
+            let rest = &input[semi + 1..];
+            if let Ok(Some(c)) = BytesRef::new(entity).resolve_char_ref() {
+                output.push(c);
+                input = rest;
+            } else if let Some(resolution) = resolve_html5_entity(entity) {
+                output.push_str(resolution);
+                input = rest;
+            } else {
+                output.push('&');
+            }
+        } else {
+            output.push('&');
+        }
+    }
+    if output.is_empty() {
+        dbg!(input);
+        Cow::from(input)
+    } else {
+        output.push_str(input);
+        Cow::from(output)
+    }
+}
+
 fn parse_nodes(nodes: &[Node]) -> Markup {
     // Appends `markup` to `dst`, merging text at the end of `dst` with text
     // in `markup`.
@@ -812,10 +844,7 @@ fn parse_nodes(nodes: &[Node]) -> Markup {
                 } else {
                     text
                 };
-                add_markup(
-                    &mut retval,
-                    Markup::Text(unescape(&text).unwrap_or(Cow::from(text)).into_owned()),
-                );
+                add_markup(&mut retval, Markup::Text(unescape(&text).into_owned()));
             }
             // SPSS often starts paragraphs with an initial `<BR>` that it
             // ignores, but it does honor `<br>`.  So weird.
@@ -1013,6 +1042,31 @@ mod tests {
         );
     }
 
+    /// From the corpus, demonstrating how SPSS sometimes writes `&` instead of `&amp;`.
+    #[test]
+    fn invalid_entities() {
+        let text = r##"<xml>&lt;head>&lt;style type="text/css">p{color:0;font-family:Monospaced;font-size:14pt;font-style:normal;font-weight:normal;text-decoration:none}&lt;/style>&lt;/head>&lt;BR>Stem-and-Leaf&amp;nbsp;Plot&amp;nbsp;for&lt;br>&lt;/br>Foobar=&amp;nbsp;K(+)&lt;br>&lt;/br>&lt;br>&lt;/br>&amp;nbsp;Frequency&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;Stem&amp;nbsp;&amp;&amp;nbsp;&amp;nbsp;Leaf&lt;br>&lt;/br>&lt;br>&lt;/br>&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;1.00&amp;nbsp;Extremes&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;(=&amp;lt;4)&lt;br>&lt;/br>&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;4.00&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;0&amp;nbsp;.&amp;nbsp;&amp;nbsp;6666&lt;br>&lt;/br>&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;1.00&amp;nbsp;Extremes&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;(&amp;gt;=8)&lt;br>&lt;/br>&lt;br>&lt;/br>&amp;nbsp;Stem&amp;nbsp;width:&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;10.00&lt;br>&lt;/br>&amp;nbsp;Each&amp;nbsp;leaf:&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;1&amp;nbsp;case(s)&lt;br>&lt;/br>&lt;br>&lt;/br>&lt;br>&lt;/br></xml>
+"##;
+        let content = quick_xml::de::from_str::<String>(text).unwrap();
+        assert_eq!(
+            Document::from_html(&content).to_html(),
+            r##"<p align="left"><font face="Monospaced" size="14pt">Stem-and-Leaf Plot for
+Foobar= K(+)
+
+ Frequency    Stem &amp;  Leaf
+
+     1.00 Extremes    (=&lt;4)
+     4.00        0 .  6666
+     1.00 Extremes    (&gt;=8)
+
+ Stem width:     10.00
+ Each leaf:       1 case(s)
+
+
+</font></p>"##
+        );
+    }
+
     /// From the corpus (also included in the documentation).
     #[test]
     fn header1() {
index 40fefc6f9be96728192d3e78f05825974f289dff..9b089c520ad958dc5348a33570a4ac7ea40558a7 100644 (file)
@@ -13,6 +13,12 @@ use crate::{
     spv::SpvArchive,
 };
 
+/// Tests for compatible handling of invalid HTML entity syntax.
+#[test]
+fn text1() {
+    test_raw_spvfile("text1", None);
+}
+
 /// Checks that reordering categories works properly.
 #[test]
 fn light1() {
diff --git a/rust/pspp/src/spv/testdata/text1.expected b/rust/pspp/src/spv/testdata/text1.expected
new file mode 100644 (file)
index 0000000..eea2994
--- /dev/null
@@ -0,0 +1,13 @@
+Something something foo sth-7 Stem-and-Leaf Plot for
+Something= S(+)
+
+ Frequency    Stem &  Leaf
+
+     1.00 Extremes    (=<4)
+     4.00        0 .  6666
+     1.00 Extremes    (>=8)
+
+ Stem width:     10.00
+ Each leaf:       1 case(s)
+
+
diff --git a/rust/pspp/src/spv/testdata/text1.spv b/rust/pspp/src/spv/testdata/text1.spv
new file mode 100644 (file)
index 0000000..cc320d9
Binary files /dev/null and b/rust/pspp/src/spv/testdata/text1.spv differ