use pango::{AttrColor, AttrInt, AttrList, AttrSize, AttrString, IsAttribute};
use quick_xml::{
Writer as XmlWriter,
- escape::unescape,
- events::{BytesText, Event},
+ escape::resolve_html5_entity,
+ events::{BytesRef, BytesText, Event},
};
use serde::{Deserialize, Deserializer, Serialize, ser::SerializeMap};
blocks
}
+fn unescape(mut input: &str) -> Cow<'_, str> {
+ let mut output = String::new();
+ while let Some(amp) = input.find('&') {
+ if amp > 0 {
+ output.push_str(&input[..amp]);
+ }
+ input = &input[amp + 1..];
+ if let Some(semi) = input.find([';', '&']) {
+ let entity = &input[..semi];
+ let rest = &input[semi + 1..];
+ if let Ok(Some(c)) = BytesRef::new(entity).resolve_char_ref() {
+ output.push(c);
+ input = rest;
+ } else if let Some(resolution) = resolve_html5_entity(entity) {
+ output.push_str(resolution);
+ input = rest;
+ } else {
+ output.push('&');
+ }
+ } else {
+ output.push('&');
+ }
+ }
+ if output.is_empty() {
+ dbg!(input);
+ Cow::from(input)
+ } else {
+ output.push_str(input);
+ Cow::from(output)
+ }
+}
+
fn parse_nodes(nodes: &[Node]) -> Markup {
// Appends `markup` to `dst`, merging text at the end of `dst` with text
// in `markup`.
} else {
text
};
- add_markup(
- &mut retval,
- Markup::Text(unescape(&text).unwrap_or(Cow::from(text)).into_owned()),
- );
+ add_markup(&mut retval, Markup::Text(unescape(&text).into_owned()));
}
// SPSS often starts paragraphs with an initial `<BR>` that it
// ignores, but it does honor `<br>`. So weird.
);
}
+ /// From the corpus, demonstrating how SPSS sometimes writes `&` instead of `&`.
+ #[test]
+ fn invalid_entities() {
+ let text = r##"<xml><head><style type="text/css">p{color:0;font-family:Monospaced;font-size:14pt;font-style:normal;font-weight:normal;text-decoration:none}</style></head><BR>Stem-and-Leaf&nbsp;Plot&nbsp;for<br></br>Foobar=&nbsp;K(+)<br></br><br></br>&nbsp;Frequency&nbsp;&nbsp;&nbsp;&nbsp;Stem&nbsp;&&nbsp;&nbsp;Leaf<br></br><br></br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;1.00&nbsp;Extremes&nbsp;&nbsp;&nbsp;&nbsp;(=&lt;4)<br></br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;4.00&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;0&nbsp;.&nbsp;&nbsp;6666<br></br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;1.00&nbsp;Extremes&nbsp;&nbsp;&nbsp;&nbsp;(&gt;=8)<br></br><br></br>&nbsp;Stem&nbsp;width:&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;10.00<br></br>&nbsp;Each&nbsp;leaf:&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;1&nbsp;case(s)<br></br><br></br><br></br></xml>
+"##;
+ let content = quick_xml::de::from_str::<String>(text).unwrap();
+ assert_eq!(
+ Document::from_html(&content).to_html(),
+ r##"<p align="left"><font face="Monospaced" size="14pt">Stem-and-Leaf Plot for
+Foobar= K(+)
+
+ Frequency Stem & Leaf
+
+ 1.00 Extremes (=<4)
+ 4.00 0 . 6666
+ 1.00 Extremes (>=8)
+
+ Stem width: 10.00
+ Each leaf: 1 case(s)
+
+
+</font></p>"##
+ );
+ }
+
/// From the corpus (also included in the documentation).
#[test]
fn header1() {