diff --git a/src/controller/strings.rs b/src/controller/strings.rs index 9bc01fc0..ffab6e5f 100644 --- a/src/controller/strings.rs +++ b/src/controller/strings.rs @@ -33,7 +33,12 @@ pub fn convert_to_utf8(bytes: Vec) -> String { return String::new(); } - let (encoding, _confidence, _language) = chardet::detect(&bytes); + let (encoding, confidence, _language) = chardet::detect(&bytes); + let encoding = if confidence < 0.75 { + "iso-8859-1".to_string() // yeah, well. + } else { + encoding + }; if let Some(coder) = encoding_from_whatwg_label(chardet::charset2encoding(&encoding)) { if let Ok(utf8string) = coder.decode(&bytes, DecoderTrap::Replace) { return utf8string.to_string(); @@ -74,4 +79,22 @@ mod tests { let converted = convert_to_utf8(bytes.clone()); assert_eq!(converted, utf8_version); } + + #[test] + fn windows1252_is_decoded() { + // windows-1252 is identical to iso-8859-1. IDEK. + let bytes: Vec = vec![ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, + 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, + 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, + 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, + ]; + let utf8_version = + "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ".to_string(); + assert!(String::from_utf8(bytes.clone()).is_err()); + let converted = convert_to_utf8(bytes.clone()); + assert_eq!(converted.len(), utf8_version.len()); + assert_eq!(converted, utf8_version); + } }