Skip to content

Commit

Permalink
fix(encodings): Use encoding detection confidence levels.
Browse files Browse the repository at this point in the history
If the detector is not confident, force the encoding to iso-8859-1,
which is probably what it is anyway, let's be honest here. Why is this
so hard.

This probably fixes the Spanish language text encoding bug.
  • Loading branch information
ceejbot committed Apr 6, 2024
1 parent d31940a commit 06fb195
Showing 1 changed file with 24 additions and 1 deletion.
25 changes: 24 additions & 1 deletion src/controller/strings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,12 @@ pub fn convert_to_utf8(bytes: Vec<u8>) -> String {
return String::new();
}

let (encoding, _confidence, _language) = chardet::detect(&bytes);
let (encoding, confidence, _language) = chardet::detect(&bytes);
let encoding = if confidence < 0.75 {
"iso-8859-1".to_string() // yeah, well.
} else {
encoding
};
if let Some(coder) = encoding_from_whatwg_label(chardet::charset2encoding(&encoding)) {
if let Ok(utf8string) = coder.decode(&bytes, DecoderTrap::Replace) {
return utf8string.to_string();
Expand Down Expand Up @@ -74,4 +79,22 @@ mod tests {
let converted = convert_to_utf8(bytes.clone());
assert_eq!(converted, utf8_version);
}

#[test]
fn windows1252_is_decoded() {
// windows-1252 is identical to iso-8859-1. IDEK.
let bytes: Vec<u8> = vec![
0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd,
0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb,
0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
];
let utf8_version =
"ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ".to_string();
assert!(String::from_utf8(bytes.clone()).is_err());
let converted = convert_to_utf8(bytes.clone());
assert_eq!(converted.len(), utf8_version.len());
assert_eq!(converted, utf8_version);
}
}

0 comments on commit 06fb195

Please sign in to comment.