Add check for unicode normalization in config (#44)

This ensures that the config sections match Wikipedia's Unicode
normalization. We could also normalize every section in every article to
handle an edge case where non-normalized output is inadvertently created
as tags are joined, but I don't think that's worth it yet.

From <https://mediawiki.org/wiki/Unicode_normalization_considerations>:

> MediaWiki applies normalization form C (NFC) to Unicode text input.

> MediaWiki doesn't apply any normalization to its output, for example
> `cafe<nowiki/>́` becomes "café" (shows U+0065 U+0301 in a row,
> without precomposed characters like U+00E9 appearing).

Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
Evan Lloyd New-Schmidt 2024-04-24 10:34:11 -04:00 committed by GitHub
parent 19d9f2c42a
commit cd03fed762
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 57 additions and 8 deletions

5
Cargo.lock generated
View file

@ -719,6 +719,7 @@ dependencies = [
"thiserror",
"tracing",
"tracing-subscriber",
"unicode-normalization",
"url",
"urlencoding",
]
@ -1478,9 +1479,9 @@ checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0"
[[package]]
name = "unicode-normalization"
version = "0.1.22"
version = "0.1.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921"
checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5"
dependencies = [
"tinyvec",
]

View file

@ -13,7 +13,6 @@ anyhow = { version = "1.0.71", features = ["backtrace"] }
clap = { version = "4.3.2", features = ["derive"] }
csv = "1.2.2"
ego-tree = "0.6.2"
expect-test = "1.4.1"
html5ever = "0.26.0"
log = "0.4.18"
markup5ever = "0.11.0"
@ -29,6 +28,10 @@ tracing-subscriber = { version = "0.3.17", features = ["env-filter"] }
url = "2.3.1"
urlencoding = "2.1.2"
[dev-dependencies]
expect-test = "1.4.1"
unicode-normalization = "0.1.23"
[profile.release]
overflow-checks = true
lto = true

View file

@ -23,12 +23,23 @@ struct Config<'a> {
sections_to_remove: BTreeMap<&'a str, BTreeSet<&'a str>>,
}
/// Path to the processing config file.
///
/// Other compile-time macros expect a string literal, so this must be a macro instead of a const str.
macro_rules! config_path {
() => {
concat!(
env!("CARGO_MANIFEST_DIR"),
"/article_processing_config.json"
)
};
}
static CONFIG: Lazy<Config<'static>> = Lazy::new(|| {
serde_json::from_str(include_str!(concat!(
env!("CARGO_MANIFEST_DIR"),
"/article_processing_config.json"
)))
.expect("\"article_processing_config.json\" is either invalid json or the wrong structure")
serde_json::from_str(include_str!(config_path!())).expect(concat!(
config_path!(),
" is either invalid json or the wrong structure"
))
});
static HEADERS: Lazy<Selector> =
@ -203,6 +214,9 @@ fn remove_ids(document: &mut Html, ids: impl IntoIterator<Item = NodeId>) {
}
/// Remove sections with the specified `titles` and all trailing elements until next section.
///
/// `titles` are matched by case-sensitive simple byte comparison.
/// `titles` should be normalized to Unicode NFC to match Wikipedia's internal normalization: <https://mediawiki.org/wiki/Unicode_normalization_considerations>.
fn remove_sections(document: &mut Html, titles: &BTreeSet<&str>) {
let mut to_remove = Vec::new();
@ -488,6 +502,37 @@ mod test {
assert!(!CONFIG.sections_to_remove.is_empty());
}
/// Ensure config sections match Wikipedia's Unicode normalization (NFC) so
/// that they can be correctly compared bytewise.
///
/// As the discussion below mentions, there is an edge-case where section
/// names in the article contain templates that expand to non-normalized
/// text, which this does not handle.
///
/// See also:
/// - [super::remove_sections]
/// - Mediawiki discussion of normalization: https://mediawiki.org/wiki/Unicode_normalization_considerations
/// - Online conversion tool: https://util.unicode.org/UnicodeJsps/transform.jsp?a=Any-NFC
#[test]
fn static_config_sections_are_normalized() {
use unicode_normalization::{is_nfc, UnicodeNormalization};
let mut all_sections_are_normalized = true;
for section in CONFIG.sections_to_remove.values().flatten() {
if !is_nfc(section) {
all_sections_are_normalized = false;
let normalized = String::from_iter(section.nfc());
eprintln!("Section to remove {section:?} should be normalized to {normalized:?}");
}
}
assert!(
all_sections_are_normalized,
"Not all sections in {} are in Unicode NFC. Please replace the reported sections.",
config_path!()
);
}
fn expand_links(document: &mut Html) {
let links: Vec<_> = document
.select(&Selector::parse("a").unwrap())