Add check for unicode normalization in config (#44)
This ensures that the config sections match Wikipedia's Unicode normalization. We could also normalize every section in every article to handle an edge case where non-normalized output is inadvertently created as tags are joined, but I don't think that's worth it yet. From <https://mediawiki.org/wiki/Unicode_normalization_considerations>: > MediaWiki applies normalization form C (NFC) to Unicode text input. > MediaWiki doesn't apply any normalization to its output, for example > `cafe<nowiki/>́` becomes "café" (shows U+0065 U+0301 in a row, > without precomposed characters like U+00E9 appearing). Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
parent
19d9f2c42a
commit
cd03fed762
3 changed files with 57 additions and 8 deletions
5
Cargo.lock
generated
5
Cargo.lock
generated
|
@ -719,6 +719,7 @@ dependencies = [
|
|||
"thiserror",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"unicode-normalization",
|
||||
"url",
|
||||
"urlencoding",
|
||||
]
|
||||
|
@ -1478,9 +1479,9 @@ checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0"
|
|||
|
||||
[[package]]
|
||||
name = "unicode-normalization"
|
||||
version = "0.1.22"
|
||||
version = "0.1.23"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921"
|
||||
checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5"
|
||||
dependencies = [
|
||||
"tinyvec",
|
||||
]
|
||||
|
|
|
@ -13,7 +13,6 @@ anyhow = { version = "1.0.71", features = ["backtrace"] }
|
|||
clap = { version = "4.3.2", features = ["derive"] }
|
||||
csv = "1.2.2"
|
||||
ego-tree = "0.6.2"
|
||||
expect-test = "1.4.1"
|
||||
html5ever = "0.26.0"
|
||||
log = "0.4.18"
|
||||
markup5ever = "0.11.0"
|
||||
|
@ -29,6 +28,10 @@ tracing-subscriber = { version = "0.3.17", features = ["env-filter"] }
|
|||
url = "2.3.1"
|
||||
urlencoding = "2.1.2"
|
||||
|
||||
[dev-dependencies]
|
||||
expect-test = "1.4.1"
|
||||
unicode-normalization = "0.1.23"
|
||||
|
||||
[profile.release]
|
||||
overflow-checks = true
|
||||
lto = true
|
||||
|
|
55
src/html.rs
55
src/html.rs
|
@ -23,12 +23,23 @@ struct Config<'a> {
|
|||
sections_to_remove: BTreeMap<&'a str, BTreeSet<&'a str>>,
|
||||
}
|
||||
|
||||
/// Path to the processing config file.
|
||||
///
|
||||
/// Other compile-time macros expect a string literal, so this must be a macro instead of a const str.
|
||||
macro_rules! config_path {
|
||||
() => {
|
||||
concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/article_processing_config.json"
|
||||
)
|
||||
};
|
||||
}
|
||||
|
||||
static CONFIG: Lazy<Config<'static>> = Lazy::new(|| {
|
||||
serde_json::from_str(include_str!(concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/article_processing_config.json"
|
||||
)))
|
||||
.expect("\"article_processing_config.json\" is either invalid json or the wrong structure")
|
||||
serde_json::from_str(include_str!(config_path!())).expect(concat!(
|
||||
config_path!(),
|
||||
" is either invalid json or the wrong structure"
|
||||
))
|
||||
});
|
||||
|
||||
static HEADERS: Lazy<Selector> =
|
||||
|
@ -203,6 +214,9 @@ fn remove_ids(document: &mut Html, ids: impl IntoIterator<Item = NodeId>) {
|
|||
}
|
||||
|
||||
/// Remove sections with the specified `titles` and all trailing elements until next section.
|
||||
///
|
||||
/// `titles` are matched by case-sensitive simple byte comparison.
|
||||
/// `titles` should be normalized to Unicode NFC to match Wikipedia's internal normalization: <https://mediawiki.org/wiki/Unicode_normalization_considerations>.
|
||||
fn remove_sections(document: &mut Html, titles: &BTreeSet<&str>) {
|
||||
let mut to_remove = Vec::new();
|
||||
|
||||
|
@ -488,6 +502,37 @@ mod test {
|
|||
assert!(!CONFIG.sections_to_remove.is_empty());
|
||||
}
|
||||
|
||||
/// Ensure config sections match Wikipedia's Unicode normalization (NFC) so
|
||||
/// that they can be correctly compared bytewise.
|
||||
///
|
||||
/// As the discussion below mentions, there is an edge-case where section
|
||||
/// names in the article contain templates that expand to non-normalized
|
||||
/// text, which this does not handle.
|
||||
///
|
||||
/// See also:
|
||||
/// - [super::remove_sections]
|
||||
/// - Mediawiki discussion of normalization: https://mediawiki.org/wiki/Unicode_normalization_considerations
|
||||
/// - Online conversion tool: https://util.unicode.org/UnicodeJsps/transform.jsp?a=Any-NFC
|
||||
#[test]
|
||||
fn static_config_sections_are_normalized() {
|
||||
use unicode_normalization::{is_nfc, UnicodeNormalization};
|
||||
|
||||
let mut all_sections_are_normalized = true;
|
||||
for section in CONFIG.sections_to_remove.values().flatten() {
|
||||
if !is_nfc(section) {
|
||||
all_sections_are_normalized = false;
|
||||
let normalized = String::from_iter(section.nfc());
|
||||
eprintln!("Section to remove {section:?} should be normalized to {normalized:?}");
|
||||
}
|
||||
}
|
||||
|
||||
assert!(
|
||||
all_sections_are_normalized,
|
||||
"Not all sections in {} are in Unicode NFC. Please replace the reported sections.",
|
||||
config_path!()
|
||||
);
|
||||
}
|
||||
|
||||
fn expand_links(document: &mut Html) {
|
||||
let links: Vec<_> = document
|
||||
.select(&Selector::parse("a").unwrap())
|
||||
|
|
Loading…
Add table
Reference in a new issue