Rewrite comments as sentences for readability

Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
Evan Lloyd New-Schmidt 2023-06-23 11:30:55 -04:00 committed by Evan Lloyd New-Schmidt
parent 8435682ddf
commit 0a0317538c
4 changed files with 13 additions and 13 deletions

View file

@ -26,11 +26,11 @@ pub fn simplify(html: &str, lang: &str) -> String {
let mut to_remove = Vec::new();
// remove sections
// Remove configured sections and all trailing elements until next section.
if let Some(bad_sections) = CONFIG.sections_to_remove.get(lang) {
for header in document.select(&HEADERS) {
// TODO: should this join all text nodes?
// TODO: Should this join all text nodes?
let Some(title) = header.text().next() else {
continue
};
@ -38,11 +38,11 @@ pub fn simplify(html: &str, lang: &str) -> String {
if bad_sections.contains(&title.trim()) {
to_remove.push(header.id());
let header_level = header.value().name();
// strip trailing nodes
// Strip trailing nodes.
for sibling in header.next_siblings() {
if let Some(element) = sibling.value().as_element() {
if element.name() == header_level {
// TODO: should this check for a higher level?
// TODO: Should this check for a higher level?
break;
}
}
@ -60,7 +60,7 @@ pub fn simplify(html: &str, lang: &str) -> String {
warn!("No sections to remove configured for lang {lang:?}");
}
// remove elements with no text that isn't whitespace
// Remove elements with no text that isn't whitespace.
for element in document
.root_element()

View file

@ -93,8 +93,8 @@ fn main() -> anyhow::Result<()> {
info!("Processing dump");
let dump = stdin().lock();
// TODO: compare different deserialization methods
// docs warn against using a reader directly, and it's slower than tar can decompress the dump
// TODO: Compare different deserialization methods.
// The docs warn against using a reader directly, and it's slower than tar can decompress the dump.
// let stream = serde_json::Deserializer::from_reader(dump).into_iter::<Page>();
let stream = dump.lines().map(|r| {
r.map_err(anyhow::Error::new)

View file

@ -119,7 +119,7 @@ pub struct WikipediaTitleNorm {
impl WikipediaTitleNorm {
fn normalize_title(title: &str) -> String {
// TODO: compare with generator url creation
// TODO: Compare with map generator url creation, ensure covers all cases.
title.replace(' ', "_")
}

View file

@ -3,18 +3,18 @@ use serde::Deserialize;
// TODO: consolidate into single struct
/// Deserialized Wikimedia Enterprise API Article
///
/// For all available fields, see https://enterprise.wikimedia.com/docs/data-dictionary/
/// For all available fields, see <https://enterprise.wikimedia.com/docs/data-dictionary/>.
#[allow(dead_code)] // TODO: reevaluate fields
#[derive(Deserialize)]
pub struct Page {
// TODO: check if CoW has a performance impact
// TODO: Check if CoW has a performance impact.
pub name: String,
pub date_modified: String,
pub in_language: Language,
#[serde(default)]
pub url: String,
pub main_entity: Option<Wikidata>,
// TODO: see what impact parsing/unescaping/allocating this has
// TODO: See what impact parsing/unescaping/allocating this has.
pub article_body: ArticleBody,
#[serde(default)]
pub redirects: Vec<Redirect>,
@ -27,12 +27,12 @@ pub struct Wikidata {
#[derive(Deserialize)]
pub struct ArticleBody {
// TODO: look into RawValue to lazily parse/allocate this:
// TODO: Look into RawValue to lazily parse/allocate this:
// https://docs.rs/serde_json/latest/serde_json/value/struct.RawValue.html
pub html: String,
}
#[allow(dead_code)] // TODO: reevaluate fields
#[allow(dead_code)] // TODO: Reevaluate fields.
#[derive(Deserialize)]
pub struct Redirect {
pub url: String,