Rewrite comments as sentences for readability
Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
parent
8435682ddf
commit
0a0317538c
4 changed files with 13 additions and 13 deletions
10
src/html.rs
10
src/html.rs
|
@ -26,11 +26,11 @@ pub fn simplify(html: &str, lang: &str) -> String {
|
|||
|
||||
let mut to_remove = Vec::new();
|
||||
|
||||
// remove sections
|
||||
// Remove configured sections and all trailing elements until next section.
|
||||
|
||||
if let Some(bad_sections) = CONFIG.sections_to_remove.get(lang) {
|
||||
for header in document.select(&HEADERS) {
|
||||
// TODO: should this join all text nodes?
|
||||
// TODO: Should this join all text nodes?
|
||||
let Some(title) = header.text().next() else {
|
||||
continue
|
||||
};
|
||||
|
@ -38,11 +38,11 @@ pub fn simplify(html: &str, lang: &str) -> String {
|
|||
if bad_sections.contains(&title.trim()) {
|
||||
to_remove.push(header.id());
|
||||
let header_level = header.value().name();
|
||||
// strip trailing nodes
|
||||
// Strip trailing nodes.
|
||||
for sibling in header.next_siblings() {
|
||||
if let Some(element) = sibling.value().as_element() {
|
||||
if element.name() == header_level {
|
||||
// TODO: should this check for a higher level?
|
||||
// TODO: Should this check for a higher level?
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -60,7 +60,7 @@ pub fn simplify(html: &str, lang: &str) -> String {
|
|||
warn!("No sections to remove configured for lang {lang:?}");
|
||||
}
|
||||
|
||||
// remove elements with no text that isn't whitespace
|
||||
// Remove elements with no text that isn't whitespace.
|
||||
|
||||
for element in document
|
||||
.root_element()
|
||||
|
|
|
@ -93,8 +93,8 @@ fn main() -> anyhow::Result<()> {
|
|||
info!("Processing dump");
|
||||
let dump = stdin().lock();
|
||||
|
||||
// TODO: compare different deserialization methods
|
||||
// docs warn against using a reader directly, and it's slower than tar can decompress the dump
|
||||
// TODO: Compare different deserialization methods.
|
||||
// The docs warn against using a reader directly, and it's slower than tar can decompress the dump.
|
||||
// let stream = serde_json::Deserializer::from_reader(dump).into_iter::<Page>();
|
||||
let stream = dump.lines().map(|r| {
|
||||
r.map_err(anyhow::Error::new)
|
||||
|
|
|
@ -119,7 +119,7 @@ pub struct WikipediaTitleNorm {
|
|||
|
||||
impl WikipediaTitleNorm {
|
||||
fn normalize_title(title: &str) -> String {
|
||||
// TODO: compare with generator url creation
|
||||
// TODO: Compare with map generator url creation, ensure covers all cases.
|
||||
title.replace(' ', "_")
|
||||
}
|
||||
|
||||
|
|
|
@ -3,18 +3,18 @@ use serde::Deserialize;
|
|||
// TODO: consolidate into single struct
|
||||
/// Deserialized Wikimedia Enterprise API Article
|
||||
///
|
||||
/// For all available fields, see https://enterprise.wikimedia.com/docs/data-dictionary/
|
||||
/// For all available fields, see <https://enterprise.wikimedia.com/docs/data-dictionary/>.
|
||||
#[allow(dead_code)] // TODO: reevaluate fields
|
||||
#[derive(Deserialize)]
|
||||
pub struct Page {
|
||||
// TODO: check if CoW has a performance impact
|
||||
// TODO: Check if CoW has a performance impact.
|
||||
pub name: String,
|
||||
pub date_modified: String,
|
||||
pub in_language: Language,
|
||||
#[serde(default)]
|
||||
pub url: String,
|
||||
pub main_entity: Option<Wikidata>,
|
||||
// TODO: see what impact parsing/unescaping/allocating this has
|
||||
// TODO: See what impact parsing/unescaping/allocating this has.
|
||||
pub article_body: ArticleBody,
|
||||
#[serde(default)]
|
||||
pub redirects: Vec<Redirect>,
|
||||
|
@ -27,12 +27,12 @@ pub struct Wikidata {
|
|||
|
||||
#[derive(Deserialize)]
|
||||
pub struct ArticleBody {
|
||||
// TODO: look into RawValue to lazily parse/allocate this:
|
||||
// TODO: Look into RawValue to lazily parse/allocate this:
|
||||
// https://docs.rs/serde_json/latest/serde_json/value/struct.RawValue.html
|
||||
pub html: String,
|
||||
}
|
||||
|
||||
#[allow(dead_code)] // TODO: reevaluate fields
|
||||
#[allow(dead_code)] // TODO: Reevaluate fields.
|
||||
#[derive(Deserialize)]
|
||||
pub struct Redirect {
|
||||
pub url: String,
|
||||
|
|
Loading…
Add table
Reference in a new issue