Initial Html Processing #10
1
Cargo.lock
generated
|
@ -522,6 +522,7 @@ version = "0.0.0"
|
|||
dependencies = [
|
||||
"anyhow",
|
||||
"clap",
|
||||
"ego-tree",
|
||||
"env_logger",
|
||||
"log",
|
||||
"once_cell",
|
||||
|
|
|
@ -10,6 +10,7 @@ default-run = "om-wikiparser"
|
|||
[dependencies]
|
||||
anyhow = { version = "1.0.71", features = ["backtrace"] }
|
||||
clap = { version = "4.3.2", features = ["derive"] }
|
||||
ego-tree = "0.6.2"
|
||||
env_logger = "0.10.0"
|
||||
log = "0.4.18"
|
||||
once_cell = "1.18.0"
|
||||
|
|
|
@ -35,6 +35,10 @@ As an example of usage with the map generator:
|
|||
# Transform intermediate files from generator.
|
||||
cut -f 2 id_to_wikidata.csv > wikidata_ids.txt
|
||||
tail -n +2 wiki_urls.txt | cut -f 3 > wikipedia_urls.txt
|
||||
# Enable backtraces in errors and panics.
|
||||
export RUST_BACKTRACE=1
|
||||
# Set log level to debug
|
||||
export RUST_LOG=om_wikiparser=debug
|
||||
# Begin extraction.
|
||||
for dump in $DUMP_DOWNLOAD_DIR/*-ENTERPRISE-HTML.json.tar.gz
|
||||
do
|
||||
|
|
|
@ -7,6 +7,11 @@ use std::io::{stdin, stdout, Read, Write};
|
|||
use om_wikiparser::html::simplify;
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
env_logger::Builder::new()
|
||||
.filter_level(log::LevelFilter::Info)
|
||||
.parse_default_env()
|
||||
.try_init()?;
|
||||
![]() It enables the logger and sets a default log level that can still be overridden by the default environment variable. I've run into trouble before when I do it in a different order and then the env var doesn't work, or can filter higher levels but not enable lower levels. It enables the logger and sets a default log level that can still be overridden by the default environment variable. I've run into trouble before when I do it in a different order and then the env var doesn't work, or can filter higher levels but not enable lower levels.
|
||||
|
||||
let mut input = String::new();
|
||||
stdin().read_to_string(&mut input)?;
|
||||
|
||||
|
|
102
src/html.rs
|
@ -1,5 +1,6 @@
|
|||
use std::collections::{BTreeMap, BTreeSet};
|
||||
|
||||
use ego_tree::NodeId;
|
||||
use once_cell::sync::Lazy;
|
||||
use scraper::{ElementRef, Html, Selector};
|
||||
use serde::Deserialize;
|
||||
|
@ -51,34 +52,65 @@ pub fn simplify(html: &str, lang: &str) -> String {
|
|||
}
|
||||
}
|
||||
![]() Do we insert full URLs on every page now? That is an overhead, webview on iOS and Android should work properly with relative URLs. Let's investigate and fix it in a separate issue later (TODO may be good here too). Do we insert full URLs on every page now? That is an overhead, webview on iOS and Android should work properly with relative URLs. Let's investigate and fix it in a separate issue later (TODO may be good here too).
![]() Is it possible to check whitespaces without modification? E.g. use is_whitespace ? Is it possible to check whitespaces without modification? E.g. use is_whitespace ?
![]() Maybe it makes sense to strip cross-wiki links to reduce the HTML size. As our articles are localized and mostly used offline, it can be a good idea. Maybe it makes sense to strip cross-wiki links to reduce the HTML size. As our articles are localized and mostly used offline, it can be a good idea.
![]() I don't see any links at all in the existing HTML dumps... Only formatting and headers tags. I don't see any links at all in the existing HTML dumps... Only formatting and headers tags.
![]() When looking at this more I found that the html does include a When looking at this more I found that the html does include a [`base`](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/base) element set to `//lang.wikipedia.org/wiki/`, but when opened as a file in firefox it assumes the scheme is `file:` so they don't work.
I'll remove this, and later if we run into a similar problem with the webviews, setting the scheme once in the base element should handle it.
![]() We could do We could do `el.text().flat_map(str::chars).all(char::is_whitespace)`. I was going to say that working with characters might be less efficient, since the `Pattern`s can [work directly in UTF-8](https://doc.rust-lang.org/src/core/str/pattern.rs.html#374), but it looks like [the implementation of `trim`](https://doc.rust-lang.org/src/core/str/mod.rs.html#1867) also uses `char::is_whitespace`.
![]() Yes, the scraper uses the Extracts API which strips most of the non-text elements. Yes, the scraper uses [the Extracts API](https://www.mediawiki.org/wiki/Extension:TextExtracts) which strips most of the non-text elements.
Would you prefer we removed the links altogether?
![]() Links may be useful in the future, especially if we somehow link wiki articles that are already embedded in the offline maps data. But for now, they can be omitted, I didn't see them on Wiki pages in OM. Did you? Links may be useful in the future, especially if we somehow link wiki articles that are already embedded in the offline maps data. But for now, they can be omitted, I didn't see them on Wiki pages in OM. Did you?
![]() As trim actually returns a slice, your implementation is already optimal ) As trim actually returns a slice, your implementation is already optimal )
![]() One thing I realized though is that I tried the three approaches with a selection of strings and found that the char iterator was fastest. Ultimately a micro-optimization but still interesting 😉.
One thing I realized though is that `trim` still has to check the right side if it encounters non-whitespace characters from the left. `trim_left` is better, but both still need to construct the slice. I don't think the compiler can optimize that away.
I tried the three approaches with a selection of strings and found that the char iterator was fastest. Ultimately a micro-optimization but still interesting :wink:.
```rust
const MIXED: &str = " \tcd\nfg ";
const NOT_WHITESPACE: &str = "abcdefgh";
const WHITESPACE: &str = " \t\n \t\t ";
```
```
running 9 tests
test chars_mixed ... bench: 3 ns/iter (+/- 0)
test chars_not_whitespace ... bench: 0 ns/iter (+/- 0)
test chars_whitespace ... bench: 7 ns/iter (+/- 1)
test trim_left_mixed ... bench: 4 ns/iter (+/- 1)
test trim_left_not_whitespace ... bench: 3 ns/iter (+/- 0)
test trim_left_whitespace ... bench: 9 ns/iter (+/- 0)
test trim_mixed ... bench: 10 ns/iter (+/- 1)
test trim_not_whitespace ... bench: 5 ns/iter (+/- 4)
test trim_whitespace ... bench: 12 ns/iter (+/- 0)
```
![]() No, the current ones don't have any. I'll strip them from the output for now. No, the current ones don't have any. I'll strip them from the output for now.
![]() This is an important outcome of an interesting project: to learn something new ) This is an important outcome of an interesting project: to learn something new )
|
||||
|
||||
for id in to_remove.drain(..) {
|
||||
if let Some(mut node) = document.tree.get_mut(id) {
|
||||
node.detach();
|
||||
}
|
||||
}
|
||||
remove_ids(&mut document, to_remove.drain(..));
|
||||
} else {
|
||||
warn!("No sections to remove configured for lang {lang:?}");
|
||||
}
|
||||
|
||||
// Remove elements with no text that isn't whitespace.
|
||||
|
||||
for element in document
|
||||
for el in document
|
||||
.root_element()
|
||||
.descendants()
|
||||
.filter_map(ElementRef::wrap)
|
||||
{
|
||||
if element.text().all(|t| t.trim().is_empty()) {
|
||||
to_remove.push(element.id());
|
||||
if is_image(&el) || is_empty_or_whitespace(&el) {
|
||||
to_remove.push(el.id());
|
||||
}
|
||||
}
|
||||
remove_ids(&mut document, to_remove.drain(..));
|
||||
|
||||
for id in to_remove.drain(..) {
|
||||
remove_links(&mut document);
|
||||
|
||||
document.html()
|
||||
}
|
||||
|
||||
fn remove_ids(document: &mut Html, ids: impl IntoIterator<Item = NodeId>) {
|
||||
for id in ids {
|
||||
if let Some(mut node) = document.tree.get_mut(id) {
|
||||
node.detach();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
document.html()
|
||||
fn is_empty_or_whitespace(el: &ElementRef) -> bool {
|
||||
el.text().flat_map(str::chars).all(char::is_whitespace)
|
||||
}
|
||||
|
||||
fn is_image(el: &ElementRef) -> bool {
|
||||
["img", "picture"].contains(&el.value().name())
|
||||
}
|
||||
|
||||
/// Remove all links, preserving any inner elements/text.
|
||||
fn remove_links(document: &mut Html) {
|
||||
let links: Vec<_> = document
|
||||
.select(&Selector::parse("a").unwrap())
|
||||
.map(|el| el.id())
|
||||
.collect();
|
||||
|
||||
for id in links {
|
||||
let Some(mut node) = document.tree.get_mut(id) else { continue };
|
||||
if node.parent().is_none() {
|
||||
continue;
|
||||
}
|
||||
|
||||
// reparent to same location as node
|
||||
while let Some(mut child) = node.first_child() {
|
||||
let child_id = child.id();
|
||||
child.detach();
|
||||
node.insert_id_before(child_id);
|
||||
}
|
||||
|
||||
node.detach();
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
@ -89,4 +121,50 @@ mod test {
|
|||
fn static_config_parses() {
|
||||
assert!(!CONFIG.sections_to_remove.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn remove_links() {
|
||||
let html = r#"
|
||||
<p> Some text that includes
|
||||
<a href="Some_Page"><span id="inner-content">several</span></a>
|
||||
<a id="second-link" href="./Another_Page">relative links</a>
|
||||
and
|
||||
<a href="https://example.com/page">an absolute link</a>
|
||||
.
|
||||
</p>
|
||||
"#;
|
||||
|
||||
let anchors = Selector::parse("a").unwrap();
|
||||
let inner_element = Selector::parse("#inner-content").unwrap();
|
||||
let second_link = Selector::parse("#second-link").unwrap();
|
||||
|
||||
let mut document = Html::parse_fragment(html);
|
||||
let links: Vec<_> = document
|
||||
.select(&anchors)
|
||||
.filter_map(|el| el.value().attr("href"))
|
||||
.collect();
|
||||
|
||||
eprintln!("{}", document.html());
|
||||
|
||||
assert_eq!(
|
||||
vec!["Some_Page", "./Another_Page", "https://example.com/page"],
|
||||
links,
|
||||
"Links in original html are not expected."
|
||||
);
|
||||
|
||||
// Detach one of the links from the root tree (as if previously deleted) to ensure it handles orphan nodes nicely.
|
||||
let link = document.select(&second_link).next().unwrap().id();
|
||||
document.tree.get_mut(link).unwrap().detach();
|
||||
|
||||
super::remove_links(&mut document);
|
||||
|
||||
let links: Vec<_> = document.select(&anchors).collect();
|
||||
|
||||
assert!(links.is_empty(), "All links should be removed.");
|
||||
|
||||
assert!(
|
||||
document.select(&inner_element).next().is_some(),
|
||||
"Link inner elements should be preserved."
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -138,6 +138,8 @@ fn write(
|
|||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
// Use info level by default, load overrides from `RUST_LOG` env variable.
|
||||
// See https://docs.rs/env_logger/latest/env_logger/index.html#example
|
||||
env_logger::Builder::new()
|
||||
.filter_level(log::LevelFilter::Info)
|
||||
.parse_default_env()
|
||||
|
|
What does it do?