Remove pretty-printing

Whitespace behavior is different between Html::html and this
half-working pretty printer. Now the tests match the parser output
exactly.

Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
Evan Lloyd New-Schmidt 2024-07-08 13:40:04 -04:00 committed by Evan Lloyd New-Schmidt
parent f2692d2ede
commit 3579410659
5 changed files with 32 additions and 314 deletions

View file

@ -23,8 +23,6 @@ use once_cell::sync::Lazy;
use scraper::{ElementRef, Html, Node, Selector};
use serde::Deserialize;
mod pretty;
pub use pretty::pretty_print;
use url::Url;
#[derive(Debug, Deserialize)]

View file

@ -1,128 +0,0 @@
// Based on the implementation from `htmlq`: https://github.com/mgdm/htmlq/blob/6e31bc814332b2521f0316d0ed9bf0a1c521b6e6/src/pretty_print.rs
// Available under the MIT License.
// Copyright (c) 2019 Michael Maclean
use std::{
collections::HashSet,
io::{self, Write},
str,
};
use html5ever::{
serialize::{HtmlSerializer, Serialize, SerializeOpts, Serializer, TraversalScope},
QualName,
};
use markup5ever::serialize::AttrRef;
use once_cell::sync::Lazy;
use scraper::Html;
pub fn pretty_print(html: &Html) -> String {
let mut content: Vec<u8> = Vec::new();
let mut pp = PrettyPrint {
indent: 0,
previous_was_block: false,
inner: HtmlSerializer::new(
&mut content,
SerializeOpts {
traversal_scope: TraversalScope::IncludeNode,
..Default::default()
},
),
at_beginning: true,
};
Serialize::serialize(html, &mut pp, TraversalScope::IncludeNode).unwrap();
str::from_utf8(content.as_ref()).unwrap().to_owned()
}
/// Elements to print on a single line instead of expanded.
static INLINE_ELEMENTS: Lazy<HashSet<&'static str>> = Lazy::new(|| {
vec![
"a", "abbr", "acronym", "audio", "b", "bdi", "bdo", "big", "button", "canvas", "cite",
"code", "data", "datalist", "del", "dfn", "em", "embed", "i", "iframe", "img", "input",
"ins", "kbd", "label", "map", "mark", "meter", "noscript", "object", "output", "picture",
"progress", "q", "ruby", "s", "samp", "script", "select", "slot", "small", "span",
"strong", "sub", "sup", "svg", "template", "textarea", "time", "u", "tt", "var", "video",
"wbr",
]
.into_iter()
.collect()
});
fn is_inline(name: &str) -> bool {
INLINE_ELEMENTS.contains(name)
}
struct PrettyPrint<W: Write> {
indent: usize,
previous_was_block: bool,
inner: HtmlSerializer<W>,
at_beginning: bool,
}
impl<W: Write> Serializer for PrettyPrint<W> {
fn start_elem<'a, AttrIter>(&mut self, name: QualName, attrs: AttrIter) -> io::Result<()>
where
AttrIter: Iterator<Item = AttrRef<'a>>,
{
// Make attribute order deterministic.
let mut attrs: Vec<_> = attrs.collect();
attrs.sort();
let inline = is_inline(&name.local);
if (!inline || self.previous_was_block) && !self.at_beginning {
self.inner.writer.write_all(b"\n")?;
self.inner.writer.write_all(&vec![b' '; self.indent])?;
}
self.indent += 2;
self.inner.start_elem(name, attrs.into_iter())?;
if self.at_beginning {
self.at_beginning = false;
self.previous_was_block = !inline;
}
Ok(())
}
fn end_elem(&mut self, name: QualName) -> io::Result<()> {
self.indent -= 2;
if is_inline(&name.local) {
self.previous_was_block = false;
} else {
self.inner.writer.write_all(b"\n")?;
self.inner.writer.write_all(&vec![b' '; self.indent])?;
self.previous_was_block = true;
}
self.inner.end_elem(name)
}
fn write_text(&mut self, text: &str) -> io::Result<()> {
if text.trim().is_empty() {
Ok(())
} else {
if self.previous_was_block {
self.inner.writer.write_all(b"\n")?;
self.inner.writer.write_all(&vec![b' '; self.indent])?;
}
self.previous_was_block = false;
self.inner.write_text(text)
}
}
fn write_comment(&mut self, text: &str) -> io::Result<()> {
self.inner.write_comment(text)
}
fn write_doctype(&mut self, name: &str) -> io::Result<()> {
self.inner.write_doctype(name)
}
fn write_processing_instruction(&mut self, target: &str, data: &str) -> io::Result<()> {
self.inner.write_processing_instruction(target, data)
}
}

File diff suppressed because one or more lines are too long

View file

@ -1,93 +1,18 @@
<p>
The <b>Crimean Mountains</b> (Crimean Tatar: <span lang="crh">Qırım dağları</span>; Ukrainian: <span lang="uk">Кримські гори</span>; Russian: <span lang="ru">Крымские горы</span>; Turkish: <i lang="tr">Yayla Dağları</i>) or <b>Yayla Mountains</b> are a range of mountains running parallel to the south-eastern coast of Crimea, between about 813 kilometers (58 miles) from the sea. Toward the west, the mountains drop steeply to the Black Sea, and to the east, they change slowly into a steppe landscape.
</p>
<p>
The Crimean Mountains consist of three subranges. The highest is the Main Range, which is subdivided into several yaylas or mountain plateaus (<i>yayla</i> or <i>yaylak</i> is Turkic for "alpine meadow"). They are:
</p>
<ul>
<li>
Baydar yayla
</li>
<li>
Ai-Petri yayla
</li>
<li>
Yalta yayla
</li>
<li>
Nikita yayla
</li>
<li>
Hurzuf yayla
</li>
<li>
Babugan yayla
</li>
<li>
Chatyr-Dag yayla
</li>
<li>
Dologorukovskaya (Subatkan) yayla
</li>
<li>
Demirci yayla
</li>
<li>
Qarabiy yayla
</li>
</ul>
<h2>
Highest peaks
</h2>
<p>
The Crimea's highest peak is the Roman-Kosh (Ukrainian: <span lang="uk">Роман-Кош</span>; Russian: <span lang="ru">Роман-Кош</span>, Crimean Tatar: <span lang="crh">Roman Qoş</span>) on the Babugan Yayla at 1,545 metres (5,069ft). Other important peaks over 1,200 metres include:
</p>
<ul>
<li>
Demir-Kapu (Ukrainian: <span lang="uk">Демір-Капу</span>, Russian: <span lang="ru">Демир-Капу</span>, Crimean Tatar: <span lang="crh">Demir Qapı</span>) 1,540 m in the Babugan Yayla;
</li>
<li>
Zeytin-Kosh (Ukrainian: <span lang="uk">Зейтин-Кош</span>; Russian: <span lang="ru">Зейтин-Кош</span>, Crimean Tatar: <span lang="crh">Zeytün Qoş</span>) 1,537 m in the Babugan Yayla;
</li>
<li>
Kemal-Egerek (Ukrainian: <span lang="uk">Кемаль-Егерек</span>, Russian: <span lang="ru">Кемаль-Эгерек</span>, Crimean Tatar: <span lang="crh">Kemal Egerek</span>) 1,529 m in the Babugan Yayla;
</li>
<li>
Eklizi-Burun (Ukrainian: <span lang="uk">Еклізі-Бурун</span>, Russian: <span lang="ru">Эклизи-Бурун</span>, Crimean Tatar: <span lang="crh">Eklizi Burun</span>) 1,527 m in the Chatyrdag Yayla;
</li>
<li>
Lapata (Ukrainian: <span lang="uk">Лапата</span>; Russian: <span lang="ru">Лапата</span>, Crimean Tatar: <span lang="crh">Lapata</span>) 1,406 m in the Yaltynska Yayla, Yalta Yaylası;
</li>
<li>
Northern Demirji (Ukrainian: <span lang="uk">Північний Демірджі</span>, Russian: <span lang="ru">Северный Демирджи</span>, Crimean Tatar: <span lang="crh">Şimaliy Demirci</span>) 1,356 m in the Demirci Yayla;
</li>
<li>
Ai-Petri (Ukrainian: <span lang="uk">Ай-Петрі</span>, Russian: <span lang="ru">Ай-Петри</span>, Crimean Tatar: <span lang="crh">Ay Petri</span>) 1,234 m in the Ay Petri Yaylası.
</li>
</ul>
<h2>
Passes and rivers
</h2>
<p>
The passes over the Crimean Mountains are: (from east to west)
</p>
<ul>
<li>
Angarskyi Pass (752m) near Perevalne, on a road from Alushta to Simferopol
</li>
<li>
Baydar Gate (503m) near Foros, connecting Baydar Valley and the sea coast
</li>
<li>
Laspi Pass (350m) near Cape Aya, on a road from Yalta to Sevastopol.
</li>
</ul>
<p>
Rivers of the Crimean Mountains include the Alma River, Chernaya River, and Salhir River on the northern slope and Uchan-su River on the southern slope which forms the Uchan-su waterfall, and the highest waterfall in Crimea.
</p>
<h2>
History
</h2>
<p>
Archaeologists have found the earliest anatomically modern humans in Europe in the Crimean Mountains' Buran-Kaya caves. The fossils are 32,000 years old, with the artifacts linked to the Gravettian culture. The fossils have cut marks suggesting a post-mortem defleshing ritual.
</p>
<p>The <b>Crimean Mountains</b> (Crimean Tatar: <span lang="crh">Qırım dağları</span>; Ukrainian: <span lang="uk">Кримські гори</span>; Russian: <span lang="ru">Крымские горы</span>; Turkish: <i lang="tr">Yayla Dağları</i>) or <b>Yayla Mountains</b> are a range of mountains running parallel to the south-eastern coast of Crimea, between about 813 kilometers (58 miles) from the sea. Toward the west, the mountains drop steeply to the Black Sea, and to the east, they change slowly into a steppe landscape.</p><p>The Crimean Mountains consist of three subranges. The highest is the Main Range, which is subdivided into several yaylas or mountain plateaus (<i>yayla</i> or <i>yaylak</i> is Turkic for "alpine meadow"). They are:</p><ul><li>Baydar yayla</li>
<li>Ai-Petri yayla</li>
<li>Yalta yayla</li>
<li>Nikita yayla</li>
<li>Hurzuf yayla</li>
<li>Babugan yayla</li>
<li>Chatyr-Dag yayla</li>
<li>Dologorukovskaya (Subatkan) yayla</li>
<li>Demirci yayla</li>
<li>Qarabiy yayla</li></ul><h2>Highest peaks</h2><p>The Crimea's highest peak is the Roman-Kosh (Ukrainian: <span lang="uk">Роман-Кош</span>; Russian: <span lang="ru">Роман-Кош</span>, Crimean Tatar: <span lang="crh">Roman Qoş</span>) on the Babugan Yayla at 1,545 metres (5,069ft). Other important peaks over 1,200 metres include:</p><ul><li>Demir-Kapu (Ukrainian: <span lang="uk">Демір-Капу</span>, Russian: <span lang="ru">Демир-Капу</span>, Crimean Tatar: <span lang="crh">Demir Qapı</span>) 1,540 m in the Babugan Yayla;</li>
<li>Zeytin-Kosh (Ukrainian: <span lang="uk">Зейтин-Кош</span>; Russian: <span lang="ru">Зейтин-Кош</span>, Crimean Tatar: <span lang="crh">Zeytün Qoş</span>) 1,537 m in the Babugan Yayla;</li>
<li>Kemal-Egerek (Ukrainian: <span lang="uk">Кемаль-Егерек</span>, Russian: <span lang="ru">Кемаль-Эгерек</span>, Crimean Tatar: <span lang="crh">Kemal Egerek</span>) 1,529 m in the Babugan Yayla;</li>
<li>Eklizi-Burun (Ukrainian: <span lang="uk">Еклізі-Бурун</span>, Russian: <span lang="ru">Эклизи-Бурун</span>, Crimean Tatar: <span lang="crh">Eklizi Burun</span>) 1,527 m in the Chatyrdag Yayla;</li>
<li>Lapata (Ukrainian: <span lang="uk">Лапата</span>; Russian: <span lang="ru">Лапата</span>, Crimean Tatar: <span lang="crh">Lapata</span>) 1,406 m in the Yaltynska Yayla, Yalta Yaylası;</li>
<li>Northern Demirji (Ukrainian: <span lang="uk">Північний Демірджі</span>, Russian: <span lang="ru">Северный Демирджи</span>, Crimean Tatar: <span lang="crh">Şimaliy Demirci</span>) 1,356 m in the Demirci Yayla;</li>
<li>Ai-Petri (Ukrainian: <span lang="uk">Ай-Петрі</span>, Russian: <span lang="ru">Ай-Петри</span>, Crimean Tatar: <span lang="crh">Ay Petri</span>) 1,234 m in the Ay Petri Yaylası.</li></ul><h2>Passes and rivers</h2><p>The passes over the Crimean Mountains are: (from east to west)</p><ul><li>Angarskyi Pass (752m) near Perevalne, on a road from Alushta to Simferopol</li>
<li>Baydar Gate (503m) near Foros, connecting Baydar Valley and the sea coast</li>
<li>Laspi Pass (350m) near Cape Aya, on a road from Yalta to Sevastopol.</li></ul><p>Rivers of the Crimean Mountains include the Alma River, Chernaya River, and Salhir River on the northern slope and Uchan-su River on the southern slope which forms the Uchan-su waterfall, and the highest waterfall in Crimea.</p><h2>History</h2><p>Archaeologists have found the earliest anatomically modern humans in Europe in the Crimean Mountains' Buran-Kaya caves. The fossils are 32,000 years old, with the artifacts linked to the Gravettian culture. The fossils have cut marks suggesting a post-mortem defleshing ritual.</p>

View file

@ -3,7 +3,7 @@
//! To update the expected output, run the test again with the env variable
//! `UPDATE_EXPECT=1` set.
//! See https://docs.rs/expect-test/ for more information.
use om_wikiparser::html::{detect_lang, pretty_print, process, process_str, HtmlError};
use om_wikiparser::html::{detect_lang, process, process_str, HtmlError};
use expect_test::{expect_file, ExpectFile};
use scraper::Html;
@ -12,7 +12,7 @@ fn check(input: &str, expect: ExpectFile) {
let html = Html::parse_document(input);
let lang = detect_lang(&html).unwrap();
let html = process(html, &lang).unwrap();
let processed = pretty_print(&html);
let processed = html.html();
expect.assert_eq(&processed);
}