Strip whitespace during parsing.
This commit is contained in:
parent
8b4684679b
commit
999a2a1fc1
10
Cargo.lock
generated
10
Cargo.lock
generated
@ -568,6 +568,7 @@ dependencies = [
|
||||
"diesel_migrations 0.99.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"dotenv 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"error-chain 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"itertools 0.7.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"lazy_static 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"log 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"r2d2 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
@ -689,6 +690,14 @@ dependencies = [
|
||||
"winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itertools"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"either 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "0.3.4"
|
||||
@ -1667,6 +1676,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
"checksum hyper-tls 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "9c81fa95203e2a6087242c38691a0210f23e9f3f8f944350bd676522132e2985"
|
||||
"checksum idna 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "014b298351066f1512874135335d62a789ffe78a9974f94b43ed5621951eaf7d"
|
||||
"checksum iovec 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b6e8b9c2247fcf6c6a1151f1156932be5606c9fd6f55a2d7f9fc1cb29386b2f7"
|
||||
"checksum itertools 0.7.4 (registry+https://github.com/rust-lang/crates.io-index)" = "947aa0b9bb417792efa3936c5dada2d680b3bc27ea6a88ffa062f4c4d86ef8c5"
|
||||
"checksum itoa 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "8324a32baf01e2ae060e9de58ed0bc2320c9a2833491ee36cd3b4c414de4db8c"
|
||||
"checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d"
|
||||
"checksum language-tags 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a91d884b6667cd606bb5a69aa0c99ba811a115fc68915e7056ec08a46e93199a"
|
||||
|
||||
@ -10,6 +10,7 @@ chrono = "0.4.0"
|
||||
derive_builder = "0.5.0"
|
||||
dotenv = "0.10.1"
|
||||
error-chain = "0.11.0"
|
||||
itertools = "0.7.4"
|
||||
lazy_static = "1.0.0"
|
||||
log = "0.3.8"
|
||||
r2d2 = "0.8.1"
|
||||
|
||||
@ -10,6 +10,7 @@
|
||||
//! A libraty for parsing, indexing and retrieving podcast Feeds,
|
||||
//! into and from a Database.
|
||||
|
||||
#![allow(unknown_lints)]
|
||||
#![deny(bad_style, const_err, dead_code, improper_ctypes, legacy_directory_ownership,
|
||||
non_shorthand_field_patterns, no_mangle_generic_items, overflowing_literals,
|
||||
path_statements, patterns_in_fns_without_body, plugin_as_library, private_in_public,
|
||||
@ -39,6 +40,7 @@ extern crate derive_builder;
|
||||
|
||||
extern crate ammonia;
|
||||
extern crate chrono;
|
||||
extern crate itertools;
|
||||
extern crate r2d2;
|
||||
extern crate r2d2_diesel;
|
||||
extern crate rayon;
|
||||
|
||||
@ -4,6 +4,7 @@ use rfc822_sanitizer::parse_from_rfc2822_with_fallback;
|
||||
|
||||
use models::insertables::{NewEpisode, NewEpisodeBuilder, NewPodcast, NewPodcastBuilder};
|
||||
use utils::url_cleaner;
|
||||
use utils::replace_extra_spaces;
|
||||
|
||||
use errors::*;
|
||||
|
||||
@ -11,7 +12,7 @@ use errors::*;
|
||||
/// Parses a `rss::Channel` into a `NewPodcast` Struct.
|
||||
pub(crate) fn new_podcast(chan: &Channel, source_id: i32) -> NewPodcast {
|
||||
let title = chan.title().trim();
|
||||
let description = ammonia::clean(chan.description().trim());
|
||||
let description = replace_extra_spaces(&ammonia::clean(chan.description()));
|
||||
|
||||
let link = url_cleaner(chan.link());
|
||||
let x = chan.itunes_ext().map(|s| s.image());
|
||||
@ -34,7 +35,8 @@ pub(crate) fn new_podcast(chan: &Channel, source_id: i32) -> NewPodcast {
|
||||
/// Parses an `rss::Item` into a `NewEpisode` Struct.
|
||||
pub(crate) fn new_episode(item: &Item, parent_id: i32) -> Result<NewEpisode> {
|
||||
let title = item.title().map(|s| s.trim().to_owned());
|
||||
let description = item.description().map(|s| ammonia::clean(s.trim()));
|
||||
let description = item.description()
|
||||
.map(|s| replace_extra_spaces(&ammonia::clean(s)));
|
||||
let guid = item.guid().map(|s| s.value().trim().to_owned());
|
||||
|
||||
// Its kinda weird this being an Option type.
|
||||
@ -92,7 +94,7 @@ mod tests {
|
||||
let descr = "The people behind The Intercept’s fearless reporting and incisive \
|
||||
commentary—Jeremy Scahill, Glenn Greenwald, Betsy Reed and others—discuss \
|
||||
the crucial issues of our time: national security, civil liberties, foreign \
|
||||
policy, and criminal justice. Plus interviews with artists, thinkers, and \
|
||||
policy, and criminal justice. Plus interviews with artists, thinkers, and \
|
||||
newsmakers who challenge our preconceptions about the world we live in.";
|
||||
let pd = new_podcast(&channel, 0);
|
||||
|
||||
@ -249,8 +251,10 @@ mod tests {
|
||||
|
||||
assert_eq!(
|
||||
i2.title(),
|
||||
Some("The Breakthrough: Behind the Scenes of Hillary Clinton’s Failed Bid for \
|
||||
President")
|
||||
Some(
|
||||
"The Breakthrough: Behind the Scenes of Hillary Clinton’s Failed Bid for \
|
||||
President"
|
||||
)
|
||||
);
|
||||
assert_eq!(
|
||||
i2.uri(),
|
||||
@ -298,7 +302,7 @@ mod tests {
|
||||
let descr2 = "<p>The Gnome project is about to solve one of our audience's biggest \
|
||||
Wayland’s concerns. But as the project takes on a new level of relevance, \
|
||||
decisions for the next version of Gnome have us worried about the \
|
||||
future.</p>\n\n<p>Plus we chat with Wimpy about the Ubuntu Rally in NYC, \
|
||||
future.</p>\n<p>Plus we chat with Wimpy about the Ubuntu Rally in NYC, \
|
||||
Microsoft’s sneaky move to turn Windows 10 into the “ULTIMATE LINUX \
|
||||
RUNTIME”, community news & more!</p>";
|
||||
assert_eq!(i2.title(), Some("Gnome Does it Again | LUP 213"));
|
||||
|
||||
@ -4,6 +4,7 @@ use rayon::prelude::*;
|
||||
use chrono::prelude::*;
|
||||
|
||||
use url::{Position, Url};
|
||||
use itertools::Itertools;
|
||||
|
||||
use errors::*;
|
||||
use dbqueries;
|
||||
@ -105,14 +106,20 @@ pub fn url_cleaner(s: &str) -> String {
|
||||
}
|
||||
}
|
||||
|
||||
/// Placeholder
|
||||
// TODO: Docs
|
||||
/// Helper functions that strips extra spaces and newlines and all the tabs.
|
||||
#[allow(match_same_arms)]
|
||||
pub fn replace_extra_spaces(s: &str) -> String {
|
||||
s.lines()
|
||||
.map(|x| x.split_whitespace().collect::<Vec<_>>().join(" "))
|
||||
.filter(|x| !x.is_empty())
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n")
|
||||
s.trim()
|
||||
.chars()
|
||||
.filter(|ch| *ch != '\t')
|
||||
.coalesce(|current, next| match (current, next) {
|
||||
('\n', '\n') => Ok('\n'),
|
||||
('\n', ' ') => Ok('\n'),
|
||||
(' ', '\n') => Ok('\n'),
|
||||
(' ', ' ') => Ok(' '),
|
||||
(_, _) => Err((current, next)),
|
||||
})
|
||||
.collect::<String>()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@ -19,7 +19,7 @@ hard_tabs = false
|
||||
spaces_within_parens = false
|
||||
write_mode = "Overwrite"
|
||||
merge_derives = true
|
||||
condense_wildcard_suffixes = true
|
||||
condense_wildcard_suffixes = false
|
||||
format_strings = true
|
||||
multiline_closure_forces_block = true
|
||||
attributes_on_same_line_as_field = true
|
||||
|
||||
Loading…
Reference in New Issue
Block a user