Strip whitespace during parsing.

This commit is contained in:
Jordan Petridis 2017-12-09 10:22:09 +02:00
parent 8b4684679b
commit 999a2a1fc1
No known key found for this signature in database
GPG Key ID: CEABAD9F5683B9A6
6 changed files with 38 additions and 14 deletions

10
Cargo.lock generated
View File

@ -568,6 +568,7 @@ dependencies = [
"diesel_migrations 0.99.0 (registry+https://github.com/rust-lang/crates.io-index)",
"dotenv 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)",
"error-chain 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)",
"itertools 0.7.4 (registry+https://github.com/rust-lang/crates.io-index)",
"lazy_static 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
"log 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
"r2d2 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)",
@ -689,6 +690,14 @@ dependencies = [
"winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "itertools"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"either 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "itoa"
version = "0.3.4"
@ -1667,6 +1676,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum hyper-tls 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "9c81fa95203e2a6087242c38691a0210f23e9f3f8f944350bd676522132e2985"
"checksum idna 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "014b298351066f1512874135335d62a789ffe78a9974f94b43ed5621951eaf7d"
"checksum iovec 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b6e8b9c2247fcf6c6a1151f1156932be5606c9fd6f55a2d7f9fc1cb29386b2f7"
"checksum itertools 0.7.4 (registry+https://github.com/rust-lang/crates.io-index)" = "947aa0b9bb417792efa3936c5dada2d680b3bc27ea6a88ffa062f4c4d86ef8c5"
"checksum itoa 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "8324a32baf01e2ae060e9de58ed0bc2320c9a2833491ee36cd3b4c414de4db8c"
"checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d"
"checksum language-tags 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a91d884b6667cd606bb5a69aa0c99ba811a115fc68915e7056ec08a46e93199a"

View File

@ -10,6 +10,7 @@ chrono = "0.4.0"
derive_builder = "0.5.0"
dotenv = "0.10.1"
error-chain = "0.11.0"
itertools = "0.7.4"
lazy_static = "1.0.0"
log = "0.3.8"
r2d2 = "0.8.1"

View File

@ -10,6 +10,7 @@
//! A libraty for parsing, indexing and retrieving podcast Feeds,
//! into and from a Database.
#![allow(unknown_lints)]
#![deny(bad_style, const_err, dead_code, improper_ctypes, legacy_directory_ownership,
non_shorthand_field_patterns, no_mangle_generic_items, overflowing_literals,
path_statements, patterns_in_fns_without_body, plugin_as_library, private_in_public,
@ -39,6 +40,7 @@ extern crate derive_builder;
extern crate ammonia;
extern crate chrono;
extern crate itertools;
extern crate r2d2;
extern crate r2d2_diesel;
extern crate rayon;

View File

@ -4,6 +4,7 @@ use rfc822_sanitizer::parse_from_rfc2822_with_fallback;
use models::insertables::{NewEpisode, NewEpisodeBuilder, NewPodcast, NewPodcastBuilder};
use utils::url_cleaner;
use utils::replace_extra_spaces;
use errors::*;
@ -11,7 +12,7 @@ use errors::*;
/// Parses a `rss::Channel` into a `NewPodcast` Struct.
pub(crate) fn new_podcast(chan: &Channel, source_id: i32) -> NewPodcast {
let title = chan.title().trim();
let description = ammonia::clean(chan.description().trim());
let description = replace_extra_spaces(&ammonia::clean(chan.description()));
let link = url_cleaner(chan.link());
let x = chan.itunes_ext().map(|s| s.image());
@ -34,7 +35,8 @@ pub(crate) fn new_podcast(chan: &Channel, source_id: i32) -> NewPodcast {
/// Parses an `rss::Item` into a `NewEpisode` Struct.
pub(crate) fn new_episode(item: &Item, parent_id: i32) -> Result<NewEpisode> {
let title = item.title().map(|s| s.trim().to_owned());
let description = item.description().map(|s| ammonia::clean(s.trim()));
let description = item.description()
.map(|s| replace_extra_spaces(&ammonia::clean(s)));
let guid = item.guid().map(|s| s.value().trim().to_owned());
// Its kinda weird this being an Option type.
@ -92,7 +94,7 @@ mod tests {
let descr = "The people behind The Intercepts fearless reporting and incisive \
commentaryJeremy Scahill, Glenn Greenwald, Betsy Reed and othersdiscuss \
the crucial issues of our time: national security, civil liberties, foreign \
policy, and criminal justice. Plus interviews with artists, thinkers, and \
policy, and criminal justice. Plus interviews with artists, thinkers, and \
newsmakers who challenge our preconceptions about the world we live in.";
let pd = new_podcast(&channel, 0);
@ -249,8 +251,10 @@ mod tests {
assert_eq!(
i2.title(),
Some("The Breakthrough: Behind the Scenes of Hillary Clintons Failed Bid for \
President")
Some(
"The Breakthrough: Behind the Scenes of Hillary Clintons Failed Bid for \
President"
)
);
assert_eq!(
i2.uri(),
@ -298,7 +302,7 @@ mod tests {
let descr2 = "<p>The Gnome project is about to solve one of our audience's biggest \
Waylands concerns. But as the project takes on a new level of relevance, \
decisions for the next version of Gnome have us worried about the \
future.</p>\n\n<p>Plus we chat with Wimpy about the Ubuntu Rally in NYC, \
future.</p>\n<p>Plus we chat with Wimpy about the Ubuntu Rally in NYC, \
Microsofts sneaky move to turn Windows 10 into the ULTIMATE LINUX \
RUNTIME, community news &amp; more!</p>";
assert_eq!(i2.title(), Some("Gnome Does it Again | LUP 213"));

View File

@ -4,6 +4,7 @@ use rayon::prelude::*;
use chrono::prelude::*;
use url::{Position, Url};
use itertools::Itertools;
use errors::*;
use dbqueries;
@ -105,14 +106,20 @@ pub fn url_cleaner(s: &str) -> String {
}
}
/// Placeholder
// TODO: Docs
/// Helper functions that strips extra spaces and newlines and all the tabs.
#[allow(match_same_arms)]
pub fn replace_extra_spaces(s: &str) -> String {
s.lines()
.map(|x| x.split_whitespace().collect::<Vec<_>>().join(" "))
.filter(|x| !x.is_empty())
.collect::<Vec<_>>()
.join("\n")
s.trim()
.chars()
.filter(|ch| *ch != '\t')
.coalesce(|current, next| match (current, next) {
('\n', '\n') => Ok('\n'),
('\n', ' ') => Ok('\n'),
(' ', '\n') => Ok('\n'),
(' ', ' ') => Ok(' '),
(_, _) => Err((current, next)),
})
.collect::<String>()
}
#[cfg(test)]

View File

@ -19,7 +19,7 @@ hard_tabs = false
spaces_within_parens = false
write_mode = "Overwrite"
merge_derives = true
condense_wildcard_suffixes = true
condense_wildcard_suffixes = false
format_strings = true
multiline_closure_forces_block = true
attributes_on_same_line_as_field = true