From 999a2a1fc1cf202e7ec34b8d87b8f5b3c717c4a6 Mon Sep 17 00:00:00 2001 From: Jordan Petridis Date: Sat, 9 Dec 2017 10:22:09 +0200 Subject: [PATCH] Strip whitespace during parsing. --- Cargo.lock | 10 ++++++++++ hammond-data/Cargo.toml | 1 + hammond-data/src/lib.rs | 2 ++ hammond-data/src/parser.rs | 16 ++++++++++------ hammond-data/src/utils.rs | 21 ++++++++++++++------- rustfmt.toml | 2 +- 6 files changed, 38 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f6037b3..910f55f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -568,6 +568,7 @@ dependencies = [ "diesel_migrations 0.99.0 (registry+https://github.com/rust-lang/crates.io-index)", "dotenv 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)", "error-chain 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)", + "itertools 0.7.4 (registry+https://github.com/rust-lang/crates.io-index)", "lazy_static 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)", "r2d2 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)", @@ -689,6 +690,14 @@ dependencies = [ "winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "itertools" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "either 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "itoa" version = "0.3.4" @@ -1667,6 +1676,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum hyper-tls 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "9c81fa95203e2a6087242c38691a0210f23e9f3f8f944350bd676522132e2985" "checksum idna 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "014b298351066f1512874135335d62a789ffe78a9974f94b43ed5621951eaf7d" "checksum iovec 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b6e8b9c2247fcf6c6a1151f1156932be5606c9fd6f55a2d7f9fc1cb29386b2f7" +"checksum itertools 0.7.4 (registry+https://github.com/rust-lang/crates.io-index)" = "947aa0b9bb417792efa3936c5dada2d680b3bc27ea6a88ffa062f4c4d86ef8c5" "checksum itoa 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "8324a32baf01e2ae060e9de58ed0bc2320c9a2833491ee36cd3b4c414de4db8c" "checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" "checksum language-tags 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a91d884b6667cd606bb5a69aa0c99ba811a115fc68915e7056ec08a46e93199a" diff --git a/hammond-data/Cargo.toml b/hammond-data/Cargo.toml index 34e41c4..c28e25f 100644 --- a/hammond-data/Cargo.toml +++ b/hammond-data/Cargo.toml @@ -10,6 +10,7 @@ chrono = "0.4.0" derive_builder = "0.5.0" dotenv = "0.10.1" error-chain = "0.11.0" +itertools = "0.7.4" lazy_static = "1.0.0" log = "0.3.8" r2d2 = "0.8.1" diff --git a/hammond-data/src/lib.rs b/hammond-data/src/lib.rs index 9828740..c562fbd 100644 --- a/hammond-data/src/lib.rs +++ b/hammond-data/src/lib.rs @@ -10,6 +10,7 @@ //! A libraty for parsing, indexing and retrieving podcast Feeds, //! into and from a Database. +#![allow(unknown_lints)] #![deny(bad_style, const_err, dead_code, improper_ctypes, legacy_directory_ownership, non_shorthand_field_patterns, no_mangle_generic_items, overflowing_literals, path_statements, patterns_in_fns_without_body, plugin_as_library, private_in_public, @@ -39,6 +40,7 @@ extern crate derive_builder; extern crate ammonia; extern crate chrono; +extern crate itertools; extern crate r2d2; extern crate r2d2_diesel; extern crate rayon; diff --git a/hammond-data/src/parser.rs b/hammond-data/src/parser.rs index 97043a5..9d2f8ed 100644 --- a/hammond-data/src/parser.rs +++ b/hammond-data/src/parser.rs @@ -4,6 +4,7 @@ use rfc822_sanitizer::parse_from_rfc2822_with_fallback; use models::insertables::{NewEpisode, NewEpisodeBuilder, NewPodcast, NewPodcastBuilder}; use utils::url_cleaner; +use utils::replace_extra_spaces; use errors::*; @@ -11,7 +12,7 @@ use errors::*; /// Parses a `rss::Channel` into a `NewPodcast` Struct. pub(crate) fn new_podcast(chan: &Channel, source_id: i32) -> NewPodcast { let title = chan.title().trim(); - let description = ammonia::clean(chan.description().trim()); + let description = replace_extra_spaces(&ammonia::clean(chan.description())); let link = url_cleaner(chan.link()); let x = chan.itunes_ext().map(|s| s.image()); @@ -34,7 +35,8 @@ pub(crate) fn new_podcast(chan: &Channel, source_id: i32) -> NewPodcast { /// Parses an `rss::Item` into a `NewEpisode` Struct. pub(crate) fn new_episode(item: &Item, parent_id: i32) -> Result { let title = item.title().map(|s| s.trim().to_owned()); - let description = item.description().map(|s| ammonia::clean(s.trim())); + let description = item.description() + .map(|s| replace_extra_spaces(&ammonia::clean(s))); let guid = item.guid().map(|s| s.value().trim().to_owned()); // Its kinda weird this being an Option type. @@ -92,7 +94,7 @@ mod tests { let descr = "The people behind The Intercept’s fearless reporting and incisive \ commentary—Jeremy Scahill, Glenn Greenwald, Betsy Reed and others—discuss \ the crucial issues of our time: national security, civil liberties, foreign \ - policy, and criminal justice. Plus interviews with artists, thinkers, and \ + policy, and criminal justice. Plus interviews with artists, thinkers, and \ newsmakers who challenge our preconceptions about the world we live in."; let pd = new_podcast(&channel, 0); @@ -249,8 +251,10 @@ mod tests { assert_eq!( i2.title(), - Some("The Breakthrough: Behind the Scenes of Hillary Clinton’s Failed Bid for \ - President") + Some( + "The Breakthrough: Behind the Scenes of Hillary Clinton’s Failed Bid for \ + President" + ) ); assert_eq!( i2.uri(), @@ -298,7 +302,7 @@ mod tests { let descr2 = "

The Gnome project is about to solve one of our audience's biggest \ Wayland’s concerns. But as the project takes on a new level of relevance, \ decisions for the next version of Gnome have us worried about the \ - future.

\n\n

Plus we chat with Wimpy about the Ubuntu Rally in NYC, \ + future.

\n

Plus we chat with Wimpy about the Ubuntu Rally in NYC, \ Microsoft’s sneaky move to turn Windows 10 into the “ULTIMATE LINUX \ RUNTIME”, community news & more!

"; assert_eq!(i2.title(), Some("Gnome Does it Again | LUP 213")); diff --git a/hammond-data/src/utils.rs b/hammond-data/src/utils.rs index d297102..72ef146 100644 --- a/hammond-data/src/utils.rs +++ b/hammond-data/src/utils.rs @@ -4,6 +4,7 @@ use rayon::prelude::*; use chrono::prelude::*; use url::{Position, Url}; +use itertools::Itertools; use errors::*; use dbqueries; @@ -105,14 +106,20 @@ pub fn url_cleaner(s: &str) -> String { } } -/// Placeholder -// TODO: Docs +/// Helper functions that strips extra spaces and newlines and all the tabs. +#[allow(match_same_arms)] pub fn replace_extra_spaces(s: &str) -> String { - s.lines() - .map(|x| x.split_whitespace().collect::>().join(" ")) - .filter(|x| !x.is_empty()) - .collect::>() - .join("\n") + s.trim() + .chars() + .filter(|ch| *ch != '\t') + .coalesce(|current, next| match (current, next) { + ('\n', '\n') => Ok('\n'), + ('\n', ' ') => Ok('\n'), + (' ', '\n') => Ok('\n'), + (' ', ' ') => Ok(' '), + (_, _) => Err((current, next)), + }) + .collect::() } #[cfg(test)] diff --git a/rustfmt.toml b/rustfmt.toml index 477cd1c..7f1b8d9 100644 --- a/rustfmt.toml +++ b/rustfmt.toml @@ -19,7 +19,7 @@ hard_tabs = false spaces_within_parens = false write_mode = "Overwrite" merge_derives = true -condense_wildcard_suffixes = true +condense_wildcard_suffixes = false format_strings = true multiline_closure_forces_block = true attributes_on_same_line_as_field = true