Strip whitespace during parsing.

This commit is contained in:
Jordan Petridis 2017-12-09 10:22:09 +02:00
parent 8b4684679b
commit 999a2a1fc1
No known key found for this signature in database
GPG Key ID: CEABAD9F5683B9A6
6 changed files with 38 additions and 14 deletions

10
Cargo.lock generated
View File

@ -568,6 +568,7 @@ dependencies = [
"diesel_migrations 0.99.0 (registry+https://github.com/rust-lang/crates.io-index)", "diesel_migrations 0.99.0 (registry+https://github.com/rust-lang/crates.io-index)",
"dotenv 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)", "dotenv 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)",
"error-chain 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)", "error-chain 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)",
"itertools 0.7.4 (registry+https://github.com/rust-lang/crates.io-index)",
"lazy_static 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", "lazy_static 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
"log 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
"r2d2 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)", "r2d2 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)",
@ -689,6 +690,14 @@ dependencies = [
"winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)", "winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
[[package]]
name = "itertools"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"either 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]] [[package]]
name = "itoa" name = "itoa"
version = "0.3.4" version = "0.3.4"
@ -1667,6 +1676,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum hyper-tls 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "9c81fa95203e2a6087242c38691a0210f23e9f3f8f944350bd676522132e2985" "checksum hyper-tls 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "9c81fa95203e2a6087242c38691a0210f23e9f3f8f944350bd676522132e2985"
"checksum idna 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "014b298351066f1512874135335d62a789ffe78a9974f94b43ed5621951eaf7d" "checksum idna 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "014b298351066f1512874135335d62a789ffe78a9974f94b43ed5621951eaf7d"
"checksum iovec 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b6e8b9c2247fcf6c6a1151f1156932be5606c9fd6f55a2d7f9fc1cb29386b2f7" "checksum iovec 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b6e8b9c2247fcf6c6a1151f1156932be5606c9fd6f55a2d7f9fc1cb29386b2f7"
"checksum itertools 0.7.4 (registry+https://github.com/rust-lang/crates.io-index)" = "947aa0b9bb417792efa3936c5dada2d680b3bc27ea6a88ffa062f4c4d86ef8c5"
"checksum itoa 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "8324a32baf01e2ae060e9de58ed0bc2320c9a2833491ee36cd3b4c414de4db8c" "checksum itoa 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "8324a32baf01e2ae060e9de58ed0bc2320c9a2833491ee36cd3b4c414de4db8c"
"checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" "checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d"
"checksum language-tags 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a91d884b6667cd606bb5a69aa0c99ba811a115fc68915e7056ec08a46e93199a" "checksum language-tags 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a91d884b6667cd606bb5a69aa0c99ba811a115fc68915e7056ec08a46e93199a"

View File

@ -10,6 +10,7 @@ chrono = "0.4.0"
derive_builder = "0.5.0" derive_builder = "0.5.0"
dotenv = "0.10.1" dotenv = "0.10.1"
error-chain = "0.11.0" error-chain = "0.11.0"
itertools = "0.7.4"
lazy_static = "1.0.0" lazy_static = "1.0.0"
log = "0.3.8" log = "0.3.8"
r2d2 = "0.8.1" r2d2 = "0.8.1"

View File

@ -10,6 +10,7 @@
//! A libraty for parsing, indexing and retrieving podcast Feeds, //! A libraty for parsing, indexing and retrieving podcast Feeds,
//! into and from a Database. //! into and from a Database.
#![allow(unknown_lints)]
#![deny(bad_style, const_err, dead_code, improper_ctypes, legacy_directory_ownership, #![deny(bad_style, const_err, dead_code, improper_ctypes, legacy_directory_ownership,
non_shorthand_field_patterns, no_mangle_generic_items, overflowing_literals, non_shorthand_field_patterns, no_mangle_generic_items, overflowing_literals,
path_statements, patterns_in_fns_without_body, plugin_as_library, private_in_public, path_statements, patterns_in_fns_without_body, plugin_as_library, private_in_public,
@ -39,6 +40,7 @@ extern crate derive_builder;
extern crate ammonia; extern crate ammonia;
extern crate chrono; extern crate chrono;
extern crate itertools;
extern crate r2d2; extern crate r2d2;
extern crate r2d2_diesel; extern crate r2d2_diesel;
extern crate rayon; extern crate rayon;

View File

@ -4,6 +4,7 @@ use rfc822_sanitizer::parse_from_rfc2822_with_fallback;
use models::insertables::{NewEpisode, NewEpisodeBuilder, NewPodcast, NewPodcastBuilder}; use models::insertables::{NewEpisode, NewEpisodeBuilder, NewPodcast, NewPodcastBuilder};
use utils::url_cleaner; use utils::url_cleaner;
use utils::replace_extra_spaces;
use errors::*; use errors::*;
@ -11,7 +12,7 @@ use errors::*;
/// Parses a `rss::Channel` into a `NewPodcast` Struct. /// Parses a `rss::Channel` into a `NewPodcast` Struct.
pub(crate) fn new_podcast(chan: &Channel, source_id: i32) -> NewPodcast { pub(crate) fn new_podcast(chan: &Channel, source_id: i32) -> NewPodcast {
let title = chan.title().trim(); let title = chan.title().trim();
let description = ammonia::clean(chan.description().trim()); let description = replace_extra_spaces(&ammonia::clean(chan.description()));
let link = url_cleaner(chan.link()); let link = url_cleaner(chan.link());
let x = chan.itunes_ext().map(|s| s.image()); let x = chan.itunes_ext().map(|s| s.image());
@ -34,7 +35,8 @@ pub(crate) fn new_podcast(chan: &Channel, source_id: i32) -> NewPodcast {
/// Parses an `rss::Item` into a `NewEpisode` Struct. /// Parses an `rss::Item` into a `NewEpisode` Struct.
pub(crate) fn new_episode(item: &Item, parent_id: i32) -> Result<NewEpisode> { pub(crate) fn new_episode(item: &Item, parent_id: i32) -> Result<NewEpisode> {
let title = item.title().map(|s| s.trim().to_owned()); let title = item.title().map(|s| s.trim().to_owned());
let description = item.description().map(|s| ammonia::clean(s.trim())); let description = item.description()
.map(|s| replace_extra_spaces(&ammonia::clean(s)));
let guid = item.guid().map(|s| s.value().trim().to_owned()); let guid = item.guid().map(|s| s.value().trim().to_owned());
// Its kinda weird this being an Option type. // Its kinda weird this being an Option type.
@ -92,7 +94,7 @@ mod tests {
let descr = "The people behind The Intercepts fearless reporting and incisive \ let descr = "The people behind The Intercepts fearless reporting and incisive \
commentaryJeremy Scahill, Glenn Greenwald, Betsy Reed and othersdiscuss \ commentaryJeremy Scahill, Glenn Greenwald, Betsy Reed and othersdiscuss \
the crucial issues of our time: national security, civil liberties, foreign \ the crucial issues of our time: national security, civil liberties, foreign \
policy, and criminal justice. Plus interviews with artists, thinkers, and \ policy, and criminal justice. Plus interviews with artists, thinkers, and \
newsmakers who challenge our preconceptions about the world we live in."; newsmakers who challenge our preconceptions about the world we live in.";
let pd = new_podcast(&channel, 0); let pd = new_podcast(&channel, 0);
@ -249,8 +251,10 @@ mod tests {
assert_eq!( assert_eq!(
i2.title(), i2.title(),
Some("The Breakthrough: Behind the Scenes of Hillary Clintons Failed Bid for \ Some(
President") "The Breakthrough: Behind the Scenes of Hillary Clintons Failed Bid for \
President"
)
); );
assert_eq!( assert_eq!(
i2.uri(), i2.uri(),
@ -298,7 +302,7 @@ mod tests {
let descr2 = "<p>The Gnome project is about to solve one of our audience's biggest \ let descr2 = "<p>The Gnome project is about to solve one of our audience's biggest \
Waylands concerns. But as the project takes on a new level of relevance, \ Waylands concerns. But as the project takes on a new level of relevance, \
decisions for the next version of Gnome have us worried about the \ decisions for the next version of Gnome have us worried about the \
future.</p>\n\n<p>Plus we chat with Wimpy about the Ubuntu Rally in NYC, \ future.</p>\n<p>Plus we chat with Wimpy about the Ubuntu Rally in NYC, \
Microsofts sneaky move to turn Windows 10 into the ULTIMATE LINUX \ Microsofts sneaky move to turn Windows 10 into the ULTIMATE LINUX \
RUNTIME, community news &amp; more!</p>"; RUNTIME, community news &amp; more!</p>";
assert_eq!(i2.title(), Some("Gnome Does it Again | LUP 213")); assert_eq!(i2.title(), Some("Gnome Does it Again | LUP 213"));

View File

@ -4,6 +4,7 @@ use rayon::prelude::*;
use chrono::prelude::*; use chrono::prelude::*;
use url::{Position, Url}; use url::{Position, Url};
use itertools::Itertools;
use errors::*; use errors::*;
use dbqueries; use dbqueries;
@ -105,14 +106,20 @@ pub fn url_cleaner(s: &str) -> String {
} }
} }
/// Placeholder /// Helper functions that strips extra spaces and newlines and all the tabs.
// TODO: Docs #[allow(match_same_arms)]
pub fn replace_extra_spaces(s: &str) -> String { pub fn replace_extra_spaces(s: &str) -> String {
s.lines() s.trim()
.map(|x| x.split_whitespace().collect::<Vec<_>>().join(" ")) .chars()
.filter(|x| !x.is_empty()) .filter(|ch| *ch != '\t')
.collect::<Vec<_>>() .coalesce(|current, next| match (current, next) {
.join("\n") ('\n', '\n') => Ok('\n'),
('\n', ' ') => Ok('\n'),
(' ', '\n') => Ok('\n'),
(' ', ' ') => Ok(' '),
(_, _) => Err((current, next)),
})
.collect::<String>()
} }
#[cfg(test)] #[cfg(test)]

View File

@ -19,7 +19,7 @@ hard_tabs = false
spaces_within_parens = false spaces_within_parens = false
write_mode = "Overwrite" write_mode = "Overwrite"
merge_derives = true merge_derives = true
condense_wildcard_suffixes = true condense_wildcard_suffixes = false
format_strings = true format_strings = true
multiline_closure_forces_block = true multiline_closure_forces_block = true
attributes_on_same_line_as_field = true attributes_on_same_line_as_field = true