Strip whitespace during parsing.
This commit is contained in:
parent
8b4684679b
commit
999a2a1fc1
10
Cargo.lock
generated
10
Cargo.lock
generated
@ -568,6 +568,7 @@ dependencies = [
|
|||||||
"diesel_migrations 0.99.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"diesel_migrations 0.99.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"dotenv 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"dotenv 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"error-chain 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"error-chain 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"itertools 0.7.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"lazy_static 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"lazy_static 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"log 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
"log 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"r2d2 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"r2d2 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
@ -689,6 +690,14 @@ dependencies = [
|
|||||||
"winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
"winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "itertools"
|
||||||
|
version = "0.7.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"either 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "itoa"
|
name = "itoa"
|
||||||
version = "0.3.4"
|
version = "0.3.4"
|
||||||
@ -1667,6 +1676,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
"checksum hyper-tls 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "9c81fa95203e2a6087242c38691a0210f23e9f3f8f944350bd676522132e2985"
|
"checksum hyper-tls 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "9c81fa95203e2a6087242c38691a0210f23e9f3f8f944350bd676522132e2985"
|
||||||
"checksum idna 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "014b298351066f1512874135335d62a789ffe78a9974f94b43ed5621951eaf7d"
|
"checksum idna 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "014b298351066f1512874135335d62a789ffe78a9974f94b43ed5621951eaf7d"
|
||||||
"checksum iovec 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b6e8b9c2247fcf6c6a1151f1156932be5606c9fd6f55a2d7f9fc1cb29386b2f7"
|
"checksum iovec 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b6e8b9c2247fcf6c6a1151f1156932be5606c9fd6f55a2d7f9fc1cb29386b2f7"
|
||||||
|
"checksum itertools 0.7.4 (registry+https://github.com/rust-lang/crates.io-index)" = "947aa0b9bb417792efa3936c5dada2d680b3bc27ea6a88ffa062f4c4d86ef8c5"
|
||||||
"checksum itoa 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "8324a32baf01e2ae060e9de58ed0bc2320c9a2833491ee36cd3b4c414de4db8c"
|
"checksum itoa 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "8324a32baf01e2ae060e9de58ed0bc2320c9a2833491ee36cd3b4c414de4db8c"
|
||||||
"checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d"
|
"checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d"
|
||||||
"checksum language-tags 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a91d884b6667cd606bb5a69aa0c99ba811a115fc68915e7056ec08a46e93199a"
|
"checksum language-tags 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a91d884b6667cd606bb5a69aa0c99ba811a115fc68915e7056ec08a46e93199a"
|
||||||
|
|||||||
@ -10,6 +10,7 @@ chrono = "0.4.0"
|
|||||||
derive_builder = "0.5.0"
|
derive_builder = "0.5.0"
|
||||||
dotenv = "0.10.1"
|
dotenv = "0.10.1"
|
||||||
error-chain = "0.11.0"
|
error-chain = "0.11.0"
|
||||||
|
itertools = "0.7.4"
|
||||||
lazy_static = "1.0.0"
|
lazy_static = "1.0.0"
|
||||||
log = "0.3.8"
|
log = "0.3.8"
|
||||||
r2d2 = "0.8.1"
|
r2d2 = "0.8.1"
|
||||||
|
|||||||
@ -10,6 +10,7 @@
|
|||||||
//! A libraty for parsing, indexing and retrieving podcast Feeds,
|
//! A libraty for parsing, indexing and retrieving podcast Feeds,
|
||||||
//! into and from a Database.
|
//! into and from a Database.
|
||||||
|
|
||||||
|
#![allow(unknown_lints)]
|
||||||
#![deny(bad_style, const_err, dead_code, improper_ctypes, legacy_directory_ownership,
|
#![deny(bad_style, const_err, dead_code, improper_ctypes, legacy_directory_ownership,
|
||||||
non_shorthand_field_patterns, no_mangle_generic_items, overflowing_literals,
|
non_shorthand_field_patterns, no_mangle_generic_items, overflowing_literals,
|
||||||
path_statements, patterns_in_fns_without_body, plugin_as_library, private_in_public,
|
path_statements, patterns_in_fns_without_body, plugin_as_library, private_in_public,
|
||||||
@ -39,6 +40,7 @@ extern crate derive_builder;
|
|||||||
|
|
||||||
extern crate ammonia;
|
extern crate ammonia;
|
||||||
extern crate chrono;
|
extern crate chrono;
|
||||||
|
extern crate itertools;
|
||||||
extern crate r2d2;
|
extern crate r2d2;
|
||||||
extern crate r2d2_diesel;
|
extern crate r2d2_diesel;
|
||||||
extern crate rayon;
|
extern crate rayon;
|
||||||
|
|||||||
@ -4,6 +4,7 @@ use rfc822_sanitizer::parse_from_rfc2822_with_fallback;
|
|||||||
|
|
||||||
use models::insertables::{NewEpisode, NewEpisodeBuilder, NewPodcast, NewPodcastBuilder};
|
use models::insertables::{NewEpisode, NewEpisodeBuilder, NewPodcast, NewPodcastBuilder};
|
||||||
use utils::url_cleaner;
|
use utils::url_cleaner;
|
||||||
|
use utils::replace_extra_spaces;
|
||||||
|
|
||||||
use errors::*;
|
use errors::*;
|
||||||
|
|
||||||
@ -11,7 +12,7 @@ use errors::*;
|
|||||||
/// Parses a `rss::Channel` into a `NewPodcast` Struct.
|
/// Parses a `rss::Channel` into a `NewPodcast` Struct.
|
||||||
pub(crate) fn new_podcast(chan: &Channel, source_id: i32) -> NewPodcast {
|
pub(crate) fn new_podcast(chan: &Channel, source_id: i32) -> NewPodcast {
|
||||||
let title = chan.title().trim();
|
let title = chan.title().trim();
|
||||||
let description = ammonia::clean(chan.description().trim());
|
let description = replace_extra_spaces(&ammonia::clean(chan.description()));
|
||||||
|
|
||||||
let link = url_cleaner(chan.link());
|
let link = url_cleaner(chan.link());
|
||||||
let x = chan.itunes_ext().map(|s| s.image());
|
let x = chan.itunes_ext().map(|s| s.image());
|
||||||
@ -34,7 +35,8 @@ pub(crate) fn new_podcast(chan: &Channel, source_id: i32) -> NewPodcast {
|
|||||||
/// Parses an `rss::Item` into a `NewEpisode` Struct.
|
/// Parses an `rss::Item` into a `NewEpisode` Struct.
|
||||||
pub(crate) fn new_episode(item: &Item, parent_id: i32) -> Result<NewEpisode> {
|
pub(crate) fn new_episode(item: &Item, parent_id: i32) -> Result<NewEpisode> {
|
||||||
let title = item.title().map(|s| s.trim().to_owned());
|
let title = item.title().map(|s| s.trim().to_owned());
|
||||||
let description = item.description().map(|s| ammonia::clean(s.trim()));
|
let description = item.description()
|
||||||
|
.map(|s| replace_extra_spaces(&ammonia::clean(s)));
|
||||||
let guid = item.guid().map(|s| s.value().trim().to_owned());
|
let guid = item.guid().map(|s| s.value().trim().to_owned());
|
||||||
|
|
||||||
// Its kinda weird this being an Option type.
|
// Its kinda weird this being an Option type.
|
||||||
@ -92,7 +94,7 @@ mod tests {
|
|||||||
let descr = "The people behind The Intercept’s fearless reporting and incisive \
|
let descr = "The people behind The Intercept’s fearless reporting and incisive \
|
||||||
commentary—Jeremy Scahill, Glenn Greenwald, Betsy Reed and others—discuss \
|
commentary—Jeremy Scahill, Glenn Greenwald, Betsy Reed and others—discuss \
|
||||||
the crucial issues of our time: national security, civil liberties, foreign \
|
the crucial issues of our time: national security, civil liberties, foreign \
|
||||||
policy, and criminal justice. Plus interviews with artists, thinkers, and \
|
policy, and criminal justice. Plus interviews with artists, thinkers, and \
|
||||||
newsmakers who challenge our preconceptions about the world we live in.";
|
newsmakers who challenge our preconceptions about the world we live in.";
|
||||||
let pd = new_podcast(&channel, 0);
|
let pd = new_podcast(&channel, 0);
|
||||||
|
|
||||||
@ -249,8 +251,10 @@ mod tests {
|
|||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
i2.title(),
|
i2.title(),
|
||||||
Some("The Breakthrough: Behind the Scenes of Hillary Clinton’s Failed Bid for \
|
Some(
|
||||||
President")
|
"The Breakthrough: Behind the Scenes of Hillary Clinton’s Failed Bid for \
|
||||||
|
President"
|
||||||
|
)
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
i2.uri(),
|
i2.uri(),
|
||||||
@ -298,7 +302,7 @@ mod tests {
|
|||||||
let descr2 = "<p>The Gnome project is about to solve one of our audience's biggest \
|
let descr2 = "<p>The Gnome project is about to solve one of our audience's biggest \
|
||||||
Wayland’s concerns. But as the project takes on a new level of relevance, \
|
Wayland’s concerns. But as the project takes on a new level of relevance, \
|
||||||
decisions for the next version of Gnome have us worried about the \
|
decisions for the next version of Gnome have us worried about the \
|
||||||
future.</p>\n\n<p>Plus we chat with Wimpy about the Ubuntu Rally in NYC, \
|
future.</p>\n<p>Plus we chat with Wimpy about the Ubuntu Rally in NYC, \
|
||||||
Microsoft’s sneaky move to turn Windows 10 into the “ULTIMATE LINUX \
|
Microsoft’s sneaky move to turn Windows 10 into the “ULTIMATE LINUX \
|
||||||
RUNTIME”, community news & more!</p>";
|
RUNTIME”, community news & more!</p>";
|
||||||
assert_eq!(i2.title(), Some("Gnome Does it Again | LUP 213"));
|
assert_eq!(i2.title(), Some("Gnome Does it Again | LUP 213"));
|
||||||
|
|||||||
@ -4,6 +4,7 @@ use rayon::prelude::*;
|
|||||||
use chrono::prelude::*;
|
use chrono::prelude::*;
|
||||||
|
|
||||||
use url::{Position, Url};
|
use url::{Position, Url};
|
||||||
|
use itertools::Itertools;
|
||||||
|
|
||||||
use errors::*;
|
use errors::*;
|
||||||
use dbqueries;
|
use dbqueries;
|
||||||
@ -105,14 +106,20 @@ pub fn url_cleaner(s: &str) -> String {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Placeholder
|
/// Helper functions that strips extra spaces and newlines and all the tabs.
|
||||||
// TODO: Docs
|
#[allow(match_same_arms)]
|
||||||
pub fn replace_extra_spaces(s: &str) -> String {
|
pub fn replace_extra_spaces(s: &str) -> String {
|
||||||
s.lines()
|
s.trim()
|
||||||
.map(|x| x.split_whitespace().collect::<Vec<_>>().join(" "))
|
.chars()
|
||||||
.filter(|x| !x.is_empty())
|
.filter(|ch| *ch != '\t')
|
||||||
.collect::<Vec<_>>()
|
.coalesce(|current, next| match (current, next) {
|
||||||
.join("\n")
|
('\n', '\n') => Ok('\n'),
|
||||||
|
('\n', ' ') => Ok('\n'),
|
||||||
|
(' ', '\n') => Ok('\n'),
|
||||||
|
(' ', ' ') => Ok(' '),
|
||||||
|
(_, _) => Err((current, next)),
|
||||||
|
})
|
||||||
|
.collect::<String>()
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|||||||
@ -19,7 +19,7 @@ hard_tabs = false
|
|||||||
spaces_within_parens = false
|
spaces_within_parens = false
|
||||||
write_mode = "Overwrite"
|
write_mode = "Overwrite"
|
||||||
merge_derives = true
|
merge_derives = true
|
||||||
condense_wildcard_suffixes = true
|
condense_wildcard_suffixes = false
|
||||||
format_strings = true
|
format_strings = true
|
||||||
multiline_closure_forces_block = true
|
multiline_closure_forces_block = true
|
||||||
attributes_on_same_line_as_field = true
|
attributes_on_same_line_as_field = true
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user