podcasts/hammond-data/src/parser.rs
2017-12-07 06:38:31 +02:00

366 lines
15 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

use ammonia;
use rss::{Channel, Item};
use rfc822_sanitizer::parse_from_rfc2822_with_fallback;
use models::insertables::{NewEpisode, NewEpisodeBuilder, NewPodcast, NewPodcastBuilder};
use utils::url_cleaner;
use errors::*;
// TODO: Extend the support for parsing itunes extensions
/// Parses a `rss::Channel` into a `NewPodcast` Struct.
pub(crate) fn new_podcast(chan: &Channel, source_id: i32) -> NewPodcast {
let title = chan.title().trim();
let description = ammonia::clean(chan.description().trim());
let link = url_cleaner(chan.link());
let x = chan.itunes_ext().map(|s| s.image());
let image_uri = if let Some(img) = x {
img.map(|s| url_cleaner(s))
} else {
chan.image().map(|foo| url_cleaner(foo.url()))
};
NewPodcastBuilder::default()
.title(title)
.description(description)
.link(link)
.image_uri(image_uri)
.source_id(source_id)
.build()
.unwrap()
}
/// Parses an `rss::Item` into a `NewEpisode` Struct.
pub(crate) fn new_episode(item: &Item, parent_id: i32) -> Result<NewEpisode> {
let title = item.title().map(|s| s.trim().to_owned());
let description = item.description().map(|s| ammonia::clean(s.trim()));
let guid = item.guid().map(|s| s.value().trim().to_owned());
// Its kinda weird this being an Option type.
// Rss 2.0 specified that it's optional.
// Though the db scema has a requirment of episode uri being Unique && Not Null.
// TODO: Restructure
let x = item.enclosure().map(|s| url_cleaner(s.url()));
let uri = if x.is_some() {
x.unwrap()
} else if item.link().is_some() {
item.link().map(|s| url_cleaner(s)).unwrap()
} else {
bail!("No url specified for the item.")
};
let date = parse_from_rfc2822_with_fallback(
// Default to rfc2822 represantation of epoch 0.
item.pub_date().unwrap_or("Thu, 1 Jan 1970 00:00:00 +0000"),
);
// Should treat information from the rss feeds as invalid by default.
// Case: Thu, 05 Aug 2016 06:00:00 -0400 <-- Actually that was friday.
let pub_date = date.map(|x| x.to_rfc2822()).ok();
let epoch = date.map(|x| x.timestamp() as i32).unwrap_or(0);
let length = item.enclosure().map(|x| x.length().parse().unwrap_or(0));
Ok(
NewEpisodeBuilder::default()
.title(title)
.uri(uri)
.description(description)
.length(length)
.published_date(pub_date)
.epoch(epoch)
.guid(guid)
.podcast_id(parent_id)
.build()
.unwrap()
)
}
#[cfg(test)]
mod tests {
use std::fs::File;
use std::io::BufReader;
use rss::Channel;
use super::*;
#[test]
fn test_new_podcast_intercepted() {
let file = File::open("tests/feeds/Intercepted.xml").unwrap();
let channel = Channel::read_from(BufReader::new(file)).unwrap();
let descr = "The people behind The Intercepts fearless reporting and incisive \
commentary—Jeremy Scahill, Glenn Greenwald, Betsy Reed and others—discuss \
the crucial issues of our time: national security, civil liberties, foreign \
policy, and criminal justice. Plus interviews with artists, thinkers, and \
newsmakers who challenge our preconceptions about the world we live in.";
let pd = new_podcast(&channel, 0);
assert_eq!(pd.title(), "Intercepted with Jeremy Scahill");
assert_eq!(pd.link(), "https://theintercept.com/podcasts");
assert_eq!(pd.description(), descr);
assert_eq!(
pd.image_uri(),
Some(
"http://static.megaphone.fm/podcasts/d5735a50-d904-11e6-8532-73c7de466ea6/image/\
uploads_2F1484252190700-qhn5krasklbce3dh-a797539282700ea0298a3a26f7e49b0b_\
2FIntercepted_COVER%2B_281_29.png"
)
);
}
#[test]
fn test_new_podcast_breakthrough() {
let file = File::open("tests/feeds/TheBreakthrough.xml").unwrap();
let channel = Channel::read_from(BufReader::new(file)).unwrap();
let descr = "Latest Articles and Investigations from ProPublica, an independent, \
non-profit newsroom that produces investigative journalism in the public \
interest.";
let pd = new_podcast(&channel, 0);
assert_eq!(pd.title(), "The Breakthrough");
assert_eq!(pd.link(), "http://www.propublica.org/podcast");
assert_eq!(pd.description(), descr);
assert_eq!(
pd.image_uri(),
Some("http://www.propublica.org/images/podcast_logo_2.png")
);
}
#[test]
fn test_new_podcast_lup() {
let file = File::open("tests/feeds/LinuxUnplugged.xml").unwrap();
let channel = Channel::read_from(BufReader::new(file)).unwrap();
let descr = "An open show powered by community LINUX Unplugged takes the best attributes \
of open collaboration and focuses them into a weekly lifestyle show about \
Linux.";
let pd = new_podcast(&channel, 0);
assert_eq!(pd.title(), "LINUX Unplugged Podcast");
assert_eq!(pd.link(), "http://www.jupiterbroadcasting.com/");
assert_eq!(pd.description(), descr);
assert_eq!(
pd.image_uri(),
Some("http://www.jupiterbroadcasting.com/images/LASUN-Badge1400.jpg")
);
}
#[test]
fn test_new_podcast_r4explanation() {
let file = File::open("tests/feeds/R4Explanation.xml").unwrap();
let channel = Channel::read_from(BufReader::new(file)).unwrap();
let pd = new_podcast(&channel, 0);
let descr = "A weekly discussion of Rust RFCs";
assert_eq!(pd.title(), "Request For Explanation");
assert_eq!(
pd.link(),
"https://request-for-explanation.github.io/podcast/"
);
assert_eq!(pd.description(), descr);
assert_eq!(
pd.image_uri(),
Some("https://request-for-explanation.github.io/podcast/podcast.png")
);
}
#[test]
fn test_new_episode_intercepted() {
let file = File::open("tests/feeds/Intercepted.xml").unwrap();
let channel = Channel::read_from(BufReader::new(file)).unwrap();
let firstitem = channel.items().first().unwrap();
let descr = "NSA whistleblower Edward Snowden discusses the massive Equifax data breach \
and allegations of Russian interference in the US election. Commentator \
Shaun King explains his call for a boycott of the NFL and talks about his \
campaign to bring violent neo-Nazis to justice. Rapper Open Mike Eagle \
performs.";
let i = new_episode(&firstitem, 0).unwrap();
assert_eq!(i.title(), Some("The Super Bowl of Racism"));
assert_eq!(i.uri(), "http://traffic.megaphone.fm/PPY6458293736.mp3");
assert_eq!(i.description(), Some(descr));
assert_eq!(i.length(), Some(66738886));
assert_eq!(i.guid(), Some("7df4070a-9832-11e7-adac-cb37b05d5e24"));
assert_eq!(i.published_date(), Some("Wed, 13 Sep 2017 10:00:00 +0000"));
assert_eq!(i.epoch(), 1505296800);
let second = channel.items().iter().nth(1).unwrap();
let i2 = new_episode(&second, 0).unwrap();
let descr2 = "This week on Intercepted: Jeremy gives an update on the aftermath of \
Blackwaters 2007 massacre of Iraqi civilians. Intercept reporter Lee Fang \
lays out how a network of libertarian think tanks called the Atlas Network \
is insidiously shaping political infrastructure in Latin America. We speak \
with attorney and former Hugo Chavez adviser Eva Golinger about the \
Venezuela\'s political turmoil.And we hear Claudia Lizardo of the \
Caracas-based band, La Pequeña Revancha, talk about her music and hopes for \
Venezuela.";
assert_eq!(
i2.title(),
Some("Atlas Golfed — U.S.-Backed Think Tanks Target Latin America")
);
assert_eq!(i2.uri(), "http://traffic.megaphone.fm/FL5331443769.mp3");
assert_eq!(i2.description(), Some(descr2));
assert_eq!(i2.length(), Some(67527575));
assert_eq!(i2.guid(), Some("7c207a24-e33f-11e6-9438-eb45dcf36a1d"));
assert_eq!(i2.published_date(), Some("Wed, 9 Aug 2017 10:00:00 +0000"));
assert_eq!(i2.epoch(), 1502272800);
}
#[test]
fn test_new_episode_breakthrough() {
let file = File::open("tests/feeds/TheBreakthrough.xml").unwrap();
let channel = Channel::read_from(BufReader::new(file)).unwrap();
let firstitem = channel.items().first().unwrap();
let descr = "<p>A reporter finds that homes meant to replace New Yorks troubled \
psychiatric hospitals might be just as bad.</p>";
let i = new_episode(&firstitem, 0).unwrap();
assert_eq!(
i.title(),
Some("The Breakthrough: Hopelessness and Exploitation Inside Homes for Mentally Ill")
);
assert_eq!(
i.uri(),
"http://tracking.feedpress.it/link/10581/6726758/20170908-cliff-levy.mp3"
);
assert_eq!(i.description(), Some(descr));
assert_eq!(i.length(), Some(33396551));
assert_eq!(
i.guid(),
Some(
"https://www.propublica.org/podcast/\
the-breakthrough-hopelessness-exploitation-homes-for-mentally-ill#134472"
)
);
assert_eq!(i.published_date(), Some("Fri, 8 Sep 2017 12:00:00 +0000"));
assert_eq!(i.epoch(), 1504872000);
let second = channel.items().iter().nth(1).unwrap();
let i2 = new_episode(&second, 0).unwrap();
let descr2 = "<p>Jonathan Allen and Amie Parnes didnt know their book would be called \
Shattered, or that their extraordinary access would let them chronicle \
the mounting signs of a doomed campaign.</p>";
assert_eq!(
i2.title(),
Some("The Breakthrough: Behind the Scenes of Hillary Clintons Failed Bid for \
President")
);
assert_eq!(
i2.uri(),
"http://tracking.feedpress.it/link/10581/6726759/16_JohnAllen-CRAFT.mp3".to_string()
);
assert_eq!(i2.description(), Some(descr2));
assert_eq!(i2.length(), Some(17964071));
assert_eq!(
i2.guid(),
Some(
"https://www.propublica.\
org/podcast/the-breakthrough-hillary-clinton-failed-presidential-bid#133721"
)
);
assert_eq!(i2.published_date(), Some("Fri, 25 Aug 2017 12:00:00 +0000"));
assert_eq!(i2.epoch(), 1503662400);
}
#[test]
fn test_new_episode_lup() {
let file = File::open("tests/feeds/LinuxUnplugged.xml").unwrap();
let channel = Channel::read_from(BufReader::new(file)).unwrap();
let firstitem = channel.items().first().unwrap();
let descr = "Audit your network with a couple of easy commands on Kali Linux. Chris \
decides to blow off a little steam by attacking his IoT devices, Wes has \
the scope on Equifax blaming open source &amp; the Beard just saved the \
show. Its a really packed episode!";
let i = new_episode(&firstitem, 0).unwrap();
assert_eq!(i.title(), Some("Hacking Devices with Kali Linux | LUP 214"));
assert_eq!(
i.uri(),
"http://www.podtrac.com/pts/redirect.mp3/traffic.libsyn.com/jnite/lup-0214.mp3"
);
assert_eq!(i.description(), Some(descr));
assert_eq!(i.length(), Some(46479789));
assert_eq!(i.guid(), Some("78A682B4-73E8-47B8-88C0-1BE62DD4EF9D"));
assert_eq!(i.published_date(), Some("Tue, 12 Sep 2017 22:24:42 -0700"));
assert_eq!(i.epoch(), 1505280282);
let second = channel.items().iter().nth(1).unwrap();
let i2 = new_episode(&second, 0).unwrap();
let descr2 = "<p>The Gnome project is about to solve one of our audience's biggest \
Waylands concerns. But as the project takes on a new level of relevance, \
decisions for the next version of Gnome have us worried about the \
future.</p>\n\n<p>Plus we chat with Wimpy about the Ubuntu Rally in NYC, \
Microsofts sneaky move to turn Windows 10 into the “ULTIMATE LINUX \
RUNTIME”, community news &amp; more!</p>";
assert_eq!(i2.title(), Some("Gnome Does it Again | LUP 213"));
assert_eq!(
i2.uri(),
"http://www.podtrac.com/pts/redirect.mp3/traffic.libsyn.com/jnite/lup-0213.mp3"
);
assert_eq!(i2.description(), Some(descr2));
assert_eq!(i2.length(), Some(36544272));
assert_eq!(i2.guid(), Some("1CE57548-B36C-4F14-832A-5D5E0A24E35B"));
assert_eq!(i2.published_date(), Some("Tue, 5 Sep 2017 20:57:27 -0700"));
assert_eq!(i2.epoch(), 1504670247);
}
#[test]
fn test_new_episode_r4expanation() {
let file = File::open("tests/feeds/R4Explanation.xml").unwrap();
let channel = Channel::read_from(BufReader::new(file)).unwrap();
let firstitem = channel.items().iter().nth(9).unwrap();
let descr = "This week we look at <a \
href=\"https://github.com/rust-lang/rfcs/pull/2094\" rel=\"noopener \
noreferrer\">RFC 2094</a> \"Non-lexical lifetimes\"";
let i = new_episode(&firstitem, 0).unwrap();
assert_eq!(i.title(), Some("Episode #9 - A Once in a Lifetime RFC"));
assert_eq!(
i.uri(),
"http://request-for-explanation.github.\
io/podcast/ep9-a-once-in-a-lifetime-rfc/episode.mp3"
);
assert_eq!(i.description(), Some(descr));
assert_eq!(i.length(), Some(15077388));
assert_eq!(
i.guid(),
Some("https://request-for-explanation.github.io/podcast/ep9-a-once-in-a-lifetime-rfc/")
);
assert_eq!(i.published_date(), Some("Mon, 28 Aug 2017 15:00:00 -0700"));
assert_eq!(i.epoch(), 1503957600);
let second = channel.items().iter().nth(8).unwrap();
let i2 = new_episode(&second, 0).unwrap();
let descr2 = "This week we look at <a href=\"https://github.com/rust-lang/rfcs/pull/2071\" \
rel=\"noopener noreferrer\">RFC 2071</a> \"Add impl Trait type alias and \
variable declarations\"";
assert_eq!(i2.title(), Some("Episode #8 - An Existential Crisis"));
assert_eq!(
i2.uri(),
"http://request-for-explanation.github.io/podcast/ep8-an-existential-crisis/episode.\
mp3"
);
assert_eq!(i2.description(), Some(descr2));
assert_eq!(i2.length(), Some(13713219));
assert_eq!(
i2.guid(),
Some("https://request-for-explanation.github.io/podcast/ep8-an-existential-crisis/")
);
assert_eq!(i2.published_date(), Some("Tue, 15 Aug 2017 17:00:00 -0700"));
assert_eq!(i2.epoch(), 1502841600);
}
}