From 07d3135d91bd2972996c2c141f20f734ed7c538c Mon Sep 17 00:00:00 2001 From: Jordan Petridis Date: Thu, 28 Sep 2017 13:47:41 +0300 Subject: [PATCH] Factored out rfc822 sanitization to its own crate, also fucked up a bit some of the NewEpisode and Episode api's. --- Cargo.toml | 1 + src/feedparser.rs | 134 +++++++++++++++------------------------------- src/index_feed.rs | 6 ++- src/lib.rs | 1 + src/models.rs | 9 ++-- 5 files changed, 55 insertions(+), 96 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 921fd72..bca9f82 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,6 +4,7 @@ version = "0.1.0" authors = ["Jordan Petridis "] [dependencies] +rfc822_sanitizer = "0.1.0" rayon = "0.8.2" regex = "0.2" error-chain = "0.11.0" diff --git a/src/feedparser.rs b/src/feedparser.rs index 81262fc..41e7984 100644 --- a/src/feedparser.rs +++ b/src/feedparser.rs @@ -1,10 +1,8 @@ use rss::{Channel, Item}; -use chrono::DateTime; -use regex; +use rfc822_sanitizer::parse_from_rfc2822_with_fallback; use models; use errors::*; -use std::collections::HashMap; pub fn parse_podcast(chan: &Channel, source_id: i32) -> Result { let title = chan.title().to_owned(); @@ -31,32 +29,11 @@ pub fn parse_podcast(chan: &Channel, source_id: i32) -> Result(item: &'a Item, parent_id: i32) -> Result> { - let weekdays = vec![ - "Mon,", "Tue,", "Wed,", "Thu,", "Fri,", "Sat,", "Sun,", "Monday,", "Tuesday,", - "Wednesday,", "Thursday,", "Friday,", "Saturday,", "Sunday,", - ]; - - let mut months = HashMap::new(); - months.insert("January", "Jan"); - months.insert("February", "Feb"); - months.insert("March", "Mar"); - months.insert("April ", "Apr"); - months.insert("May", "May"); - months.insert("June", "Jun"); - months.insert("July", "Jul"); - months.insert("August", "Aug"); - months.insert("September", "Sep"); - months.insert("October", "Oct"); - months.insert("November", "Nov"); - months.insert("December", "Dec"); - let title = item.title(); - let description = item.description(); let guid = item.guid().map(|x| x.value()); let mut uri = item.enclosure().map(|x| x.url()); - if uri == None { uri = item.link(); } @@ -66,66 +43,17 @@ pub fn parse_episode<'a>(item: &'a Item, parent_id: i32) -> Result Some(foo.to_rfc2822()), + Err(_) => None, + }; - let epoch = match pub_date { - Some(abc) => { - let mut foo = String::from(abc); - - // Pad HH:MM:SS with exta zero if needed. - let re = regex::Regex::new(r"(\d{1,2}):(\d{1,2}):(\d{1,2})")?; - // hours, minutes, seconds = cap[1], cap[2], cap[3] - let cap = re.captures(&abc).unwrap(); - let mut newtime = Vec::new(); - - cap.iter().skip(1).for_each(|x| if let Some(y) = x { - if y.end() - y.start() == 1 { - // warn!("{}", y.as_str()); - newtime.push(format!("0{}", y.as_str())); - } else { - newtime.push(y.as_str().to_string()); - } - }); - - let ntime = &newtime.join(":"); - foo = foo.replace(cap.get(0).unwrap().as_str(), ntime); - debug!("{:?}", foo); - - // Weekday name is not required for rfc2822 - // for stable, replace for_each with map and add - // .fold((), |(),_|()) or .collect() - weekdays.iter().for_each(|x| if foo.starts_with(x) { - // TODO: handle to lower etc. - // For sure someone has a weird feed with the day in lowercase - foo = format!("{}", &foo[x.len()..]); - foo = foo.trim().to_string(); - }); - - // Replace long month names with 3 letter Abr. - months.iter().for_each(|(k, v)| if abc.contains(k) { - foo = foo.replace(k, v); - }); - - // See #102, https://github.com/chronotope/chrono/issues/102 - // Handle -0000 as +0000 - if abc.ends_with("-0000") { - foo = format!("{}+0000", &abc[..abc.len() - 5]); - } - - let date = DateTime::parse_from_rfc2822(&foo); - - match date { - Ok(bar) => bar.timestamp() as i32, - Err(baz) => { - // info!("{}", foo); - error!("Error while trying to parse \"{}\" as date.", foo); - error!("Error: {}", baz); - debug!("Falling back to default 0"); - 0 - } - } - } - _ => 0, + let epoch = match date { + Ok(foo) => foo.timestamp() as i32, + Err(_) => 0, }; let length = item.enclosure() @@ -245,7 +173,10 @@ mod tests { assert_eq!(i.description, Some(descr)); assert_eq!(i.length, Some(66738886)); assert_eq!(i.guid, Some("7df4070a-9832-11e7-adac-cb37b05d5e24")); - assert_eq!(i.published_date, Some("Wed, 13 Sep 2017 10:00:00 -0000")); + assert_eq!( + i.published_date, + Some("Wed, 13 Sep 2017 10:00:00 +0000".to_string()) + ); assert_eq!(i.epoch, 1505296800); let second = channel.items().iter().nth(1).unwrap(); @@ -267,7 +198,10 @@ mod tests { assert_eq!(i2.description, Some(descr2)); assert_eq!(i2.length, Some(67527575)); assert_eq!(i2.guid, Some("7c207a24-e33f-11e6-9438-eb45dcf36a1d")); - assert_eq!(i2.published_date, Some("Wed, 09 Aug 2017 10:00:00 -0000")); + assert_eq!( + i2.published_date, + Some("Wed, 9 Aug 2017 10:00:00 +0000".to_string()) + ); assert_eq!(i2.epoch, 1502272800); } @@ -300,7 +234,10 @@ mod tests { the-breakthrough-hopelessness-exploitation-homes-for-mentally-ill#134472", ) ); - assert_eq!(i.published_date, Some("Fri, 08 Sep 2017 12:00:00 +0000")); + assert_eq!( + i.published_date, + Some("Fri, 8 Sep 2017 12:00:00 +0000".to_string()) + ); assert_eq!(i.epoch, 1504872000); let second = channel.items().iter().nth(1).unwrap(); @@ -330,7 +267,10 @@ mod tests { org/podcast/the-breakthrough-hillary-clinton-failed-presidential-bid#133721", ) ); - assert_eq!(i2.published_date, Some("Fri, 25 Aug 2017 12:00:00 +0000")); + assert_eq!( + i2.published_date, + Some("Fri, 25 Aug 2017 12:00:00 +0000".to_string()) + ); assert_eq!(i2.epoch, 1503662400); } @@ -354,7 +294,10 @@ mod tests { assert_eq!(i.description, Some(descr)); assert_eq!(i.length, Some(46479789)); assert_eq!(i.guid, Some("78A682B4-73E8-47B8-88C0-1BE62DD4EF9D")); - assert_eq!(i.published_date, Some("Tue, 12 Sep 2017 22:24:42 -0700")); + assert_eq!( + i.published_date, + Some("Tue, 12 Sep 2017 22:24:42 -0700".to_string()) + ); assert_eq!(i.epoch, 1505280282); let second = channel.items().iter().nth(1).unwrap(); @@ -375,7 +318,10 @@ mod tests { assert_eq!(i2.description, Some(descr2)); assert_eq!(i2.length, Some(36544272)); assert_eq!(i2.guid, Some("1CE57548-B36C-4F14-832A-5D5E0A24E35B")); - assert_eq!(i2.published_date, Some("Tue, 05 Sep 2017 20:57:27 -0700")); + assert_eq!( + i2.published_date, + Some("Tue, 5 Sep 2017 20:57:27 -0700".to_string()) + ); assert_eq!(i2.epoch, 1504670247); } @@ -404,7 +350,10 @@ mod tests { i.guid, Some("https://request-for-explanation.github.io/podcast/ep9-a-once-in-a-lifetime-rfc/",) ); - assert_eq!(i.published_date, Some("Mon, 28 Aug 2017 15:00:00 PDT")); + assert_eq!( + i.published_date, + Some("Mon, 28 Aug 2017 15:00:00 -0700".to_string()) + ); assert_eq!(i.epoch, 1503957600); let second = channel.items().iter().nth(8).unwrap(); @@ -427,7 +376,10 @@ mod tests { i2.guid, Some("https://request-for-explanation.github.io/podcast/ep8-an-existential-crisis/",) ); - assert_eq!(i2.published_date, Some("Tue, 15 Aug 2017 17:00:00 PDT")); + assert_eq!( + i2.published_date, + Some("Tue, 15 Aug 2017 17:00:00 -0700".to_string()) + ); assert_eq!(i2.epoch, 1502841600); } } diff --git a/src/index_feed.rs b/src/index_feed.rs index 61db6b9..30ee40c 100644 --- a/src/index_feed.rs +++ b/src/index_feed.rs @@ -60,10 +60,12 @@ fn index_podcast(con: &SqliteConnection, pd: &NewPodcast) -> Result<()> { fn index_episode(con: &SqliteConnection, ep: &NewEpisode) -> Result<()> { match dbqueries::load_episode(con, &ep.uri.unwrap()) { - Ok(mut foo) => if foo.title() != ep.title || foo.published_date() != ep.published_date { + Ok(mut foo) => if foo.title() != ep.title + || foo.published_date() != ep.published_date.as_ref().map(|x| x.as_str()) + { foo.set_title(ep.title); foo.set_description(ep.description); - foo.set_published_date(ep.published_date); + foo.set_published_date(ep.published_date.clone()); foo.set_guid(ep.guid); foo.set_length(ep.length); foo.set_epoch(ep.epoch); diff --git a/src/lib.rs b/src/lib.rs index e4a554f..70139d3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -24,6 +24,7 @@ extern crate hyper; extern crate rayon; extern crate regex; extern crate reqwest; +extern crate rfc822_sanitizer; extern crate rss; extern crate time; extern crate xdg; diff --git a/src/models.rs b/src/models.rs index 5b21f28..5522c8d 100644 --- a/src/models.rs +++ b/src/models.rs @@ -65,8 +65,10 @@ impl Episode { self.published_date.as_ref().map(|s| s.as_str()) } - pub fn set_published_date(&mut self, value: Option<&str>) { - self.published_date = value.map(|x| x.to_string()); + // FIXME: make the setter accept &str again + pub fn set_published_date(&mut self, value: Option) { + // self.published_date = value.map(|x| x.to_string()); + self.published_date = value; } pub fn guid(&self) -> Option<&str> { @@ -226,7 +228,8 @@ pub struct NewEpisode<'a> { pub uri: Option<&'a str>, pub local_uri: Option<&'a str>, pub description: Option<&'a str>, - pub published_date: Option<&'a str>, + // FIXME: make it &str again + pub published_date: Option, pub length: Option, pub guid: Option<&'a str>, pub epoch: i32,