From f9b34bbd5041d8f16832c9b232dac0c54620f9f2 Mon Sep 17 00:00:00 2001 From: Jordan Petridis Date: Thu, 10 May 2018 18:17:19 +0300 Subject: [PATCH] h-data: Initial implementation of an OPML parser and importer. This is not really compiant with the OPML spec and there does not seem to be an OPML crate sadly. There are edge-cases that are not handled but will only be addressed if a problem is reported. --- Cargo.lock | 11 +++ hammond-data/Cargo.toml | 2 + hammond-data/src/errors.rs | 9 +++ hammond-data/src/lib.rs | 7 ++ hammond-data/src/opml.rs | 144 +++++++++++++++++++++++++++++++++++++ 5 files changed, 173 insertions(+) create mode 100644 hammond-data/src/opml.rs diff --git a/Cargo.lock b/Cargo.lock index efb816d..4b8cd7e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -685,6 +685,7 @@ dependencies = [ "hyper-tls 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", "lazy_static 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)", + "maplit 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)", "native-tls 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", "num_cpus 1.8.0 (registry+https://github.com/rust-lang/crates.io-index)", "pretty_assertions 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)", @@ -699,6 +700,7 @@ dependencies = [ "tokio-core 0.1.17 (registry+https://github.com/rust-lang/crates.io-index)", "url 1.7.0 (registry+https://github.com/rust-lang/crates.io-index)", "xdg 2.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "xml-rs 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -2141,6 +2143,14 @@ name = "xdg" version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "xml-rs" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "bitflags 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + [metadata] "checksum adler32 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6cbd0b9af8587c72beadc9f72d35b9fbb070982c9e6203e46e93f10df25f8f45" "checksum aho-corasick 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)" = "d6531d44de723825aa81398a6415283229725a00fa30713812ab9323faa82fc4" @@ -2377,3 +2387,4 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" "checksum ws2_32-sys 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "d59cefebd0c892fa2dd6de581e937301d8552cb44489cdff035c6187cb63fa5e" "checksum xdg 2.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a66b7c2281ebde13cf4391d70d4c7e5946c3c25e72a7b859ca8f677dcd0b0c61" +"checksum xml-rs 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "3c1cb601d29fe2c2ac60a2b2e5e293994d87a1f6fa9687a31a15270f909be9c2" diff --git a/hammond-data/Cargo.toml b/hammond-data/Cargo.toml index 665351c..b8779df 100644 --- a/hammond-data/Cargo.toml +++ b/hammond-data/Cargo.toml @@ -16,6 +16,7 @@ rfc822_sanitizer = "0.3.3" rss = "1.5.0" url = "1.7.0" xdg = "2.1.0" +xml-rs = "0.7.0" futures = "0.1.21" hyper = "0.11.25" tokio-core = "0.1.17" @@ -43,6 +44,7 @@ rand = "0.4.2" tempdir = "0.3.7" criterion = "0.2.3" pretty_assertions = "0.5.1" +maplit = "1.0.1" [[bench]] name = "bench" diff --git a/hammond-data/src/errors.rs b/hammond-data/src/errors.rs index 6a0dd21..c2ea15f 100644 --- a/hammond-data/src/errors.rs +++ b/hammond-data/src/errors.rs @@ -5,6 +5,7 @@ use hyper; use native_tls; use rss; use url; +use xml; use std::io; @@ -49,6 +50,8 @@ pub enum DataError { IOError(#[cause] io::Error), #[fail(display = "RSS Error: {}", _0)] RssError(#[cause] rss::Error), + #[fail(display = "XML Reader Error: {}", _0)] + XmlReaderError(#[cause] xml::reader::Error), #[fail(display = "Error: {}", _0)] Bail(String), #[fail(display = "{}", _0)] @@ -115,6 +118,12 @@ impl From for DataError { } } +impl From for DataError { + fn from(err: xml::reader::Error) -> Self { + DataError::XmlReaderError(err) + } +} + impl From for DataError { fn from(err: String) -> Self { DataError::Bail(err) diff --git a/hammond-data/src/lib.rs b/hammond-data/src/lib.rs index 3c087b0..13847d5 100644 --- a/hammond-data/src/lib.rs +++ b/hammond-data/src/lib.rs @@ -29,6 +29,10 @@ #[macro_use] extern crate pretty_assertions; +#[cfg(test)] +#[macro_use] +extern crate maplit; + #[macro_use] extern crate derive_builder; #[macro_use] @@ -58,6 +62,8 @@ extern crate rss; extern crate tokio_core; extern crate url; extern crate xdg; +#[allow(unused)] +extern crate xml; pub mod database; #[allow(missing_docs)] @@ -66,6 +72,7 @@ pub mod dbqueries; pub mod errors; mod feed; pub(crate) mod models; +pub mod opml; mod parser; pub mod pipeline; mod schema; diff --git a/hammond-data/src/opml.rs b/hammond-data/src/opml.rs new file mode 100644 index 0000000..9da756a --- /dev/null +++ b/hammond-data/src/opml.rs @@ -0,0 +1,144 @@ +//! FIXME: Docs + +// #![allow(unused)] + +use errors::DataError; +use models::Source; +use xml::reader; + +use std::collections::HashSet; +use std::io::Read; + +// use std::fs::{File, OpenOptions}; +// use std::io::BufReader; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +// FIXME: Make it a Diesel model +/// Represents an `outline` xml element as per the `OPML` [specification][spec] +/// not `RSS` related sub-elements are ommited. +/// +/// [spec]: http://dev.opml.org/spec2.html +pub struct Opml { + title: String, + description: String, + url: String, +} + +/// Import feed url's from a `R` into the `Source` table. +pub fn opml_import(reader: R) -> Result>, DataError> { + let feeds = extract_sources(reader)?; + Ok(feeds + .iter() + .map(|opml| Source::from_url(&opml.url)) + .collect()) +} + +/// Extracts the `outline` elemnts from a reader `R` and returns a `HashSet` of `Opml` structs. +pub fn extract_sources(reader: R) -> Result, reader::Error> { + let mut list = HashSet::new(); + let parser = reader::EventReader::new(reader); + + parser + .into_iter() + .map(|e| match e { + Ok(reader::XmlEvent::StartElement { + name, attributes, .. + }) => { + if name.local_name == "outline" { + let mut title = String::new(); + let mut url = String::new(); + let mut description = String::new(); + + attributes.into_iter().for_each(|attribute| { + match attribute.name.local_name.as_str() { + "title" => title = attribute.value, + "xmlUrl" => url = attribute.value, + "description" => description = attribute.value, + _ => {} + } + }); + + let feed = Opml { + title, + description, + url, + }; + list.insert(feed); + } + Ok(()) + } + Err(err) => Err(err), + _ => Ok(()), + }) + .collect::, reader::Error>>()?; + + Ok(list) +} + +#[cfg(test)] +mod tests { + use super::*; + use chrono::Local; + + #[test] + fn test_extract() { + let int_title = String::from("Intercepted with Jeremy Scahill"); + let int_url = String::from("https://feeds.feedburner.com/InterceptedWithJeremyScahill"); + let int_desc = + String::from( + "The people behind The Intercept’s fearless reporting and incisive \ + commentary—Jeremy Scahill, Glenn Greenwald, Betsy Reed and others—discuss the \ + crucial issues of our time: national security, civil liberties, foreign policy, \ + and criminal justice. Plus interviews with artists, thinkers, and newsmakers \ + who challenge our preconceptions about the world we live in.", + ); + + let dec_title = String::from("Deconstructed with Mehdi Hasan"); + let dec_url = String::from("https://rss.prod.firstlook.media/deconstructed/podcast.rss"); + let dec_desc = String::from( + "Journalist Mehdi Hasan is known around the world for his televised takedowns of \ + presidents and prime ministers. In this new podcast from The Intercept, Mehdi \ + unpacks a game-changing news event of the week while challenging the conventional \ + wisdom. As a Brit, a Muslim and an immigrant based in Donald Trump's Washington \ + D.C., Mehdi gives a refreshingly provocative perspective on the ups and downs of \ + American—and global—politics.", + ); + + #[cfg_attr(rustfmt, rustfmt_skip)] + let sample1 = format!( + " \ + \ + \ + Test OPML File \ + {} \ + http://www.opml.org/spec2 \ + \ + \ + \ + \ + \ + ", + Local::now().format("%a, %d %b %Y %T %Z"), + int_title, + int_desc, + int_url, + dec_title, + dec_desc, + dec_url, + ); + + let map = hashset![ + Opml { + title: int_title, + description: int_desc, + url: int_url + }, + Opml { + title: dec_title, + description: dec_desc, + url: dec_url + }, + ]; + assert_eq!(extract_sources(sample1.as_bytes()).unwrap(), map); + } +}