h-data: Initial implementation of an OPML parser and importer.

This is not really compiant with the OPML spec and there
does not seem to be an OPML crate sadly. There are edge-cases
that are not handled but will only be addressed if a problem is reported.
This commit is contained in:
Jordan Petridis 2018-05-10 18:17:19 +03:00
parent f06dbd0562
commit f9b34bbd50
No known key found for this signature in database
GPG Key ID: CEABAD9F5683B9A6
5 changed files with 173 additions and 0 deletions

11
Cargo.lock generated
View File

@ -685,6 +685,7 @@ dependencies = [
"hyper-tls 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
"lazy_static 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
"log 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)",
"maplit 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
"native-tls 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
"num_cpus 1.8.0 (registry+https://github.com/rust-lang/crates.io-index)",
"pretty_assertions 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
@ -699,6 +700,7 @@ dependencies = [
"tokio-core 0.1.17 (registry+https://github.com/rust-lang/crates.io-index)",
"url 1.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
"xdg 2.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
"xml-rs 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
@ -2141,6 +2143,14 @@ name = "xdg"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "xml-rs"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"bitflags 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[metadata]
"checksum adler32 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6cbd0b9af8587c72beadc9f72d35b9fbb070982c9e6203e46e93f10df25f8f45"
"checksum aho-corasick 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)" = "d6531d44de723825aa81398a6415283229725a00fa30713812ab9323faa82fc4"
@ -2377,3 +2387,4 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
"checksum ws2_32-sys 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "d59cefebd0c892fa2dd6de581e937301d8552cb44489cdff035c6187cb63fa5e"
"checksum xdg 2.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a66b7c2281ebde13cf4391d70d4c7e5946c3c25e72a7b859ca8f677dcd0b0c61"
"checksum xml-rs 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "3c1cb601d29fe2c2ac60a2b2e5e293994d87a1f6fa9687a31a15270f909be9c2"

View File

@ -16,6 +16,7 @@ rfc822_sanitizer = "0.3.3"
rss = "1.5.0"
url = "1.7.0"
xdg = "2.1.0"
xml-rs = "0.7.0"
futures = "0.1.21"
hyper = "0.11.25"
tokio-core = "0.1.17"
@ -43,6 +44,7 @@ rand = "0.4.2"
tempdir = "0.3.7"
criterion = "0.2.3"
pretty_assertions = "0.5.1"
maplit = "1.0.1"
[[bench]]
name = "bench"

View File

@ -5,6 +5,7 @@ use hyper;
use native_tls;
use rss;
use url;
use xml;
use std::io;
@ -49,6 +50,8 @@ pub enum DataError {
IOError(#[cause] io::Error),
#[fail(display = "RSS Error: {}", _0)]
RssError(#[cause] rss::Error),
#[fail(display = "XML Reader Error: {}", _0)]
XmlReaderError(#[cause] xml::reader::Error),
#[fail(display = "Error: {}", _0)]
Bail(String),
#[fail(display = "{}", _0)]
@ -115,6 +118,12 @@ impl From<rss::Error> for DataError {
}
}
impl From<xml::reader::Error> for DataError {
fn from(err: xml::reader::Error) -> Self {
DataError::XmlReaderError(err)
}
}
impl From<String> for DataError {
fn from(err: String) -> Self {
DataError::Bail(err)

View File

@ -29,6 +29,10 @@
#[macro_use]
extern crate pretty_assertions;
#[cfg(test)]
#[macro_use]
extern crate maplit;
#[macro_use]
extern crate derive_builder;
#[macro_use]
@ -58,6 +62,8 @@ extern crate rss;
extern crate tokio_core;
extern crate url;
extern crate xdg;
#[allow(unused)]
extern crate xml;
pub mod database;
#[allow(missing_docs)]
@ -66,6 +72,7 @@ pub mod dbqueries;
pub mod errors;
mod feed;
pub(crate) mod models;
pub mod opml;
mod parser;
pub mod pipeline;
mod schema;

144
hammond-data/src/opml.rs Normal file
View File

@ -0,0 +1,144 @@
//! FIXME: Docs
// #![allow(unused)]
use errors::DataError;
use models::Source;
use xml::reader;
use std::collections::HashSet;
use std::io::Read;
// use std::fs::{File, OpenOptions};
// use std::io::BufReader;
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
// FIXME: Make it a Diesel model
/// Represents an `outline` xml element as per the `OPML` [specification][spec]
/// not `RSS` related sub-elements are ommited.
///
/// [spec]: http://dev.opml.org/spec2.html
pub struct Opml {
title: String,
description: String,
url: String,
}
/// Import feed url's from a `R` into the `Source` table.
pub fn opml_import<R: Read>(reader: R) -> Result<Vec<Result<Source, DataError>>, DataError> {
let feeds = extract_sources(reader)?;
Ok(feeds
.iter()
.map(|opml| Source::from_url(&opml.url))
.collect())
}
/// Extracts the `outline` elemnts from a reader `R` and returns a `HashSet` of `Opml` structs.
pub fn extract_sources<R: Read>(reader: R) -> Result<HashSet<Opml>, reader::Error> {
let mut list = HashSet::new();
let parser = reader::EventReader::new(reader);
parser
.into_iter()
.map(|e| match e {
Ok(reader::XmlEvent::StartElement {
name, attributes, ..
}) => {
if name.local_name == "outline" {
let mut title = String::new();
let mut url = String::new();
let mut description = String::new();
attributes.into_iter().for_each(|attribute| {
match attribute.name.local_name.as_str() {
"title" => title = attribute.value,
"xmlUrl" => url = attribute.value,
"description" => description = attribute.value,
_ => {}
}
});
let feed = Opml {
title,
description,
url,
};
list.insert(feed);
}
Ok(())
}
Err(err) => Err(err),
_ => Ok(()),
})
.collect::<Result<Vec<_>, reader::Error>>()?;
Ok(list)
}
#[cfg(test)]
mod tests {
use super::*;
use chrono::Local;
#[test]
fn test_extract() {
let int_title = String::from("Intercepted with Jeremy Scahill");
let int_url = String::from("https://feeds.feedburner.com/InterceptedWithJeremyScahill");
let int_desc =
String::from(
"The people behind The Intercepts fearless reporting and incisive \
commentaryJeremy Scahill, Glenn Greenwald, Betsy Reed and othersdiscuss the \
crucial issues of our time: national security, civil liberties, foreign policy, \
and criminal justice. Plus interviews with artists, thinkers, and newsmakers \
who challenge our preconceptions about the world we live in.",
);
let dec_title = String::from("Deconstructed with Mehdi Hasan");
let dec_url = String::from("https://rss.prod.firstlook.media/deconstructed/podcast.rss");
let dec_desc = String::from(
"Journalist Mehdi Hasan is known around the world for his televised takedowns of \
presidents and prime ministers. In this new podcast from The Intercept, Mehdi \
unpacks a game-changing news event of the week while challenging the conventional \
wisdom. As a Brit, a Muslim and an immigrant based in Donald Trump's Washington \
D.C., Mehdi gives a refreshingly provocative perspective on the ups and downs of \
Americanand globalpolitics.",
);
#[cfg_attr(rustfmt, rustfmt_skip)]
let sample1 = format!(
"<?xml version=\"1.0\" encoding=\"UTF-8\"?> \
<opml version=\"2.0\"> \
<head> \
<title>Test OPML File</title> \
<dateCreated>{}</dateCreated> \
<docs>http://www.opml.org/spec2</docs> \
</head> \
<body> \
<outline type=\"rss\" title=\"{}\" description=\"{}\" xmlUrl=\"{}\"/> \
<outline type=\"rss\" title=\"{}\" description=\"{}\" xmlUrl=\"{}\"/> \
</body> \
</opml>",
Local::now().format("%a, %d %b %Y %T %Z"),
int_title,
int_desc,
int_url,
dec_title,
dec_desc,
dec_url,
);
let map = hashset![
Opml {
title: int_title,
description: int_desc,
url: int_url
},
Opml {
title: dec_title,
description: dec_desc,
url: dec_url
},
];
assert_eq!(extract_sources(sample1.as_bytes()).unwrap(), map);
}
}