add support to xpath selectors
parent
b8e24a248e
commit
fd0ef3caa9
|
@ -13,4 +13,5 @@ url = "2.1"
|
||||||
http = "0.2"
|
http = "0.2"
|
||||||
nipper = "0.1.8"
|
nipper = "0.1.8"
|
||||||
serde_json = "1.0"
|
serde_json = "1.0"
|
||||||
serde = "1.0"
|
serde = "1.0"
|
||||||
|
cssifier = "0.1.2"
|
|
@ -3,21 +3,39 @@ pub trait SelectorBase {
|
||||||
|
|
||||||
fn html(&self) -> String;
|
fn html(&self) -> String;
|
||||||
|
|
||||||
fn css<T: SelectorBase>(&self, css_selector: &'static str) -> Vec<T> {
|
// fn css<'a, T: SelectorBase>(&self, css_selector: &'a str) -> Vec<T> {
|
||||||
|
fn css<'a>(&self, css_selector: &'a str) -> Vec<Selector> {
|
||||||
let html = nipper::Document::from(self.html().as_str());
|
let html = nipper::Document::from(self.html().as_str());
|
||||||
|
|
||||||
let mut output = vec![];
|
let mut output = vec![];
|
||||||
|
|
||||||
for item in html.select(css_selector).iter() {
|
for item in html.select(css_selector).iter() {
|
||||||
output.push(T::from_html(item.html().to_string()))
|
output.push(Selector::from_html(item.html().to_string()))
|
||||||
}
|
}
|
||||||
output
|
output
|
||||||
}
|
}
|
||||||
|
|
||||||
fn css_once<T: SelectorBase>(&self, css_selector: &'static str) -> Option<T> {
|
fn css_once<'a>(&self, css_selector: &'a str) -> Option<Selector> {
|
||||||
self.css(css_selector).pop()
|
self.css(css_selector).pop()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn xpath(&self, xpath: &'static str) -> Vec<Selector> {
|
||||||
|
match cssifier::cssifier(xpath) {
|
||||||
|
Some(css_selector) => {
|
||||||
|
if css_selector == "" {
|
||||||
|
Vec::default()
|
||||||
|
} else {
|
||||||
|
self.css(css_selector.as_str())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None => Vec::default(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn xpath_once(&self, xpath: &'static str) -> Option<Selector> {
|
||||||
|
self.xpath(xpath).pop()
|
||||||
|
}
|
||||||
|
|
||||||
fn content(&self) -> String {
|
fn content(&self) -> String {
|
||||||
let html = nipper::Document::from(self.html().as_str());
|
let html = nipper::Document::from(self.html().as_str());
|
||||||
html.select("body > *")
|
html.select("body > *")
|
||||||
|
|
|
@ -3,6 +3,7 @@ use http::StatusCode;
|
||||||
use raspa::request::{Request, RequestBase};
|
use raspa::request::{Request, RequestBase};
|
||||||
use raspa::selector::{Selector, SelectorBase};
|
use raspa::selector::{Selector, SelectorBase};
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
|
// use std::collections::HashMap;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn plain_text_selector() {
|
fn plain_text_selector() {
|
||||||
|
@ -17,12 +18,9 @@ fn plain_text_selector() {
|
||||||
"
|
"
|
||||||
.to_string();
|
.to_string();
|
||||||
let sel = Selector::from_html(html);
|
let sel = Selector::from_html(html);
|
||||||
assert_eq!(sel.css::<Selector>("h1")[0].html(), "<h1>hello world</h1>");
|
assert_eq!(sel.css("h1")[0].html(), "<h1>hello world</h1>");
|
||||||
assert_eq!(sel.css::<Selector>("#text")[0].content(), "good bye");
|
assert_eq!(sel.css("#text")[0].content(), "good bye");
|
||||||
assert_eq!(
|
assert_eq!(sel.css_once("body > a").unwrap().content(), "simple text");
|
||||||
sel.css_once::<Selector>("body > a").unwrap().content(),
|
|
||||||
"simple text"
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -30,7 +28,7 @@ fn simple_request() {
|
||||||
let req: Request = RequestBase::new("https://httpbin.org/").unwrap();
|
let req: Request = RequestBase::new("https://httpbin.org/").unwrap();
|
||||||
let resp = req.launch();
|
let resp = req.launch();
|
||||||
assert_eq!(resp.status_code, StatusCode::OK);
|
assert_eq!(resp.status_code, StatusCode::OK);
|
||||||
assert!(resp.css::<Selector>("h2")[0].html().contains("httpbin.org"));
|
assert!(resp.css("h2")[0].html().contains("httpbin.org"));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -55,29 +53,57 @@ fn complex_selectors() {
|
||||||
"
|
"
|
||||||
.to_string();
|
.to_string();
|
||||||
let sel = Selector::from_html(html);
|
let sel = Selector::from_html(html);
|
||||||
assert_eq!(
|
assert_eq!(sel.css_once("p").unwrap().attr("id").unwrap(), "text");
|
||||||
sel.css_once::<Selector>("p").unwrap().attr("id").unwrap(),
|
assert_eq!(sel.css("a")[0].attr("href").unwrap(), "http://google.com");
|
||||||
"text"
|
for node in sel.css("ul li").iter() {
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
sel.css::<Selector>("a")[0].attr("href").unwrap(),
|
|
||||||
"http://google.com"
|
|
||||||
);
|
|
||||||
for node in sel.css::<Selector>("ul li").iter() {
|
|
||||||
let text = node.content();
|
let text = node.content();
|
||||||
assert_eq!(node.attr("class").unwrap(), "item");
|
assert_eq!(node.attr("class").unwrap(), "item");
|
||||||
assert!(node.attr("id").unwrap().contains(&text));
|
assert!(node.attr("id").unwrap().contains(&text));
|
||||||
}
|
}
|
||||||
|
|
||||||
let div = sel.css_once::<Selector>("div").unwrap();
|
let div = sel.css_once("div").unwrap();
|
||||||
for node in div.css::<Selector>("a").iter() {
|
for node in div.css("a").iter() {
|
||||||
if node.attr("href").unwrap() == "#" {
|
if node.attr("href").unwrap() == "#" {
|
||||||
assert_eq!(node.content(), "non link");
|
assert_eq!(node.content(), "non link");
|
||||||
} else {
|
} else {
|
||||||
assert_eq!(node.content(), "link");
|
assert_eq!(node.content(), "link");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
assert!(sel.css_once::<Selector>("h1").is_none());
|
assert!(sel.css_once("h1").is_none());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn xpath_test() {
|
||||||
|
let html = "
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<p id='text'>good bye</p>
|
||||||
|
<a href='http://google.com'>simple text</a>
|
||||||
|
<div class='container'>
|
||||||
|
<a href='#'>first text</a>
|
||||||
|
<a href='http://localhost'>link</a>
|
||||||
|
<a href='#'>non link</a>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"
|
||||||
|
.to_string();
|
||||||
|
let sel = Selector::from_html(html);
|
||||||
|
assert_eq!(
|
||||||
|
sel.xpath_once("//div/a[1]").unwrap().content(),
|
||||||
|
"first text"
|
||||||
|
);
|
||||||
|
assert_eq!(sel.xpath("//*[@id='text']")[0].content(), "good bye");
|
||||||
|
assert_eq!(
|
||||||
|
sel.xpath("//a[contains(@href, 'localhost')]")[0].content(),
|
||||||
|
"link"
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
sel.xpath_once("//div[@class='container']/a[3]")
|
||||||
|
.unwrap()
|
||||||
|
.content(),
|
||||||
|
"non link"
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -90,8 +116,22 @@ fn simple_json_test() {
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn simple_post_request() {
|
fn simple_post_request() {
|
||||||
let mut req = Request::new("https://httpbin.org/post").unwrap();
|
let resp: Value = Request::new("https://httpbin.org/post")
|
||||||
req.method(Method::POST);
|
.unwrap()
|
||||||
let resp: Value = req.launch().to_json().expect("cannot parse json");
|
.method(Method::POST)
|
||||||
|
.launch()
|
||||||
|
.to_json()
|
||||||
|
.expect("cannot parse json");
|
||||||
assert_eq!(resp["url"].as_str().unwrap(), "https://httpbin.org/post");
|
assert_eq!(resp["url"].as_str().unwrap(), "https://httpbin.org/post");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// #[test]
|
||||||
|
// fn complex_post_request() {
|
||||||
|
// let form = HashMap::new();
|
||||||
|
// let attrs = HashMap::new();
|
||||||
|
//
|
||||||
|
// let resp = Request::new("https://httpbin.org/post")
|
||||||
|
// .unwrap()
|
||||||
|
// .method(Method::POST)
|
||||||
|
// .add_attrs()
|
||||||
|
// }
|
||||||
|
|
Loading…
Reference in New Issue