librs_gemini/src/scraper.rs

219 lines
6.8 KiB
Rust

use crate::category::Category;
use crate::crate_info::CrateInfo;
use crate::search_item::SearchItem;
use crate::{consts, version::Version};
use cached::proc_macro::cached;
use htmd::HtmlToMarkdown;
use raspa::{
request::{Request, RequestBase},
selector::SelectorBase,
};
#[cached(time = 62400)]
pub fn get_home_page() -> (u32, Vec<Category>) {
let resp = Request::new(consts::HOME_URL).unwrap().launch();
let raw_header_text = resp.css_once(".inner-col p").unwrap().content();
let total_crates_indexed = raw_header_text
.split(' ')
.nth(2)
.unwrap()
.replace('.', "")
.replace(',', "")
.parse()
.unwrap();
let categoryes = resp
.css("ul.cat > li")
.iter()
.filter_map(|elem| {
if elem.css_once("h3").is_none() {
return None;
}
let title = elem.css_once("h3").unwrap().content();
let slug = elem
.css_once("a")
.unwrap()
.attr("href")
.unwrap()
.strip_prefix('/')
.unwrap()
.to_string();
let desc = elem.css_once("span.desc").unwrap().content();
let mut crates = elem
.css("ul.crates > li")
.iter()
.map(|e| e.css_once("a").unwrap().content())
.collect::<Vec<String>>();
let more_count = crates
.pop()
.unwrap()
.split(" ")
.next()
.unwrap()
.parse()
.unwrap();
Some(Category {
name: title,
slug,
description: desc,
crates,
more_count,
})
})
.collect::<Vec<Category>>();
(total_crates_indexed, categoryes)
}
#[cached(time = 3600)]
pub fn search_by_criteria(criteria: String) -> Option<Vec<SearchItem>> {
let resp = Request::new(consts::CRITERIA_SEARCH_URL)
.unwrap()
.add_params(vec![("q", criteria)])
.launch();
if resp.css_once(".notfound").is_some() {
None
} else {
Some(
resp.css(".inner-col ol li")
.iter()
.map(|elem| {
let name = elem.css_once("h4").unwrap().content();
let description = elem.css_once(".desc").unwrap().content().trim().into();
let version = elem.css_once(".version").unwrap().content();
let tags = elem
.css(".k")
.iter()
.map(|tag| tag.content())
.collect::<Vec<String>>();
let search_item = SearchItem::new(name, description, version, tags);
search_item
})
.collect::<Vec<SearchItem>>(),
)
}
}
#[cached(time = 3600)]
pub fn search_crate(crate_name: String) -> Option<CrateInfo> {
let resp = Request::new(format!("{}/{}", consts::CRATE_INFO_URL, crate_name))
.unwrap()
.launch();
if resp.css_once(".notfound").is_some() {
None
} else {
let description = resp
.css_once(".desc")
.unwrap()
.content()
.trim()
.replace('\n', " ")
.into();
let owner = resp
.css("a.owner")
.iter()
.next()
.unwrap()
.css_once("span")
.unwrap()
.content();
let versions = resp
.css("#versions tr")
.iter()
.filter_map(|elem| {
if elem.css_once("del").is_some() {
None
} else if elem.css_once(".new").is_some() {
let version = elem
.css_once("a")
.unwrap()
.content()
.lines()
.nth(2)
.unwrap()
.trim()
.to_string();
let date = elem
.css_once("*")
.unwrap()
.content()
.lines()
.nth(2)
.unwrap()
.trim()
.to_string();
Some(Version::new(version, date))
} else {
let raw_data = elem.css_once("*").unwrap().content();
let mut elements = raw_data.lines();
let mut version = elements.nth(1).unwrap().to_string();
if version.contains("new") {
version = elements.nth(0).unwrap().to_string();
}
let date = elements.nth(1).unwrap().to_string();
Some(Version::new(version, date))
}
})
.collect::<Vec<Version>>();
let latest_version = versions.first().unwrap().clone();
let readme = resp.css_once(".readme").map(|elem| {
let converter = HtmlToMarkdown::builder()
.skip_tags(vec![
"h1",
"img",
"picture",
"figcaption",
"source",
"figure",
"svg",
])
.build();
let md_readme = converter.convert(elem.html().as_ref()).unwrap();
md2gemtext::convert(&md_readme)
});
let license = resp
.css_once("b[property='license']")
.map(|elem| elem.content());
let mut git_repo = None;
let mut api_reference = None;
let mut website = None;
for li in resp.css("header nav li") {
let a = li.css_once("a").unwrap();
let href = a.attr("href");
let content = a.html();
if content.contains("Git") || content.contains("Repository") {
git_repo = href;
} else if content.contains("API reference") {
api_reference = href;
} else if content.contains("Home") {
website = href;
}
}
let dependencies = resp
.css("#deps li[property='requirements'] a[title*='1']")
.iter()
.map(|elem| elem.content())
.collect::<Vec<String>>();
Some(CrateInfo {
name: crate_name,
description,
owner,
latest_version,
versions,
readme,
license,
git_repo,
api_reference,
website,
dependencies,
})
}
}