Add support to videos
parent
db797e2c34
commit
583690a445
|
@ -10,4 +10,5 @@ chrono = { version = "0.4.26", features = ["serde"] }
|
|||
diesel = { version = "2.1.0", features = ["sqlite", "chrono", "serde_json"] }
|
||||
dotenv = "0.15.0"
|
||||
serde = "1.0.164"
|
||||
serde_json = "1.0.99"
|
||||
serde_derive = "1.0.183"
|
||||
serde_json = "1.0.99"
|
||||
|
|
|
@ -0,0 +1,12 @@
|
|||
[package]
|
||||
name = "juunil-crawler"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
raspa = { git = "https://git.kirbylife.dev/kirbylife/raspa" }
|
||||
juunil = { path = "../" }
|
||||
chrono = { version = "0.4.26", features = ["serde"] }
|
||||
dotenv = "0.15.0"
|
||||
serde = "1.0.183"
|
||||
serde_json = "1.0.104"
|
|
@ -0,0 +1,154 @@
|
|||
use chrono::{NaiveDate, NaiveDateTime};
|
||||
use dotenv::dotenv;
|
||||
use juunil::controllers::{images, posts, videos};
|
||||
use raspa::request::{Request, RequestBase};
|
||||
use raspa::selector::{Selector, SelectorBase};
|
||||
use serde_json::Value;
|
||||
|
||||
const URL_BASE: &str = "https://syndication.twitter.com/srv/timeline-profile/screen-name/";
|
||||
const DATE_FMT: &str = "%a %b %d %H:%M:%S %z %Y";
|
||||
|
||||
#[derive(Debug)]
|
||||
struct PostInfo {
|
||||
id: i64,
|
||||
description: String,
|
||||
images: Vec<String>,
|
||||
videos: Vec<String>,
|
||||
datetime: NaiveDateTime,
|
||||
}
|
||||
|
||||
fn build_timestamp_from_str(raw_date: String) -> NaiveDateTime {
|
||||
NaiveDateTime::parse_from_str(&raw_date, DATE_FMT).unwrap()
|
||||
}
|
||||
|
||||
fn get_post_info(content: Value) -> PostInfo {
|
||||
let id = content["conversation_id_str"]
|
||||
.as_str()
|
||||
.unwrap()
|
||||
.parse::<i64>()
|
||||
.unwrap();
|
||||
|
||||
let mut description = content["full_text"].as_str().unwrap().to_string();
|
||||
let datetime = build_timestamp_from_str(content["created_at"].as_str().unwrap().to_string());
|
||||
let mut images: Vec<String> = vec![];
|
||||
let mut videos: Vec<String> = vec![];
|
||||
|
||||
let media_items = if content["extended_entities"]["media"].is_array() {
|
||||
content["extended_entities"]["media"].as_array().unwrap()
|
||||
} else {
|
||||
content["entities"]["media"].as_array().unwrap()
|
||||
};
|
||||
|
||||
for media in media_items {
|
||||
let min_url = media["url"].as_str().unwrap();
|
||||
description = description.replace(min_url, "");
|
||||
|
||||
let media_type = media["type"].as_str().unwrap();
|
||||
|
||||
match media_type {
|
||||
"photo" => images.push(media["media_url_https"].as_str().unwrap().to_string()),
|
||||
"video" => {
|
||||
let mut bitrate = 0;
|
||||
let mut index = 0;
|
||||
for (i, variant) in media["video_info"]["variants"]
|
||||
.as_array()
|
||||
.unwrap()
|
||||
.iter()
|
||||
.enumerate()
|
||||
{
|
||||
if variant["bitrate"].is_number() {
|
||||
let temp_bitrate = variant["bitrate"].as_u64().unwrap();
|
||||
if temp_bitrate > bitrate {
|
||||
bitrate = temp_bitrate;
|
||||
index = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
videos.push(
|
||||
media["video_info"]["variants"][index]["url"]
|
||||
.as_str()
|
||||
.unwrap()
|
||||
.to_string(),
|
||||
);
|
||||
}
|
||||
_ => {}
|
||||
};
|
||||
}
|
||||
description = description.trim().to_string();
|
||||
|
||||
for url in content["entities"]["urls"].as_array().unwrap() {
|
||||
let min_url = url["url"].as_str().unwrap();
|
||||
let max_url = url["expanded_url"].as_str().unwrap();
|
||||
description = description.replace(min_url, max_url);
|
||||
}
|
||||
|
||||
PostInfo {
|
||||
id,
|
||||
description,
|
||||
images,
|
||||
videos,
|
||||
datetime,
|
||||
}
|
||||
}
|
||||
|
||||
fn get_posts() -> Vec<PostInfo> {
|
||||
let tw_user: &str = &std::env::var("TW_USER")
|
||||
.expect("Could not load the environment variable \"TW_USER\", add it to your .env");
|
||||
let auth_token: &str = &std::env::var("AUTH_TOKEN")
|
||||
.expect("Could not load the environment variable \"AUTH_TOKEN\", add it to your .env");
|
||||
|
||||
let resp = if cfg!(debug_assertions) {
|
||||
Selector::from_html(include_str!("../test2.html"))
|
||||
} else {
|
||||
let req = Request::new(format!("{}{}", URL_BASE, tw_user)).unwrap();
|
||||
let res = req.add_cookies(vec![("auth_token", auth_token)]).launch();
|
||||
Selector::from_html(res.html())
|
||||
};
|
||||
|
||||
let raw_json = resp
|
||||
.xpath_once("//script[@id=\"__NEXT_DATA__\"]")
|
||||
.unwrap()
|
||||
.html();
|
||||
println!("{raw_json}");
|
||||
let raw_json: &[u8] = raw_json.as_ref();
|
||||
|
||||
let data: Value = serde_json::from_slice(&raw_json[51..raw_json.len() - 9])
|
||||
.expect("The JSON could'nt be deserialized");
|
||||
let tws = data["props"]["pageProps"]["timeline"]["entries"]
|
||||
.as_array()
|
||||
.unwrap()
|
||||
.iter()
|
||||
// Remove all the RT statuses
|
||||
.filter(|x| {
|
||||
!x["content"]["tweet"]["full_text"]
|
||||
.as_str()
|
||||
.unwrap()
|
||||
.starts_with("RT")
|
||||
})
|
||||
// Remove all the reply tweets
|
||||
.filter(|x| !x["content"]["tweet"]["in_reply_to_status_id_str"].is_string())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let mut output = vec![];
|
||||
|
||||
for tw in tws {
|
||||
let content = tw["content"]["tweet"].clone();
|
||||
let post_info = get_post_info(content);
|
||||
|
||||
output.push(post_info);
|
||||
}
|
||||
output
|
||||
}
|
||||
|
||||
fn main() {
|
||||
dotenv().ok();
|
||||
|
||||
let latest_posts = get_posts();
|
||||
println!("{latest_posts:#?}");
|
||||
|
||||
for post in latest_posts {
|
||||
posts::add_post(post.id, post.description, post.datetime);
|
||||
images::add_images(post.id, post.images);
|
||||
videos::add_videos(post.id, post.videos);
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,12 @@
|
|||
[package]
|
||||
name = "juunil-server"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
juunil = { path = "../" }
|
||||
chrono = { version = "0.4.26", features = ["serde"] }
|
||||
dotenv = "0.15.0"
|
||||
rocket = { version = "=0.5.0-rc.3", features = ["json"] }
|
||||
serde = "1.0.164"
|
||||
serde_json = "1.0.99"
|
|
@ -0,0 +1,27 @@
|
|||
#[macro_use]
|
||||
extern crate rocket;
|
||||
|
||||
use dotenv::dotenv;
|
||||
use juunil::controllers::posts;
|
||||
use juunil::models::PostWithMedia;
|
||||
use rocket::serde::json::Json;
|
||||
use serde::Serialize;
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct Posts {
|
||||
posts: Vec<PostWithMedia>,
|
||||
}
|
||||
|
||||
#[get("/")]
|
||||
fn index() -> Json<Posts> {
|
||||
let posts = posts::get_all_posts_with_media();
|
||||
|
||||
Json(Posts { posts })
|
||||
}
|
||||
|
||||
#[launch]
|
||||
fn rocket() -> _ {
|
||||
dotenv().ok();
|
||||
|
||||
rocket::build().mount("/", routes![index])
|
||||
}
|
|
@ -1,2 +1,3 @@
|
|||
DROP TABLE image;
|
||||
DROP TABLE video;
|
||||
DROP TABLE post;
|
||||
|
|
|
@ -11,3 +11,10 @@ CREATE TABLE image (
|
|||
url TEXT NOT NULL,
|
||||
FOREIGN KEY (post_id) REFERENCES post(id)
|
||||
);
|
||||
|
||||
CREATE TABLE video (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
|
||||
post_id BIGINT NOT NULL,
|
||||
url TEXT NOT NULL,
|
||||
FOREIGN KEY (post_id) REFERENCES post(id)
|
||||
);
|
||||
|
|
|
@ -0,0 +1,171 @@
|
|||
use chrono::prelude::Utc;
|
||||
use chrono::NaiveDateTime;
|
||||
use diesel::sqlite::SqliteConnection;
|
||||
use diesel::Connection;
|
||||
use diesel::ExpressionMethods;
|
||||
use diesel::QueryDsl;
|
||||
use diesel::RunQueryDsl;
|
||||
use diesel::SelectableHelper;
|
||||
|
||||
use std::env;
|
||||
|
||||
pub fn establish_connection() -> SqliteConnection {
|
||||
let database_url = env::var("DATABASE_URL").expect("DATABASE_URL env var not seted");
|
||||
|
||||
SqliteConnection::establish(&database_url)
|
||||
.expect(&format!("Error connecting to {}", database_url))
|
||||
}
|
||||
|
||||
pub mod posts {
|
||||
use crate::controllers::*;
|
||||
use crate::models::{Post, PostWithMedia};
|
||||
use crate::schema::post;
|
||||
|
||||
pub fn add_post(post_id: i64, description: String, posted_date: NaiveDateTime) {
|
||||
let mut conn = establish_connection();
|
||||
|
||||
let crawled_date = Utc::now().naive_utc();
|
||||
|
||||
let new_post = Post {
|
||||
id: post_id,
|
||||
description,
|
||||
posted_date,
|
||||
crawled_date,
|
||||
};
|
||||
|
||||
diesel::insert_into(post::table)
|
||||
.values(&new_post)
|
||||
.execute(&mut conn)
|
||||
.expect("Error adding the post");
|
||||
}
|
||||
|
||||
pub fn get_post(post_id: i64) -> Post {
|
||||
use crate::schema::post::dsl::*;
|
||||
|
||||
let mut conn = establish_connection();
|
||||
|
||||
post.find(post_id)
|
||||
.first(&mut conn)
|
||||
.expect("Error getting the post")
|
||||
}
|
||||
|
||||
pub fn get_post_with_media(post_id: i64) -> PostWithMedia {
|
||||
let images = images::get_images_strings(post_id);
|
||||
let videos = videos::get_videos_strings(post_id);
|
||||
let post = get_post(post_id);
|
||||
|
||||
PostWithMedia {
|
||||
id: post.id,
|
||||
description: post.description,
|
||||
posted_date: post.posted_date,
|
||||
crawled_date: post.crawled_date,
|
||||
images,
|
||||
videos,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_all_posts_with_media() -> Vec<PostWithMedia> {
|
||||
use crate::schema::post::dsl::*;
|
||||
|
||||
let mut conn = establish_connection();
|
||||
|
||||
let posts = post
|
||||
.order(crawled_date.asc())
|
||||
.load(&mut conn)
|
||||
.expect("Error getting the posts");
|
||||
|
||||
let mut output = vec![];
|
||||
|
||||
for post_without_media in posts {
|
||||
let mut new_post = PostWithMedia::from_post(&post_without_media);
|
||||
let images = images::get_images_strings(new_post.id);
|
||||
let videos = videos::get_videos_strings(new_post.id);
|
||||
new_post.images = images;
|
||||
new_post.videos = videos;
|
||||
output.push(new_post);
|
||||
}
|
||||
output
|
||||
}
|
||||
}
|
||||
|
||||
pub mod images {
|
||||
use crate::controllers::*;
|
||||
use crate::models::{Image, NewImage};
|
||||
use crate::schema::image;
|
||||
|
||||
pub fn add_image(post_id: i64, url: String) {
|
||||
let mut conn = establish_connection();
|
||||
|
||||
let new_image = NewImage { post_id, url };
|
||||
|
||||
diesel::insert_into(image::table)
|
||||
.values(&new_image)
|
||||
.execute(&mut conn)
|
||||
.expect("Error adding the image");
|
||||
}
|
||||
|
||||
pub fn add_images(post_id: i64, urls: Vec<String>) {
|
||||
for url in urls {
|
||||
add_image(post_id, url);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_images(post: i64) -> Vec<Image> {
|
||||
use crate::schema::image::dsl::*;
|
||||
let mut conn = establish_connection();
|
||||
|
||||
image
|
||||
.filter(post_id.eq(post))
|
||||
.select(Image::as_select())
|
||||
.load(&mut conn)
|
||||
.expect("Error getting the images")
|
||||
}
|
||||
|
||||
pub fn get_images_strings(post: i64) -> Vec<String> {
|
||||
get_images(post)
|
||||
.iter()
|
||||
.map(|image| image.url.clone())
|
||||
.collect::<Vec<String>>()
|
||||
}
|
||||
}
|
||||
|
||||
pub mod videos {
|
||||
use crate::controllers::*;
|
||||
use crate::models::{NewVideo, Video};
|
||||
use crate::schema::video;
|
||||
|
||||
pub fn add_video(post_id: i64, url: String) {
|
||||
let mut conn = establish_connection();
|
||||
|
||||
let new_video = NewVideo { post_id, url };
|
||||
|
||||
diesel::insert_into(video::table)
|
||||
.values(&new_video)
|
||||
.execute(&mut conn)
|
||||
.expect("Error adding the video");
|
||||
}
|
||||
|
||||
pub fn add_videos(post_id: i64, urls: Vec<String>) {
|
||||
for url in urls {
|
||||
add_video(post_id, url);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_videos(post: i64) -> Vec<Video> {
|
||||
use crate::schema::video::dsl::*;
|
||||
let mut conn = establish_connection();
|
||||
|
||||
video
|
||||
.filter(post_id.eq(post))
|
||||
.select(Video::as_select())
|
||||
.load(&mut conn)
|
||||
.expect("Error getting the videos")
|
||||
}
|
||||
|
||||
pub fn get_videos_strings(post: i64) -> Vec<String> {
|
||||
get_videos(post)
|
||||
.iter()
|
||||
.map(|video| video.url.clone())
|
||||
.collect::<Vec<String>>()
|
||||
}
|
||||
}
|
|
@ -0,0 +1,72 @@
|
|||
use chrono::NaiveDateTime;
|
||||
use diesel::prelude::*;
|
||||
use serde_derive::Serialize;
|
||||
|
||||
#[derive(Identifiable, Queryable, Selectable, Insertable, Debug)]
|
||||
#[diesel(table_name = crate::schema::post)]
|
||||
#[diesel(check_for_backend(diesel::sqlite::Sqlite))]
|
||||
pub struct Post {
|
||||
pub id: i64,
|
||||
pub description: String,
|
||||
pub posted_date: NaiveDateTime,
|
||||
pub crawled_date: NaiveDateTime,
|
||||
}
|
||||
|
||||
#[derive(Identifiable, Queryable, Selectable, Associations, Debug)]
|
||||
#[diesel(belongs_to(Post))]
|
||||
#[diesel(table_name = crate::schema::image)]
|
||||
#[diesel(check_for_backend(diesel::sqlite::Sqlite))]
|
||||
pub struct Image {
|
||||
pub id: i32,
|
||||
pub post_id: i64,
|
||||
pub url: String,
|
||||
}
|
||||
|
||||
#[derive(Insertable, Associations, Debug)]
|
||||
#[diesel(belongs_to(Post))]
|
||||
#[diesel(table_name = crate::schema::image)]
|
||||
pub struct NewImage {
|
||||
pub post_id: i64,
|
||||
pub url: String,
|
||||
}
|
||||
|
||||
#[derive(Identifiable, Queryable, Selectable, Associations, Debug)]
|
||||
#[diesel(belongs_to(Post))]
|
||||
#[diesel(table_name = crate::schema::video)]
|
||||
#[diesel(check_for_backend(diesel::sqlite::Sqlite))]
|
||||
pub struct Video {
|
||||
pub id: i32,
|
||||
pub post_id: i64,
|
||||
pub url: String,
|
||||
}
|
||||
|
||||
#[derive(Insertable, Associations, Debug)]
|
||||
#[diesel(belongs_to(Post))]
|
||||
#[diesel(table_name = crate::schema::video)]
|
||||
pub struct NewVideo {
|
||||
pub post_id: i64,
|
||||
pub url: String,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct PostWithMedia {
|
||||
pub id: i64,
|
||||
pub description: String,
|
||||
pub posted_date: NaiveDateTime,
|
||||
pub crawled_date: NaiveDateTime,
|
||||
pub images: Vec<String>,
|
||||
pub videos: Vec<String>,
|
||||
}
|
||||
|
||||
impl PostWithMedia {
|
||||
pub fn from_post(post: &Post) -> Self {
|
||||
PostWithMedia {
|
||||
id: post.id,
|
||||
description: post.description.clone(),
|
||||
posted_date: post.posted_date,
|
||||
crawled_date: post.crawled_date,
|
||||
images: vec![],
|
||||
videos: vec![],
|
||||
}
|
||||
}
|
||||
}
|
|
@ -17,9 +17,19 @@ diesel::table! {
|
|||
}
|
||||
}
|
||||
|
||||
diesel::table! {
|
||||
video (id) {
|
||||
id -> Integer,
|
||||
post_id -> BigInt,
|
||||
url -> Text,
|
||||
}
|
||||
}
|
||||
|
||||
diesel::joinable!(image -> post (post_id));
|
||||
diesel::joinable!(video -> post (post_id));
|
||||
|
||||
diesel::allow_tables_to_appear_in_same_query!(
|
||||
image,
|
||||
post,
|
||||
video,
|
||||
);
|
||||
|
|
Loading…
Reference in New Issue