From 583690a4456c1a2f995acf5c6db9d9c721bbd8bb Mon Sep 17 00:00:00 2001 From: kirbylife Date: Tue, 8 Aug 2023 00:07:41 -0600 Subject: [PATCH] Add support to videos --- Cargo.toml | 3 +- juunil-crawler/Cargo.toml | 12 + juunil-crawler/src/main.rs | 154 + juunil-crawler/test.html | 4237 +++++++++++++++++ juunil-crawler/test2.html | 2622 ++++++++++ juunil-server/Cargo.toml | 12 + juunil-server/src/main.rs | 27 + .../2023-08-07-063249_create_posts/down.sql | 1 + .../2023-08-07-063249_create_posts/up.sql | 7 + src/controllers.rs | 171 + src/models.rs | 72 + src/schema.rs | 10 + 12 files changed, 7327 insertions(+), 1 deletion(-) create mode 100644 juunil-crawler/Cargo.toml create mode 100644 juunil-crawler/src/main.rs create mode 100644 juunil-crawler/test.html create mode 100644 juunil-crawler/test2.html create mode 100644 juunil-server/Cargo.toml create mode 100644 juunil-server/src/main.rs create mode 100644 src/controllers.rs create mode 100644 src/models.rs diff --git a/Cargo.toml b/Cargo.toml index 137bd75..167f53d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,4 +10,5 @@ chrono = { version = "0.4.26", features = ["serde"] } diesel = { version = "2.1.0", features = ["sqlite", "chrono", "serde_json"] } dotenv = "0.15.0" serde = "1.0.164" -serde_json = "1.0.99" \ No newline at end of file +serde_derive = "1.0.183" +serde_json = "1.0.99" diff --git a/juunil-crawler/Cargo.toml b/juunil-crawler/Cargo.toml new file mode 100644 index 0000000..ee0dd17 --- /dev/null +++ b/juunil-crawler/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "juunil-crawler" +version = "0.1.0" +edition = "2021" + +[dependencies] +raspa = { git = "https://git.kirbylife.dev/kirbylife/raspa" } +juunil = { path = "../" } +chrono = { version = "0.4.26", features = ["serde"] } +dotenv = "0.15.0" +serde = "1.0.183" +serde_json = "1.0.104" diff --git a/juunil-crawler/src/main.rs b/juunil-crawler/src/main.rs new file mode 100644 index 0000000..d3c1642 --- /dev/null +++ b/juunil-crawler/src/main.rs @@ -0,0 +1,154 @@ +use chrono::{NaiveDate, NaiveDateTime}; +use dotenv::dotenv; +use juunil::controllers::{images, posts, videos}; +use raspa::request::{Request, RequestBase}; +use raspa::selector::{Selector, SelectorBase}; +use serde_json::Value; + +const URL_BASE: &str = "https://syndication.twitter.com/srv/timeline-profile/screen-name/"; +const DATE_FMT: &str = "%a %b %d %H:%M:%S %z %Y"; + +#[derive(Debug)] +struct PostInfo { + id: i64, + description: String, + images: Vec, + videos: Vec, + datetime: NaiveDateTime, +} + +fn build_timestamp_from_str(raw_date: String) -> NaiveDateTime { + NaiveDateTime::parse_from_str(&raw_date, DATE_FMT).unwrap() +} + +fn get_post_info(content: Value) -> PostInfo { + let id = content["conversation_id_str"] + .as_str() + .unwrap() + .parse::() + .unwrap(); + + let mut description = content["full_text"].as_str().unwrap().to_string(); + let datetime = build_timestamp_from_str(content["created_at"].as_str().unwrap().to_string()); + let mut images: Vec = vec![]; + let mut videos: Vec = vec![]; + + let media_items = if content["extended_entities"]["media"].is_array() { + content["extended_entities"]["media"].as_array().unwrap() + } else { + content["entities"]["media"].as_array().unwrap() + }; + + for media in media_items { + let min_url = media["url"].as_str().unwrap(); + description = description.replace(min_url, ""); + + let media_type = media["type"].as_str().unwrap(); + + match media_type { + "photo" => images.push(media["media_url_https"].as_str().unwrap().to_string()), + "video" => { + let mut bitrate = 0; + let mut index = 0; + for (i, variant) in media["video_info"]["variants"] + .as_array() + .unwrap() + .iter() + .enumerate() + { + if variant["bitrate"].is_number() { + let temp_bitrate = variant["bitrate"].as_u64().unwrap(); + if temp_bitrate > bitrate { + bitrate = temp_bitrate; + index = i; + } + } + } + videos.push( + media["video_info"]["variants"][index]["url"] + .as_str() + .unwrap() + .to_string(), + ); + } + _ => {} + }; + } + description = description.trim().to_string(); + + for url in content["entities"]["urls"].as_array().unwrap() { + let min_url = url["url"].as_str().unwrap(); + let max_url = url["expanded_url"].as_str().unwrap(); + description = description.replace(min_url, max_url); + } + + PostInfo { + id, + description, + images, + videos, + datetime, + } +} + +fn get_posts() -> Vec { + let tw_user: &str = &std::env::var("TW_USER") + .expect("Could not load the environment variable \"TW_USER\", add it to your .env"); + let auth_token: &str = &std::env::var("AUTH_TOKEN") + .expect("Could not load the environment variable \"AUTH_TOKEN\", add it to your .env"); + + let resp = if cfg!(debug_assertions) { + Selector::from_html(include_str!("../test2.html")) + } else { + let req = Request::new(format!("{}{}", URL_BASE, tw_user)).unwrap(); + let res = req.add_cookies(vec![("auth_token", auth_token)]).launch(); + Selector::from_html(res.html()) + }; + + let raw_json = resp + .xpath_once("//script[@id=\"__NEXT_DATA__\"]") + .unwrap() + .html(); + println!("{raw_json}"); + let raw_json: &[u8] = raw_json.as_ref(); + + let data: Value = serde_json::from_slice(&raw_json[51..raw_json.len() - 9]) + .expect("The JSON could'nt be deserialized"); + let tws = data["props"]["pageProps"]["timeline"]["entries"] + .as_array() + .unwrap() + .iter() + // Remove all the RT statuses + .filter(|x| { + !x["content"]["tweet"]["full_text"] + .as_str() + .unwrap() + .starts_with("RT") + }) + // Remove all the reply tweets + .filter(|x| !x["content"]["tweet"]["in_reply_to_status_id_str"].is_string()) + .collect::>(); + + let mut output = vec![]; + + for tw in tws { + let content = tw["content"]["tweet"].clone(); + let post_info = get_post_info(content); + + output.push(post_info); + } + output +} + +fn main() { + dotenv().ok(); + + let latest_posts = get_posts(); + println!("{latest_posts:#?}"); + + for post in latest_posts { + posts::add_post(post.id, post.description, post.datetime); + images::add_images(post.id, post.images); + videos::add_videos(post.id, post.videos); + } +} diff --git a/juunil-crawler/test.html b/juunil-crawler/test.html new file mode 100644 index 0000000..748d62b --- /dev/null +++ b/juunil-crawler/test.html @@ -0,0 +1,4237 @@ + + + + + + + + + + + + + + +
+ + + + + + + + + + + \ No newline at end of file diff --git a/juunil-crawler/test2.html b/juunil-crawler/test2.html new file mode 100644 index 0000000..13993a0 --- /dev/null +++ b/juunil-crawler/test2.html @@ -0,0 +1,2622 @@ + + + + + + + + + + + + + + +
+ + + + + + + + + + + \ No newline at end of file diff --git a/juunil-server/Cargo.toml b/juunil-server/Cargo.toml new file mode 100644 index 0000000..af0b6b9 --- /dev/null +++ b/juunil-server/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "juunil-server" +version = "0.1.0" +edition = "2021" + +[dependencies] +juunil = { path = "../" } +chrono = { version = "0.4.26", features = ["serde"] } +dotenv = "0.15.0" +rocket = { version = "=0.5.0-rc.3", features = ["json"] } +serde = "1.0.164" +serde_json = "1.0.99" diff --git a/juunil-server/src/main.rs b/juunil-server/src/main.rs new file mode 100644 index 0000000..8b1e45e --- /dev/null +++ b/juunil-server/src/main.rs @@ -0,0 +1,27 @@ +#[macro_use] +extern crate rocket; + +use dotenv::dotenv; +use juunil::controllers::posts; +use juunil::models::PostWithMedia; +use rocket::serde::json::Json; +use serde::Serialize; + +#[derive(Serialize)] +struct Posts { + posts: Vec, +} + +#[get("/")] +fn index() -> Json { + let posts = posts::get_all_posts_with_media(); + + Json(Posts { posts }) +} + +#[launch] +fn rocket() -> _ { + dotenv().ok(); + + rocket::build().mount("/", routes![index]) +} diff --git a/migrations/2023-08-07-063249_create_posts/down.sql b/migrations/2023-08-07-063249_create_posts/down.sql index 90de6dd..e682395 100644 --- a/migrations/2023-08-07-063249_create_posts/down.sql +++ b/migrations/2023-08-07-063249_create_posts/down.sql @@ -1,2 +1,3 @@ DROP TABLE image; +DROP TABLE video; DROP TABLE post; diff --git a/migrations/2023-08-07-063249_create_posts/up.sql b/migrations/2023-08-07-063249_create_posts/up.sql index cb22c6e..cc0187b 100644 --- a/migrations/2023-08-07-063249_create_posts/up.sql +++ b/migrations/2023-08-07-063249_create_posts/up.sql @@ -11,3 +11,10 @@ CREATE TABLE image ( url TEXT NOT NULL, FOREIGN KEY (post_id) REFERENCES post(id) ); + +CREATE TABLE video ( + id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, + post_id BIGINT NOT NULL, + url TEXT NOT NULL, + FOREIGN KEY (post_id) REFERENCES post(id) +); diff --git a/src/controllers.rs b/src/controllers.rs new file mode 100644 index 0000000..629241c --- /dev/null +++ b/src/controllers.rs @@ -0,0 +1,171 @@ +use chrono::prelude::Utc; +use chrono::NaiveDateTime; +use diesel::sqlite::SqliteConnection; +use diesel::Connection; +use diesel::ExpressionMethods; +use diesel::QueryDsl; +use diesel::RunQueryDsl; +use diesel::SelectableHelper; + +use std::env; + +pub fn establish_connection() -> SqliteConnection { + let database_url = env::var("DATABASE_URL").expect("DATABASE_URL env var not seted"); + + SqliteConnection::establish(&database_url) + .expect(&format!("Error connecting to {}", database_url)) +} + +pub mod posts { + use crate::controllers::*; + use crate::models::{Post, PostWithMedia}; + use crate::schema::post; + + pub fn add_post(post_id: i64, description: String, posted_date: NaiveDateTime) { + let mut conn = establish_connection(); + + let crawled_date = Utc::now().naive_utc(); + + let new_post = Post { + id: post_id, + description, + posted_date, + crawled_date, + }; + + diesel::insert_into(post::table) + .values(&new_post) + .execute(&mut conn) + .expect("Error adding the post"); + } + + pub fn get_post(post_id: i64) -> Post { + use crate::schema::post::dsl::*; + + let mut conn = establish_connection(); + + post.find(post_id) + .first(&mut conn) + .expect("Error getting the post") + } + + pub fn get_post_with_media(post_id: i64) -> PostWithMedia { + let images = images::get_images_strings(post_id); + let videos = videos::get_videos_strings(post_id); + let post = get_post(post_id); + + PostWithMedia { + id: post.id, + description: post.description, + posted_date: post.posted_date, + crawled_date: post.crawled_date, + images, + videos, + } + } + + pub fn get_all_posts_with_media() -> Vec { + use crate::schema::post::dsl::*; + + let mut conn = establish_connection(); + + let posts = post + .order(crawled_date.asc()) + .load(&mut conn) + .expect("Error getting the posts"); + + let mut output = vec![]; + + for post_without_media in posts { + let mut new_post = PostWithMedia::from_post(&post_without_media); + let images = images::get_images_strings(new_post.id); + let videos = videos::get_videos_strings(new_post.id); + new_post.images = images; + new_post.videos = videos; + output.push(new_post); + } + output + } +} + +pub mod images { + use crate::controllers::*; + use crate::models::{Image, NewImage}; + use crate::schema::image; + + pub fn add_image(post_id: i64, url: String) { + let mut conn = establish_connection(); + + let new_image = NewImage { post_id, url }; + + diesel::insert_into(image::table) + .values(&new_image) + .execute(&mut conn) + .expect("Error adding the image"); + } + + pub fn add_images(post_id: i64, urls: Vec) { + for url in urls { + add_image(post_id, url); + } + } + + pub fn get_images(post: i64) -> Vec { + use crate::schema::image::dsl::*; + let mut conn = establish_connection(); + + image + .filter(post_id.eq(post)) + .select(Image::as_select()) + .load(&mut conn) + .expect("Error getting the images") + } + + pub fn get_images_strings(post: i64) -> Vec { + get_images(post) + .iter() + .map(|image| image.url.clone()) + .collect::>() + } +} + +pub mod videos { + use crate::controllers::*; + use crate::models::{NewVideo, Video}; + use crate::schema::video; + + pub fn add_video(post_id: i64, url: String) { + let mut conn = establish_connection(); + + let new_video = NewVideo { post_id, url }; + + diesel::insert_into(video::table) + .values(&new_video) + .execute(&mut conn) + .expect("Error adding the video"); + } + + pub fn add_videos(post_id: i64, urls: Vec) { + for url in urls { + add_video(post_id, url); + } + } + + pub fn get_videos(post: i64) -> Vec