Add support to videos

main
kirbylife 2023-08-08 00:07:41 -06:00
parent db797e2c34
commit 583690a445
12 changed files with 7327 additions and 1 deletions

View File

@ -10,4 +10,5 @@ chrono = { version = "0.4.26", features = ["serde"] }
diesel = { version = "2.1.0", features = ["sqlite", "chrono", "serde_json"] }
dotenv = "0.15.0"
serde = "1.0.164"
serde_json = "1.0.99"
serde_derive = "1.0.183"
serde_json = "1.0.99"

View File

@ -0,0 +1,12 @@
[package]
name = "juunil-crawler"
version = "0.1.0"
edition = "2021"
[dependencies]
raspa = { git = "https://git.kirbylife.dev/kirbylife/raspa" }
juunil = { path = "../" }
chrono = { version = "0.4.26", features = ["serde"] }
dotenv = "0.15.0"
serde = "1.0.183"
serde_json = "1.0.104"

View File

@ -0,0 +1,154 @@
use chrono::{NaiveDate, NaiveDateTime};
use dotenv::dotenv;
use juunil::controllers::{images, posts, videos};
use raspa::request::{Request, RequestBase};
use raspa::selector::{Selector, SelectorBase};
use serde_json::Value;
const URL_BASE: &str = "https://syndication.twitter.com/srv/timeline-profile/screen-name/";
const DATE_FMT: &str = "%a %b %d %H:%M:%S %z %Y";
#[derive(Debug)]
struct PostInfo {
id: i64,
description: String,
images: Vec<String>,
videos: Vec<String>,
datetime: NaiveDateTime,
}
fn build_timestamp_from_str(raw_date: String) -> NaiveDateTime {
NaiveDateTime::parse_from_str(&raw_date, DATE_FMT).unwrap()
}
fn get_post_info(content: Value) -> PostInfo {
let id = content["conversation_id_str"]
.as_str()
.unwrap()
.parse::<i64>()
.unwrap();
let mut description = content["full_text"].as_str().unwrap().to_string();
let datetime = build_timestamp_from_str(content["created_at"].as_str().unwrap().to_string());
let mut images: Vec<String> = vec![];
let mut videos: Vec<String> = vec![];
let media_items = if content["extended_entities"]["media"].is_array() {
content["extended_entities"]["media"].as_array().unwrap()
} else {
content["entities"]["media"].as_array().unwrap()
};
for media in media_items {
let min_url = media["url"].as_str().unwrap();
description = description.replace(min_url, "");
let media_type = media["type"].as_str().unwrap();
match media_type {
"photo" => images.push(media["media_url_https"].as_str().unwrap().to_string()),
"video" => {
let mut bitrate = 0;
let mut index = 0;
for (i, variant) in media["video_info"]["variants"]
.as_array()
.unwrap()
.iter()
.enumerate()
{
if variant["bitrate"].is_number() {
let temp_bitrate = variant["bitrate"].as_u64().unwrap();
if temp_bitrate > bitrate {
bitrate = temp_bitrate;
index = i;
}
}
}
videos.push(
media["video_info"]["variants"][index]["url"]
.as_str()
.unwrap()
.to_string(),
);
}
_ => {}
};
}
description = description.trim().to_string();
for url in content["entities"]["urls"].as_array().unwrap() {
let min_url = url["url"].as_str().unwrap();
let max_url = url["expanded_url"].as_str().unwrap();
description = description.replace(min_url, max_url);
}
PostInfo {
id,
description,
images,
videos,
datetime,
}
}
fn get_posts() -> Vec<PostInfo> {
let tw_user: &str = &std::env::var("TW_USER")
.expect("Could not load the environment variable \"TW_USER\", add it to your .env");
let auth_token: &str = &std::env::var("AUTH_TOKEN")
.expect("Could not load the environment variable \"AUTH_TOKEN\", add it to your .env");
let resp = if cfg!(debug_assertions) {
Selector::from_html(include_str!("../test2.html"))
} else {
let req = Request::new(format!("{}{}", URL_BASE, tw_user)).unwrap();
let res = req.add_cookies(vec![("auth_token", auth_token)]).launch();
Selector::from_html(res.html())
};
let raw_json = resp
.xpath_once("//script[@id=\"__NEXT_DATA__\"]")
.unwrap()
.html();
println!("{raw_json}");
let raw_json: &[u8] = raw_json.as_ref();
let data: Value = serde_json::from_slice(&raw_json[51..raw_json.len() - 9])
.expect("The JSON could'nt be deserialized");
let tws = data["props"]["pageProps"]["timeline"]["entries"]
.as_array()
.unwrap()
.iter()
// Remove all the RT statuses
.filter(|x| {
!x["content"]["tweet"]["full_text"]
.as_str()
.unwrap()
.starts_with("RT")
})
// Remove all the reply tweets
.filter(|x| !x["content"]["tweet"]["in_reply_to_status_id_str"].is_string())
.collect::<Vec<_>>();
let mut output = vec![];
for tw in tws {
let content = tw["content"]["tweet"].clone();
let post_info = get_post_info(content);
output.push(post_info);
}
output
}
fn main() {
dotenv().ok();
let latest_posts = get_posts();
println!("{latest_posts:#?}");
for post in latest_posts {
posts::add_post(post.id, post.description, post.datetime);
images::add_images(post.id, post.images);
videos::add_videos(post.id, post.videos);
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,12 @@
[package]
name = "juunil-server"
version = "0.1.0"
edition = "2021"
[dependencies]
juunil = { path = "../" }
chrono = { version = "0.4.26", features = ["serde"] }
dotenv = "0.15.0"
rocket = { version = "=0.5.0-rc.3", features = ["json"] }
serde = "1.0.164"
serde_json = "1.0.99"

View File

@ -0,0 +1,27 @@
#[macro_use]
extern crate rocket;
use dotenv::dotenv;
use juunil::controllers::posts;
use juunil::models::PostWithMedia;
use rocket::serde::json::Json;
use serde::Serialize;
#[derive(Serialize)]
struct Posts {
posts: Vec<PostWithMedia>,
}
#[get("/")]
fn index() -> Json<Posts> {
let posts = posts::get_all_posts_with_media();
Json(Posts { posts })
}
#[launch]
fn rocket() -> _ {
dotenv().ok();
rocket::build().mount("/", routes![index])
}

View File

@ -1,2 +1,3 @@
DROP TABLE image;
DROP TABLE video;
DROP TABLE post;

View File

@ -11,3 +11,10 @@ CREATE TABLE image (
url TEXT NOT NULL,
FOREIGN KEY (post_id) REFERENCES post(id)
);
CREATE TABLE video (
id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
post_id BIGINT NOT NULL,
url TEXT NOT NULL,
FOREIGN KEY (post_id) REFERENCES post(id)
);

171
src/controllers.rs 100644
View File

@ -0,0 +1,171 @@
use chrono::prelude::Utc;
use chrono::NaiveDateTime;
use diesel::sqlite::SqliteConnection;
use diesel::Connection;
use diesel::ExpressionMethods;
use diesel::QueryDsl;
use diesel::RunQueryDsl;
use diesel::SelectableHelper;
use std::env;
pub fn establish_connection() -> SqliteConnection {
let database_url = env::var("DATABASE_URL").expect("DATABASE_URL env var not seted");
SqliteConnection::establish(&database_url)
.expect(&format!("Error connecting to {}", database_url))
}
pub mod posts {
use crate::controllers::*;
use crate::models::{Post, PostWithMedia};
use crate::schema::post;
pub fn add_post(post_id: i64, description: String, posted_date: NaiveDateTime) {
let mut conn = establish_connection();
let crawled_date = Utc::now().naive_utc();
let new_post = Post {
id: post_id,
description,
posted_date,
crawled_date,
};
diesel::insert_into(post::table)
.values(&new_post)
.execute(&mut conn)
.expect("Error adding the post");
}
pub fn get_post(post_id: i64) -> Post {
use crate::schema::post::dsl::*;
let mut conn = establish_connection();
post.find(post_id)
.first(&mut conn)
.expect("Error getting the post")
}
pub fn get_post_with_media(post_id: i64) -> PostWithMedia {
let images = images::get_images_strings(post_id);
let videos = videos::get_videos_strings(post_id);
let post = get_post(post_id);
PostWithMedia {
id: post.id,
description: post.description,
posted_date: post.posted_date,
crawled_date: post.crawled_date,
images,
videos,
}
}
pub fn get_all_posts_with_media() -> Vec<PostWithMedia> {
use crate::schema::post::dsl::*;
let mut conn = establish_connection();
let posts = post
.order(crawled_date.asc())
.load(&mut conn)
.expect("Error getting the posts");
let mut output = vec![];
for post_without_media in posts {
let mut new_post = PostWithMedia::from_post(&post_without_media);
let images = images::get_images_strings(new_post.id);
let videos = videos::get_videos_strings(new_post.id);
new_post.images = images;
new_post.videos = videos;
output.push(new_post);
}
output
}
}
pub mod images {
use crate::controllers::*;
use crate::models::{Image, NewImage};
use crate::schema::image;
pub fn add_image(post_id: i64, url: String) {
let mut conn = establish_connection();
let new_image = NewImage { post_id, url };
diesel::insert_into(image::table)
.values(&new_image)
.execute(&mut conn)
.expect("Error adding the image");
}
pub fn add_images(post_id: i64, urls: Vec<String>) {
for url in urls {
add_image(post_id, url);
}
}
pub fn get_images(post: i64) -> Vec<Image> {
use crate::schema::image::dsl::*;
let mut conn = establish_connection();
image
.filter(post_id.eq(post))
.select(Image::as_select())
.load(&mut conn)
.expect("Error getting the images")
}
pub fn get_images_strings(post: i64) -> Vec<String> {
get_images(post)
.iter()
.map(|image| image.url.clone())
.collect::<Vec<String>>()
}
}
pub mod videos {
use crate::controllers::*;
use crate::models::{NewVideo, Video};
use crate::schema::video;
pub fn add_video(post_id: i64, url: String) {
let mut conn = establish_connection();
let new_video = NewVideo { post_id, url };
diesel::insert_into(video::table)
.values(&new_video)
.execute(&mut conn)
.expect("Error adding the video");
}
pub fn add_videos(post_id: i64, urls: Vec<String>) {
for url in urls {
add_video(post_id, url);
}
}
pub fn get_videos(post: i64) -> Vec<Video> {
use crate::schema::video::dsl::*;
let mut conn = establish_connection();
video
.filter(post_id.eq(post))
.select(Video::as_select())
.load(&mut conn)
.expect("Error getting the videos")
}
pub fn get_videos_strings(post: i64) -> Vec<String> {
get_videos(post)
.iter()
.map(|video| video.url.clone())
.collect::<Vec<String>>()
}
}

72
src/models.rs 100644
View File

@ -0,0 +1,72 @@
use chrono::NaiveDateTime;
use diesel::prelude::*;
use serde_derive::Serialize;
#[derive(Identifiable, Queryable, Selectable, Insertable, Debug)]
#[diesel(table_name = crate::schema::post)]
#[diesel(check_for_backend(diesel::sqlite::Sqlite))]
pub struct Post {
pub id: i64,
pub description: String,
pub posted_date: NaiveDateTime,
pub crawled_date: NaiveDateTime,
}
#[derive(Identifiable, Queryable, Selectable, Associations, Debug)]
#[diesel(belongs_to(Post))]
#[diesel(table_name = crate::schema::image)]
#[diesel(check_for_backend(diesel::sqlite::Sqlite))]
pub struct Image {
pub id: i32,
pub post_id: i64,
pub url: String,
}
#[derive(Insertable, Associations, Debug)]
#[diesel(belongs_to(Post))]
#[diesel(table_name = crate::schema::image)]
pub struct NewImage {
pub post_id: i64,
pub url: String,
}
#[derive(Identifiable, Queryable, Selectable, Associations, Debug)]
#[diesel(belongs_to(Post))]
#[diesel(table_name = crate::schema::video)]
#[diesel(check_for_backend(diesel::sqlite::Sqlite))]
pub struct Video {
pub id: i32,
pub post_id: i64,
pub url: String,
}
#[derive(Insertable, Associations, Debug)]
#[diesel(belongs_to(Post))]
#[diesel(table_name = crate::schema::video)]
pub struct NewVideo {
pub post_id: i64,
pub url: String,
}
#[derive(Serialize)]
pub struct PostWithMedia {
pub id: i64,
pub description: String,
pub posted_date: NaiveDateTime,
pub crawled_date: NaiveDateTime,
pub images: Vec<String>,
pub videos: Vec<String>,
}
impl PostWithMedia {
pub fn from_post(post: &Post) -> Self {
PostWithMedia {
id: post.id,
description: post.description.clone(),
posted_date: post.posted_date,
crawled_date: post.crawled_date,
images: vec![],
videos: vec![],
}
}
}

View File

@ -17,9 +17,19 @@ diesel::table! {
}
}
diesel::table! {
video (id) {
id -> Integer,
post_id -> BigInt,
url -> Text,
}
}
diesel::joinable!(image -> post (post_id));
diesel::joinable!(video -> post (post_id));
diesel::allow_tables_to_appear_in_same_query!(
image,
post,
video,
);