├── .gitignore ├── src ├── util │ ├── pgs │ │ ├── mod.rs │ │ └── reindex.rs │ ├── lnx │ │ ├── mod.rs │ │ ├── post.rs │ │ └── reindex.rs │ ├── mod.rs │ └── interval_lock.rs ├── imageboard │ ├── mod.rs │ └── model.rs ├── storage │ ├── asagi │ │ ├── common.sql │ │ ├── storage │ │ │ ├── mod.rs │ │ │ ├── s3.rs │ │ │ └── filesystem.rs │ │ ├── triggers_v2.sql │ │ ├── thread.rs │ │ ├── stats.rs │ │ ├── db_metrics.rs │ │ ├── boards.sql │ │ └── triggers.sql │ ├── mod.rs │ ├── search_pg │ │ ├── arena.rs │ │ ├── posts.sql │ │ ├── builder.rs │ │ └── mod.rs │ └── search_lnx │ │ ├── posts.json │ │ ├── post.rs │ │ ├── builder.rs │ │ └── mod.rs ├── feed │ ├── feed.rs │ ├── mod.rs │ └── feed_all.rs ├── config.rs ├── api.rs └── main.rs ├── torako.service ├── LICENSE ├── ASAGI.md ├── Cargo.toml ├── README.md ├── .github └── workflows │ └── release.yml └── Torako.sample.toml /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | Torako.toml -------------------------------------------------------------------------------- /src/util/pgs/mod.rs: -------------------------------------------------------------------------------- 1 | #[cfg(feature = "pgs-reindex")] 2 | mod reindex; 3 | 4 | #[cfg(feature = "pgs-reindex")] 5 | pub use reindex::reindex; 6 | -------------------------------------------------------------------------------- /src/imageboard/mod.rs: -------------------------------------------------------------------------------- 1 | mod board_stream; 2 | mod model; 3 | 4 | pub use board_stream::{BoardStream, Metrics}; 5 | pub use model::{CatalogPage, CatalogThread, Post, Thread}; 6 | -------------------------------------------------------------------------------- /src/util/lnx/mod.rs: -------------------------------------------------------------------------------- 1 | #[cfg(feature = "lnx-reindex")] 2 | mod reindex; 3 | #[cfg(feature = "lnx-reindex")] 4 | pub mod post; 5 | 6 | #[cfg(feature = "lnx-reindex")] 7 | pub use reindex::reindex; 8 | -------------------------------------------------------------------------------- /src/storage/asagi/common.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS `index_counters` ( 2 | `id` varchar(50) NOT NULL, 3 | `val` int(10) NOT NULL, 4 | PRIMARY KEY (`id`) 5 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; -------------------------------------------------------------------------------- /src/storage/asagi/storage/mod.rs: -------------------------------------------------------------------------------- 1 | use super::Error; 2 | 3 | mod backblaze; 4 | mod filesystem; 5 | mod s3; 6 | 7 | pub use backblaze::Backblaze; 8 | pub use filesystem::FileSystem; 9 | pub use s3::S3; 10 | -------------------------------------------------------------------------------- /src/util/mod.rs: -------------------------------------------------------------------------------- 1 | use clap::ArgMatches; 2 | 3 | pub mod interval_lock; 4 | pub mod pgs; 5 | pub mod lnx; 6 | 7 | pub fn boo<'a>(_: &ArgMatches<'a>) -> i32 { 8 | println!("boo"); 9 | return 0; 10 | } 11 | -------------------------------------------------------------------------------- /torako.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Torako Imageboard Archiver 3 | Documentation=http://github.com/miyachan/torako 4 | After=network.target 5 | 6 | [Service] 7 | Type=simple 8 | ExecStart=/usr/local/bin/torako -c /etc/Torako.toml 9 | Restart=on-failure 10 | RestartSec=30 11 | TimeoutStopSec=180 12 | 13 | [Install] 14 | WantedBy=multi-user.target -------------------------------------------------------------------------------- /src/storage/mod.rs: -------------------------------------------------------------------------------- 1 | use std::pin::Pin; 2 | 3 | pub mod asagi; 4 | pub mod search_lnx; 5 | pub mod search_pg; 6 | 7 | pub trait MetricsProvider: Sync + Send { 8 | fn name(&self) -> &'static str; 9 | fn metrics( 10 | &self, 11 | ) -> Pin> + Send>>; 12 | } 13 | -------------------------------------------------------------------------------- /src/storage/search_pg/arena.rs: -------------------------------------------------------------------------------- 1 | use std::cell::UnsafeCell; 2 | 3 | pub struct Arena { 4 | // For soundness, maybe inner should be protected by a Mutex? 5 | // ex. How do you prevent Arena from being mutated across threads? 6 | inner: UnsafeCell>, 7 | } 8 | 9 | impl Arena { 10 | pub fn new(capacity: usize) -> Self { 11 | Self { 12 | inner: UnsafeCell::new(Vec::with_capacity(capacity)), 13 | } 14 | } 15 | 16 | pub fn alloc<'a>(&'a self, v: T) -> &'a T { 17 | let inner = unsafe { &mut *self.inner.get() }; 18 | if inner.len() >= inner.capacity() { 19 | panic!("'rena is in a bad state. The 'rena must be explictly sized to hold no more than the amount of objects it was allocated with.") 20 | } 21 | inner.push(v); 22 | let len = inner.len(); 23 | &inner[len - 1] 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/storage/asagi/triggers_v2.sql: -------------------------------------------------------------------------------- 1 | DROP TRIGGER IF EXISTS `before_ins_%%BOARD%%`; 2 | DROP TRIGGER IF EXISTS `after_ins_%%BOARD%%`; 3 | 4 | DROP PROCEDURE IF EXISTS `delete_thread_%%BOARD%%`; 5 | 6 | CREATE PROCEDURE `delete_thread_%%BOARD%%` (tnum INT) 7 | BEGIN 8 | DELETE FROM `%%BOARD%%_threads` WHERE thread_num = tnum; 9 | END; 10 | 11 | DROP PROCEDURE IF EXISTS `delete_image_%%BOARD%%`; 12 | 13 | CREATE PROCEDURE `delete_image_%%BOARD%%` (n_media_id INT) 14 | BEGIN 15 | UPDATE `%%BOARD%%_images` SET total = (total - 1) WHERE media_id = n_media_id; 16 | END; 17 | 18 | 19 | DROP TRIGGER IF EXISTS `after_del_%%BOARD%%`; 20 | 21 | CREATE TRIGGER `after_del_%%BOARD%%` AFTER DELETE ON `%%BOARD%%` 22 | FOR EACH ROW 23 | BEGIN 24 | CALL update_thread_%%BOARD%%(OLD.thread_num, OLD.subnum, OLD.timestamp, OLD.media_hash, OLD.email); 25 | IF OLD.op = 1 THEN 26 | CALL delete_thread_%%BOARD%%(OLD.num); 27 | END IF; 28 | IF OLD.media_hash IS NOT NULL THEN 29 | CALL delete_image_%%BOARD%%(OLD.media_id); 30 | END IF; 31 | END; -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /ASAGI.md: -------------------------------------------------------------------------------- 1 | # Asagi Modifications 2 | 3 | This document notes any changes to the basic Asagi install that were done. See [https://archive.wakarimasen.co/_/articles/about/](https://archive.wakarimasen.co/_/articles/about/). 4 | 5 | ## Schema Modifications 6 | 7 | ### New Columns 8 | 9 | The original Asagi implementation had a `timestamp` column that was the seconds since UNIX Epoch, but did the calculation from the timezone America/New_York. Torako continues this, but this timestamp has issues where around DST clock changes you lose proper ordering of the posts. Torako, by default on creation, and if it detects a `unix_timestamp` field (of type `TIMESTAMP`) will also write the time of the post in seconds after UNIX Epoch in UTC. 10 | 11 | ### Dropped Indexes 12 | 13 | It's safe to drop some indexes; having a lot of indexes can cause memory and disk usage to blow up, and slow down inserts. They are a waste if they are never used. FoolFuuka doesn't seem to use these indexes (or in some other cases the indexes are worthless because other WHERE conditions make the result set relatively small, or the seatch is better served by the search index). The size column represents a table with more than 100M rows. 14 | 15 | | Table | Index Name | Size (MiB) | 16 | | -- | -- | -- | 17 | | %%BOARD%% | subnum_index | 1462.00 | 18 | | %%BOARD%% | op_index | 1015.00 | 19 | | %%BOARD%% | media_hash_index | 2770.69 | 20 | | %%BOARD%% | media_orig_index | 2379.94 | 21 | | %%BOARD%% | name_trip_index | 2512.94 | 22 | | %%BOARD%% | trip_index | 1309.91 | 23 | | %%BOARD%% | email_index | 1208.00 | 24 | | %%BOARD%% | timestamp_index | 2821.95 | -------------------------------------------------------------------------------- /src/feed/feed.rs: -------------------------------------------------------------------------------- 1 | use std::pin::Pin; 2 | 3 | use futures::future::Future; 4 | use futures::ready; 5 | use futures::sink::Sink; 6 | use futures::task::{Context, Poll}; 7 | 8 | /// Future for the [`feed`](super::SinkExt::feed) method. 9 | #[derive(Debug)] 10 | #[must_use = "futures do nothing unless you `.await` or poll them"] 11 | pub struct Feed<'a, Si: ?Sized, Item> { 12 | sink: &'a mut Si, 13 | item: Option, 14 | } 15 | 16 | // Pinning is never projected to children 17 | impl Unpin for Feed<'_, Si, Item> {} 18 | 19 | impl<'a, Si: Sink + Unpin + ?Sized, Item> Feed<'a, Si, Item> { 20 | pub(super) fn new(sink: &'a mut Si, item: Item) -> Self { 21 | Feed { 22 | sink, 23 | item: Some(item), 24 | } 25 | } 26 | 27 | #[allow(dead_code)] 28 | pub(super) fn sink_pin_mut(&mut self) -> Pin<&mut Si> { 29 | Pin::new(self.sink) 30 | } 31 | 32 | #[allow(dead_code)] 33 | pub(super) fn is_item_pending(&self) -> bool { 34 | self.item.is_some() 35 | } 36 | } 37 | 38 | impl + Unpin + ?Sized, Item> Future for Feed<'_, Si, Item> { 39 | type Output = Result<(), Si::Error>; 40 | 41 | fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { 42 | let this = &mut *self; 43 | let mut sink = Pin::new(&mut this.sink); 44 | ready!(sink.as_mut().poll_ready(cx))?; 45 | let item = this.item.take().expect("polled Feed after completion"); 46 | sink.as_mut().start_send(item)?; 47 | Poll::Ready(Ok(())) 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/feed/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * This implementation of feed is extracted from: 3 | * https://github.com/rust-lang/futures-rs/pull/2155 4 | */ 5 | use futures::sink::Sink; 6 | use futures::stream::{Stream, TryStream}; 7 | 8 | mod feed; 9 | pub use self::feed::Feed; 10 | 11 | mod feed_all; 12 | pub use self::feed_all::FeedAll; 13 | 14 | impl FeedSinkExt for T where T: Sink {} 15 | 16 | pub trait FeedSinkExt: Sink { 17 | /// A future that completes after the given item has been received 18 | /// by the sink. 19 | /// 20 | /// Unlike `send`, the returned future does not flush the sink. 21 | /// It is the caller's responsibility to ensure all pending items 22 | /// are processed, which can be done via `flush` or `close`. 23 | fn feed(&mut self, item: Item) -> Feed<'_, Self, Item> 24 | where 25 | Self: Unpin, 26 | { 27 | Feed::new(self, item) 28 | } 29 | 30 | /// A future that completes after the given stream has been fully received 31 | /// by the sink. 32 | /// 33 | /// This future will drive the stream to keep producing items until it is 34 | /// exhausted, sending each item to the sink. It will complete once the 35 | /// stream is exhausted and the sink has received all items. 36 | /// Note that the sink is **not** closed. 37 | /// 38 | /// Unlike `send_all`, the returned future does not fully flush the sink 39 | /// before completion. 40 | /// It is the caller's responsibility to ensure all pending items 41 | /// are processed, which can be done via `flush` or `close`. 42 | fn feed_all<'a, St>(&'a mut self, stream: &'a mut St) -> FeedAll<'a, Self, St> 43 | where 44 | St: TryStream + Stream + Unpin + ?Sized, 45 | Self: Unpin, 46 | { 47 | FeedAll::new(self, stream) 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/storage/search_pg/posts.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS posts ( 2 | board VARCHAR(8) NOT NULL, 3 | thread_no BIGINT NOT NULL, 4 | post_no BIGINT NOT NULL, 5 | subject TSVECTOR, 6 | username TSVECTOR, 7 | tripcode TSVECTOR, 8 | email TSVECTOR, 9 | unique_id TEXT, 10 | since4_pass INT, 11 | country TEXT, 12 | filename TSVECTOR, 13 | image_hash TEXT, 14 | image_width INT, 15 | image_height INT, 16 | ts TIMESTAMP, 17 | comment TSVECTOR, 18 | deleted bool, 19 | ghost bool, 20 | sticky bool, 21 | spoiler bool, 22 | op bool, 23 | capcode INT, 24 | PRIMARY KEY (board, post_no) 25 | ); 26 | 27 | CREATE INDEX IF NOT EXISTS posts_thread ON posts (thread_no); 28 | CREATE INDEX IF NOT EXISTS posts_subject ON posts USING GIN (subject); 29 | CREATE INDEX IF NOT EXISTS posts_username ON posts USING GIN (username); 30 | CREATE INDEX IF NOT EXISTS posts_tripcode ON posts USING GIN (tripcode); 31 | CREATE INDEX IF NOT EXISTS posts_email ON posts USING GIN (email); 32 | CREATE INDEX IF NOT EXISTS posts_unqiue_id ON posts (unique_id); 33 | CREATE INDEX IF NOT EXISTS posts_since4pass ON posts (since4_pass); 34 | CREATE INDEX IF NOT EXISTS posts_country ON posts (country); 35 | CREATE INDEX IF NOT EXISTS posts_filename ON posts USING GIN (filename); 36 | CREATE INDEX IF NOT EXISTS posts_im_hash ON posts (image_hash); 37 | CREATE INDEX IF NOT EXISTS posts_im_w ON posts (image_width); 38 | CREATE INDEX IF NOT EXISTS posts_im_h ON posts (image_height); 39 | CREATE INDEX IF NOT EXISTS posts_ts ON posts (ts DESC); 40 | CREATE INDEX IF NOT EXISTS posts_com ON posts USING GIN (comment); 41 | CREATE INDEX IF NOT EXISTS posts_deleted ON posts (deleted); 42 | CREATE INDEX IF NOT EXISTS posts_ghost ON posts (ghost); 43 | CREATE INDEX IF NOT EXISTS posts_sticky ON posts (sticky); 44 | CREATE INDEX IF NOT EXISTS posts_spoiler ON posts (spoiler); 45 | CREATE INDEX IF NOT EXISTS posts_op ON posts (op); 46 | CREATE INDEX IF NOT EXISTS posts_capcode ON posts (capcode); -------------------------------------------------------------------------------- /src/storage/asagi/thread.rs: -------------------------------------------------------------------------------- 1 | use crate::imageboard::Post; 2 | 3 | #[derive(Debug, Clone)] 4 | pub(super) struct Thread { 5 | pub(super) thread_num: u64, 6 | pub(super) time_op: Option, 7 | pub(super) time_last: u64, 8 | pub(super) time_bump: u64, 9 | pub(super) time_ghost: u64, 10 | pub(super) time_ghost_bump: u64, 11 | pub(super) time_last_modified: u64, 12 | pub(super) n_replies: u64, 13 | pub(super) n_images: u64, 14 | pub(super) sticky: Option, 15 | pub(super) locked: Option, 16 | } 17 | 18 | impl Thread { 19 | pub(super) fn new(thread_num: u64) -> Self { 20 | Self { 21 | thread_num, 22 | time_op: None, 23 | time_last: 0, 24 | time_bump: 0, 25 | time_ghost: 0, 26 | time_ghost_bump: 0, 27 | time_last_modified: 0, 28 | n_replies: 0, 29 | n_images: 0, 30 | sticky: None, 31 | locked: None, 32 | } 33 | } 34 | 35 | pub(super) fn update(&mut self, post: &Post) { 36 | let post_timestamp = post.nyc_timestamp() as u64; 37 | if post.is_op() { 38 | self.time_op = Some(post_timestamp); 39 | // self.sticky = Some(post.sticky); 40 | // self.locked = Some(post.closed); 41 | } 42 | self.time_last = self.time_last.max(post_timestamp); 43 | self.time_last_modified = self.time_last; 44 | if post.email.as_ref().map(|e| e != "sage").unwrap_or(true) { 45 | self.time_bump = self.time_bump.max(post_timestamp); 46 | } 47 | 48 | self.n_replies += 1; 49 | self.n_images += match post.filename { 50 | Some(_) => 1, 51 | None => 0, 52 | }; 53 | } 54 | } 55 | 56 | #[derive(Debug, Clone)] 57 | pub(super) struct Media { 58 | pub(super) media_id: u64, 59 | pub(super) media_hash: String, 60 | pub(super) media: Option, 61 | pub(super) preview_op: Option, 62 | pub(super) preview_reply: Option, 63 | pub(super) total: u64, 64 | pub(super) banned: bool, 65 | } 66 | -------------------------------------------------------------------------------- /src/storage/asagi/stats.rs: -------------------------------------------------------------------------------- 1 | use crate::imageboard::Post; 2 | 3 | pub(super) struct Daily { 4 | pub(super) day: u64, 5 | pub(super) posts: u64, 6 | pub(super) images: u64, 7 | pub(super) sage: u64, 8 | pub(super) anons: u64, 9 | pub(super) trips: u64, 10 | pub(super) names: u64, 11 | } 12 | 13 | impl Daily { 14 | pub(super) fn new(day: u64) -> Self { 15 | Self { 16 | day, 17 | posts: 0, 18 | images: 0, 19 | sage: 0, 20 | anons: 0, 21 | trips: 0, 22 | names: 0, 23 | } 24 | } 25 | 26 | pub(super) fn update(&mut self, post: &Post) { 27 | self.posts += 1; 28 | if post.filename.is_some() { 29 | self.images += 1; 30 | } 31 | if post.email.as_ref().map(|e| e == "sage").unwrap_or(false) { 32 | self.sage += 1; 33 | } 34 | if post 35 | .name 36 | .as_ref() 37 | .map(|n| n == "Anonymous") 38 | .unwrap_or(false) 39 | && post.trip.is_none() 40 | { 41 | self.anons += 1; 42 | } 43 | if post.trip.is_some() { 44 | self.trips += 1; 45 | } 46 | if post 47 | .name 48 | .as_ref() 49 | .map(|n| n != "Anonymous" && !n.is_empty()) 50 | .unwrap_or(false) 51 | { 52 | self.names += 1; 53 | } 54 | } 55 | } 56 | 57 | pub(super) struct User { 58 | pub(super) name: String, 59 | pub(super) trip: String, 60 | pub(super) first_seen: u64, 61 | pub(super) post_count: i64, 62 | } 63 | 64 | impl User { 65 | pub(super) fn new(name: Option, trip: Option, first_seen: u64) -> Self { 66 | Self { 67 | name: name.unwrap_or(String::from("")), 68 | trip: trip.unwrap_or(String::from("")), 69 | first_seen, 70 | post_count: 0, 71 | } 72 | } 73 | 74 | pub(super) fn update(&mut self, post: &Post) { 75 | self.post_count += 1; 76 | self.first_seen = self.first_seen.min(post.nyc_timestamp()); 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "torako" 3 | version = "0.11.3-alpha.0" 4 | authors = ["miyachan "] 5 | edition = "2018" 6 | repository = "https://github.com/miyachan/torako" 7 | publish = false 8 | 9 | [features] 10 | default = ["jemalloc"] 11 | jemalloc = ["jemallocator"] 12 | trust-dns = ["reqwest/trust-dns"] 13 | io-uring = ["rio"] 14 | pgs-reindex = ["sqlx", "num_cpus", "indicatif"] 15 | lnx-reindex = ["sqlx", "num_cpus", "indicatif"] 16 | 17 | [dependencies] 18 | clap = "2" 19 | log = "0.4" 20 | tokio = { version = "0.2.23", features = ["full"] } 21 | serde = { version = "1", features = ["derive", "rc"] } 22 | serde_json = "1" 23 | futures = "0.3" 24 | reqwest = { version = "0.10.7", default-features = false, features = ["json", "stream", "gzip", "socks", "rustls-tls"] } 25 | thiserror = "1" 26 | toml = "0.5" 27 | rand = "0.7" 28 | governor = "0.3" 29 | mysql_async = "0.24" 30 | smallstr = { version = "0.2", features = ["serde"] } 31 | chrono = "0.4" 32 | chrono-tz = "0.5" 33 | regex = "1" 34 | lazy_static = "1" 35 | htmlescape = "0.3" 36 | parking_lot = "0.11" 37 | backoff = "0.2" 38 | humantime-serde = "1" 39 | humantime = "2" 40 | url = { version = "2", features = ["serde"] } 41 | ctrlc = { version = "3", features = ["termination"] } 42 | erased-serde = "0.3" 43 | warp = { version = "0.2", default-features = false } 44 | pretty_env_logger = "0.4" 45 | mime_guess = "2" 46 | rusoto_s3 = "0.45" 47 | bytes = "0.5" 48 | rusoto_core = "0.45" 49 | void = "1" 50 | tokio-postgres = { version = "0.5", features = ["with-chrono-0_4"] } 51 | deadpool-postgres = { version = "0.5", default-features = false } 52 | percent-encoding = "2" 53 | indicatif = { version = "0.15", optional = true } 54 | sqlx = { version = "0.4.0-beta.1", optional = true, default-features = false, features = ["runtime-tokio", "mysql", "chrono", "macros"] } 55 | num_cpus = { version = "1", optional = true } 56 | probabilistic-collections = { version = "0.7", features = ["serde"] } 57 | bincode = "1" 58 | memchr = "2" 59 | sha2 = "0.9" 60 | hex = "0.4" 61 | seahash = "4" 62 | 63 | [target.'cfg(target_os = "linux")'.dependencies] 64 | rio = { version = "0.9", optional = true } 65 | 66 | [target.'cfg(not(target_env = "msvc"))'.dependencies] 67 | jemallocator = { version = "0.3", optional = true } 68 | 69 | [target.'cfg(not(target_family = "windows"))'.dependencies] 70 | users = "0.10" 71 | nix = "0.18" 72 | get_if_addrs = "0.5" 73 | -------------------------------------------------------------------------------- /src/util/interval_lock.rs: -------------------------------------------------------------------------------- 1 | use std::pin::Pin; 2 | use std::sync::Arc; 3 | use std::task::{Context, Poll}; 4 | 5 | use futures::prelude::*; 6 | use futures::task::AtomicWaker; 7 | use parking_lot::Mutex; 8 | 9 | #[derive(Default)] 10 | pub struct IntervalLock { 11 | inner: Arc, 12 | } 13 | 14 | #[derive(Default, Debug)] 15 | struct IntervalLockInner { 16 | leased: Mutex>, 17 | wakers: Mutex>>, 18 | } 19 | 20 | #[must_use] 21 | #[derive(Debug)] 22 | pub struct IntervalLockGuard { 23 | parent: Arc, 24 | range: (u64, u64), 25 | } 26 | 27 | struct AcquireRange { 28 | parent: Arc, 29 | range: (u64, u64), 30 | waker: Arc, 31 | } 32 | 33 | impl IntervalLockInner { 34 | fn try_acquire(self: &Arc, range: (u64, u64)) -> Option { 35 | let mut ranges = self.leased.lock(); 36 | let has_overlap = ranges 37 | .iter() 38 | .any(|borrowed| range.0 <= borrowed.1 && borrowed.0 <= range.1); 39 | if has_overlap { 40 | None 41 | } else { 42 | ranges.push(range); 43 | Some(IntervalLockGuard { 44 | parent: self.clone(), 45 | range, 46 | }) 47 | } 48 | } 49 | 50 | fn release(&self, range: (u64, u64)) { 51 | let mut ranges = self.leased.lock(); 52 | if let Some(pos) = ranges.iter().position(|x| *x == range) { 53 | ranges.remove(pos); 54 | } 55 | drop(ranges); 56 | self.wakers.lock().iter().for_each(|w| w.wake()); 57 | } 58 | } 59 | 60 | impl IntervalLock { 61 | pub async fn acquire(&self, range: (u64, u64)) -> IntervalLockGuard { 62 | match self.inner.try_acquire(range) { 63 | Some(g) => g, 64 | None => { 65 | let waker = Arc::new(AtomicWaker::new()); 66 | self.inner.wakers.lock().push(waker.clone()); 67 | AcquireRange { 68 | parent: self.inner.clone(), 69 | range, 70 | waker, 71 | } 72 | .await 73 | } 74 | } 75 | } 76 | } 77 | 78 | impl Future for AcquireRange { 79 | type Output = IntervalLockGuard; 80 | 81 | fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { 82 | match self.parent.try_acquire(self.range) { 83 | Some(g) => { 84 | let mut wakers = self.parent.wakers.lock(); 85 | if let Some(pos) = wakers.iter().position(|x| Arc::ptr_eq(&x, &self.waker)) { 86 | wakers.remove(pos); 87 | } 88 | Poll::Ready(g) 89 | } 90 | None => { 91 | self.waker.register(cx.waker()); 92 | Poll::Pending 93 | } 94 | } 95 | } 96 | } 97 | 98 | impl Drop for IntervalLockGuard { 99 | fn drop(&mut self) { 100 | self.parent.release(self.range) 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /src/util/lnx/post.rs: -------------------------------------------------------------------------------- 1 | use serde::{Serialize, Serializer}; 2 | 3 | pub fn as_u32_be(array: &[u8; 4]) -> u32 { 4 | ((array[0] as u32) << 24) + 5 | ((array[1] as u32) << 16) + 6 | ((array[2] as u32) << 8) + 7 | ((array[3] as u32) << 0) 8 | } 9 | 10 | fn field_ser(x: T, s: S) -> Result 11 | where 12 | S: Serializer, 13 | { 14 | [&x].serialize(s) 15 | } 16 | 17 | #[derive(Serialize, Clone, Debug)] 18 | pub struct Post<'a> { 19 | #[serde(serialize_with = "field_ser")] 20 | pub board: &'a str, 21 | #[serde(serialize_with = "field_ser")] 22 | pub thread_no: u64, 23 | #[serde(serialize_with = "field_ser")] 24 | pub post_no: u64, 25 | #[serde(skip_serializing_if = "Option::is_none", serialize_with = "field_ser")] 26 | pub subject: Option<&'a str>, 27 | #[serde(skip_serializing_if = "Option::is_none", serialize_with = "field_ser")] 28 | pub username: Option<&'a str>, 29 | #[serde(skip_serializing_if = "Option::is_none", serialize_with = "field_ser")] 30 | pub tripcode: Option<&'a str>, 31 | #[serde(skip_serializing_if = "Option::is_none", serialize_with = "field_ser")] 32 | pub email: Option<&'a str>, 33 | #[serde(skip_serializing_if = "Option::is_none", serialize_with = "field_ser")] 34 | pub unique_id: Option<&'a str>, 35 | #[serde(skip_serializing_if = "Option::is_none", serialize_with = "field_ser")] 36 | pub since4_pass: Option, 37 | #[serde(skip_serializing_if = "Option::is_none", serialize_with = "field_ser")] 38 | pub country: Option<&'a str>, 39 | #[serde(skip_serializing_if = "Option::is_none", serialize_with = "field_ser")] 40 | pub filename: Option<&'a str>, 41 | #[serde(skip_serializing_if = "Option::is_none", serialize_with = "field_ser")] 42 | pub image_hash: Option<&'a str>, 43 | #[serde(serialize_with = "field_ser")] 44 | pub image_width: u64, 45 | #[serde(serialize_with = "field_ser")] 46 | pub image_height: u64, 47 | #[serde(serialize_with = "field_ser")] 48 | pub ts: u64, 49 | #[serde(serialize_with = "field_ser")] 50 | pub tsr: u64, 51 | #[serde(skip_serializing_if = "Option::is_none", serialize_with = "field_ser")] 52 | pub comment: Option<&'a str>, 53 | #[serde(serialize_with = "field_ser")] 54 | pub deleted: u64, 55 | #[serde(serialize_with = "field_ser")] 56 | pub ghost: u64, 57 | #[serde(serialize_with = "field_ser")] 58 | pub sticky: u64, 59 | #[serde(serialize_with = "field_ser")] 60 | pub spoiler: u64, 61 | #[serde(serialize_with = "field_ser")] 62 | pub op: u64, 63 | #[serde(skip_serializing_if = "Option::is_none", serialize_with = "field_ser")] 64 | pub capcode: Option, 65 | #[serde(serialize_with = "field_ser")] 66 | pub version: u64, 67 | #[serde(serialize_with = "field_ser")] 68 | pub tuid: u64, 69 | } 70 | 71 | #[derive(Debug, Serialize)] 72 | pub struct DeletePostDoc { 73 | tuid: u64 74 | } 75 | 76 | #[derive(Debug, Serialize)] 77 | pub struct DeletePost(Vec); 78 | 79 | impl DeletePost { 80 | pub fn new(ids: Vec) -> Self { 81 | Self(ids.into_iter().map(|id| DeletePostDoc{ tuid: id }).collect()) 82 | } 83 | } -------------------------------------------------------------------------------- /src/feed/feed_all.rs: -------------------------------------------------------------------------------- 1 | use std::fmt; 2 | use std::pin::Pin; 3 | 4 | use futures::future::Future; 5 | use futures::ready; 6 | use futures::sink::Sink; 7 | use futures::stream::TryStreamExt; 8 | use futures::stream::{Stream, TryStream}; 9 | use futures::task::{Context, Poll}; 10 | 11 | /// Future for the [`feed_all`](super::SinkExt::feed_all) method. 12 | #[allow(explicit_outlives_requirements)] // https://github.com/rust-lang/rust/issues/60993 13 | #[must_use = "futures do nothing unless you `.await` or poll them"] 14 | pub struct FeedAll<'a, Si, St> 15 | where 16 | Si: ?Sized, 17 | St: ?Sized + TryStream, 18 | { 19 | sink: &'a mut Si, 20 | stream: &'a mut St, 21 | buffered: Option, 22 | } 23 | 24 | impl fmt::Debug for FeedAll<'_, Si, St> 25 | where 26 | Si: fmt::Debug + ?Sized, 27 | St: fmt::Debug + ?Sized + TryStream, 28 | St::Ok: fmt::Debug, 29 | { 30 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 31 | f.debug_struct("FeedAll") 32 | .field("sink", &self.sink) 33 | .field("stream", &self.stream) 34 | .field("buffered", &self.buffered) 35 | .finish() 36 | } 37 | } 38 | 39 | // Pinning is never projected to any fields 40 | impl Unpin for FeedAll<'_, Si, St> 41 | where 42 | Si: Unpin + ?Sized, 43 | St: TryStream + Unpin + ?Sized, 44 | { 45 | } 46 | 47 | impl<'a, Si, St, Ok, Error> FeedAll<'a, Si, St> 48 | where 49 | Si: Sink + Unpin + ?Sized, 50 | St: TryStream + Stream + Unpin + ?Sized, 51 | { 52 | pub(super) fn new(sink: &'a mut Si, stream: &'a mut St) -> FeedAll<'a, Si, St> { 53 | FeedAll { 54 | sink, 55 | stream, 56 | buffered: None, 57 | } 58 | } 59 | 60 | #[allow(dead_code)] 61 | pub(super) fn sink_pin_mut(&mut self) -> Pin<&mut Si> { 62 | Pin::new(self.sink) 63 | } 64 | 65 | fn try_start_send( 66 | &mut self, 67 | cx: &mut Context<'_>, 68 | item: St::Ok, 69 | ) -> Poll> { 70 | debug_assert!(self.buffered.is_none()); 71 | match Pin::new(&mut self.sink).poll_ready(cx)? { 72 | Poll::Ready(()) => Poll::Ready(Pin::new(&mut self.sink).start_send(item)), 73 | Poll::Pending => { 74 | self.buffered = Some(item); 75 | Poll::Pending 76 | } 77 | } 78 | } 79 | } 80 | 81 | impl Future for FeedAll<'_, Si, St> 82 | where 83 | Si: Sink + Unpin + ?Sized, 84 | St: Stream> + Unpin + ?Sized, 85 | { 86 | type Output = Result<(), Error>; 87 | 88 | fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { 89 | let this = &mut *self; 90 | // If we've got an item buffered already, we need to write it to the 91 | // sink before we can do anything else 92 | if let Some(item) = this.buffered.take() { 93 | ready!(this.try_start_send(cx, item))? 94 | } 95 | 96 | loop { 97 | match this.stream.try_poll_next_unpin(cx)? { 98 | Poll::Ready(Some(item)) => ready!(this.try_start_send(cx, item))?, 99 | Poll::Ready(None) => return Poll::Ready(Ok(())), 100 | Poll::Pending => { 101 | ready!(Pin::new(&mut this.sink).poll_flush(cx))?; 102 | return Poll::Pending; 103 | } 104 | } 105 | } 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /src/storage/asagi/db_metrics.rs: -------------------------------------------------------------------------------- 1 | use std::cell::RefCell; 2 | use std::sync::Arc; 3 | use std::time::{Duration, Instant}; 4 | 5 | use futures::prelude::*; 6 | use mysql_async::prelude::*; 7 | use rustc_hash::FxHashMap; 8 | use serde::Serialize; 9 | 10 | pub struct MaybeMetric { 11 | metric: Option, 12 | modified: Instant, 13 | lock: Arc>, 14 | } 15 | 16 | impl Default for MaybeMetric { 17 | fn default() -> Self { 18 | MaybeMetric { 19 | metric: None, 20 | modified: Instant::now() - Duration::from_secs(300), 21 | lock: Arc::new(futures::lock::Mutex::new(())), 22 | } 23 | } 24 | } 25 | 26 | thread_local! { 27 | static POST_STATS: RefCell>>> = RefCell::new(MaybeMetric::default()); 28 | } 29 | 30 | #[derive(Debug, Serialize, Clone)] 31 | pub struct DatabaseMetrics { 32 | posts: Arc>, 33 | } 34 | 35 | pub(super) async fn database_metrics( 36 | asagi: Arc, 37 | boards: Vec<&'static str>, 38 | ) -> Result { 39 | loop { 40 | let (ps, lock) = POST_STATS.with(|ps| { 41 | let stats = ps.borrow(); 42 | let last_update = stats.modified; 43 | match &stats.metric { 44 | Some(stats) if last_update.elapsed() < Duration::from_secs(60) => { 45 | (Some(stats.clone()), None) 46 | } 47 | _ => (None, Some(stats.lock.clone())), 48 | } 49 | }); 50 | if let Some(ps) = ps { 51 | return Ok(DatabaseMetrics { posts: ps }); 52 | } 53 | let lock = lock.unwrap(); 54 | let guard = lock.try_lock(); 55 | if guard.is_none() { 56 | drop(lock.lock().await); 57 | continue; 58 | }; 59 | 60 | // https://github.com/rust-lang/rust/issues/64552#issuecomment-669728225 61 | let stats: std::pin::Pin + Send>> = Box::pin( 62 | futures::stream::iter(boards.iter()) 63 | .map(|board| { 64 | let conn = asagi.direct_db_pool.get_conn(); 65 | async move { 66 | let mut conn = conn.await?; 67 | let sz: Option = conn 68 | .query_first(format!("SELECT COUNT(*) FROM `{}`", board)) 69 | .await?; 70 | 71 | Ok::<_, super::Error>((*board, sz.unwrap())) 72 | } 73 | }) 74 | .buffer_unordered(usize::MAX), 75 | ); 76 | 77 | let fut = stats.try_collect::>(); 78 | let stats = match tokio::time::timeout(Duration::from_secs(15), fut).await { 79 | Ok(f) => { 80 | let mut stats = f?; 81 | let total: u64 = stats.iter().map(|x| x.1).copied().sum(); 82 | stats.insert("_total", total); 83 | stats 84 | } 85 | Err(_) => { 86 | let mut stats = FxHashMap::default(); 87 | stats.insert("_timedout", 1); 88 | stats 89 | } 90 | }; 91 | 92 | POST_STATS.with(move |ps| { 93 | let mut l = ps.borrow_mut(); 94 | l.metric = Some(Arc::new(stats)); 95 | l.modified = Instant::now(); 96 | }); 97 | drop(guard); 98 | continue; 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /src/storage/search_lnx/posts.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "%%NAME%%", 3 | 4 | "writer_buffer": 144000000, 5 | "writer_threads": 6, 6 | "reader_threads": 6, 7 | 8 | "max_concurrency": 12, 9 | "search_fields": [ 10 | "comment" 11 | ], 12 | 13 | "storage_type": "filesystem", 14 | "set_conjunction_by_default": true, 15 | "use_fast_fuzzy": false, 16 | "strip_stop_words": false, 17 | 18 | "fields": { 19 | "board": { 20 | "type": "text", 21 | "stored": true 22 | }, 23 | "thread_no": { 24 | "type": "u64", 25 | "stored": true, 26 | "indexed": true, 27 | "fast": "single" 28 | }, 29 | "post_no": { 30 | "type": "u64", 31 | "stored": true, 32 | "indexed": true, 33 | "fast": "single" 34 | }, 35 | "subject": { 36 | "type": "text", 37 | "stored": false 38 | }, 39 | "username": { 40 | "type": "text", 41 | "stored": false 42 | }, 43 | "tripcode": { 44 | "type": "text", 45 | "stored": false 46 | }, 47 | "email": { 48 | "type": "text", 49 | "stored": false 50 | }, 51 | "unique_id": { 52 | "type": "text", 53 | "stored": false 54 | }, 55 | "since4_pass": { 56 | "type": "u64", 57 | "stored": false, 58 | "indexed": true, 59 | "fast": "single" 60 | }, 61 | "country": { 62 | "type": "text", 63 | "stored": false 64 | }, 65 | "filename": { 66 | "type": "text", 67 | "stored": false 68 | }, 69 | "image_hash": { 70 | "type": "text", 71 | "stored": false 72 | }, 73 | "image_width": { 74 | "type": "u64", 75 | "stored": false, 76 | "indexed": true, 77 | "fast": "single" 78 | }, 79 | "image_height": { 80 | "type": "u64", 81 | "stored": false, 82 | "indexed": true, 83 | "fast": "single" 84 | }, 85 | "ts": { 86 | "type": "u64", 87 | "stored": false, 88 | "indexed": true, 89 | "fast": "single" 90 | }, 91 | "tsr": { 92 | "type": "u64", 93 | "stored": false, 94 | "indexed": true, 95 | "fast": "single" 96 | }, 97 | "comment": { 98 | "type": "text", 99 | "stored": false 100 | }, 101 | "deleted": { 102 | "type": "u64", 103 | "stored": false, 104 | "indexed": true, 105 | "fast": "single" 106 | }, 107 | "ghost": { 108 | "type": "u64", 109 | "stored": false, 110 | "indexed": true, 111 | "fast": "single" 112 | }, 113 | "sticky": { 114 | "type": "u64", 115 | "stored": false, 116 | "indexed": true, 117 | "fast": "single" 118 | }, 119 | "spoiler": { 120 | "type": "u64", 121 | "stored": false, 122 | "indexed": true, 123 | "fast": "single" 124 | }, 125 | "op": { 126 | "type": "u64", 127 | "stored": false, 128 | "indexed": true, 129 | "fast": "single" 130 | }, 131 | "capcode": { 132 | "type": "u64", 133 | "stored": false, 134 | "indexed": true, 135 | "fast": "single" 136 | }, 137 | "tuid": { 138 | "type": "u64", 139 | "stored": false, 140 | "indexed": true, 141 | "fast": "single" 142 | }, 143 | "version": { 144 | "type": "u64", 145 | "stored": false, 146 | "indexed": true, 147 | "fast": "single" 148 | } 149 | }, 150 | "boost_fields": {} 151 | } -------------------------------------------------------------------------------- /src/storage/asagi/boards.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS `%%BOARD%%` ( 2 | `doc_id` int unsigned NOT NULL auto_increment, 3 | `media_id` int unsigned NOT NULL DEFAULT '0', 4 | `poster_ip` decimal(39,0) unsigned NOT NULL DEFAULT '0', 5 | `num` int unsigned NOT NULL, 6 | `subnum` int unsigned NOT NULL, 7 | `thread_num` int unsigned NOT NULL DEFAULT '0', 8 | `op` bool NOT NULL DEFAULT '0', 9 | `timestamp` int unsigned NOT NULL, 10 | `timestamp_expired` int unsigned NOT NULL, 11 | `preview_orig` varchar(20), 12 | `preview_w` smallint unsigned NOT NULL DEFAULT '0', 13 | `preview_h` smallint unsigned NOT NULL DEFAULT '0', 14 | `media_filename` text, 15 | `media_w` smallint unsigned NOT NULL DEFAULT '0', 16 | `media_h` smallint unsigned NOT NULL DEFAULT '0', 17 | `media_size` int unsigned NOT NULL DEFAULT '0', 18 | `media_hash` varchar(25), 19 | `media_orig` varchar(191), 20 | `spoiler` bool NOT NULL DEFAULT '0', 21 | `deleted` bool NOT NULL DEFAULT '0', 22 | `capcode` varchar(1) NOT NULL DEFAULT 'N', 23 | `email` varchar(100), 24 | `name` varchar(100), 25 | `trip` varchar(25), 26 | `title` varchar(100), 27 | `comment` text, 28 | `delpass` tinytext, 29 | `sticky` bool NOT NULL DEFAULT '0', 30 | `locked` bool NOT NULL DEFAULT '0', 31 | `poster_hash` varchar(8), 32 | `poster_country` varchar(2), 33 | `exif` text, 34 | `unix_timestamp` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, 35 | 36 | PRIMARY KEY (`doc_id`), 37 | UNIQUE num_subnum_index (`num`, `subnum`), 38 | INDEX thread_num_subnum_index (`thread_num`, `num`, `subnum`), 39 | INDEX subnum_index (`subnum`), 40 | INDEX op_index (`op`), 41 | INDEX media_id_index (`media_id`), 42 | INDEX media_hash_index (`media_hash`), 43 | INDEX media_orig_index (`media_orig`), 44 | INDEX name_trip_index (`name`, `trip`), 45 | INDEX trip_index (`trip`), 46 | INDEX email_index (`email`), 47 | INDEX poster_ip_index (`poster_ip`), 48 | INDEX timestamp_index (`timestamp`) 49 | ) ENGINE=%%ENGINE%% CHARSET=%%CHARSET%%; 50 | 51 | CREATE TABLE IF NOT EXISTS `%%BOARD%%_deleted` LIKE `%%BOARD%%`; 52 | 53 | CREATE TABLE IF NOT EXISTS `%%BOARD%%_threads` ( 54 | `thread_num` int unsigned NOT NULL, 55 | `time_op` int unsigned NOT NULL, 56 | `time_last` int unsigned NOT NULL, 57 | `time_bump` int unsigned NOT NULL, 58 | `time_ghost` int unsigned DEFAULT NULL, 59 | `time_ghost_bump` int unsigned DEFAULT NULL, 60 | `time_last_modified` int unsigned NOT NULL, 61 | `nreplies` int unsigned NOT NULL DEFAULT '0', 62 | `nimages` int unsigned NOT NULL DEFAULT '0', 63 | `sticky` bool NOT NULL DEFAULT '0', 64 | `locked` bool NOT NULL DEFAULT '0', 65 | 66 | PRIMARY KEY (`thread_num`), 67 | INDEX time_op_index (`time_op`), 68 | INDEX time_bump_index (`time_bump`), 69 | INDEX time_ghost_bump_index (`time_ghost_bump`), 70 | INDEX time_last_modified_index (`time_last_modified`), 71 | INDEX sticky_index (`sticky`), 72 | INDEX locked_index (`locked`) 73 | ) ENGINE=%%ENGINE%% CHARSET=%%CHARSET%%; 74 | 75 | CREATE TABLE IF NOT EXISTS `%%BOARD%%_users` ( 76 | `user_id` int unsigned NOT NULL auto_increment, 77 | `name` varchar(100) NOT NULL DEFAULT '', 78 | `trip` varchar(25) NOT NULL DEFAULT '', 79 | `firstseen` int(11) NOT NULL, 80 | `postcount` int(11) NOT NULL, 81 | 82 | PRIMARY KEY (`user_id`), 83 | UNIQUE name_trip_index (`name`, `trip`), 84 | INDEX firstseen_index (`firstseen`), 85 | INDEX postcount_index (`postcount`) 86 | ) ENGINE=%%ENGINE%% DEFAULT CHARSET=%%CHARSET%%; 87 | 88 | CREATE TABLE IF NOT EXISTS `%%BOARD%%_images` ( 89 | `media_id` int unsigned NOT NULL auto_increment, 90 | `media_hash` varchar(25) NOT NULL, 91 | `media` varchar(191), 92 | `preview_op` varchar(20), 93 | `preview_reply` varchar(20), 94 | `total` int(10) unsigned NOT NULL DEFAULT '0', 95 | `banned` smallint unsigned NOT NULL DEFAULT '0', 96 | `media_sha256` binary(32), 97 | `preview_op_sha256` binary(32), 98 | `preview_reply_sha256` binary(32), 99 | 100 | PRIMARY KEY (`media_id`), 101 | UNIQUE media_hash_index (`media_hash`), 102 | INDEX total_index (`total`), 103 | INDEX banned_index (`banned`) 104 | ) ENGINE=%%ENGINE%% DEFAULT CHARSET=%%CHARSET%%; 105 | 106 | CREATE TABLE IF NOT EXISTS `%%BOARD%%_daily` ( 107 | `day` int(10) unsigned NOT NULL, 108 | `posts` int(10) unsigned NOT NULL, 109 | `images` int(10) unsigned NOT NULL, 110 | `sage` int(10) unsigned NOT NULL, 111 | `anons` int(10) unsigned NOT NULL, 112 | `trips` int(10) unsigned NOT NULL, 113 | `names` int(10) unsigned NOT NULL, 114 | 115 | PRIMARY KEY (`day`) 116 | ) ENGINE=%%ENGINE%% DEFAULT CHARSET=%%CHARSET%%; -------------------------------------------------------------------------------- /src/storage/search_pg/builder.rs: -------------------------------------------------------------------------------- 1 | use std::str::FromStr; 2 | use std::sync::{ 3 | atomic::{AtomicBool, AtomicUsize}, 4 | Arc, 5 | }; 6 | 7 | use futures::prelude::*; 8 | use futures::task::AtomicWaker; 9 | use log::info; 10 | 11 | use super::{Error, Search, SearchInner}; 12 | 13 | pub struct SearchBuilder { 14 | postgres_url: Option, 15 | inflight_posts: usize, 16 | fail_on_save_error: bool, 17 | retries_on_save_error: usize, 18 | } 19 | 20 | impl Default for SearchBuilder { 21 | fn default() -> Self { 22 | SearchBuilder { 23 | postgres_url: None, 24 | inflight_posts: usize::MAX, 25 | fail_on_save_error: true, 26 | retries_on_save_error: 0, 27 | } 28 | } 29 | } 30 | 31 | impl SearchBuilder { 32 | pub fn with_database(mut self, database_url: url::Url) -> Self { 33 | self.postgres_url = Some(database_url); 34 | self 35 | } 36 | 37 | pub fn max_inflight_posts(mut self, posts: usize) -> Self { 38 | self.inflight_posts = posts; 39 | self 40 | } 41 | 42 | pub fn fail_on_save_error(mut self, yes: bool) -> Self { 43 | self.fail_on_save_error = yes; 44 | self 45 | } 46 | 47 | pub fn retries_on_save_error(mut self, retries: usize) -> Self { 48 | self.retries_on_save_error = retries; 49 | self 50 | } 51 | 52 | pub async fn build(self) -> Result { 53 | info!("Initializing Asagi Postgres Search Backend..."); 54 | info!("Connecting to Postgres..."); 55 | let mut pg_url = self.postgres_url.clone().unwrap(); 56 | 57 | let pool_size = pg_url 58 | .query_pairs() 59 | .find(|x| x.0 == "pool_size") 60 | .map(|x| x.1.parse::()); 61 | let pool_size = match pool_size { 62 | Some(p) => match p { 63 | Ok(s) => s, 64 | Err(_) => return Err(Error::InvalidPoolSize), 65 | }, 66 | None => 16, 67 | }; 68 | 69 | pg_url.query_pairs_mut().clear().extend_pairs( 70 | self.postgres_url 71 | .as_ref() 72 | .unwrap() 73 | .query_pairs() 74 | .filter(|x| x.0 != "pool_size"), 75 | ); 76 | 77 | let config = match tokio_postgres::Config::from_str(&pg_url.to_string()) { 78 | Ok(c) => c, 79 | Err(err) => return Err(Error::InvalidDatabase(err)), 80 | }; 81 | let manager = deadpool_postgres::Manager::new(config, tokio_postgres::NoTls); 82 | let pool = deadpool_postgres::Pool::new(manager, pool_size); 83 | 84 | let client = pool.get().await?; 85 | 86 | info!("Creating tables (if needed)..."); 87 | client.batch_execute(include_str!("posts.sql")).await?; 88 | 89 | drop(client); 90 | 91 | let (process_tx, process_rx) = tokio::sync::mpsc::unbounded_channel(); 92 | 93 | let search = SearchInner { 94 | db_pool: pool, 95 | max_inflight_posts: self.inflight_posts, 96 | fail_on_save_error: self.fail_on_save_error, 97 | retries_on_save_error: self.retries_on_save_error, 98 | 99 | failed: AtomicBool::new(false), 100 | inflight_posts: AtomicUsize::new(0), 101 | waker: Arc::new(AtomicWaker::new()), 102 | flush_waker: Arc::new(AtomicWaker::new()), 103 | close_waker: Arc::new(AtomicWaker::new()), 104 | metrics: Arc::new(super::SearchMetrics::default()), 105 | process_tx, 106 | }; 107 | 108 | let search = Arc::new(search); 109 | let search2 = search.clone(); 110 | 111 | tokio::spawn( 112 | process_rx 113 | .take_while(|x| future::ready(x.is_some())) 114 | .zip(stream::repeat(search2)) 115 | .map(|(x, search2)| match x { 116 | Some(p) => search2.send_posts(p), 117 | None => unreachable!(), 118 | }) 119 | .buffer_unordered(usize::MAX) 120 | .for_each(|_| future::ready(())), 121 | ); 122 | 123 | Ok(Search { inner: search }) 124 | } 125 | } 126 | 127 | impl From<&crate::config::AsagiSearch> for SearchBuilder { 128 | fn from(config: &crate::config::AsagiSearch) -> Self { 129 | let mut builder = SearchBuilder::default(); 130 | builder = builder.with_database(config.database_url.clone()); 131 | if let Some(inflight_posts) = config.inflight_posts { 132 | builder = builder.max_inflight_posts(inflight_posts.into()); 133 | } 134 | if let Some(fail_on_save_error) = config.fail_on_save_error { 135 | builder = builder.fail_on_save_error(fail_on_save_error); 136 | } 137 | if let Some(retries_on_save_error) = config.retries_on_save_error { 138 | builder = builder.retries_on_save_error(retries_on_save_error); 139 | } 140 | 141 | builder 142 | } 143 | } 144 | -------------------------------------------------------------------------------- /src/storage/search_lnx/post.rs: -------------------------------------------------------------------------------- 1 | use std::time::{SystemTime, UNIX_EPOCH}; 2 | 3 | use serde::{Serialize, Serializer}; 4 | 5 | fn as_u32_be(array: &[u8; 4]) -> u32 { 6 | ((array[0] as u32) << 24) + 7 | ((array[1] as u32) << 16) + 8 | ((array[2] as u32) << 8) + 9 | ((array[3] as u32) << 0) 10 | } 11 | 12 | fn field_ser(x: T, s: S) -> Result 13 | where 14 | S: Serializer, 15 | { 16 | [&x].serialize(s) 17 | } 18 | 19 | #[derive(Serialize, Clone, Debug)] 20 | pub struct Post<'a> { 21 | #[serde(serialize_with = "field_ser")] 22 | board: &'static str, 23 | #[serde(serialize_with = "field_ser")] 24 | thread_no: u64, 25 | #[serde(serialize_with = "field_ser")] 26 | post_no: u64, 27 | #[serde(skip_serializing_if = "Option::is_none", serialize_with = "field_ser")] 28 | subject: Option<&'a str>, 29 | #[serde(skip_serializing_if = "Option::is_none", serialize_with = "field_ser")] 30 | username: Option<&'a str>, 31 | #[serde(skip_serializing_if = "Option::is_none", serialize_with = "field_ser")] 32 | tripcode: Option<&'a str>, 33 | #[serde(skip_serializing_if = "Option::is_none", serialize_with = "field_ser")] 34 | email: Option<&'a str>, 35 | #[serde(skip_serializing_if = "Option::is_none", serialize_with = "field_ser")] 36 | unique_id: Option<&'a str>, 37 | #[serde(skip_serializing_if = "Option::is_none", serialize_with = "field_ser")] 38 | since4_pass: Option, 39 | #[serde(skip_serializing_if = "Option::is_none", serialize_with = "field_ser")] 40 | country: Option, 41 | #[serde(skip_serializing_if = "Option::is_none", serialize_with = "field_ser")] 42 | filename: Option, 43 | #[serde(skip_serializing_if = "Option::is_none", serialize_with = "field_ser")] 44 | image_hash: Option<&'a str>, 45 | #[serde(serialize_with = "field_ser")] 46 | image_width: u64, 47 | #[serde(serialize_with = "field_ser")] 48 | image_height: u64, 49 | #[serde(serialize_with = "field_ser")] 50 | ts: u64, 51 | #[serde(serialize_with = "field_ser")] 52 | tsr: u64, 53 | #[serde(skip_serializing_if = "Option::is_none", serialize_with = "field_ser")] 54 | comment: Option, 55 | #[serde(serialize_with = "field_ser")] 56 | deleted: u64, 57 | #[serde(serialize_with = "field_ser")] 58 | ghost: u64, 59 | #[serde(serialize_with = "field_ser")] 60 | sticky: u64, 61 | #[serde(serialize_with = "field_ser")] 62 | spoiler: u64, 63 | #[serde(serialize_with = "field_ser")] 64 | op: u64, 65 | #[serde(skip_serializing_if = "Option::is_none", serialize_with = "field_ser")] 66 | capcode: Option, 67 | #[serde(serialize_with = "field_ser")] 68 | version: u64, 69 | #[serde(serialize_with = "field_ser")] 70 | pub tuid: u64, 71 | } 72 | 73 | impl<'a> From<&'a crate::imageboard::Post> for Post<'a> { 74 | fn from(post: &'a crate::imageboard::Post) -> Self { 75 | let upper = { 76 | let bytes = post.board.as_bytes(); 77 | let bytes = [bytes.get(0).copied().unwrap_or(0), bytes.get(1).copied().unwrap_or(0), bytes.get(2).copied().unwrap_or(0), bytes.get(3).copied().unwrap_or(0)]; 78 | as_u32_be(&bytes) 79 | }; 80 | let lower = post.no as u32; 81 | let tuid = (upper as u64) << 32 | (lower as u64); 82 | let version = match SystemTime::now().duration_since(UNIX_EPOCH) { 83 | Ok(n) => n.as_millis() as u64, 84 | Err(_) => panic!("SystemTime before UNIX EPOCH!"), 85 | }; 86 | Post { 87 | board: post.board, 88 | thread_no: post.thread_no(), 89 | post_no: post.no, 90 | subject: post.sub.as_ref().map(|x| &**x), 91 | username: post.name.as_ref().map(|x| &**x), 92 | tripcode: post.trip.as_ref().map(|x| &**x), 93 | email: post.email.as_ref().map(|x| &**x), 94 | unique_id: post.id.as_ref().map(|x| &**x), 95 | since4_pass: post.since4pass.map(|s| s as u64), 96 | country: post.poster_country(), 97 | filename: post.media_filename(), 98 | image_hash: post.md5.as_ref().map(|x| &**x), 99 | image_width: post.w as _, 100 | image_height: post.h as _, 101 | ts: post.time as _, 102 | tsr: u64::MAX - (post.time as u64), 103 | comment: post.comment(), 104 | deleted: if post.deleted { 1 } else { 0 }, 105 | ghost: 0, 106 | sticky: if post.sticky { 1 } else { 0 }, 107 | spoiler: if post.spoiler { 1 } else { 0 }, 108 | op: if post.is_op() { 1 } else { 0 }, 109 | capcode: post.short_capcode().chars().next().map(|c| c as u64), 110 | tuid, 111 | version, 112 | } 113 | } 114 | } 115 | 116 | #[derive(Debug, Serialize)] 117 | pub struct DeletePostDoc { 118 | tuid: u64 119 | } 120 | 121 | #[derive(Debug, Serialize)] 122 | pub struct DeletePost(Vec); 123 | 124 | impl DeletePost { 125 | pub fn new(ids: Vec) -> Self { 126 | Self(ids.into_iter().map(|id| DeletePostDoc{ tuid: id }).collect()) 127 | } 128 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Torako 2 | 3 | > Torako: A 4chan imageboard scraper 4 | 5 | Torako is an imageboard scraper, designed to support multiple backends, with support for an [Asagi](https://github.com/eksopl/asagi/) style backend as a drop-in replacement for it. 6 | 7 | Torako is written in Rust with with async-features using the Tokio reactor. The async design allows scraping multiple boards with minimal overhead and still remain performant. 8 | 9 | Report any issues or features requests using GitHub Issues. 10 | 11 | ## Getting Started 12 | 13 | Builds are generated for Windows, Linux and macOS and are available on the [GitHub releases page](https://github.com/miyachan/torako/releases). Once downloaded, grab and modify the sample configuration and place it in the same directory as `Torako.toml` (You can also use the `-c` flag to specify a custom configuration path). 14 | 15 | ```sh 16 | $ ./torako -c ./Torako.toml 17 | ``` 18 | 19 | > *Note*: Torako may create a large number of connections during it's warmup phase and this may cause your system to hit open file descriptor limits. Make sure to increase them. 20 | 21 | ## Configuration 22 | 23 | The `Torako.sample.toml` sample file can be used as a base to start running Torako. The sample configuration file also documents all the various options that can be used to configure and customize Torako. In order to start Torako you must provide at least one board to scape and configure a storage backend. 24 | 25 | ## Building 26 | 27 | Building Torako only requires Rust v1.46.0+. You can install Rust with [rustup](https://rustup.rs/). Simply clone the repo and run: 28 | 29 | ```sh 30 | $ cargo build --release 31 | ``` 32 | > *Note*: On certain platforms you may need to ensure you have C/C++ compiler, `pkg-config` and libssl-dev installed to build the openssl bindings. 33 | 34 | ## Asagi Compatibility 35 | 36 | Torako is designed to be a drop-in replacement for Asagi, so any deviation from Asagi's outputs should be considered a bug. However, Torako does not communicate with MySQL the same way Torako does, and Torako tries to be more performant. Torako's default configuration should be compatiable with an existing deployment with no changes. 37 | 38 | ### MySQL Triggers 39 | 40 | Traditionally, Asagi uses MySQL triggers to keep tables like `board_images` and `board_daily` in sync. Triggers can work well, but when mass inserting data, triggers may not be the most optimal solution. Instead Torako has an option to compute the required database changes internally for a set of posts and make one batch update to every table. 41 | 42 | Enabling this mode requires setting `use_triggers = false` in the configuration. 43 | 44 | > **WARNING**: Enabling this option will drop all triggers on your database (except the delete triggers) when Torako starts. You can recreate them by restarting Torako with `use_triggers = true`. 45 | 46 | ### Stats 47 | 48 | By default, the stats table are not computed, unless you already have the proper triggers in place. If Torako is running with `use_triggers = false`, then `compute_stats = true` can also be set to generate the stats for the `board_daily` and `board_users` table. 49 | 50 | ## Architecture 51 | 52 | At a high level, Torako organizes data flow from `BoardStream`s which are implemented as a `Stream>` to storage sinks which are are `Sink`s of the same type. Using these base types, it's natural to introduce backpressure in the system that can termporarily pause archiving if the backlog is too great or if memory usage is too high. 53 | 54 | ### BoardStream 55 | 56 | `BoardStream` is a `futures::stream::Stream` that will output all new posts. A post may get restransmitted if the content of the post is edited or if the post is deleted (resent posts will be sent with the `is_retransmission` flag). 57 | 58 | ### Storage Backend 59 | 60 | #### Asagi 61 | 62 | The Asagi backend is designed to deliver the same outputs as the [Asagi](https://github.com/eksopl/asagi/) imageboard scraper. Using the configuration it is possible to tune the concurrency parameters to optomize for throughput and memory usage. In general, allowing more requests in flight will increase memory usage as the Torako will keep those posts buffered in memory before they are finally flushed to the database. The storage backend is also designed to keep MySQL reads at a minimum and optimizations are in place to reduce superflous reads. 63 | 64 | ## API 65 | 66 | Torako includes an API endpoint (`http://127.0.0.1:2377` by default) for interacting 67 | and inspecting Torako. 68 | 69 | ### API Documentation 70 | 71 | #### `GET /` 72 | > Content-Type: application/json 73 | 74 | Returns a JSON object with metrics information for every board and all storage backends. 75 | 76 | ## Performance 77 | 78 | Setup: 79 | 80 | * torako: GCP e2-standard-4 (4 vCPUs, 16 GB memory) 81 | * MySQL 5.7 (empty): db-n1-standard-2 (2 vCPUs, 7.5 GB memory) 82 | * `use_triggers = false` 83 | * `compute_stats = true` 84 | * `concurrent_downloads = 1024` 85 | * `inflight_posts = unlimited` 86 | * `media_backpressure = false` 87 | * Boards: All except /f/, media downloads disabled for /wsg/ and /gif/ 88 | 89 | When Torako first starts, it will attempt to download all posts and any missing images. Downloading all posts is relatively fast, and most of the CPU usage then goes to downloading media until it is fully caught up. It took approximately 25 minutes to download all the content from every board on a gigabit connection. 90 | 91 | Resource information: 92 | * **CPU**: Torako will use nearly all cores if there is a large backlog of media to download 93 | * **Memory**: Memory usage is tunable and depends on your settings for `concurrent_downloads` and `inflight_posts`, and `media_backpressure`. In this test Torako spikes up to 3 GiB of memory, slowly settled to 170MiB of usage as the backlog cleared 94 | * **Network**: The instance's bandwidth was completely maxed out at an average of 128MiB/s 95 | -------------------------------------------------------------------------------- /src/storage/asagi/triggers.sql: -------------------------------------------------------------------------------- 1 | DROP PROCEDURE IF EXISTS `update_thread_%%BOARD%%`; 2 | 3 | CREATE PROCEDURE `update_thread_%%BOARD%%` (tnum INT, ghost_num INT, p_timestamp INT, 4 | p_media_hash VARCHAR(25), p_email VARCHAR(100)) 5 | BEGIN 6 | DECLARE d_time_last INT; 7 | DECLARE d_time_bump INT; 8 | DECLARE d_time_ghost INT; 9 | DECLARE d_time_ghost_bump INT; 10 | DECLARE d_time_last_modified INT; 11 | DECLARE d_image INT; 12 | 13 | SET d_time_last = 0; 14 | SET d_time_bump = 0; 15 | SET d_time_ghost = 0; 16 | SET d_time_ghost_bump = 0; 17 | SET d_image = p_media_hash IS NOT NULL; 18 | 19 | IF (ghost_num = 0) THEN 20 | SET d_time_last_modified = p_timestamp; 21 | SET d_time_last = p_timestamp; 22 | IF (p_email <> 'sage' OR p_email IS NULL) THEN 23 | SET d_time_bump = p_timestamp; 24 | END IF; 25 | ELSE 26 | SET d_time_last_modified = p_timestamp; 27 | SET d_time_ghost = p_timestamp; 28 | IF (p_email <> 'sage' OR p_email IS NULL) THEN 29 | SET d_time_ghost_bump = p_timestamp; 30 | END IF; 31 | END IF; 32 | 33 | UPDATE 34 | `%%BOARD%%_threads` op 35 | SET 36 | op.time_last = ( 37 | COALESCE( 38 | GREATEST(op.time_op, d_time_last), 39 | op.time_op 40 | ) 41 | ), 42 | op.time_bump = ( 43 | COALESCE( 44 | GREATEST(op.time_bump, d_time_bump), 45 | op.time_op 46 | ) 47 | ), 48 | op.time_ghost = ( 49 | IF ( 50 | GREATEST( 51 | IFNULL(op.time_ghost, 0), 52 | d_time_ghost 53 | ) <> 0, 54 | GREATEST( 55 | IFNULL(op.time_ghost, 0), 56 | d_time_ghost 57 | ), 58 | NULL 59 | ) 60 | ), 61 | op.time_ghost_bump = ( 62 | IF( 63 | GREATEST( 64 | IFNULL(op.time_ghost_bump, 0), 65 | d_time_ghost_bump 66 | ) <> 0, 67 | GREATEST( 68 | IFNULL(op.time_ghost_bump, 0), 69 | d_time_ghost_bump 70 | ), 71 | NULL 72 | ) 73 | ), 74 | op.time_last_modified = ( 75 | COALESCE( 76 | GREATEST(op.time_last_modified, d_time_last_modified), 77 | op.time_op 78 | ) 79 | ), 80 | op.nreplies = ( 81 | op.nreplies + 1 82 | ), 83 | op.nimages = ( 84 | op.nimages + d_image 85 | ) 86 | WHERE op.thread_num = tnum; 87 | END; 88 | 89 | DROP PROCEDURE IF EXISTS `update_thread_timestamp_%%BOARD%%`; 90 | 91 | CREATE PROCEDURE `update_thread_timestamp_%%BOARD%%` (tnum INT, timestamp INT) 92 | BEGIN 93 | UPDATE 94 | `%%BOARD%%_threads` op 95 | SET 96 | op.time_last_modified = ( 97 | GREATEST(op.time_last_modified, timestamp) 98 | ) 99 | WHERE op.thread_num = tnum; 100 | END; 101 | 102 | DROP PROCEDURE IF EXISTS `create_thread_%%BOARD%%`; 103 | 104 | CREATE PROCEDURE `create_thread_%%BOARD%%` (num INT, timestamp INT) 105 | BEGIN 106 | INSERT IGNORE INTO `%%BOARD%%_threads` VALUES (num, timestamp, timestamp, 107 | timestamp, NULL, NULL, timestamp, 0, 0, 0, 0); 108 | END; 109 | 110 | DROP PROCEDURE IF EXISTS `delete_thread_%%BOARD%%`; 111 | 112 | CREATE PROCEDURE `delete_thread_%%BOARD%%` (tnum INT) 113 | BEGIN 114 | DELETE FROM `%%BOARD%%_threads` WHERE thread_num = tnum; 115 | END; 116 | 117 | DROP PROCEDURE IF EXISTS `insert_image_%%BOARD%%`; 118 | 119 | CREATE PROCEDURE `insert_image_%%BOARD%%` (n_media_hash VARCHAR(25), 120 | n_media VARCHAR(20), n_preview VARCHAR(20), n_op INT) 121 | BEGIN 122 | IF n_op = 1 THEN 123 | INSERT INTO `%%BOARD%%_images` (media_hash, media, preview_op, total) 124 | VALUES (n_media_hash, n_media, n_preview, 1) 125 | ON DUPLICATE KEY UPDATE 126 | media_id = LAST_INSERT_ID(media_id), 127 | total = (total + 1), 128 | preview_op = COALESCE(preview_op, VALUES(preview_op)), 129 | media = COALESCE(media, VALUES(media)); 130 | ELSE 131 | INSERT INTO `%%BOARD%%_images` (media_hash, media, preview_reply, total) 132 | VALUES (n_media_hash, n_media, n_preview, 1) 133 | ON DUPLICATE KEY UPDATE 134 | media_id = LAST_INSERT_ID(media_id), 135 | total = (total + 1), 136 | preview_reply = COALESCE(preview_reply, VALUES(preview_reply)), 137 | media = COALESCE(media, VALUES(media)); 138 | END IF; 139 | END; 140 | 141 | DROP PROCEDURE IF EXISTS `delete_image_%%BOARD%%`; 142 | 143 | CREATE PROCEDURE `delete_image_%%BOARD%%` (n_media_id INT) 144 | BEGIN 145 | UPDATE `%%BOARD%%_images` SET total = (total - 1) WHERE media_id = n_media_id; 146 | END; 147 | 148 | DROP TRIGGER IF EXISTS `before_ins_%%BOARD%%`; 149 | 150 | CREATE TRIGGER `before_ins_%%BOARD%%` BEFORE INSERT ON `%%BOARD%%` 151 | FOR EACH ROW 152 | BEGIN 153 | IF NEW.media_hash IS NOT NULL THEN 154 | CALL insert_image_%%BOARD%%(NEW.media_hash, NEW.media_orig, NEW.preview_orig, NEW.op); 155 | SET NEW.media_id = LAST_INSERT_ID(); 156 | END IF; 157 | END; 158 | 159 | DROP TRIGGER IF EXISTS `after_ins_%%BOARD%%`; 160 | 161 | CREATE TRIGGER `after_ins_%%BOARD%%` AFTER INSERT ON `%%BOARD%%` 162 | FOR EACH ROW 163 | BEGIN 164 | IF NEW.op = 1 THEN 165 | CALL create_thread_%%BOARD%%(NEW.num, NEW.timestamp); 166 | END IF; 167 | CALL update_thread_%%BOARD%%(NEW.thread_num, NEW.subnum, NEW.timestamp, NEW.media_hash, NEW.email); 168 | END; 169 | 170 | DROP TRIGGER IF EXISTS `after_del_%%BOARD%%`; 171 | 172 | CREATE TRIGGER `after_del_%%BOARD%%` AFTER DELETE ON `%%BOARD%%` 173 | FOR EACH ROW 174 | BEGIN 175 | CALL update_thread_%%BOARD%%(OLD.thread_num, OLD.subnum, OLD.timestamp, OLD.media_hash, OLD.email); 176 | IF OLD.op = 1 THEN 177 | CALL delete_thread_%%BOARD%%(OLD.num); 178 | END IF; 179 | IF OLD.media_hash IS NOT NULL THEN 180 | CALL delete_image_%%BOARD%%(OLD.media_id); 181 | END IF; 182 | END; 183 | 184 | DROP TRIGGER IF EXISTS `after_upd_%%BOARD%%`; 185 | 186 | CREATE TRIGGER `after_upd_%%BOARD%%` AFTER UPDATE ON `%%BOARD%%` 187 | FOR EACH ROW 188 | BEGIN 189 | IF NEW.timestamp_expired <> 0 THEN 190 | CALL update_thread_timestamp_%%BOARD%%(NEW.thread_num, NEW.timestamp_expired); 191 | END IF; 192 | END; -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | tags: 4 | - "v*" 5 | 6 | name: Release 7 | 8 | env: 9 | RELEASE_DIR: artifacts 10 | GITHUB_REF: "${{ github.ref }}" 11 | WINDOWS_TARGET: x86_64-pc-windows-msvc 12 | WINDOWS64_NAME: windows-amd64 13 | MACOS_TARGET: x86_64-apple-darwin 14 | MACOS64_NAME: darwin-amd64 15 | LINUX_TARGET: x86_64-unknown-linux-gnu 16 | LINUX64_NAME: linux-amd64 17 | 18 | jobs: 19 | build: 20 | name: Build artifacts 21 | runs-on: ${{ matrix.os }} 22 | strategy: 23 | matrix: 24 | build: [linux, macos, windows] 25 | include: 26 | - build: linux 27 | os: ubuntu-latest 28 | rust: stable 29 | - build: macos 30 | os: macos-latest 31 | rust: stable 32 | - build: windows 33 | os: windows-latest 34 | rust: stable 35 | 36 | steps: 37 | - uses: actions/checkout@v2 38 | 39 | - name: Cache Cargo registry 40 | uses: actions/cache@v1 41 | if: matrix.rust 42 | with: 43 | path: ~/.cargo/registry 44 | key: ${{ matrix.build }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }} 45 | restore-keys: | 46 | ${{ matrix.build }}-cargo-registry- 47 | 48 | - name: Cache Cargo index 49 | uses: actions/cache@v1 50 | if: matrix.rust 51 | with: 52 | path: ~/.cargo/git 53 | key: ${{ matrix.build }}-cargo-index-${{ hashFiles('**/Cargo.lock') }} 54 | restore-keys: | 55 | ${{ matrix.build }}-cargo-index- 56 | 57 | - name: Cache Cargo build 58 | uses: actions/cache@v1 59 | if: matrix.rust 60 | with: 61 | path: target 62 | key: ${{ matrix.build }}-target-${{ hashFiles('**/Cargo.lock') }} 63 | restore-keys: | 64 | ${{ matrix.build }}-target- 65 | 66 | - name: Query version number 67 | id: get_version 68 | shell: bash 69 | run: | 70 | echo "using version tag ${GITHUB_REF:10}" 71 | echo ::set-output name=version::"${GITHUB_REF:10}" 72 | 73 | - name: Install Rust 74 | if: matrix.rust 75 | run: | 76 | rustup update ${{ matrix.rust }} --no-self-update 77 | rustup default ${{ matrix.rust }} 78 | 79 | - name: Build (Linux) 80 | if: matrix.build == 'linux' 81 | run: | 82 | rustup target add ${{ env.LINUX_TARGET }} 83 | cargo build --release --features pgs-reindex,lnx-reindex --target ${{ env.LINUX_TARGET }} 84 | 85 | - name: Build (MacOS) 86 | if: matrix.build == 'macos' 87 | run: cargo build --release --features pgs-reindex,lnx-reindex 88 | 89 | - name: Build (Windows) 90 | if: matrix.build == 'windows' 91 | run: cargo build --release --features pgs-reindex,lnx-reindex 92 | 93 | - name: Create artifact directory 94 | run: | 95 | mkdir ${{ env.RELEASE_DIR }} 96 | mkdir dist 97 | 98 | - name: Create Build (Linux) 99 | if: matrix.build == 'linux' 100 | run: | 101 | mv ./target/${{ env.LINUX_TARGET }}/release/torako ./${{ env.RELEASE_DIR }}/torako-${{ env.LINUX64_NAME }} 102 | 103 | - name: Create Build (Windows) 104 | if: matrix.build == 'windows' 105 | shell: bash 106 | run: | 107 | mv ./target/release/torako.exe ./${{ env.RELEASE_DIR }}/torako-${{ env.WINDOWS64_NAME }}.exe 108 | 109 | - name: Create Build (MacOS) 110 | if: matrix.build == 'macos' 111 | run: | 112 | mv ./target/release/torako ./${{ env.RELEASE_DIR }}/torako-${{ env.MACOS64_NAME }} 113 | 114 | - name: Upload binaries 115 | uses: actions/upload-artifact@v1 116 | with: 117 | name: ${{ matrix.build }} 118 | path: ./${{ env.RELEASE_DIR }} 119 | 120 | release: 121 | name: GitHub Release 122 | needs: build 123 | runs-on: ubuntu-latest 124 | steps: 125 | - name: Query version number 126 | id: get_version 127 | shell: bash 128 | run: | 129 | echo "using version tag ${GITHUB_REF:10}" 130 | echo ::set-output name=version::"${GITHUB_REF:10}" 131 | 132 | - name: Create Release 133 | id: create_release 134 | uses: actions/create-release@v1 135 | env: 136 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 137 | with: 138 | tag_name: ${{ steps.get_version.outputs.VERSION }} 139 | release_name: ${{ steps.get_version.outputs.VERSION }} 140 | draft: true 141 | 142 | - name: Download Linux bundle 143 | uses: actions/download-artifact@v1 144 | with: 145 | name: linux 146 | 147 | - name: Download Windows bundle 148 | uses: actions/download-artifact@v1 149 | with: 150 | name: windows 151 | 152 | - name: Download MacOS bundle 153 | uses: actions/download-artifact@v1 154 | with: 155 | name: macos 156 | 157 | - name: Release Linux bundle 158 | uses: actions/upload-release-asset@v1 159 | env: 160 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 161 | with: 162 | upload_url: ${{ steps.create_release.outputs.upload_url }} 163 | asset_path: ./linux/torako-${{ env.LINUX64_NAME }} 164 | asset_content_type: application/octet-stream 165 | asset_name: torako-${{ env.LINUX64_NAME }} 166 | 167 | - name: Release Windows bundle 168 | uses: actions/upload-release-asset@v1 169 | env: 170 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 171 | with: 172 | upload_url: ${{ steps.create_release.outputs.upload_url }} 173 | asset_path: ./windows/torako-${{ env.WINDOWS64_NAME }}.exe 174 | asset_content_type: application/octet-stream 175 | asset_name: torako-${{ env.WINDOWS64_NAME }}.exe 176 | 177 | - name: Release MacOS bundle 178 | uses: actions/upload-release-asset@v1 179 | env: 180 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 181 | with: 182 | upload_url: ${{ steps.create_release.outputs.upload_url }} 183 | asset_path: ./macos/torako-${{ env.MACOS64_NAME }} 184 | asset_content_type: application/octet-stream 185 | asset_name: torako-${{ env.MACOS64_NAME }} -------------------------------------------------------------------------------- /src/config.rs: -------------------------------------------------------------------------------- 1 | use std::net::SocketAddr; 2 | use std::num::{NonZeroU32, NonZeroUsize}; 3 | use std::path::PathBuf; 4 | use std::time::Duration; 5 | 6 | use serde::Deserialize; 7 | 8 | use crate::SeaHashMap; 9 | 10 | #[derive(Debug, Deserialize, Clone, Default)] 11 | #[serde(default)] 12 | pub struct Config { 13 | pub api_addr: Option, 14 | pub api_addr_interface: Option, 15 | pub rate_limit: Option, 16 | pub thread_concurrency: Option, 17 | #[serde(with = "humantime_serde")] 18 | pub request_timeout: Option, 19 | #[serde(default)] 20 | pub request_proxy: Vec, 21 | #[serde(default)] 22 | pub request_only_proxy: bool, 23 | pub boards: Board, 24 | pub backend: Backend, 25 | } 26 | 27 | #[derive(Debug, Deserialize, Clone, Default)] 28 | #[serde(default)] 29 | pub struct Board { 30 | pub tls: Option, 31 | pub host: Option, 32 | #[serde(with = "humantime_serde")] 33 | pub refresh_rate: Option, 34 | pub deleted_page_threshold: Option, 35 | pub download_thumbs: Option, 36 | pub download_media: Option, 37 | pub url_media_filename: Option, 38 | 39 | #[serde(flatten)] 40 | pub boards: SeaHashMap, 41 | } 42 | 43 | #[derive(Debug, Deserialize, Clone, Default)] 44 | #[serde(default)] 45 | pub struct Backend { 46 | pub asagi: Option, 47 | pub asagi_pg_search: Option, 48 | pub asagi_lnx_search: Option, 49 | pub null: Option, 50 | } 51 | 52 | #[derive(Debug, Deserialize, Clone)] 53 | pub struct Asagi { 54 | #[serde(default)] 55 | pub disabled: bool, 56 | #[serde(default)] 57 | pub thumbs: Option, 58 | #[serde(default)] 59 | pub media: Option, 60 | #[serde(default)] 61 | pub media_url: Option, 62 | #[serde(default)] 63 | pub thumb_url: Option, 64 | #[serde(default, alias = "persist_error_is_fatal")] 65 | pub fail_on_save_error: Option, 66 | #[serde(default)] 67 | pub retries_on_save_error: Option, 68 | #[serde(default)] 69 | pub inflight_posts: Option, 70 | #[serde(default)] 71 | pub concurrent_downloads: Option, 72 | #[serde(default)] 73 | pub media_backpressure: Option, 74 | #[serde(default)] 75 | pub media_storage: Option, 76 | #[serde(default)] 77 | pub thumb_storage: Option, 78 | pub database: AsagiDatabase, 79 | #[serde(default)] 80 | pub old_dir_structure: Option, 81 | #[serde(default)] 82 | pub sha_dir_structure: Option, 83 | #[serde(default)] 84 | pub boards: SeaHashMap, 85 | #[serde(default)] 86 | pub tmp_dir: Option, 87 | 88 | // Options kept for backwards compatibility 89 | #[serde(default)] 90 | pub media_path: Option, 91 | #[serde(default)] 92 | pub web_unix_group: Option, 93 | } 94 | 95 | #[derive(Debug, Deserialize, Clone)] 96 | pub struct AsagiBoard { 97 | pub media_storage: Option, 98 | pub thumb_storage: Option, 99 | } 100 | 101 | #[derive(Debug, Deserialize, Clone)] 102 | pub struct AsagiDatabase { 103 | pub url: url::Url, 104 | #[serde(default)] 105 | pub charset: Option, 106 | #[serde(default)] 107 | pub use_triggers: Option, 108 | #[serde(default)] 109 | pub compute_stats: Option, 110 | #[serde(default)] 111 | pub truncate_fields: Option, 112 | #[serde(default)] 113 | pub sql_set_utc: Option, 114 | #[serde(default)] 115 | pub mysql_engine: Option, 116 | } 117 | 118 | #[derive(Debug, Deserialize, Clone)] 119 | pub struct AsagiStorage { 120 | #[serde(default)] 121 | pub filesystem: Option, 122 | #[serde(default)] 123 | pub s3: Option, 124 | #[serde(default)] 125 | pub b2: Option, 126 | } 127 | 128 | #[derive(Debug, Deserialize, Clone)] 129 | pub struct AsagiFilesystemStorage { 130 | #[serde(default)] 131 | pub disabled: bool, 132 | pub media_path: PathBuf, 133 | #[serde(default)] 134 | pub tmp_dir: Option, 135 | #[serde(default)] 136 | pub web_unix_group: Option, 137 | } 138 | 139 | #[derive(Debug, Deserialize, Clone)] 140 | pub struct AsagiS3Storage { 141 | #[serde(default)] 142 | pub disabled: bool, 143 | pub access_key_id: String, 144 | pub secret_access_key: String, 145 | pub region: String, 146 | pub endpoint: Option, 147 | pub bucket: String, 148 | pub acl: Option, 149 | #[serde(default)] 150 | pub check_exists: Option, 151 | } 152 | 153 | #[derive(Debug, Deserialize, Clone)] 154 | pub struct AsagiB2Storage { 155 | #[serde(default)] 156 | pub disabled: bool, 157 | pub application_key_id: String, 158 | pub application_key: String, 159 | pub bucket_id: String, 160 | #[serde(default)] 161 | pub check_exists: Option, 162 | #[serde(default)] 163 | pub bloom: Option, 164 | } 165 | 166 | #[derive(Debug, Deserialize, Clone, Default)] 167 | #[serde(default)] 168 | pub struct AsagiB2StorageBloom { 169 | pub disabled: Option, 170 | pub file_key: Option, 171 | pub initial_bit_count: Option, 172 | pub false_positive_rate: Option, 173 | #[serde(with = "humantime_serde")] 174 | pub upload_frequency: Option, 175 | } 176 | 177 | #[derive(Debug, Deserialize, Clone)] 178 | pub struct AsagiSearch { 179 | #[serde(default)] 180 | pub disabled: bool, 181 | pub database_url: url::Url, 182 | #[serde(default)] 183 | pub inflight_posts: Option, 184 | #[serde(default)] 185 | pub fail_on_save_error: Option, 186 | #[serde(default)] 187 | pub retries_on_save_error: Option, 188 | } 189 | 190 | #[derive(Debug, Deserialize, Clone)] 191 | pub struct AsagiLnxSearch { 192 | #[serde(default)] 193 | pub disabled: bool, 194 | pub database_url: url::Url, 195 | pub index: String, 196 | #[serde(default)] 197 | pub inflight_posts: Option, 198 | #[serde(default)] 199 | pub concurrent_requests: Option, 200 | #[serde(default)] 201 | pub fail_on_save_error: Option, 202 | #[serde(default)] 203 | pub retries_on_save_error: Option, 204 | #[serde(default)] 205 | pub authentication_key: Option, 206 | #[serde(with = "humantime_serde", default)] 207 | pub commit_sync_interval: Option, 208 | #[serde(with = "humantime_serde", default)] 209 | pub request_timeout: Option, 210 | } 211 | 212 | #[derive(Debug, Deserialize, Clone)] 213 | pub struct NullBackend { 214 | #[serde(default)] 215 | pub disabled: bool, 216 | } 217 | -------------------------------------------------------------------------------- /src/api.rs: -------------------------------------------------------------------------------- 1 | use std::net::SocketAddr; 2 | use std::sync::{atomic::Ordering, Arc}; 3 | use std::time::{Duration, Instant, SystemTime}; 4 | 5 | use clap::crate_version; 6 | use futures::prelude::*; 7 | use log::{error, info, warn}; 8 | use serde::Serialize; 9 | use warp::{http::StatusCode, Filter}; 10 | 11 | use crate::SeaHashMap; 12 | 13 | #[derive(Serialize)] 14 | struct BoardMetrics { 15 | posts: u64, 16 | deleted: u64, 17 | warmed_up: bool, 18 | last_modified: i64, 19 | cloudflare_blocked: u64, 20 | } 21 | 22 | impl Default for BoardMetrics { 23 | fn default() -> Self { 24 | Self { 25 | posts: 0, 26 | deleted: 0, 27 | warmed_up: true, 28 | last_modified: 0, 29 | cloudflare_blocked: 0, 30 | } 31 | } 32 | } 33 | 34 | impl std::ops::Add for BoardMetrics { 35 | type Output = Self; 36 | 37 | fn add(self, other: Self) -> Self { 38 | Self { 39 | posts: self.posts + other.posts, 40 | deleted: self.deleted + other.deleted, 41 | last_modified: self.last_modified.max(other.last_modified), 42 | warmed_up: self.warmed_up && other.warmed_up, 43 | cloudflare_blocked: self.cloudflare_blocked + other.cloudflare_blocked, 44 | } 45 | } 46 | } 47 | 48 | impl From<&crate::imageboard::Metrics> for BoardMetrics { 49 | fn from(metrics: &crate::imageboard::Metrics) -> Self { 50 | Self { 51 | posts: metrics.posts.load(Ordering::Relaxed), 52 | deleted: metrics.deleted.load(Ordering::Relaxed), 53 | last_modified: metrics.last_modified.load(Ordering::Relaxed), 54 | warmed_up: metrics.warmed_up.load(Ordering::Relaxed), 55 | cloudflare_blocked: metrics.cloudflare_blocked.load(Ordering::Relaxed), 56 | } 57 | } 58 | } 59 | 60 | #[derive(Serialize)] 61 | struct Info { 62 | name: String, 63 | #[serde(with = "humantime_serde")] 64 | uptime: Duration, 65 | #[serde(with = "humantime_serde")] 66 | started_at: Option, 67 | version: String, 68 | boards: SeaHashMap<&'static str, BoardMetrics>, 69 | all_boards: BoardMetrics, 70 | storage: SeaHashMap<&'static str, Box>, 71 | ok: bool, 72 | } 73 | 74 | #[derive(Serialize)] 75 | struct ErrorMessage { 76 | code: u16, 77 | message: String, 78 | } 79 | 80 | fn with_boards( 81 | db: Arc>>, 82 | ) -> impl Filter< 83 | Extract = (Arc>>,), 84 | Error = std::convert::Infallible, 85 | > + Clone { 86 | warp::any().map(move || db.clone()) 87 | } 88 | 89 | fn with_storage( 90 | db: Arc>>, 91 | ) -> impl Filter< 92 | Extract = (Arc>>,), 93 | Error = std::convert::Infallible, 94 | > + Clone { 95 | warp::any().map(move || db.clone()) 96 | } 97 | 98 | fn info( 99 | start_time: Instant, 100 | system_start: SystemTime, 101 | board: Arc>>, 102 | storage: Arc>>, 103 | ) -> impl Filter + Clone { 104 | warp::path::end() 105 | .and(warp::get()) 106 | .and(with_boards(board)) 107 | .and(with_storage(storage)) 108 | .and_then( 109 | move |board: Arc>>, 110 | storage: Arc>>| async move { 111 | let info = Info { 112 | name: "torako".into(), 113 | uptime: start_time.elapsed(), 114 | started_at: Some(system_start), 115 | version: crate_version!().into(), 116 | boards: board.iter().map(|b| (b.board, b.as_ref().into())).collect(), 117 | all_boards: board 118 | .iter() 119 | .fold(BoardMetrics::default(), |acc, x| acc + x.as_ref().into()), 120 | storage: stream::iter(storage.iter()) 121 | .then(|s| async move { (s.name(), s.metrics().await) }) 122 | .collect() 123 | .await, 124 | ok: true, 125 | }; 126 | Ok::<_, core::convert::Infallible>(warp::reply::json(&info)) 127 | }, 128 | ) 129 | } 130 | 131 | async fn handle_rejection( 132 | err: warp::Rejection, 133 | ) -> Result { 134 | let code; 135 | let message; 136 | 137 | if err.is_not_found() { 138 | code = StatusCode::NOT_FOUND; 139 | message = "NOT_FOUND".to_string(); 140 | } else if let Some(_) = err.find::() { 141 | message = "BAD_REQUEST".to_string(); 142 | code = StatusCode::BAD_REQUEST; 143 | } else if let Some(_) = err.find::() { 144 | code = StatusCode::METHOD_NOT_ALLOWED; 145 | message = "METHOD_NOT_ALLOWED".to_string(); 146 | } else { 147 | error!("unhandled rejection: {:?}", err); 148 | code = StatusCode::INTERNAL_SERVER_ERROR; 149 | message = "UNHANDLED_REJECTION".to_string(); 150 | } 151 | 152 | let json = warp::reply::json(&ErrorMessage { 153 | code: code.as_u16(), 154 | message: message.into(), 155 | }); 156 | 157 | Ok(warp::reply::with_status(json, code)) 158 | } 159 | 160 | pub fn serve( 161 | addr: SocketAddr, 162 | addr_interface: Option, 163 | board_metrics: Vec>, 164 | storage_metrics: Vec>, 165 | ) -> impl Future { 166 | let routes = info( 167 | Instant::now(), 168 | SystemTime::now(), 169 | Arc::new(board_metrics), 170 | Arc::new(storage_metrics), 171 | ); 172 | 173 | let addr = match addr_interface { 174 | #[cfg(target_family = "windows")] 175 | Some(_) => { 176 | warn!("Binding to interfaces is not possible on Windows"); 177 | addr 178 | } 179 | #[cfg(not(target_family = "windows"))] 180 | Some(interface) => match get_if_addrs::get_if_addrs() { 181 | Ok(interfaces) => { 182 | let port = addr.port(); 183 | if let Some(interface) = interfaces.iter().find(|&x| x.name == interface) { 184 | let addr = interface.ip(); 185 | (addr, port).into() 186 | } else { 187 | warn!("Failed to find network interface: {}", interface); 188 | warn!("API server will fallback to binding to address."); 189 | addr 190 | } 191 | } 192 | Err(err) => { 193 | warn!("Failed to query system interfaces: {}", err); 194 | warn!("API server will fallback to binding to address."); 195 | addr 196 | } 197 | }, 198 | None => addr, 199 | }; 200 | 201 | info!("Starting API server on: {}", addr); 202 | warp::serve(routes.recover(handle_rejection)).run(addr) 203 | } 204 | -------------------------------------------------------------------------------- /src/storage/asagi/storage/s3.rs: -------------------------------------------------------------------------------- 1 | use std::io; 2 | use std::path::Path; 3 | use std::pin::Pin; 4 | use std::sync::Arc; 5 | use std::task::{Context, Poll}; 6 | 7 | use bytes::Bytes; 8 | use futures::prelude::*; 9 | use log::error; 10 | use rusoto_core::{credential, HttpClient, Region, RusotoError}; 11 | use rusoto_s3::{S3Client, S3 as S3Trait}; 12 | use tokio::io::AsyncWrite; 13 | use tokio::sync::mpsc::UnboundedSender; 14 | 15 | use super::Error; 16 | 17 | struct File { 18 | sender: Option>>, 19 | upload: Pin< 20 | Box< 21 | dyn Future< 22 | Output = Result< 23 | Result>, 24 | tokio::task::JoinError, 25 | >, 26 | > + Send, 27 | >, 28 | >, 29 | } 30 | 31 | pub struct S3 { 32 | s3_client: Arc, 33 | acl: Option, 34 | bucket: String, 35 | check_exists: bool, 36 | } 37 | 38 | impl S3 { 39 | pub async fn new< 40 | T: AsRef, 41 | U: AsRef, 42 | V: AsRef, 43 | W: AsRef, 44 | X: AsRef, 45 | Y: AsRef, 46 | >( 47 | access_key: T, 48 | secret_access_key: U, 49 | bucket: V, 50 | region: W, 51 | endpoint: Option, 52 | acl: Option, 53 | check_exists: bool, 54 | ) -> Result { 55 | let provider = credential::StaticProvider::new_minimal( 56 | access_key.as_ref().into(), 57 | secret_access_key.as_ref().into(), 58 | ); 59 | let region = match endpoint { 60 | Some(endpoint) => Region::Custom { 61 | name: region.as_ref().into(), 62 | endpoint: endpoint.as_ref().trim_end_matches('/').to_string(), 63 | }, 64 | None => match region.as_ref().parse::() { 65 | Ok(r) => r, 66 | Err(_) => return Err(Error::S3("Invalid Region".into())), 67 | }, 68 | }; 69 | 70 | let s3 = Self { 71 | s3_client: Arc::new(S3Client::new_with( 72 | HttpClient::new().unwrap(), 73 | provider, 74 | region, 75 | )), 76 | bucket: String::from(bucket.as_ref()), 77 | acl: acl.map(|x| String::from(x.as_ref())), 78 | check_exists, 79 | }; 80 | 81 | Ok(s3) 82 | } 83 | } 84 | 85 | impl S3 { 86 | pub async fn exists>(&self, filepath: T) -> Result { 87 | if !self.check_exists { 88 | return Ok(false); 89 | } 90 | let head_object_req = rusoto_s3::HeadObjectRequest { 91 | bucket: self.bucket.clone(), 92 | key: filepath.as_ref().to_string_lossy().to_string(), 93 | ..Default::default() 94 | }; 95 | match self.s3_client.head_object(head_object_req).await { 96 | Ok(_) => Ok(true), 97 | Err(RusotoError::Service(_)) => Ok(false), 98 | Err(err) => { 99 | if let RusotoError::Unknown(ref resp) = err { 100 | // Rusoto doesn't seem to play nice with Backblaze here... 101 | if resp.status == reqwest::StatusCode::NOT_FOUND { 102 | return Ok(false); 103 | } 104 | error!( 105 | "Received an unknown error from S3. Check endpoint? Status: {} Headers: {:?} Body: {}", 106 | resp.status, resp.headers, resp.body.len() 107 | ); 108 | } 109 | Err(Error::Rusoto(Box::new(err))) 110 | } 111 | } 112 | } 113 | 114 | pub async fn open>( 115 | &self, 116 | filepath: T, 117 | size: usize, 118 | ) -> Result { 119 | let (tx, rx) = tokio::sync::mpsc::unbounded_channel(); 120 | let put_object = rusoto_s3::PutObjectRequest { 121 | acl: self.acl.clone(), 122 | bucket: self.bucket.clone(), 123 | key: filepath.as_ref().to_string_lossy().to_string(), 124 | body: Some(rusoto_s3::StreamingBody::new(rx)), 125 | cache_control: Some("public, max-age=31104000".into()), 126 | content_length: Some(size as i64), 127 | content_type: Some( 128 | mime_guess::from_path(filepath.as_ref()) 129 | .first_or_octet_stream() 130 | .to_string(), 131 | ), 132 | ..Default::default() 133 | }; 134 | let client = self.s3_client.clone(); 135 | 136 | Ok(File { 137 | sender: Some(tx), 138 | upload: tokio::spawn(async move { client.put_object(put_object).await }).boxed(), 139 | }) 140 | } 141 | } 142 | 143 | impl AsyncWrite for File { 144 | fn poll_write( 145 | mut self: Pin<&mut Self>, 146 | cx: &mut Context<'_>, 147 | buf: &[u8], 148 | ) -> Poll> { 149 | let len = buf.len(); 150 | let x = self 151 | .sender 152 | .as_ref() 153 | .unwrap() 154 | .send(Ok(Bytes::copy_from_slice(buf))); 155 | match x { 156 | Ok(_) => Poll::Ready(Ok(len)), 157 | Err(_) => { 158 | //reqwest was dropped somehow 159 | match Pin::new(&mut self.upload).poll(cx) { 160 | Poll::Pending => return Poll::Pending, 161 | Poll::Ready(join_handle) => match join_handle { 162 | Ok(r) => match r { 163 | Ok(_) => Poll::Ready(Err(io::Error::new( 164 | io::ErrorKind::Other, 165 | Error::RequestEOF, 166 | ))), 167 | Err(err) => Poll::Ready(Err(io::Error::new( 168 | io::ErrorKind::Other, 169 | Error::Rusoto(Box::new(err)), 170 | ))), 171 | }, 172 | // maybe panic? 173 | Err(err) => Poll::Ready(Err(io::Error::new(io::ErrorKind::Other, err))), 174 | }, 175 | } 176 | } 177 | } 178 | } 179 | 180 | fn poll_flush(self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll> { 181 | Poll::Ready(Ok(())) 182 | } 183 | 184 | fn poll_shutdown(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { 185 | if let Some(c) = self.sender.take() { 186 | drop(c) 187 | } 188 | match Pin::new(&mut self.upload).poll(cx) { 189 | Poll::Pending => return Poll::Pending, 190 | Poll::Ready(join_handle) => match join_handle { 191 | Ok(r) => match r { 192 | Ok(_) => Poll::Ready(Ok(())), 193 | Err(err) => Poll::Ready(Err(io::Error::new( 194 | io::ErrorKind::Other, 195 | Error::Rusoto(Box::new(err)), 196 | ))), 197 | }, 198 | // Maybe panic? 199 | Err(err) => Poll::Ready(Err(io::Error::new(io::ErrorKind::Other, err))), 200 | }, 201 | } 202 | } 203 | } 204 | -------------------------------------------------------------------------------- /src/storage/search_lnx/builder.rs: -------------------------------------------------------------------------------- 1 | use std::{num::NonZeroUsize, sync::{ 2 | atomic::{AtomicBool, AtomicUsize, Ordering}, 3 | Arc, 4 | }}; 5 | use std::time::Duration; 6 | 7 | use futures::prelude::*; 8 | use futures::task::AtomicWaker; 9 | use log::info; 10 | 11 | use super::{Error, Search, SearchInner}; 12 | 13 | pub struct SearchBuilder { 14 | db_url: Option, 15 | index: String, 16 | inflight_posts: usize, 17 | fail_on_save_error: bool, 18 | retries_on_save_error: usize, 19 | authentication_key: String, 20 | commit_sync_interval: Duration, 21 | concurrent_requests: usize, 22 | request_timeout: Duration, 23 | } 24 | 25 | impl Default for SearchBuilder { 26 | fn default() -> Self { 27 | SearchBuilder { 28 | db_url: None, 29 | index: Default::default(), 30 | inflight_posts: usize::MAX, 31 | concurrent_requests: usize::MAX, 32 | fail_on_save_error: true, 33 | retries_on_save_error: 0, 34 | authentication_key: Default::default(), 35 | commit_sync_interval: Duration::from_secs(5), 36 | request_timeout: Duration::from_secs(120), 37 | } 38 | } 39 | } 40 | 41 | impl SearchBuilder { 42 | pub fn with_database(mut self, database_url: url::Url) -> Self { 43 | self.db_url = Some(database_url); 44 | self 45 | } 46 | 47 | pub fn with_index>(mut self, index: T) -> Self { 48 | self.index = String::from(index.as_ref()); 49 | self 50 | } 51 | 52 | pub fn commit_sync_interval(mut self, interval: Duration) -> Self { 53 | self.commit_sync_interval = interval; 54 | self 55 | } 56 | 57 | pub fn concurrent_requests(mut self, requests: NonZeroUsize) -> Self { 58 | self.concurrent_requests = requests.get(); 59 | self 60 | } 61 | 62 | pub fn authentication_key>(mut self, index: T) -> Self { 63 | self.authentication_key = String::from(index.as_ref()); 64 | self 65 | } 66 | 67 | pub fn max_inflight_posts(mut self, posts: usize) -> Self { 68 | self.inflight_posts = posts; 69 | self 70 | } 71 | 72 | pub fn fail_on_save_error(mut self, yes: bool) -> Self { 73 | self.fail_on_save_error = yes; 74 | self 75 | } 76 | 77 | pub fn request_timeout(mut self, timeout: Duration) -> Self { 78 | self.request_timeout = timeout; 79 | self 80 | } 81 | 82 | pub fn retries_on_save_error(mut self, retries: usize) -> Self { 83 | self.retries_on_save_error = retries; 84 | self 85 | } 86 | 87 | pub async fn build(self) -> Result { 88 | info!("Initializing Asagi Lnx Search Backend..."); 89 | if self.index.is_empty() { 90 | return Err(Error::InvalidIndex); 91 | } 92 | let mut headers = reqwest::header::HeaderMap::new(); 93 | if !self.authentication_key.is_empty() { 94 | headers.insert( 95 | "Authorization", 96 | reqwest::header::HeaderValue::from_str(&format!( 97 | "Bearer {}", 98 | self.authentication_key.as_str() 99 | )) 100 | .unwrap(), 101 | ); 102 | } 103 | let client = reqwest::Client::builder() 104 | .default_headers(headers) 105 | .timeout(self.request_timeout) 106 | .build() 107 | .unwrap(); 108 | 109 | // info!("Creating index (if needed)..."); 110 | // TODO 111 | 112 | let mut upload_url = self.db_url.clone().unwrap(); 113 | upload_url.set_path(&format!("/indexes/{}/documents", self.index)); 114 | upload_url.set_query(Some("wait=true")); 115 | 116 | let mut commit_url = self.db_url.clone().unwrap(); 117 | commit_url.set_path(&format!("/indexes/{}/commit", self.index)); 118 | 119 | let (process_tx, process_rx) = tokio::sync::mpsc::unbounded_channel(); 120 | let commit_lock = Arc::new(tokio::sync::RwLock::new(())); 121 | let dirty = Arc::new(AtomicBool::new(false)); 122 | let dirty_weak = Arc::downgrade(&dirty); 123 | let commit_sync_interval = self.commit_sync_interval; 124 | 125 | let search = SearchInner { 126 | client: client.clone(), 127 | upload_url, 128 | max_inflight_posts: self.inflight_posts, 129 | fail_on_save_error: self.fail_on_save_error, 130 | retries_on_save_error: self.retries_on_save_error, 131 | requests: tokio::sync::Semaphore::new(self.concurrent_requests), 132 | 133 | failed: AtomicBool::new(false), 134 | inflight_posts: AtomicUsize::new(0), 135 | waker: Arc::new(AtomicWaker::new()), 136 | flush_waker: Arc::new(AtomicWaker::new()), 137 | close_waker: Arc::new(AtomicWaker::new()), 138 | metrics: Arc::new(super::SearchMetrics::default()), 139 | process_tx, 140 | commit_lock: commit_lock.clone(), 141 | dirty, 142 | }; 143 | 144 | let search = Arc::new(search); 145 | let search2 = search.clone(); 146 | 147 | tokio::spawn( 148 | process_rx 149 | .take_while(|x| future::ready(x.is_some())) 150 | .zip(stream::repeat(search2)) 151 | .map(|(x, search2)| match x { 152 | Some(p) => search2.send_posts(p), 153 | None => unreachable!(), 154 | }) 155 | .buffer_unordered(usize::MAX) 156 | .for_each(|_| future::ready(())), 157 | ); 158 | tokio::spawn(async move { 159 | loop { 160 | tokio::time::delay_for(commit_sync_interval).await; 161 | if let Some(dirty) = dirty_weak.upgrade() { 162 | if dirty.load(Ordering::Relaxed) { 163 | let started = std::time::Instant::now(); 164 | log::trace!("[lnx] Starting commit"); 165 | let t = commit_lock.write().await; 166 | let r = client 167 | .post(commit_url.clone()) 168 | .send() 169 | .and_then(|resp| futures::future::ready(resp.error_for_status())) 170 | .await; 171 | drop(t); 172 | if let Err(err) = r { 173 | log::warn!("Failed to commit lnx search index: {}", err); 174 | } else { 175 | log::trace!("[lnx] Commit completed, took {}ms", started.elapsed().as_secs_f32() / 1000.); 176 | dirty.store(false, Ordering::Relaxed) 177 | } 178 | } 179 | } else { 180 | break; 181 | } 182 | } 183 | }); 184 | 185 | Ok(Search { inner: search }) 186 | } 187 | } 188 | 189 | impl From<&crate::config::AsagiLnxSearch> for SearchBuilder { 190 | fn from(config: &crate::config::AsagiLnxSearch) -> Self { 191 | let mut builder = SearchBuilder::default(); 192 | builder = builder.with_database(config.database_url.clone()); 193 | builder = builder.with_index(config.index.as_str()); 194 | if let Some(inflight_posts) = config.inflight_posts { 195 | builder = builder.max_inflight_posts(inflight_posts.into()); 196 | } 197 | if let Some(fail_on_save_error) = config.fail_on_save_error { 198 | builder = builder.fail_on_save_error(fail_on_save_error); 199 | } 200 | if let Some(retries_on_save_error) = config.retries_on_save_error { 201 | builder = builder.retries_on_save_error(retries_on_save_error); 202 | } 203 | if let Some(authentication_key) = config.authentication_key.as_ref() { 204 | builder = builder.authentication_key(authentication_key); 205 | } 206 | if let Some(commit_sync_interval) = config.commit_sync_interval.as_ref() { 207 | builder = builder.commit_sync_interval(*commit_sync_interval); 208 | } 209 | if let Some(concurrent_requests) = config.concurrent_requests.as_ref() { 210 | builder = builder.concurrent_requests(*concurrent_requests); 211 | } 212 | if let Some(request_timeout) = config.request_timeout.as_ref() { 213 | builder = builder.request_timeout(*request_timeout); 214 | } 215 | 216 | builder 217 | } 218 | } 219 | -------------------------------------------------------------------------------- /src/storage/asagi/storage/filesystem.rs: -------------------------------------------------------------------------------- 1 | use std::ffi::OsStr; 2 | use std::io; 3 | use std::path::{Path, PathBuf}; 4 | use std::pin::Pin; 5 | use std::task::{Context, Poll}; 6 | 7 | #[cfg(not(target_family = "windows"))] 8 | use std::os::unix::fs::PermissionsExt; 9 | 10 | use futures::prelude::*; 11 | use rand::distributions::Alphanumeric; 12 | use rand::Rng; 13 | use tokio::io::AsyncWrite; 14 | 15 | use super::Error; 16 | 17 | enum FileState { 18 | Open, 19 | Rename(Pin> + Send>>), 20 | #[allow(dead_code)] 21 | Permission(Pin> + Send>>), 22 | } 23 | 24 | impl Default for FileState { 25 | fn default() -> Self { 26 | FileState::Open 27 | } 28 | } 29 | 30 | struct File { 31 | filename: PathBuf, 32 | tempname: PathBuf, 33 | inner: tokio::fs::File, 34 | state: FileState, 35 | 36 | #[cfg(target_family = "windows")] 37 | #[allow(dead_code)] 38 | media_group: (), 39 | #[cfg(not(target_family = "windows"))] 40 | media_group: Option, 41 | } 42 | 43 | pub struct FileSystem { 44 | media_path: PathBuf, 45 | tmp_dir: PathBuf, 46 | 47 | #[cfg(target_family = "windows")] 48 | #[allow(dead_code)] 49 | media_group: (), 50 | #[cfg(not(target_family = "windows"))] 51 | media_group: Option, 52 | 53 | #[cfg(all(feature = "io-uring", target_os = "linux"))] 54 | io_ring: rio::Rio, 55 | 56 | #[cfg(not(all(feature = "io-uring", target_os = "linux")))] 57 | #[allow(dead_code)] 58 | io_ring: (), 59 | } 60 | 61 | impl FileSystem { 62 | #[cfg(all(feature = "io-uring", target_os = "linux"))] 63 | fn io_ring() -> rio::Rio { 64 | rio::new().expect("create uring") 65 | } 66 | 67 | #[cfg(not(all(feature = "io-uring", target_os = "linux")))] 68 | fn io_ring() -> () { 69 | () 70 | } 71 | 72 | #[cfg(target_family = "windows")] 73 | fn group>(_group: Option) -> Result<(), Error> { 74 | Ok(()) 75 | } 76 | 77 | #[cfg(not(target_family = "windows"))] 78 | fn group>(group: Option) -> Result, Error> { 79 | group 80 | .as_ref() 81 | .map(|g| users::get_group_by_name(g).ok_or(Error::InvalidUserGroup)) 82 | .transpose() 83 | } 84 | 85 | pub async fn new, U: AsRef, V: AsRef>( 86 | media_path: T, 87 | tmp_dir: U, 88 | media_group: Option, 89 | ) -> Result { 90 | let tmp_dir = tmp_dir.as_ref(); 91 | 92 | match tokio::fs::metadata(&tmp_dir).await { 93 | Ok(m) => { 94 | if !m.is_dir() { 95 | return Err(Error::InvalidTempDir(tmp_dir.to_owned())); 96 | } 97 | } 98 | Err(_) => tokio::fs::create_dir_all(&tmp_dir).await?, 99 | }; 100 | 101 | Ok(Self { 102 | io_ring: Self::io_ring(), 103 | media_path: PathBuf::from(media_path.as_ref()), 104 | tmp_dir: PathBuf::from(tmp_dir), 105 | media_group: Self::group(media_group)?, 106 | }) 107 | } 108 | } 109 | 110 | impl FileSystem { 111 | pub async fn exists>(&self, filepath: T) -> Result { 112 | Ok(tokio::fs::metadata(filepath).await.is_ok()) 113 | } 114 | 115 | pub async fn open>( 116 | &self, 117 | filepath: T, 118 | _size: usize, 119 | ) -> Result { 120 | let filename = self.media_path.join(filepath.as_ref()); 121 | let subdir = filename.parent().unwrap(); 122 | 123 | if tokio::fs::metadata(&subdir).await.is_err() { 124 | let parent = subdir.parent().unwrap(); 125 | let has_parent = parent.exists(); 126 | if let Err(err) = tokio::fs::create_dir_all(&subdir).await { 127 | return Err(Error::from(err)); 128 | } 129 | #[cfg(target_family = "windows")] 130 | { 131 | // get rid of warning on windows build 132 | let _has_parent = has_parent; 133 | } 134 | #[cfg(not(target_family = "windows"))] 135 | if let Some(group) = &self.media_group { 136 | if !has_parent { 137 | tokio::fs::set_permissions(parent, std::fs::Permissions::from_mode(0o755)) 138 | .await?; 139 | let _ = nix::unistd::chown( 140 | parent, 141 | None, 142 | Some(nix::unistd::Gid::from_raw(group.gid())), 143 | ); 144 | } 145 | tokio::fs::set_permissions(&subdir, std::fs::Permissions::from_mode(0o755)).await?; 146 | let _ = 147 | nix::unistd::chown(subdir, None, Some(nix::unistd::Gid::from_raw(group.gid()))); 148 | } 149 | } 150 | 151 | let tempname = self.tmp_dir.join( 152 | rand::thread_rng() 153 | .sample_iter(&Alphanumeric) 154 | .take(16) 155 | .collect::(), 156 | ); 157 | 158 | let file = tokio::fs::OpenOptions::new() 159 | .write(true) 160 | .create(true) 161 | .truncate(true) 162 | .open(&tempname) 163 | .await?; 164 | Ok(File { 165 | filename, 166 | tempname, 167 | inner: file, 168 | state: FileState::default(), 169 | media_group: self.media_group.clone(), 170 | }) 171 | } 172 | } 173 | 174 | #[cfg(not(all(feature = "io-uring", target_os = "linux")))] 175 | impl AsyncWrite for File { 176 | fn poll_write( 177 | mut self: Pin<&mut Self>, 178 | cx: &mut Context<'_>, 179 | buf: &[u8], 180 | ) -> Poll> { 181 | Pin::new(&mut self.inner).poll_write(cx, buf) 182 | } 183 | 184 | fn poll_flush(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { 185 | Pin::new(&mut self.inner).poll_flush(cx) 186 | } 187 | 188 | fn poll_shutdown(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { 189 | loop { 190 | match &mut self.state { 191 | FileState::Open => match Pin::new(&mut self.inner).poll_shutdown(cx) { 192 | Poll::Pending => return Poll::Pending, 193 | Poll::Ready(r) => match r { 194 | Ok(_) => { 195 | self.state = FileState::Rename( 196 | tokio::fs::rename(self.tempname.clone(), self.filename.clone()) 197 | .boxed(), 198 | ); 199 | continue; 200 | } 201 | Err(err) => return Poll::Ready(Err(err)), 202 | }, 203 | }, 204 | FileState::Rename(fut) => match fut.as_mut().poll(cx) { 205 | Poll::Pending => return Poll::Pending, 206 | Poll::Ready(r) => match r { 207 | Ok(_) => { 208 | #[cfg(not(target_family = "windows"))] 209 | if let Some(_) = &self.media_group { 210 | self.state = FileState::Permission( 211 | tokio::fs::set_permissions( 212 | self.filename.clone(), 213 | std::fs::Permissions::from_mode(0o644), 214 | ) 215 | .boxed(), 216 | ); 217 | continue; 218 | } 219 | return Poll::Ready(Ok(())); 220 | } 221 | Err(err) => return Poll::Ready(Err(err)), 222 | }, 223 | }, 224 | FileState::Permission(fut) => match fut.as_mut().poll(cx) { 225 | Poll::Pending => return Poll::Pending, 226 | Poll::Ready(r) => match r { 227 | Ok(_) => { 228 | #[cfg(not(target_family = "windows"))] 229 | if let Some(group) = &self.media_group { 230 | let _ = nix::unistd::chown( 231 | &self.filename, 232 | None, 233 | Some(nix::unistd::Gid::from_raw(group.gid())), 234 | ); 235 | } 236 | 237 | return Poll::Ready(Ok(())); 238 | } 239 | Err(err) => return Poll::Ready(Err(err)), 240 | }, 241 | }, 242 | } 243 | } 244 | } 245 | } 246 | 247 | #[cfg(all(feature = "io-uring", target_os = "linux"))] 248 | impl AsyncWrite for File { 249 | fn poll_write( 250 | mut self: Pin<&mut Self>, 251 | cx: &mut Context<'_>, 252 | buf: &[u8], 253 | ) -> Poll> { 254 | unimplemented!() 255 | } 256 | 257 | fn poll_flush(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { 258 | unimplemented!() 259 | } 260 | 261 | fn poll_shutdown(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { 262 | unimplemented!() 263 | } 264 | } 265 | -------------------------------------------------------------------------------- /src/storage/search_lnx/mod.rs: -------------------------------------------------------------------------------- 1 | use std::pin::Pin; 2 | use std::sync::{ 3 | atomic::{AtomicBool, AtomicU64, AtomicUsize, Ordering}, 4 | Arc, 5 | }; 6 | use std::task::{Context, Poll}; 7 | use std::time::{Duration, Instant}; 8 | 9 | use backoff::backoff::Backoff; 10 | use futures::prelude::*; 11 | use futures::task::AtomicWaker; 12 | use log::{debug, error, warn}; 13 | use serde::Serialize; 14 | use thiserror::Error; 15 | use tokio::sync::{RwLock, Semaphore}; 16 | 17 | mod builder; 18 | mod post; 19 | 20 | use crate::imageboard; 21 | pub use builder::SearchBuilder; 22 | 23 | #[derive(Debug, Error)] 24 | pub enum Error { 25 | #[error("invalid index name")] 26 | InvalidIndex, 27 | #[error("A fatal error occured when trying to archive posts")] 28 | ArchiveError, 29 | #[error("database error: {}", .0)] 30 | DB(reqwest::Error), 31 | #[error("io error: {}", .0)] 32 | IO(#[from] std::io::Error), 33 | } 34 | 35 | struct SearchInner { 36 | client: reqwest::Client, 37 | upload_url: url::Url, 38 | max_inflight_posts: usize, 39 | fail_on_save_error: bool, 40 | retries_on_save_error: usize, 41 | commit_lock: Arc>, 42 | dirty: Arc, 43 | 44 | failed: AtomicBool, 45 | inflight_posts: AtomicUsize, 46 | waker: Arc, 47 | flush_waker: Arc, 48 | close_waker: Arc, 49 | metrics: Arc, 50 | requests: Semaphore, 51 | process_tx: tokio::sync::mpsc::UnboundedSender>>, 52 | } 53 | 54 | #[derive(Debug, Serialize)] 55 | pub struct Metrics { 56 | pub posts: u64, 57 | pub avg_insert_time_ms: f64, 58 | pub save_errors: u64, 59 | pub inflight: u64, 60 | } 61 | 62 | #[derive(Default, Debug)] 63 | struct SearchMetrics { 64 | posts: AtomicU64, 65 | queries: AtomicU64, 66 | query_time_ns: AtomicU64, 67 | save_errors: AtomicU64, 68 | } 69 | 70 | impl SearchMetrics { 71 | pub fn incr_posts(&self, count: u64) { 72 | self.posts.fetch_add(count, Ordering::Relaxed); 73 | } 74 | 75 | pub fn incr_query_time(&self, dur: Duration) { 76 | self.queries.fetch_add(1, Ordering::Relaxed); 77 | self.query_time_ns 78 | .fetch_add(dur.as_nanos() as u64, Ordering::Relaxed); 79 | } 80 | 81 | pub fn incr_save_error(&self, count: u64) { 82 | self.save_errors.fetch_add(count, Ordering::Relaxed); 83 | } 84 | } 85 | 86 | #[derive(Clone)] 87 | pub struct SearchMetricsProvider { 88 | inner: Arc, 89 | } 90 | 91 | impl super::MetricsProvider for SearchMetricsProvider { 92 | fn name(&self) -> &'static str { 93 | "lnx_search" 94 | } 95 | 96 | fn metrics( 97 | &self, 98 | ) -> Pin> + Send>> 99 | { 100 | let queries = self.inner.metrics.queries.load(Ordering::Acquire) as f64; 101 | let tt = self.inner.metrics.query_time_ns.load(Ordering::Acquire) as f64; 102 | let m = Metrics { 103 | posts: self.inner.metrics.posts.load(Ordering::Acquire), 104 | inflight: self.inner.inflight_posts.load(Ordering::Acquire) as _, 105 | avg_insert_time_ms: queries / tt * 1_000_000., 106 | save_errors: self.inner.metrics.save_errors.load(Ordering::Acquire), 107 | }; 108 | let m: Box = Box::new(m); 109 | futures::future::ready(m).boxed() 110 | } 111 | } 112 | 113 | #[must_use = "futures do nothing unless you `.await` or poll them"] 114 | pub struct Search { 115 | inner: Arc, 116 | } 117 | 118 | impl Search { 119 | #[allow(dead_code)] 120 | pub fn builder() -> SearchBuilder { 121 | SearchBuilder::default() 122 | } 123 | 124 | pub fn metrics_provider(&self) -> impl super::MetricsProvider { 125 | SearchMetricsProvider { 126 | inner: self.inner.clone(), 127 | } 128 | } 129 | } 130 | 131 | impl SearchInner { 132 | async fn save_posts(&self, item: Vec) -> Result<(), Error> { 133 | let posts = item.iter() 134 | .filter(|p| !p.deleted) // TODO 135 | .map(|p| p.into()).collect::>(); 136 | let delete_posts = { 137 | let field = posts.iter().map(|x| x.tuid).collect::>(); 138 | post::DeletePost::new(field) 139 | }; 140 | let mut err = None; 141 | let mut backoff = backoff::ExponentialBackoff::default(); 142 | backoff.max_elapsed_time = None; 143 | let start = Instant::now(); 144 | let rows = posts.len(); 145 | for _ in 0..=self.retries_on_save_error { 146 | let permit = self.requests.acquire().await; 147 | let t = self.commit_lock.read().await; 148 | let r = self 149 | .client 150 | .delete(self.upload_url.clone()) 151 | .json(&delete_posts) 152 | .send() 153 | .and_then(|resp| futures::future::ready(resp.error_for_status())) 154 | .and_then(|_| { 155 | self.client 156 | .post(self.upload_url.clone()) 157 | .json(&posts) 158 | .send() 159 | .and_then(|resp| futures::future::ready(resp.error_for_status())) 160 | }) 161 | .await; 162 | drop(permit); 163 | drop(t); 164 | if let Err(e) = r { 165 | log::warn!("Failed to insert data into lnx: {}", e); 166 | err = Some(Err(Error::DB(e))); 167 | if let Some(b) = backoff.next_backoff() { 168 | tokio::time::delay_for(b).await; 169 | } 170 | continue; 171 | } 172 | 173 | self.metrics.incr_posts(rows as u64); 174 | self.metrics.incr_query_time(start.elapsed()); 175 | self.notify_post(rows); 176 | self.dirty.store(true, Ordering::Relaxed); 177 | return Ok(()); 178 | } 179 | self.notify_post(rows); 180 | return err.unwrap(); 181 | } 182 | 183 | async fn send_posts(self: Arc, item: Vec) { 184 | let board = item[0].board; 185 | let thread_no = item[0].thread_no(); 186 | let post_no = item[0].no; 187 | let sz = item.len(); 188 | match self.save_posts(item).await { 189 | Ok(_) => debug!( 190 | "Flushed {} posts to lnx. [First]: {}/{}/{}", 191 | sz, board, thread_no, post_no 192 | ), 193 | Err(err) => { 194 | error!( 195 | "Failed to save data for {} posts [First]: {}/{}/{}: {}", 196 | sz, board, thread_no, post_no, err 197 | ); 198 | if !self.fail_on_save_error { 199 | warn!("Some posts were unable to be archived, however the error isn't being treated as fatal. Some posts may be lost.") 200 | } 201 | self.metrics.incr_save_error(1); 202 | self.failed.store(true, Ordering::SeqCst); 203 | } 204 | } 205 | } 206 | 207 | fn notify_post(&self, no_posts: usize) { 208 | let old = self.inflight_posts.fetch_sub(no_posts, Ordering::AcqRel); 209 | let curr = old - no_posts; 210 | if curr < self.max_inflight_posts { 211 | self.waker.wake(); 212 | } 213 | if curr == 0 { 214 | self.flush_waker.wake(); 215 | self.close_waker.wake(); 216 | } 217 | } 218 | 219 | fn is_ready(&self) -> bool { 220 | let posts = self.inflight_posts.load(Ordering::Acquire); 221 | posts < self.max_inflight_posts 222 | } 223 | 224 | fn is_empty(&self) -> bool { 225 | let posts = self.inflight_posts.load(Ordering::Acquire); 226 | posts == 0 227 | } 228 | 229 | fn has_failed(&self) -> bool { 230 | return self.fail_on_save_error && self.failed.load(Ordering::Relaxed); 231 | } 232 | } 233 | 234 | impl Sink> for Search { 235 | type Error = Error; 236 | 237 | fn poll_ready(self: Pin<&mut Self>, cx: &mut Context) -> Poll> { 238 | if self.inner.has_failed() { 239 | return Poll::Ready(Err(Error::ArchiveError)); 240 | } 241 | self.inner.waker.register(cx.waker()); 242 | match self.inner.is_ready() { 243 | true => Poll::Ready(Ok(())), 244 | false => Poll::Pending, 245 | } 246 | } 247 | 248 | fn start_send(self: Pin<&mut Self>, item: Vec) -> Result<(), Self::Error> { 249 | if item.len() > 0 { 250 | self.inner 251 | .inflight_posts 252 | .fetch_add(item.len(), Ordering::AcqRel); 253 | self.inner.process_tx.send(Some(item)).unwrap(); 254 | } 255 | Ok(()) 256 | } 257 | 258 | fn poll_flush(self: Pin<&mut Self>, cx: &mut Context) -> Poll> { 259 | if self.inner.has_failed() { 260 | return Poll::Ready(Err(Error::ArchiveError)); 261 | } 262 | self.inner.flush_waker.register(cx.waker()); 263 | match self.inner.is_empty() { 264 | true => Poll::Ready(Ok(())), 265 | false => Poll::Pending, 266 | } 267 | } 268 | 269 | fn poll_close(self: Pin<&mut Self>, cx: &mut Context) -> Poll> { 270 | let _ = self.inner.process_tx.send(None); 271 | if self.inner.has_failed() { 272 | return Poll::Ready(Err(Error::ArchiveError)); 273 | } 274 | self.inner.close_waker.register(cx.waker()); 275 | match self.inner.is_empty() { 276 | true => Poll::Ready(Ok(())), 277 | false => Poll::Pending, 278 | } 279 | } 280 | } 281 | -------------------------------------------------------------------------------- /src/storage/search_pg/mod.rs: -------------------------------------------------------------------------------- 1 | use std::borrow::Cow; 2 | use std::pin::Pin; 3 | use std::sync::{ 4 | atomic::{AtomicBool, AtomicU64, AtomicUsize, Ordering}, 5 | Arc, 6 | }; 7 | use std::task::{Context, Poll}; 8 | use std::time::{Duration, Instant}; 9 | 10 | use backoff::backoff::Backoff; 11 | use futures::prelude::*; 12 | use futures::task::AtomicWaker; 13 | use log::{debug, error, warn}; 14 | use memchr::memchr; 15 | use serde::Serialize; 16 | use thiserror::Error; 17 | use tokio_postgres::types::ToSql; 18 | 19 | mod arena; 20 | mod builder; 21 | mod placeholders; 22 | 23 | use crate::imageboard; 24 | pub use builder::SearchBuilder; 25 | pub use placeholders::PLACEHOLDERS; 26 | 27 | #[derive(Debug, Error)] 28 | pub enum Error { 29 | #[error("invalid database pool size")] 30 | InvalidPoolSize, 31 | #[error("invalid database URL provided: {}", .0)] 32 | InvalidDatabase(tokio_postgres::Error), 33 | #[error("A fatal error occured when trying to archive posts")] 34 | ArchiveError, 35 | #[error("database connection error: {}", .0)] 36 | Pool(#[from] deadpool_postgres::PoolError), 37 | #[error("database error: {}", .0)] 38 | DB(#[from] tokio_postgres::Error), 39 | #[error("io error: {}", .0)] 40 | IO(#[from] std::io::Error), 41 | } 42 | 43 | struct SearchInner { 44 | db_pool: deadpool_postgres::Pool, 45 | max_inflight_posts: usize, 46 | fail_on_save_error: bool, 47 | retries_on_save_error: usize, 48 | 49 | failed: AtomicBool, 50 | inflight_posts: AtomicUsize, 51 | waker: Arc, 52 | flush_waker: Arc, 53 | close_waker: Arc, 54 | metrics: Arc, 55 | process_tx: tokio::sync::mpsc::UnboundedSender>>, 56 | } 57 | 58 | #[derive(Debug, Serialize)] 59 | pub struct Metrics { 60 | pub posts: u64, 61 | pub avg_insert_time_ms: f64, 62 | pub save_errors: u64, 63 | } 64 | 65 | #[derive(Default, Debug)] 66 | struct SearchMetrics { 67 | posts: AtomicU64, 68 | queries: AtomicU64, 69 | query_time_ns: AtomicU64, 70 | save_errors: AtomicU64, 71 | } 72 | 73 | impl SearchMetrics { 74 | pub fn incr_posts(&self, count: u64) { 75 | self.posts.fetch_add(count, Ordering::Relaxed); 76 | } 77 | 78 | pub fn incr_query_time(&self, dur: Duration) { 79 | self.queries.fetch_add(1, Ordering::Relaxed); 80 | self.query_time_ns 81 | .fetch_add(dur.as_nanos() as u64, Ordering::Relaxed); 82 | } 83 | 84 | pub fn incr_save_error(&self, count: u64) { 85 | self.save_errors.fetch_add(count, Ordering::Relaxed); 86 | } 87 | } 88 | 89 | #[derive(Clone)] 90 | pub struct SearchMetricsProvider { 91 | inner: Arc, 92 | } 93 | 94 | impl super::MetricsProvider for SearchMetricsProvider { 95 | fn name(&self) -> &'static str { 96 | "pg_search" 97 | } 98 | 99 | fn metrics( 100 | &self, 101 | ) -> Pin> + Send>> 102 | { 103 | let queries = self.inner.metrics.queries.load(Ordering::Acquire) as f64; 104 | let tt = self.inner.metrics.query_time_ns.load(Ordering::Acquire) as f64; 105 | let m = Metrics { 106 | posts: self.inner.metrics.posts.load(Ordering::Acquire), 107 | avg_insert_time_ms: queries / tt * 1_000_000., 108 | save_errors: self.inner.metrics.save_errors.load(Ordering::Acquire), 109 | }; 110 | let m: Box = Box::new(m); 111 | futures::future::ready(m).boxed() 112 | } 113 | } 114 | 115 | #[must_use = "futures do nothing unless you `.await` or poll them"] 116 | pub struct Search { 117 | inner: Arc, 118 | } 119 | 120 | impl Search { 121 | #[allow(dead_code)] 122 | pub fn builder() -> SearchBuilder { 123 | SearchBuilder::default() 124 | } 125 | 126 | pub fn metrics_provider(&self) -> impl super::MetricsProvider { 127 | SearchMetricsProvider { 128 | inner: self.inner.clone(), 129 | } 130 | } 131 | } 132 | 133 | impl SearchInner { 134 | async fn save_posts(&self, mut item: Vec) -> Result<(), Error> { 135 | let client = self.db_pool.get().await?; 136 | while item.len() > 0 { 137 | let start = Instant::now(); 138 | // Postgres only supports a maximum of 2^15 params 139 | let (remain, posts) = if item.len() > 1280 { 140 | let remain = item.split_off(1280); 141 | (remain, item) 142 | } else { 143 | (vec![], item) 144 | }; 145 | item = remain; 146 | let rows = posts.len(); 147 | let query = "INSERT INTO 148 | posts 149 | (board, thread_no, post_no, subject, username, tripcode, 150 | email, unique_id, since4_pass, country, filename, 151 | image_hash, image_width, image_height, ts, comment, deleted, 152 | ghost, sticky, spoiler, op, capcode) VALUES "; 153 | let stmt = std::iter::once(Cow::Borrowed(query)) 154 | .chain((0..rows).map(|i| { 155 | let z = i * 22; 156 | Cow::Owned( 157 | [ 158 | if i == 0 { "(" } else { "\n,(" }, 159 | PLACEHOLDERS[z], // board 160 | ",", 161 | PLACEHOLDERS[z + 1], // thread_no 162 | ",", 163 | PLACEHOLDERS[z + 2], // post_no 164 | ",to_tsvector(", 165 | PLACEHOLDERS[z + 3], // subject 166 | "),to_tsvector(", 167 | PLACEHOLDERS[z + 4], // username 168 | "),to_tsvector(", 169 | PLACEHOLDERS[z + 5], // tripcode 170 | "),to_tsvector(", 171 | PLACEHOLDERS[z + 6], // email 172 | "),", 173 | PLACEHOLDERS[z + 7], // unique_id 174 | ",", 175 | PLACEHOLDERS[z + 8], // since4_pass 176 | ",", 177 | PLACEHOLDERS[z + 9], // country 178 | ",to_tsvector(REPLACE(", 179 | PLACEHOLDERS[z + 10], // filename 180 | ",'.',' ')),", 181 | PLACEHOLDERS[z + 11], // image_hash 182 | ",", 183 | PLACEHOLDERS[z + 12], // image_width 184 | ",", 185 | PLACEHOLDERS[z + 13], // image_height 186 | ",TO_TIMESTAMP(CAST(", 187 | PLACEHOLDERS[z + 14], // ts 188 | "::INT8 AS FLOAT8)),to_tsvector(", 189 | PLACEHOLDERS[z + 15], // comment 190 | "),", 191 | PLACEHOLDERS[z + 16], // deleted 192 | ",", 193 | PLACEHOLDERS[z + 17], // ghost 194 | ",", 195 | PLACEHOLDERS[z + 18], // sticky 196 | ",", 197 | PLACEHOLDERS[z + 19], // spoiler 198 | ",", 199 | PLACEHOLDERS[z + 20], // op 200 | ",CAST(", 201 | PLACEHOLDERS[z + 21], // capcode 202 | "::INT8 AS INT4))", 203 | ] 204 | .join(""), 205 | ) 206 | })) 207 | .chain(std::iter::once(Cow::Borrowed( 208 | " ON CONFLICT (board, post_no) DO UPDATE SET 209 | deleted = EXCLUDED.deleted, 210 | sticky = EXCLUDED.sticky, 211 | comment = COALESCE(EXCLUDED.comment, posts.comment); 212 | ", 213 | ))) 214 | .collect::(); 215 | 216 | let i64_rena = arena::Arena::new(posts.len() * 4); 217 | let str_rena = arena::Arena::new(posts.len() * 4); 218 | 219 | let params = (0..posts.len()) 220 | .into_iter() 221 | .map(|i| { 222 | let values: Box<[&(dyn ToSql + Sync)]> = Box::new([ 223 | str_rena.alloc(Some(posts[i].board.to_string())), 224 | i64_rena.alloc(Some(posts[i].thread_no() as i64)), 225 | i64_rena.alloc(Some(posts[i].no as i64)), 226 | &posts[i].sub, 227 | &posts[i].name, 228 | &posts[i].trip, 229 | &posts[i].email, 230 | &posts[i].id, 231 | &posts[i].since4pass, 232 | str_rena.alloc(posts[i].poster_country()), 233 | str_rena.alloc(posts[i].media_filename()), 234 | &posts[i].md5, 235 | &posts[i].w, 236 | &posts[i].h, 237 | i64_rena.alloc(Some(posts[i].time as i64)), 238 | str_rena.alloc(posts[i].comment().map(|x| str_sanitize(x))), 239 | &posts[i].deleted, 240 | &false, 241 | &posts[i].sticky, 242 | &posts[i].spoiler, 243 | if posts[i].is_op() { &true } else { &false }, 244 | i64_rena.alloc(posts[i].short_capcode().chars().next().map(|c| c as i64)), 245 | ]); 246 | values.into_vec() 247 | }) 248 | .flatten() 249 | .collect::>(); 250 | 251 | let mut attempts = 0; 252 | let mut backoff = backoff::ExponentialBackoff::default(); 253 | backoff.max_elapsed_time = None; 254 | loop { 255 | let r = client.execute(stmt.as_str(), ¶ms).await; 256 | match r { 257 | Ok(_) => break, 258 | Err(err) => { 259 | if attempts >= self.retries_on_save_error { 260 | return Err(Error::from(err)); 261 | } 262 | attempts += 1; 263 | if let Some(b) = backoff.next_backoff() { 264 | tokio::time::delay_for(b).await; 265 | } 266 | continue; 267 | } 268 | } 269 | } 270 | self.metrics.incr_posts(rows as u64); 271 | self.metrics.incr_query_time(start.elapsed()); 272 | self.notify_post(rows); 273 | 274 | // Since values contains references to data in the 'renas, 275 | // the values must be dropped before we drop the 'renas 276 | drop(params); 277 | drop(i64_rena); 278 | drop(str_rena); 279 | } 280 | 281 | Ok(()) 282 | } 283 | 284 | async fn send_posts(self: Arc, item: Vec) { 285 | let board = item[0].board; 286 | let thread_no = item[0].thread_no(); 287 | let post_no = item[0].no; 288 | let sz = item.len(); 289 | match self.save_posts(item).await { 290 | Ok(_) => debug!( 291 | "Flushed {} posts to postgres. [First]: {}/{}/{}", 292 | sz, board, thread_no, post_no 293 | ), 294 | Err(err) => { 295 | error!( 296 | "Failed to save data for {} posts [First]: {}/{}/{}: {}", 297 | sz, board, thread_no, post_no, err 298 | ); 299 | if !self.fail_on_save_error { 300 | warn!("Some posts were unable to be archived, however the error isn't being treated as fatal. Some posts may be lost.") 301 | } 302 | self.metrics.incr_save_error(1); 303 | self.failed.store(true, Ordering::SeqCst); 304 | } 305 | } 306 | } 307 | 308 | fn notify_post(&self, no_posts: usize) { 309 | let old = self.inflight_posts.fetch_sub(no_posts, Ordering::AcqRel); 310 | let curr = old - no_posts; 311 | if curr < self.max_inflight_posts { 312 | self.waker.wake(); 313 | } 314 | if curr == 0 { 315 | self.flush_waker.wake(); 316 | self.close_waker.wake(); 317 | } 318 | } 319 | 320 | fn is_ready(&self) -> bool { 321 | let posts = self.inflight_posts.load(Ordering::Acquire); 322 | posts < self.max_inflight_posts 323 | } 324 | 325 | fn is_empty(&self) -> bool { 326 | let posts = self.inflight_posts.load(Ordering::Acquire); 327 | posts == 0 328 | } 329 | 330 | fn has_failed(&self) -> bool { 331 | return self.fail_on_save_error && self.failed.load(Ordering::Relaxed); 332 | } 333 | } 334 | 335 | impl Sink> for Search { 336 | type Error = Error; 337 | 338 | fn poll_ready(self: Pin<&mut Self>, cx: &mut Context) -> Poll> { 339 | if self.inner.has_failed() { 340 | return Poll::Ready(Err(Error::ArchiveError)); 341 | } 342 | self.inner.waker.register(cx.waker()); 343 | match self.inner.is_ready() { 344 | true => Poll::Ready(Ok(())), 345 | false => Poll::Pending, 346 | } 347 | } 348 | 349 | fn start_send(self: Pin<&mut Self>, item: Vec) -> Result<(), Self::Error> { 350 | if item.len() > 0 { 351 | self.inner 352 | .inflight_posts 353 | .fetch_add(item.len(), Ordering::AcqRel); 354 | self.inner.process_tx.send(Some(item)).unwrap(); 355 | } 356 | Ok(()) 357 | } 358 | 359 | fn poll_flush(self: Pin<&mut Self>, cx: &mut Context) -> Poll> { 360 | if self.inner.has_failed() { 361 | return Poll::Ready(Err(Error::ArchiveError)); 362 | } 363 | self.inner.flush_waker.register(cx.waker()); 364 | match self.inner.is_empty() { 365 | true => Poll::Ready(Ok(())), 366 | false => Poll::Pending, 367 | } 368 | } 369 | 370 | fn poll_close(self: Pin<&mut Self>, cx: &mut Context) -> Poll> { 371 | let _ = self.inner.process_tx.send(None); 372 | if self.inner.has_failed() { 373 | return Poll::Ready(Err(Error::ArchiveError)); 374 | } 375 | self.inner.close_waker.register(cx.waker()); 376 | match self.inner.is_empty() { 377 | true => Poll::Ready(Ok(())), 378 | false => Poll::Pending, 379 | } 380 | } 381 | } 382 | 383 | fn str_sanitize(input: String) -> String { 384 | match memchr(0, input.as_bytes()) { 385 | Some(_) => input.replace(char::from(0), ""), 386 | None => input, 387 | } 388 | } 389 | -------------------------------------------------------------------------------- /Torako.sample.toml: -------------------------------------------------------------------------------- 1 | ## HTTP API Server Address for Torako. If this is omitted or 0, 2 | ## the API server will be disabled. 3 | api_addr = "127.0.0.1:2377" 4 | 5 | ## Instead of listening on an IP address, torako can be 6 | ## configured to listen on an interface. (UNIX only) 7 | #api_addr_interface = "eth0" 8 | 9 | ## Global Imageboard API Rate Limit in Request/Second/ 10 | ## this rate limit only affects downloading the JSON threads/posts 11 | ## and will ensure api requests are throttled appropirately. 12 | ## If this option is not specified, there will be no rate limiting 13 | #rate_limit = 10 14 | 15 | ## Global board thread concurrency. This is a limit on the amount of 16 | ## of threads that will be retrieved and buffered at once. If this limit 17 | ## is exceeded then no more threads will be downloaded until the current 18 | ## threads are cleared, and those threads that weren't able to be downloaded 19 | ## because of this limit will be retried after the next board refresh. 20 | ## 21 | ## Not recommended unless memory usage is a real issue. Prefer using 22 | ## `rate_limit` (to throttle API calls), `inflight_posts` (to throttle MySQL 23 | ## load), or `concurrent_downloads` (to throttle network bandwidth). 24 | ## (Default: unlimited) 25 | #thread_concurrency = 2 26 | 27 | ## Global HTTP Timeout. Any request that takes longer than this 28 | ## is cancelled. Note that if Torako has very slow internet for some reason 29 | ## this timeout will cancel any download that takes longer than 30 minutes. 30 | ## A large timeout is useful for cleaning up connections that may have stalled 31 | ## indefinitely. 32 | ## Leaving this unset will disable the timeout. 33 | request_timeout = "30m" 34 | 35 | ## Proxies. Provide a list of proxies that Torako should use for 36 | ## http requests. (Default: None) 37 | #request_proxy = ["http://foo.prox:29292", "https://sfoo.prox:444", "socks5://sock.prox:4444"] 38 | 39 | ## Only use proxies. By default Torako will round robin through the available 40 | ## proxies provided *and* not using a proxy at all. Enable this to ensure 41 | ## that every request goes through a proxy. 42 | #request_only_proxy = false 43 | 44 | [boards] 45 | ## Enable TLS for API calls (Default: true) 46 | #tls = true 47 | 48 | ## Imageboard API hostname (Default: "a.4cdn.org") 49 | #host = "a.4cdn.org" 50 | 51 | ## Default refresh rate for boards (Default: 10s) 52 | #refresh_rate = "10s" 53 | 54 | ## Default deleted page threshold. If threads 55 | ## are removed before this threshold is reached 56 | ## then the thread is marked as (moderator) deleted 57 | ## (Default: 8) 58 | #deleted_page_threshold = 8 59 | 60 | ## Download thumbnails (Default: true) 61 | #download_thumbs = true 62 | 63 | ## Download media/images (Default: true) 64 | #download_media = true 65 | 66 | ## List the boards that should be archived 67 | [boards.po] 68 | ## Any of the above options can be provided 69 | ## here for board specific options 70 | [boards.vip] 71 | #[boards.f] 72 | ## The /f/ board uses the filename rather than the post time 73 | ## Warning: Enabling this will enable media downloads, however 74 | ## the standard Asagi schema did not take the longer filenames 75 | ## into account. You must update the `media` length on the 76 | ## `f_images` table to 191 and the `media_orig` length in the 77 | ## `f` table if you are starting from an older DB. 78 | #url_media_filename = true 79 | 80 | [backend] 81 | [backend.asagi] 82 | ## Download thumbnails. Disabling this will prevent the 83 | ## asagi storage backend from downloading thumbnails (Default: true) 84 | #thumbs = true 85 | 86 | ## Download media. Disabling this will prevent the 87 | ## asagi storage backend from downloading media (Default: true) 88 | #media = true 89 | 90 | ## Media content host URL. (Default: "https://i.4cdn.org/") 91 | #media_url = "https://i.4cdn.org/" 92 | 93 | ## Thumb content host URL (Default: "https://i.4cdn.org/") 94 | #thumb_url = "https://i.4cdn.org/" 95 | 96 | ## Old Directory Structure. This is a legacy option in Asagi. (Default: false) 97 | #old_dir_structure = false 98 | 99 | ## Sha256 Directory Structure. Images will be stored in accordance to the 100 | ## sha256 hash of the image. (Default: false) 101 | #sha_dir_structure = false 102 | 103 | ## If we fail to save a thread or post to MySQL for whatever 104 | ## reason, we should consider this as a fatal error and 105 | ## stop Torako. The operator (or system) should restart Torako. 106 | ## Due to Torako's design is we fail to save a post/thread, then 107 | ## that thread could be lost if this is set to false. 108 | ## (Default: true) 109 | #fail_on_save_error = true 110 | 111 | ## This option denotes how many times saving the posts should be retried 112 | ## in case of some database error. If `fail_on_save_error` is true, 113 | ## and the number of retries is exceeded, then torako will crash. If 114 | ## `fail_on_save_error` is false, then the posts are dropped to the floor 115 | ## and not saved at all. If this is set to a high number and inflight_posts 116 | ## is set to a high number, errors that are persistent (ex. the database 117 | ## machine blew up) will cause Torako to buffer more and more posts in memory. 118 | #retries_on_save_error = 0 119 | 120 | ## Inflight Posts. This number denotes how many posts can be buffered in memory 121 | ## at a time before Asagi rejects accepting more posts. This acts as a form 122 | ## of backpressure and can be used to optimize memory usage or database activity 123 | ## Note that this is an *advisory* limit, and can be temporarily exceeded. 124 | ## (Ex. if the limit is 10, and we receieve a thread with 11 posts, we will 125 | ## buffer all 11 posts in memory, but no more posts will be archived until the 126 | ## 11 posts are persisted to the database) 127 | ## Default: unlimited 128 | #inflight_posts = 65536 129 | 130 | ## Concurrent Downloads. This number controls the amount of simultaneous media 131 | ## downloads that can happen at once. For the Asagi storage backend a single 132 | ## download might be both the thumbnail and media, so a concurrent_downloads 133 | ## value of "1", might start 2 downloads (one for the thumb and the media). 134 | ## Unlike `inflight_posts`, this value is not advisory and the number of 135 | ## concurrent downloads will ont exceed this number. 136 | ## Tune this number according to your memory and bandwidth preferences 137 | ## (Default: 128) 138 | #concurrent_downloads = 1024 139 | 140 | ## Setting this to true means that the media download queue will backpressure 141 | ## the system, and once all concurrent_download slots are full, no more new 142 | ## posts will be scraped until there is capacity to download more images. 143 | ## Turning this on will decrease memory pressure at the cost of pausing 144 | ## archiving if the download queue is full. The download queue can "fill up" 145 | ## for example when starting an archive of a fresh board and all the images 146 | ## must be downloaded. 147 | #media_backpressure = false 148 | 149 | # TMP dir. (Default: System Temp Path + /torako) 150 | #tmp_dir = "/tmp/torako" 151 | 152 | [backend.asagi.media_storage.filesystem] 153 | ## Media Path. This is where downloaded content is stored. If this is not 154 | ## provided then media downloading will be disabled. 155 | media_path = "/var/lib/torako" 156 | 157 | ## Web UNIX Group. On Unix system, if this is set, all downloads and download 158 | ## folders will have their group ownership set to this. (Default: None) 159 | #web_unix_group = "www-data" 160 | 161 | #[backend.asagi.media_storage.s3] 162 | ## S3 Access Key ID. 163 | #access_key_id = "0026149a" 164 | 165 | ## S3 Secret Access Key 166 | #secret_access_key = "secret" 167 | 168 | ## S3 Region 169 | #region = "us-east-2" 170 | 171 | ## S3 Endpoint. This can be used to support S3-compatible endpoints. 172 | #endpoint = "foobar" 173 | 174 | ## S3 Bucket. 175 | #bucket = "foo" 176 | 177 | ## S3 Object ACL. Leaving this blank will not set an ACL. 178 | #acl = "public-read" 179 | 180 | ## Set this to false if Torako should skip checking if the image already 181 | ## exists, and always redownload. (Default: true) 182 | #check_exists = false 183 | 184 | #[backend.asagi.media_storage.b2] 185 | ## Backblaze Native API Storage Engine 186 | 187 | ## Backblaze Application Key ID. This key will need read & write privileges 188 | ## to the bucket of your choosing. 189 | #application_key_id = "0026149a" 190 | 191 | ## Backblaze Application Key. 192 | #application_key = "secret" 193 | 194 | ## Backblaze Bucket ID. 195 | #bucket_id = "44444433333" 196 | 197 | ## Set this to false if Torako should skip checking if the image already 198 | ## exists, and always redownload. (Default: true) 199 | #check_exists = false 200 | 201 | #[backend.asagi.media_storage.b2.bloom] 202 | ## If we opt to check_exists, then every time to check 203 | ## for a file, we will incur a Class B charge with B2 at a rate of 204 | ## $0.004 per 10,000. We can minimize this by using a bloom filter. 205 | ## If the bloom filter returns negative then we know we don't have 206 | ## the image, and we will download it. 207 | 208 | ## How often the bloomfilter is persisted to B2 209 | # upload_frequency = "5m" 210 | 211 | ## The filename to save the bloom filter under. 212 | # file_key = "torako.bloom" 213 | 214 | ## initial_bit_count = 100000000 215 | 216 | ##false_positive_rate = 0.05 217 | 218 | ## All the options for `backend.asagi.media_storage` can also be set for 219 | ## `backend.asagi.thumb_storage`, to store thumbnails in a different manner. 220 | ## By default thumbnails will be stored according to `media_storage`. 221 | #[backend.asagi.thumb_storage.filesystem] 222 | ## Media Path. This is where downloaded content is stored. If this is not 223 | ## provided then media downloading will be disabled. 224 | #media_path = "/var/lib/torako" 225 | 226 | # TMP dir. (Default: System Temp Path + /torako) 227 | #tmp_dir = "/tmp/torako" 228 | 229 | ## Web UNIX Group. On Unix system, if this is set, all downloads and download 230 | ## folders will have their group ownership set to this. (Default: None) 231 | #web_unix_group = "www-data" 232 | 233 | [backend.asagi.database] 234 | ## Database URL. Usually in the format of 235 | ## mysql://username:password@host/db_name. 236 | ## Only MySQL is supported today. 237 | url = "mysql://torako@localhost/torako" 238 | 239 | ## Database Charset (Default: "utf8mb4") 240 | charset = "utf8mb4" 241 | 242 | ## In Asagi's original design, 2nd-level tables such as board_threads and other 243 | ## stats were handled via MySQL triggers. This may be problematic for high load 244 | ## databases. If `use_triggers` is set to false, the updates to child tables 245 | ## will be computed by Torako and flushed to MySQL in a single transaction. 246 | ## WARNING: If you set `use_triggers` to false, Torako WILL DROP YOUR insert & 247 | ## update triggers. Running Torako again with `use_triggers` as true will 248 | ## recreate those triggers. 249 | ## (Default: true) 250 | use_triggers = true 251 | 252 | ## Compute stats in the board_daily and board_users table. For Torako to handle 253 | ## this, you must set `use_triggers` to false as these stats are computed 254 | ## by Torako and flushed to MySQL. 255 | ## (Default: false) 256 | compute_stats = false 257 | 258 | ## MySQL Database Engine. Use this datbase engine for all tables that 259 | ## Torako creates. 260 | ## (Default: InnoDB) 261 | #mysql_engine = "InnoDB" 262 | 263 | ## The orginal Asagi column definitions for the `email`, `name`, and `title`, 264 | ## fields are are limited to 100 characters. By default Torako will truncate 265 | ## them. 266 | #truncate_fields = false 267 | 268 | ## You may experience issue with the TIMESTAMP field in the boards column 269 | ## if your local MySQL install's timezone is not set to UTC. It is good 270 | ## practive to ensure your database install's timezone is UTC, but if it's 271 | ## not certain timestamps may cause Torako to crash if MySQL isn't set to UTC 272 | ## 273 | ## Torako will always try to use utc where possible (other than in situations 274 | ## where backwards compat requires something different), however for databases 275 | ## like MySQL, UTC mode must be set explictly. This can be safely disabled 276 | ## if your database is already in UTC. 277 | ## (Default: true) 278 | #sql_set_utc = true 279 | 280 | [backend.asagi_pg_search] 281 | disabled = true 282 | 283 | ## Database URL. In the format of 284 | ## postgresql://username:password@host/db_name. 285 | database_url = "postgres://localhost/" 286 | 287 | ## See the option description under backend.asagi 288 | ## Default: true 289 | #fail_on_save_error = true 290 | 291 | ## Inflight Posts. See the option description under backend.asagi 292 | ## Default: unlimited 293 | #inflight_posts = 65536 294 | 295 | ## See the option description under backend.asagi 296 | ## Default 0 297 | #retries_on_save_error = 0 298 | 299 | [backend.asagi_lnx_search] 300 | disabled = true 301 | 302 | ## Database URL. 303 | database_url = "http://localhost:8000/" 304 | 305 | ## The name of the index that should be used. 306 | index = "posts" 307 | 308 | ## See the option description under backend.asagi 309 | ## Default: true 310 | #fail_on_save_error = true 311 | 312 | ## Inflight Posts. See the option description under backend.asagi 313 | ## Default: unlimited 314 | #inflight_posts = 65536 315 | 316 | ## Maximum Concurrent Requests. This will throttle requests if there are too many 317 | ## concurrent requests. 318 | ## Default: unlimited 319 | #concurrent_requests = 65536 320 | 321 | ## See the option description under backend.asagi 322 | ## Default 0 323 | #retries_on_save_error = 0 324 | 325 | ## Authentication Key 326 | #authentication_key = "" 327 | 328 | ## Commit Sync Interval 329 | ## Commit posts to the index in this interval. Posts must be commited to show up 330 | ## in searches 331 | ## Default 5s 332 | #commit_sync_interval = "5s" 333 | 334 | ## Request Timeout 335 | ## Reqeust timeout to database 336 | #request_timeout = "120s" 337 | 338 | # --- 339 | 340 | ## You can set custom storage backends for specific 341 | ## boards by using this format. This mirrors the options available in 342 | ## `backend.asagi.media_storage`. Note that specifying a board specific storage 343 | ## engine will completely override global options (ex. if globally you store 344 | ## files on disk, but on /gif/ you opt to use s3, then /gif/ will not store 345 | ## files on disk). 346 | ## 347 | ## The storage configuration heirarchy is as follows: 348 | ## Images: [backend.asagi.boards.{board}.media_storage] > [backend.asagi.media_storage] 349 | ## Thumbs: [backend.asagi.boards.{board}.thumb_storage] > [backend.asagi.thumb_storage] > [backend.asagi.boards.{board}.media_storage] > [backend.asagi.media_storage] 350 | #[backend.asagi.boards.gif.media_storage.s3] 351 | ## S3 Access Key ID. 352 | #access_key_id = "0026149a" 353 | 354 | ## S3 Secret Access Key 355 | #secret_access_key = "secret" 356 | 357 | ## S3 Region 358 | #region = "us-east-2" 359 | 360 | ## S3 Endpoint. This can be used to support S3-compatible endpoints. 361 | #endpoint = "foobar" 362 | 363 | ## S3 Bucket. 364 | #bucket = "foo" 365 | 366 | ## S3 Object ACL. Leaving this blank will not set an ACL. 367 | #acl = "public-read" 368 | 369 | ## Set this to false if Torako should skip checking if the image already 370 | ## exists, and always redownload. (Default: true) 371 | #check_exists = false 372 | 373 | ## Likewise board specfic thumbnail storage can be specified. 374 | #[backend.asagi.boards.gif.thumb_storage.s3] 375 | ## S3 Access Key ID. 376 | #access_key_id = "0026149a" 377 | 378 | ## S3 Secret Access Key 379 | #secret_access_key = "secret" 380 | 381 | ## S3 Region 382 | #region = "us-east-2" 383 | 384 | ## S3 Endpoint. This can be used to support S3-compatible endpoints. 385 | #endpoint = "foobar" 386 | 387 | ## S3 Bucket. 388 | #bucket = "foo" 389 | 390 | ## S3 Object ACL. Leaving this blank will not set an ACL. 391 | #acl = "public-read" 392 | 393 | ## Set this to false if Torako should skip checking if the image already 394 | ## exists, and always redownload. (Default: true) 395 | #check_exists = false -------------------------------------------------------------------------------- /src/util/lnx/reindex.rs: -------------------------------------------------------------------------------- 1 | use std::iter::IntoIterator; 2 | use std::sync::{ 3 | atomic::{AtomicBool, Ordering}, 4 | Arc, 5 | }; 6 | 7 | use clap::ArgMatches; 8 | use futures::prelude::*; 9 | use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; 10 | use log::{error, info}; 11 | use thiserror::Error; 12 | 13 | #[derive(Debug, Error)] 14 | pub enum Error { 15 | #[error("pg database error: {}", .0)] 16 | DB(reqwest::Error), 17 | #[error("io error: {}", .0)] 18 | IO(#[from] std::io::Error), 19 | #[error("mysql error: {}", .0)] 20 | MySQL(#[from] sqlx::Error), 21 | } 22 | 23 | #[derive(sqlx::FromRow, Debug)] 24 | struct BoardInfo { 25 | board: String, 26 | records: i64, 27 | } 28 | 29 | #[derive(sqlx::FromRow, Debug)] 30 | struct Post { 31 | board: String, 32 | thread_num: u32, 33 | num: u32, 34 | title: Option, 35 | name: Option, 36 | trip: Option, 37 | email: Option, 38 | poster_hash: Option, 39 | poster_country: Option, 40 | media_filename: Option, 41 | media_hash: Option, 42 | media_w: Option, 43 | media_h: Option, 44 | timestamp: i64, 45 | comment: Option, 46 | deleted: bool, 47 | sticky: bool, 48 | spoiler: bool, 49 | op: bool, 50 | capcode: Option, 51 | } 52 | 53 | impl<'a> Into> for &'a Post { 54 | fn into(self) -> super::post::Post<'a> { 55 | let upper = { 56 | let bytes = self.board.as_bytes(); 57 | let bytes = [bytes.get(0).copied().unwrap_or(0), bytes.get(1).copied().unwrap_or(0), bytes.get(2).copied().unwrap_or(0), bytes.get(3).copied().unwrap_or(0)]; 58 | super::post::as_u32_be(&bytes) 59 | }; 60 | let lower = self.num as u32; 61 | let tuid = (upper as u64) << 32 | (lower as u64); 62 | let version = match std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH) { 63 | Ok(n) => n.as_millis() as u64, 64 | Err(_) => panic!("SystemTime before UNIX EPOCH!"), 65 | }; 66 | super::post::Post { 67 | board: self.board.as_str(), 68 | thread_no: self.thread_num as _, 69 | post_no: self.num as _, 70 | subject: self.title.as_ref().map(|x| &**x), 71 | username: self.name.as_ref().map(|x| &**x), 72 | tripcode: self.trip.as_ref().map(|x| &**x), 73 | email: self.email.as_ref().map(|x| &**x), 74 | unique_id: self.poster_hash.as_ref().map(|x| &**x), 75 | since4_pass: None, 76 | country: self.poster_country.as_ref().map(|x| &**x), 77 | filename: self.media_filename.as_ref().map(|x| &**x), 78 | image_hash: self.media_hash.as_ref().map(|x| &**x), 79 | image_width: self.media_w.unwrap_or(0) as _, 80 | image_height: self.media_h.unwrap_or(0) as _, 81 | ts: self.timestamp as _, 82 | tsr: u64::MAX - (self.timestamp as u64), 83 | comment: self.comment.as_ref().map(|x| &**x), 84 | deleted: if self.deleted { 1 } else { 0 }, 85 | ghost: 0, 86 | sticky: if self.sticky { 1 } else { 0 }, 87 | spoiler: if self.spoiler { 1 } else { 0 }, 88 | op: if self.op { 1 } else { 0 }, 89 | capcode: self.capcode.as_ref() 90 | .map(|c| { 91 | c.chars() 92 | .filter(char::is_ascii) 93 | .next() 94 | .map(|c| c as u64) 95 | }) 96 | .flatten(), 97 | tuid, 98 | version, 99 | } 100 | } 101 | } 102 | 103 | struct LnxReIndex { 104 | client: reqwest::Client, 105 | upload_url: url::Url, 106 | commit_url: url::Url, 107 | mysql: sqlx::MySqlPool, 108 | tables: Vec, 109 | write_streams: usize, 110 | } 111 | 112 | fn lookup_query>(board: T) -> String { 113 | format!( 114 | r"SELECT 115 | '{}' AS board, 116 | `thread_num`, 117 | `num`, 118 | `title`, 119 | `name`, 120 | `trip`, 121 | `email`, 122 | `poster_hash`, 123 | `poster_country`, 124 | `media_filename`, 125 | `media_hash`, 126 | `media_w`, 127 | `media_h`, 128 | COALESCE(UNIX_TIMESTAMP(`unix_timestamp`), `timestamp`) AS `timestamp`, 129 | `comment`, 130 | `deleted`, 131 | `sticky`, 132 | `spoiler`, 133 | `op`, 134 | `capcode` 135 | FROM 136 | `{}`", 137 | board.as_ref(), 138 | board.as_ref() 139 | ) 140 | } 141 | 142 | impl LnxReIndex { 143 | async fn new, V: AsRef, T: IntoIterator>( 144 | upload_url: url::Url, 145 | commit_url: url::Url, 146 | authentication_key: V, 147 | request_timeout: Option, 148 | source_url: url::Url, 149 | tables: T, 150 | write_streams: usize, 151 | ) -> Result { 152 | 153 | let mut headers = reqwest::header::HeaderMap::new(); 154 | if !authentication_key.as_ref().is_empty() { 155 | headers.insert( 156 | "Authorization", 157 | reqwest::header::HeaderValue::from_str(&format!( 158 | "Bearer {}", 159 | authentication_key.as_ref() 160 | )) 161 | .unwrap(), 162 | ); 163 | } 164 | let client = reqwest::Client::builder() 165 | .default_headers(headers) 166 | .timeout(request_timeout.unwrap_or(std::time::Duration::from_secs(u64::MAX))) 167 | .build() 168 | .unwrap(); 169 | 170 | info!( 171 | "Connecting to MySQL at {}...", 172 | source_url.host_str().unwrap() 173 | ); 174 | let mysql_pool = sqlx::MySqlPool::connect(&source_url.to_string()).await?; 175 | 176 | Ok(Self { 177 | client, 178 | upload_url, 179 | commit_url, 180 | mysql: mysql_pool, 181 | tables: tables 182 | .into_iter() 183 | .map(|x| String::from(x.as_ref())) 184 | .collect(), 185 | write_streams, 186 | }) 187 | } 188 | 189 | pub async fn build(self, commit_interval: Option) -> Result<(), Error> { 190 | let rwlock = Arc::new(tokio::sync::RwLock::new(())); 191 | let boards = self.tables; 192 | let mysql_pool = self.mysql; 193 | let write_streams = self.write_streams; 194 | 195 | if let Some(commit_interval) = commit_interval { 196 | let rwlock = rwlock.clone(); 197 | let client = self.client.clone(); 198 | let commit_url = self.commit_url.clone(); 199 | tokio::spawn(async move { 200 | loop { 201 | tokio::time::delay_for(commit_interval).await; 202 | let t = rwlock.write().await; 203 | let r = client 204 | .post(commit_url.clone()) 205 | .send() 206 | .and_then(|resp| futures::future::ready(resp.error_for_status())) 207 | .await; 208 | drop(t); 209 | if let Err(err) = r { 210 | log::warn!("Failed to commit lnx search index: {}", err); 211 | } 212 | } 213 | }); 214 | } 215 | let m = Arc::new(MultiProgress::new()); 216 | let sty = ProgressStyle::default_bar() 217 | .template("{spinner:.green} {msg} [{elapsed_precise}] [{bar:40.cyan/blue}] {pos:>7}/{len:7} ({per_sec}, {eta})") 218 | .progress_chars("#>-"); 219 | let joined = AtomicBool::new(false); 220 | info!("Querying table information..."); 221 | let boards = stream::iter(boards) 222 | .then(|board| { 223 | mysql_pool.acquire().map(move |conn| match conn { 224 | Ok(c) => Ok((board, c)), 225 | Err(err) => Err(Error::from(err)), 226 | }) 227 | }) 228 | .and_then(|(board, mut conn)| async move { 229 | info!("\t{}...", board); 230 | let info = sqlx::query_as::<_, BoardInfo>(&format!( 231 | "SELECT '{}' AS board, COUNT(1) AS records FROM `{}`", 232 | board, board 233 | )) 234 | .fetch_one(&mut conn) 235 | .await?; 236 | Ok(info) 237 | }) 238 | .map_ok(|info| { 239 | let pb = m.add(ProgressBar::new(info.records as u64)); 240 | pb.set_style(sty.clone()); 241 | pb.set_message(info.board.as_str()); 242 | pb.set_position(0); 243 | (info, pb) 244 | }) 245 | .try_collect::>() 246 | .await?; 247 | 248 | info!("Starting transfer..."); 249 | let client = self.client.clone(); 250 | let upload_url = self.upload_url.clone(); 251 | let rows = stream::iter(boards) 252 | .then(|(info, pb)| { 253 | mysql_pool.acquire().map(move |conn| match conn { 254 | Ok(c) => Ok((info, c, pb)), 255 | Err(err) => Err(Error::from(err)), 256 | }) 257 | }) 258 | .and_then(|(info, mut conn, pb)| { 259 | if !joined.fetch_or(true, Ordering::AcqRel) { 260 | let m = m.clone(); 261 | tokio::spawn(tokio::task::spawn_blocking(move || { 262 | // do some compute-heavy work or call synchronous code 263 | m.join() 264 | })); 265 | } 266 | let lookup = lookup_query(&info.board); 267 | let client = client.clone(); 268 | let upload_url = upload_url.clone(); 269 | let rwlock = rwlock.clone(); 270 | async move { 271 | sqlx::query_as::<_, Post>(&lookup) 272 | .fetch(&mut conn) 273 | .chunks(1280) 274 | .map(|posts| { 275 | posts 276 | .into_iter() 277 | .collect::, _>>() 278 | .map_err(|err| Error::from(err)) 279 | }) 280 | .map_ok(|posts| { 281 | rwlock.read().map(move |permit| (permit, posts)).then(|(permit, posts)| { 282 | let rows = posts.len(); 283 | let delete_posts = { 284 | let posts = posts.iter().map(|p| p.into()).collect::>(); 285 | let field = posts.iter().map(|x| x.tuid).collect::>(); 286 | super::post::DeletePost::new(field) 287 | }; 288 | client 289 | .delete(upload_url.clone()) 290 | .json(&delete_posts) 291 | .send() 292 | .and_then(|resp| futures::future::ready(resp.error_for_status())) 293 | .map_ok(move |_| posts) 294 | .and_then(|posts| { 295 | let posts = posts.iter().map(|p| p.into()).collect::>(); 296 | client 297 | .post(upload_url.clone()) 298 | .json(&posts) 299 | .send() 300 | .and_then(|resp| futures::future::ready(resp.error_for_status())) 301 | }) 302 | .map_ok(move |_| rows) 303 | .map_err(|err| Error::DB(err)) 304 | .inspect(move |_| drop(permit)) 305 | }) 306 | }) 307 | .try_buffer_unordered(write_streams) 308 | .try_fold(0, |acc, rows| { 309 | pb.inc(rows as u64); 310 | futures::future::ready(Ok(acc + rows)) 311 | }) 312 | .inspect(|r| { 313 | if r.is_ok() { 314 | pb.finish_with_message(&info.board); 315 | } else { 316 | pb.finish_at_current_pos(); 317 | } 318 | }) 319 | .await 320 | } 321 | }) 322 | .try_fold(0, |acc, rows| futures::future::ready(Ok(acc + rows))) 323 | .await?; 324 | 325 | info!("Finished. Modified {} rows.", rows); 326 | Ok(()) 327 | } 328 | } 329 | 330 | pub fn reindex<'a>(matches: &ArgMatches<'a>) -> i32 { 331 | info!("Running lnx-search re-indexer"); 332 | 333 | let lnx_url: url::Url = match matches.value_of("lnx").unwrap().parse() { 334 | Ok(c) => c, 335 | Err(err) => { 336 | error!("Failed to parse postgres uri: {}", err); 337 | return 1; 338 | } 339 | }; 340 | let lnx_index = match matches.value_of("index") { 341 | Some(c) => String::from(c), 342 | None => { 343 | error!("Invalid index."); 344 | return 1; 345 | } 346 | }; 347 | let lnx_key = match matches.value_of("authentication-key") { 348 | Some(c) => String::from(c), 349 | None => String::from("") 350 | }; 351 | let mysql_url: url::Url = match matches.value_of("mysql").unwrap().parse() { 352 | Ok(c) => c, 353 | Err(err) => { 354 | error!("Failed to parse mysql uri: {}", err); 355 | return 1; 356 | } 357 | }; 358 | let commit_interval: usize = match matches.value_of("commit-interval").unwrap().parse() { 359 | Ok(c) => c, 360 | Err(err) => { 361 | error!("Failed to parse commiut url uri: {}", err); 362 | return 1; 363 | } 364 | }; 365 | let commit_interval = match commit_interval { 366 | 0 => None, 367 | c => Some(std::time::Duration::from_secs(c as _ )) 368 | }; 369 | let request_timeout: usize = match matches.value_of("request-timeout").unwrap().parse() { 370 | Ok(c) => c, 371 | Err(err) => { 372 | error!("Failed to parse commiut url uri: {}", err); 373 | return 1; 374 | } 375 | }; 376 | let request_timeout = match request_timeout { 377 | 0 => None, 378 | c => Some(std::time::Duration::from_secs(c as _ )) 379 | }; 380 | 381 | let write_streams: usize = match matches.value_of("write-streams").unwrap().parse() { 382 | Ok(0) => { 383 | error!("Invalid number 0 for write-streams"); 384 | return 1; 385 | } 386 | Ok(c) => c, 387 | Err(err) => { 388 | error!("Failed to parse write-streams: {}", err); 389 | return 1; 390 | } 391 | }; 392 | 393 | let boards = matches 394 | .values_of("boards") 395 | .unwrap() 396 | .map(|x| String::from(x)) 397 | .collect::>(); 398 | 399 | info!("Importing Boards: \"{}\"", boards.join("\",\"")); 400 | 401 | let mut runtime = tokio::runtime::Builder::new() 402 | .threaded_scheduler() 403 | .enable_all() 404 | .thread_name("torako") 405 | .build() 406 | .unwrap(); 407 | 408 | let mut upload_url = lnx_url.clone(); 409 | upload_url.set_path(&format!("/indexes/{}/documents", &lnx_index)); 410 | upload_url.set_query(Some("wait=true")); 411 | 412 | let mut commit_url = lnx_url.clone(); 413 | commit_url.set_path(&format!("/indexes/{}/commit", &lnx_index)); 414 | 415 | let r = LnxReIndex::new(upload_url, commit_url, lnx_key, request_timeout, mysql_url, boards, write_streams); 416 | let r = runtime.block_on(r.and_then(|r| r.build(commit_interval))); 417 | 418 | match r { 419 | Ok(_) => 0, 420 | Err(err) => { 421 | error!("Reindexing failed: {}", err); 422 | 1 423 | } 424 | } 425 | } 426 | -------------------------------------------------------------------------------- /src/imageboard/model.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use std::hash::{Hash, Hasher}; 3 | use std::time::SystemTime; 4 | 5 | use chrono::{NaiveDateTime, TimeZone}; 6 | use chrono_tz::America::New_York; 7 | use lazy_static::lazy_static; 8 | use regex::{Regex, RegexBuilder}; 9 | use serde::de::{self, Deserializer, Unexpected}; 10 | use serde::{Deserialize, Serialize}; 11 | 12 | #[derive(Debug, Deserialize, Clone)] 13 | pub struct CatalogPage { 14 | pub page: usize, 15 | pub threads: Vec, 16 | } 17 | 18 | #[derive(Debug, Deserialize, Clone)] 19 | pub struct CatalogThread { 20 | #[serde(default, skip)] 21 | pub page: usize, 22 | #[serde(default, skip)] 23 | pub board: &'static str, 24 | pub no: u64, 25 | pub resto: u64, 26 | #[serde(default, deserialize_with = "bool_from_int")] 27 | pub sticky: bool, 28 | #[serde(default, deserialize_with = "bool_from_int")] 29 | pub closed: bool, 30 | pub now: String, 31 | pub time: i64, 32 | pub name: Option, 33 | pub trip: Option, 34 | pub id: Option, 35 | pub capcode: Option, 36 | pub country: Option, 37 | pub sub: Option, 38 | pub com: Option, 39 | pub tim: Option, 40 | pub filename: Option, 41 | pub ext: Option, 42 | pub fsize: Option, 43 | pub md5: Option, 44 | pub w: Option, 45 | pub h: Option, 46 | pub tn_w: Option, 47 | pub tn_h: Option, 48 | #[serde(default, deserialize_with = "bool_from_int")] 49 | pub file_deleted: bool, 50 | #[serde(default, deserialize_with = "bool_from_int")] 51 | pub spoiler: bool, 52 | #[serde(default)] 53 | pub custom_spoiler: u8, 54 | #[serde(default)] 55 | pub omitted_posts: u16, 56 | #[serde(default)] 57 | pub omitted_images: u16, 58 | #[serde(default)] 59 | pub replies: u16, 60 | #[serde(default)] 61 | pub images: u16, 62 | #[serde(default, deserialize_with = "bool_from_int")] 63 | pub bumplimit: bool, 64 | #[serde(default, deserialize_with = "bool_from_int")] 65 | pub imagelimit: bool, 66 | pub last_modified: u64, 67 | pub tag: Option, 68 | pub semantic_url: Option, 69 | pub since4pass: Option, 70 | #[serde(default)] 71 | pub unique_ips: u32, 72 | #[serde(default, deserialize_with = "bool_from_int")] 73 | pub m_img: bool, 74 | #[serde(default)] 75 | pub last_replies: Vec, 76 | } 77 | 78 | #[derive(Default, Debug, Deserialize, Clone)] 79 | pub struct Thread { 80 | pub posts: Vec, 81 | } 82 | 83 | #[derive(Default, Debug, Deserialize, Clone)] 84 | pub struct Post { 85 | #[serde(default, skip)] 86 | pub board: &'static str, 87 | pub no: u64, 88 | pub resto: u64, 89 | #[serde(default, deserialize_with = "bool_from_int")] 90 | pub sticky: bool, 91 | #[serde(default, deserialize_with = "bool_from_int")] 92 | pub closed: bool, 93 | pub now: String, 94 | pub time: u64, 95 | pub name: Option, 96 | pub email: Option, 97 | pub trip: Option, 98 | pub id: Option, 99 | pub capcode: Option, 100 | pub country: Option, 101 | pub country_name: Option, 102 | pub troll_country: Option, // ? 103 | pub sub: Option, 104 | pub com: Option, 105 | pub tim: Option, 106 | pub filename: Option, 107 | pub ext: Option, 108 | #[serde(default)] 109 | pub fsize: u64, 110 | pub md5: Option, 111 | #[serde(default)] 112 | pub w: i32, 113 | #[serde(default)] 114 | pub h: i32, 115 | #[serde(default)] 116 | pub tn_w: u32, 117 | #[serde(default)] 118 | pub tn_h: u32, 119 | #[serde(default, deserialize_with = "bool_from_int")] 120 | pub file_deleted: bool, 121 | #[serde(default, deserialize_with = "bool_from_int")] 122 | pub spoiler: bool, 123 | #[serde(default)] 124 | pub custom_spoiler: u8, 125 | #[serde(default)] 126 | pub replies: u16, 127 | #[serde(default)] 128 | pub images: u16, 129 | #[serde(default, deserialize_with = "bool_from_int")] 130 | pub bumplimit: bool, 131 | #[serde(default, deserialize_with = "bool_from_int")] 132 | pub imagelimit: bool, 133 | pub tag: Option, 134 | pub semantic_url: Option, 135 | pub since4pass: Option, 136 | #[serde(default)] 137 | pub unique_ips: u32, 138 | #[serde(default, deserialize_with = "bool_from_int")] 139 | pub m_img: bool, 140 | #[serde(default, deserialize_with = "bool_from_int")] 141 | pub archived: bool, 142 | pub archived_on: Option, 143 | #[serde(default, skip)] 144 | pub deleted: bool, 145 | #[serde(default, skip)] 146 | pub deleted_at: Option, 147 | #[serde(default, skip)] 148 | pub is_retransmission: bool, 149 | #[serde(default, skip)] 150 | pub url_media_filename: bool, 151 | } 152 | 153 | impl Post { 154 | pub fn is_op(&self) -> bool { 155 | self.resto == 0 156 | } 157 | 158 | pub fn thread_no(&self) -> u64 { 159 | if self.resto == 0 { 160 | self.no 161 | } else { 162 | self.resto 163 | } 164 | } 165 | 166 | pub fn nyc_timestamp(&self) -> u64 { 167 | let ny_time = chrono::Utc 168 | .timestamp(self.time as i64, 0) 169 | .with_timezone(&New_York); 170 | ny_time.naive_local().timestamp().max(0) as u64 171 | } 172 | 173 | pub fn datetime(&self) -> Option { 174 | // deleted posts will have a time of 0 175 | match self.time { 176 | 0 => None, 177 | _ => Some(NaiveDateTime::from_timestamp(self.time as i64, 0)), 178 | } 179 | } 180 | 181 | pub fn preview_orig(&self) -> Option { 182 | self.tim.as_ref().map(|t| format!("{}s.jpg", t)) 183 | } 184 | 185 | pub fn media_filename(&self) -> Option { 186 | self.filename 187 | .as_ref() 188 | .map(|x| x.to_string() + self.ext.as_ref().unwrap()) 189 | } 190 | 191 | pub fn media_filename_decoded(&self) -> Option { 192 | self.filename.as_ref().map(|x| { 193 | htmlescape::decode_html(x).unwrap_or_else(|_| x.to_string()) 194 | + self.ext.as_ref().unwrap() 195 | }) 196 | } 197 | 198 | pub fn media_orig(&self) -> Option { 199 | if self.url_media_filename { 200 | return self.media_filename(); 201 | } 202 | self.tim 203 | .as_ref() 204 | .zip(self.ext.as_ref()) 205 | .map(|(tim, ext)| format!("{}{}", tim, ext)) 206 | } 207 | 208 | pub fn short_capcode(&self) -> String { 209 | self.capcode 210 | .as_ref() 211 | .map(|c| match c.to_ascii_lowercase().as_str() { 212 | "manager" => String::from("M"), 213 | _ => c.chars().next().unwrap().to_ascii_uppercase().to_string(), 214 | }) 215 | .unwrap_or(String::from("N")) 216 | } 217 | 218 | pub fn poster_name(&self) -> Option { 219 | self.name.as_ref().map(|x| Self::clean_simple(x)) 220 | } 221 | 222 | pub fn poster_nametrip(&self) -> Option { 223 | match (self.name.as_ref(), self.trip.as_ref()) { 224 | (Some(name), Some(trip)) => Some(format!("{}{}", name, trip)), 225 | (Some(name), _) => Some(name.clone()), 226 | (_, Some(trip)) => Some(trip.clone()), 227 | _ => None, 228 | } 229 | } 230 | 231 | pub fn title(&self) -> Option { 232 | self.sub.as_ref().map(|x| Self::clean_simple(x)) 233 | } 234 | 235 | pub fn poster_hash(&self) -> Option { 236 | self.id.as_ref().map(|x| { 237 | if x.as_str() == "Developer" { 238 | String::from("Dev") 239 | } else { 240 | x.to_string() 241 | } 242 | }) 243 | } 244 | 245 | pub fn poster_country(&self) -> Option { 246 | match &self.country { 247 | Some(country) => match country.as_ref() { 248 | "XX" | "A1" => None, 249 | _ => Some(country.clone()), 250 | }, 251 | None => None, 252 | } 253 | } 254 | 255 | pub fn comment(&self) -> Option { 256 | self.com.as_ref().map(|x| Post::clean_heavy(x.as_str())) 257 | } 258 | 259 | pub fn comment_hash(&self) -> u64 { 260 | match self.com.as_ref() { 261 | Some(t) => { 262 | let mut s = seahash::SeaHasher::default(); 263 | t.hash(&mut s); 264 | s.finish() 265 | } 266 | None => 0, 267 | } 268 | } 269 | 270 | pub fn has_thumb(&self) -> bool { 271 | self.tn_h > 0 && self.tn_w > 0 272 | } 273 | 274 | pub fn has_media(&self) -> bool { 275 | self.filename.is_some() 276 | } 277 | 278 | pub fn exif(&self) -> Option { 279 | let exif = Exif::parse(&self); 280 | if exif.is_empty() { 281 | None 282 | } else { 283 | Some(serde_json::to_string(&exif).unwrap()) 284 | } 285 | } 286 | 287 | pub fn clean_simple>(text: T) -> String { 288 | lazy_static! { 289 | static ref RE_PIPELINE: [(Regex, &'static str); 6] = [ 290 | (Regex::new(">").unwrap(), ">"), 291 | (Regex::new("<").unwrap(), "<"), 292 | (Regex::new(""").unwrap(), "\""), 293 | (Regex::new("&").unwrap(), "&"), 294 | (Regex::new("\\s*$").unwrap(), ""), 295 | (Regex::new("^\\s*$").unwrap(), ""), 296 | ]; 297 | } 298 | match htmlescape::decode_html(text.as_ref()) { 299 | Ok(text) => { 300 | let mut ret = text; 301 | for (patt, repl) in RE_PIPELINE[4..].iter() { 302 | match patt.replace_all(&ret, *repl) { 303 | std::borrow::Cow::Owned(s) => ret = s, 304 | std::borrow::Cow::Borrowed(_) => (), 305 | }; 306 | } 307 | ret 308 | } 309 | Err(err) => { 310 | log::warn!( 311 | "error decoding html entities (will do regex escaping): '{:?}':\n'''\n{}\n'''", 312 | err, 313 | text.as_ref() 314 | ); 315 | let mut ret = text.as_ref().to_owned(); 316 | for (patt, repl) in RE_PIPELINE.iter() { 317 | match patt.replace_all(&ret, *repl) { 318 | std::borrow::Cow::Owned(s) => ret = s, 319 | std::borrow::Cow::Borrowed(_) => (), 320 | }; 321 | } 322 | ret 323 | } 324 | } 325 | } 326 | 327 | pub fn clean_heavy>(text: T) -> String { 328 | lazy_static! { 329 | static ref RE_PIPELINE: [(Regex, &'static str); 23] = [ 330 | // Admin-Mod-Dev quotelinks 331 | (Regex::new("(?:Administrator|Moderator|Developer) Repl(?:y|ies):.*?
").unwrap(), ""), 332 | // Non-public tags 333 | (Regex::new("\\[(/?(banned|moot|spoiler|code))]").unwrap(), "[$1:lit]"), 334 | // Comment too long, also EXIF tag toggle 335 | (Regex::new(".*?").unwrap(), ""), 336 | // EXIF data 337 | (Regex::new("]*>.*?
").unwrap(), ""), 338 | // DRAW data 339 | (Regex::new("

Oekaki Post.*?").unwrap(), ""), 340 | // Banned/Warned text 341 | (Regex::new("<(?:b|strong) style=\"color:\\s*red;\">(.*?)").unwrap(), "[banned]$1[/banned]"), 342 | // moot text 343 | (Regex::new("
(.*?)
").unwrap(), "[moot]$1[/moot]"), 344 | // fortune text 345 | (Regex::new("

(.*?)
").unwrap(), "\n\n[fortune color=\"$1\"]$2[/fortune]"), 346 | // bold text 347 | (Regex::new("<(?:b|strong)>(.*?)").unwrap(), "[b]$1[/b]"), 348 | // code tags 349 | (Regex::new("]*>").unwrap(), "[code]"), 350 | (Regex::new("").unwrap(), "[/code]"), 351 | // math tags 352 | (Regex::new("(.*?)").unwrap(), "[math]$1[/math]"), 353 | (Regex::new("
(.*?)
").unwrap(), "[eqn]$1[/eqn]"), 354 | // > implying I'm quoting someone 355 | (Regex::new("(.*?)").unwrap(), "$1"), 356 | (Regex::new("(.*?)").unwrap(), "$1"), 357 | (Regex::new("(.*?)").unwrap(), "$1"), 358 | // Links 359 | (Regex::new("]*>(.*?)").unwrap(), "$1"), 360 | // old spoilers 361 | (Regex::new("]*>(.*?)").unwrap(), "[spoiler]$1[/spoiler]"), 362 | // ShiftJIS 363 | (Regex::new("(.*?)").unwrap(), "[shiftjis]$1[/shiftjis]"), 364 | // new spoilers 365 | (Regex::new("").unwrap(), "[spoiler]"), 366 | (Regex::new("").unwrap(), "[/spoiler]"), 367 | // new line/wbr 368 | (Regex::new("").unwrap(), "\n"), 369 | (Regex::new("").unwrap(), "") 370 | ]; 371 | } 372 | 373 | let mut ret = String::from(text.as_ref()); 374 | for (patt, repl) in RE_PIPELINE.iter() { 375 | match patt.replace_all(&ret, *repl) { 376 | std::borrow::Cow::Owned(s) => ret = s, 377 | std::borrow::Cow::Borrowed(_) => (), 378 | }; 379 | } 380 | 381 | Self::clean_simple(ret) 382 | } 383 | 384 | pub fn deleted(board: &'static str, no: u64) -> Self { 385 | Self { 386 | board, 387 | no, 388 | is_retransmission: true, 389 | deleted: true, 390 | deleted_at: match SystemTime::now().duration_since(SystemTime::UNIX_EPOCH) { 391 | Ok(n) => { 392 | let ny_time = chrono::Utc 393 | .timestamp(n.as_secs() as i64, 0) 394 | .with_timezone(&New_York); 395 | Some(ny_time.naive_local().timestamp() as u64) 396 | } 397 | Err(_) => panic!("SystemTime before UNIX EPOCH!"), 398 | }, 399 | ..Default::default() 400 | } 401 | } 402 | } 403 | 404 | #[derive(Debug, Serialize, Clone, Default)] 405 | pub struct Exif { 406 | #[serde(rename = "uniqueIps", skip_serializing_if = "Option::is_none")] 407 | unique_ips: Option, 408 | #[serde(skip_serializing_if = "Option::is_none")] 409 | since4pass: Option, 410 | #[serde(rename = "trollCountry", skip_serializing_if = "Option::is_none")] 411 | troll_country: Option, 412 | #[serde(rename = "Time", skip_serializing_if = "Option::is_none")] 413 | time: Option, 414 | #[serde(rename = "Painter", skip_serializing_if = "Option::is_none")] 415 | painter: Option, 416 | #[serde(rename = "Source", skip_serializing_if = "Option::is_none")] 417 | source: Option, 418 | 419 | #[serde(flatten, skip_serializing_if = "HashMap::is_empty")] 420 | exif_data: HashMap, 421 | } 422 | 423 | impl Exif { 424 | fn parse(post: &Post) -> Self { 425 | lazy_static! { 426 | static ref DRAW_RE: Regex = RegexBuilder::new("Oekaki \\s Post \\s \\(Time: \\s (.*?), \\s Painter: \\s (.*?)(?:, \\s Source: \\s (.*?))?(?:, \\s Animation: \\s (.*?))?\\)").dot_matches_new_line(true).ignore_whitespace(true).build().unwrap(); 427 | static ref EXIF_RE: Regex = RegexBuilder::new("]*>(.*)
").dot_matches_new_line(true).ignore_whitespace(true).build().unwrap(); 428 | static ref EXIF_DATA_RE: Regex = RegexBuilder::new("(.*?)(.*?)").dot_matches_new_line(true).ignore_whitespace(true).build().unwrap(); 429 | } 430 | 431 | let mut exif_data = HashMap::new(); 432 | let mut time = None; 433 | let mut painter = None; 434 | let mut source = None; 435 | if let Some(text) = post.com.as_ref() { 436 | if let Some(exif) = EXIF_RE.captures(text.as_str()) { 437 | let data = exif[1].replace("", ""); 438 | for cap in EXIF_DATA_RE.captures_iter(&data) { 439 | exif_data.insert(String::from(&cap[1]), String::from(&cap[2])); 440 | } 441 | } 442 | if let Some(draw) = DRAW_RE.captures(text.as_str()) { 443 | time = Some(String::from(&draw[1])); 444 | painter = Some(String::from(&draw[2])); 445 | source = draw.get(3).map(|source| Post::clean_heavy(source.as_str())) 446 | } 447 | } 448 | 449 | Self { 450 | unique_ips: if post.unique_ips == 0 { 451 | None 452 | } else { 453 | Some(post.unique_ips.to_string()) 454 | }, 455 | since4pass: post 456 | .since4pass 457 | .map(|x| if x == 0 { None } else { Some(x.to_string()) }) 458 | .flatten(), 459 | troll_country: post.troll_country.clone(), 460 | time, 461 | painter, 462 | source, 463 | exif_data, 464 | } 465 | } 466 | 467 | fn is_empty(&self) -> bool { 468 | self.unique_ips.is_none() 469 | && self.since4pass.is_none() 470 | && self.troll_country.is_none() 471 | && self.time.is_none() 472 | && self.painter.is_none() 473 | && self.source.is_none() 474 | && self.exif_data.is_empty() 475 | } 476 | } 477 | 478 | fn bool_from_int<'de, D: Deserializer<'de>>(deserializer: D) -> Result { 479 | match u8::deserialize(deserializer)? { 480 | 0 => Ok(false), 481 | 1 => Ok(true), 482 | other => Err(de::Error::invalid_value( 483 | Unexpected::Unsigned(other as u64), 484 | &"zero or one", 485 | )), 486 | } 487 | } 488 | -------------------------------------------------------------------------------- /src/util/pgs/reindex.rs: -------------------------------------------------------------------------------- 1 | use std::borrow::Cow; 2 | use std::iter::IntoIterator; 3 | use std::str::FromStr; 4 | use std::sync::{ 5 | atomic::{AtomicBool, Ordering}, 6 | Arc, 7 | }; 8 | 9 | use clap::ArgMatches; 10 | use futures::prelude::*; 11 | use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; 12 | use log::{error, info}; 13 | use memchr::memchr; 14 | use thiserror::Error; 15 | use tokio_postgres::types::ToSql; 16 | 17 | use crate::storage::search_pg::PLACEHOLDERS; 18 | use crate::util::interval_lock::IntervalLock; 19 | 20 | #[derive(Debug, Error)] 21 | pub enum Error { 22 | #[error("invalid database pool size")] 23 | InvalidPoolSize, 24 | #[error("invalid database URL provided: {}", .0)] 25 | InvalidDatabase(tokio_postgres::Error), 26 | #[error("pg database connection error: {}", .0)] 27 | Pool(#[from] deadpool_postgres::PoolError), 28 | #[error("pg database error: {}", .0)] 29 | DB(#[from] tokio_postgres::Error), 30 | #[error("io error: {}", .0)] 31 | IO(#[from] std::io::Error), 32 | #[error("mysql error: {}", .0)] 33 | MySQL(#[from] sqlx::Error), 34 | } 35 | 36 | #[derive(sqlx::FromRow, Debug)] 37 | struct BoardInfo { 38 | board: String, 39 | records: i64, 40 | } 41 | 42 | #[derive(sqlx::FromRow, Debug)] 43 | struct Post { 44 | board: String, 45 | thread_num: u32, 46 | num: u32, 47 | title: Option, 48 | name: Option, 49 | trip: Option, 50 | email: Option, 51 | poster_hash: Option, 52 | poster_country: Option, 53 | media_filename: Option, 54 | media_hash: Option, 55 | media_w: Option, 56 | media_h: Option, 57 | timestamp: i64, 58 | comment: Option, 59 | deleted: bool, 60 | sticky: bool, 61 | spoiler: bool, 62 | op: bool, 63 | capcode: Option, 64 | } 65 | 66 | struct PGSReIndex { 67 | pg: deadpool_postgres::Pool, 68 | mysql: sqlx::MySqlPool, 69 | tables: Vec, 70 | write_streams: usize, 71 | } 72 | 73 | fn lookup_query>(board: T) -> String { 74 | format!( 75 | r"SELECT 76 | '{}' AS board, 77 | `thread_num`, 78 | `num`, 79 | `title`, 80 | `name`, 81 | `trip`, 82 | `email`, 83 | `poster_hash`, 84 | `poster_country`, 85 | `media_filename`, 86 | `media_hash`, 87 | `media_w`, 88 | `media_h`, 89 | COALESCE(UNIX_TIMESTAMP(`unix_timestamp`), `timestamp`) AS `timestamp`, 90 | `comment`, 91 | `deleted`, 92 | `sticky`, 93 | `spoiler`, 94 | `op`, 95 | `capcode` 96 | FROM 97 | `{}`", 98 | board.as_ref(), 99 | board.as_ref() 100 | ) 101 | } 102 | 103 | impl PGSReIndex { 104 | async fn new, T: IntoIterator>( 105 | mut pg_url: url::Url, 106 | source_url: url::Url, 107 | tables: T, 108 | write_streams: usize, 109 | ) -> Result { 110 | let pool_size = pg_url 111 | .query_pairs() 112 | .find(|x| x.0 == "pool_size") 113 | .map(|x| x.1.parse::()); 114 | let pool_size = match pool_size { 115 | Some(p) => match p { 116 | Ok(s) => s, 117 | Err(_) => return Err(Error::InvalidPoolSize), 118 | }, 119 | None => 16, 120 | }; 121 | 122 | let pg_url = { 123 | let x = pg_url.clone(); 124 | let pairs = x.query_pairs().filter(|x| x.0 != "pool_size"); 125 | pg_url.query_pairs_mut().clear().extend_pairs(pairs); 126 | pg_url 127 | }; 128 | 129 | let config = match tokio_postgres::Config::from_str(&pg_url.to_string()) { 130 | Ok(c) => c, 131 | Err(err) => return Err(Error::InvalidDatabase(err)), 132 | }; 133 | let manager = deadpool_postgres::Manager::new(config, tokio_postgres::NoTls); 134 | let pg_pool = deadpool_postgres::Pool::new(manager, pool_size); 135 | info!( 136 | "Connecting to postgres at {}...", 137 | pg_url.host_str().unwrap() 138 | ); 139 | drop(pg_pool.get().await?); 140 | 141 | info!( 142 | "Connecting to MySQL at {}...", 143 | source_url.host_str().unwrap() 144 | ); 145 | let mysql_pool = sqlx::MySqlPool::connect(&source_url.to_string()).await?; 146 | // drop(mysql_pool.get_conn().await?); 147 | 148 | Ok(Self { 149 | pg: pg_pool, 150 | mysql: mysql_pool, 151 | tables: tables 152 | .into_iter() 153 | .map(|x| String::from(x.as_ref())) 154 | .collect(), 155 | write_streams, 156 | }) 157 | } 158 | 159 | pub async fn build(self) -> Result<(), Error> { 160 | let boards = self.tables; 161 | let mysql_pool = self.mysql; 162 | let pg_pool = Arc::new(self.pg); 163 | let write_streams = self.write_streams; 164 | let m = Arc::new(MultiProgress::new()); 165 | let sty = ProgressStyle::default_bar() 166 | .template("{spinner:.green} {msg} [{elapsed_precise}] [{bar:40.cyan/blue}] {pos:>7}/{len:7} ({per_sec}, {eta})") 167 | .progress_chars("#>-"); 168 | let joined = AtomicBool::new(false); 169 | info!("Querying table information..."); 170 | let boards = stream::iter(boards) 171 | .then(|board| { 172 | mysql_pool.acquire().map(move |conn| match conn { 173 | Ok(c) => Ok((board, c)), 174 | Err(err) => Err(Error::from(err)), 175 | }) 176 | }) 177 | .and_then(|(board, mut conn)| async move { 178 | info!("\t{}...", board); 179 | let info = sqlx::query_as::<_, BoardInfo>(&format!( 180 | "SELECT '{}' AS board, COUNT(1) AS records FROM `{}`", 181 | board, board 182 | )) 183 | .fetch_one(&mut conn) 184 | .await?; 185 | Ok(info) 186 | }) 187 | .map_ok(|info| { 188 | let pb = m.add(ProgressBar::new(info.records as u64)); 189 | pb.set_style(sty.clone()); 190 | pb.set_message(info.board.as_str()); 191 | pb.set_position(0); 192 | (info, pb) 193 | }) 194 | .try_collect::>() 195 | .await?; 196 | 197 | info!("Starting transfer..."); 198 | let rows = stream::iter(boards) 199 | .then(|(info, pb)| { 200 | mysql_pool.acquire().map(move |conn| match conn { 201 | Ok(c) => Ok((info, c, pb)), 202 | Err(err) => Err(Error::from(err)), 203 | }) 204 | }) 205 | .and_then(|(info, mut conn, pb)| { 206 | if !joined.fetch_or(true, Ordering::AcqRel) { 207 | let m = m.clone(); 208 | tokio::spawn(tokio::task::spawn_blocking(move || { 209 | // do some compute-heavy work or call synchronous code 210 | m.join() 211 | })); 212 | } 213 | let interval_lock: IntervalLock = Default::default(); 214 | let lookup = lookup_query(&info.board); 215 | let pg_pool = pg_pool.clone(); 216 | async move { 217 | sqlx::query_as::<_, Post>(&lookup) 218 | .fetch(&mut conn) 219 | .chunks(1280) 220 | .map(|posts| { 221 | let (mut low, mut high) = (u64::MAX, 0); 222 | let posts = posts 223 | .into_iter() 224 | .inspect(|post| { 225 | if let Ok(post) = post { 226 | low = low.min(post.num as u64); 227 | high = high.max(post.num as u64); 228 | } 229 | }) 230 | .collect::, _>>() 231 | .map_err(|err| Error::from(err)); 232 | posts.map(|posts| (posts, (low, high))) 233 | }) 234 | .map_ok(|(posts, idx_range)| { 235 | let rows = posts.len(); 236 | let query = "INSERT INTO 237 | posts 238 | (board, thread_no, post_no, subject, username, tripcode, 239 | email, unique_id, since4_pass, country, filename, 240 | image_hash, image_width, image_height, ts, comment, deleted, 241 | ghost, sticky, spoiler, op, capcode) VALUES "; 242 | let stmt = std::iter::once(Cow::Borrowed(query)) 243 | .chain((0..rows).map(|i| { 244 | let z = i * 22; 245 | Cow::Owned( 246 | [ 247 | if i == 0 { "(" } else { "\n,(" }, 248 | PLACEHOLDERS[z], // board 249 | ",", 250 | PLACEHOLDERS[z + 1], // thread_no 251 | ",", 252 | PLACEHOLDERS[z + 2], // post_no 253 | ",to_tsvector(", 254 | PLACEHOLDERS[z + 3], // subject 255 | "),to_tsvector(", 256 | PLACEHOLDERS[z + 4], // username 257 | "),to_tsvector(", 258 | PLACEHOLDERS[z + 5], // tripcode 259 | "),to_tsvector(", 260 | PLACEHOLDERS[z + 6], // email 261 | "),", 262 | PLACEHOLDERS[z + 7], // unique_id 263 | ",", 264 | PLACEHOLDERS[z + 8], // since4_pass 265 | ",", 266 | PLACEHOLDERS[z + 9], // country 267 | ",to_tsvector(REPLACE(", 268 | PLACEHOLDERS[z + 10], // filename 269 | ",'.',' ')),", 270 | PLACEHOLDERS[z + 11], // image_hash 271 | ",", 272 | PLACEHOLDERS[z + 12], // image_width 273 | ",", 274 | PLACEHOLDERS[z + 13], // image_height 275 | ",TO_TIMESTAMP(CAST(", 276 | PLACEHOLDERS[z + 14], // ts 277 | "::INT8 AS FLOAT8)),to_tsvector(", 278 | PLACEHOLDERS[z + 15], // comment 279 | "),", 280 | PLACEHOLDERS[z + 16], // deleted 281 | ",", 282 | PLACEHOLDERS[z + 17], // ghost 283 | ",", 284 | PLACEHOLDERS[z + 18], // sticky 285 | ",", 286 | PLACEHOLDERS[z + 19], // spoiler 287 | ",", 288 | PLACEHOLDERS[z + 20], // op 289 | ",", 290 | PLACEHOLDERS[z + 21], // capcode 291 | ")", 292 | ] 293 | .join(""), 294 | ) 295 | })) 296 | .chain(std::iter::once(Cow::Borrowed(" ON CONFLICT DO NOTHING"))) 297 | .collect::(); 298 | 299 | let params = posts 300 | .into_iter() 301 | .map(|post| { 302 | let values: Box<[Box]> = Box::new([ 303 | Box::new(post.board), 304 | Box::new(post.thread_num as i64), 305 | Box::new(post.num as i64), 306 | Box::new(post.title.map(str_sanitize)), 307 | Box::new(post.name.map(str_sanitize)), 308 | Box::new(post.trip.map(str_sanitize)), 309 | Box::new(post.email.map(str_sanitize)), 310 | Box::new(post.poster_hash.map(str_sanitize)), 311 | Box::new(None::), 312 | Box::new(post.poster_country.map(str_sanitize)), 313 | Box::new(post.media_filename.map(str_sanitize)), 314 | Box::new(post.media_hash.map(str_sanitize)), 315 | Box::new(post.media_w.map(|x| x as i32)), 316 | Box::new(post.media_h.map(|x| x as i32)), 317 | Box::new(post.timestamp), 318 | Box::new(post.comment.map(str_sanitize)), 319 | Box::new(post.deleted), 320 | Box::new(false), 321 | Box::new(post.sticky), 322 | Box::new(post.spoiler), 323 | Box::new(post.op), 324 | Box::new( 325 | post.capcode 326 | .map(|c| { 327 | c.chars() 328 | .filter(char::is_ascii) 329 | .next() 330 | .map(|c| c as i32) 331 | }) 332 | .flatten(), 333 | ), 334 | ]); 335 | values.into_vec() 336 | }) 337 | .flatten() 338 | .collect::>>(); 339 | 340 | interval_lock 341 | .acquire(idx_range) 342 | .map(move |g| (g, rows)) 343 | .then(|(guard, rows)| { 344 | pg_pool.get().map_err(|err| Error::from(err)).and_then( 345 | move |pg_conn| async move { 346 | pg_conn 347 | .execute_raw( 348 | stmt.as_str(), 349 | params.iter().map(|x| x.as_ref()), 350 | ) 351 | .map_ok(|written| (written, rows)) 352 | .map_err(|err| Error::from(err)) 353 | .inspect(move |_| drop(guard)) 354 | .await 355 | }, 356 | ) 357 | }) 358 | }) 359 | .try_buffer_unordered(write_streams) 360 | .try_fold(0, |acc, (written, rows)| { 361 | pb.inc(rows as u64); 362 | futures::future::ready(Ok(acc + written)) 363 | }) 364 | .inspect(|r| { 365 | if r.is_ok() { 366 | pb.finish_with_message(&info.board); 367 | } else { 368 | pb.finish_at_current_pos(); 369 | } 370 | }) 371 | .await 372 | } 373 | }) 374 | .try_fold(0, |acc, rows| futures::future::ready(Ok(acc + rows))) 375 | .await?; 376 | 377 | info!("Finished. Modified {} rows.", rows); 378 | Ok(()) 379 | } 380 | } 381 | 382 | fn str_sanitize(input: String) -> String { 383 | match memchr(0, input.as_bytes()) { 384 | Some(_) => input.replace(char::from(0), ""), 385 | None => input, 386 | } 387 | } 388 | 389 | pub fn reindex<'a>(matches: &ArgMatches<'a>) -> i32 { 390 | info!("Running pg-search re-indexer"); 391 | 392 | let postgres_url: url::Url = match matches.value_of("postgres").unwrap().parse() { 393 | Ok(c) => c, 394 | Err(err) => { 395 | error!("Failed to parse postgres uri: {}", err); 396 | return 1; 397 | } 398 | }; 399 | let mysql_url: url::Url = match matches.value_of("mysql").unwrap().parse() { 400 | Ok(c) => c, 401 | Err(err) => { 402 | error!("Failed to parse postgres uri: {}", err); 403 | return 1; 404 | } 405 | }; 406 | 407 | let write_streams: usize = match matches.value_of("write-streams").unwrap().parse() { 408 | Ok(0) => { 409 | error!("Invalid number 0 for write-streams"); 410 | return 1; 411 | } 412 | Ok(c) => c, 413 | Err(err) => { 414 | error!("Failed to parse write-streams: {}", err); 415 | return 1; 416 | } 417 | }; 418 | 419 | let boards = matches 420 | .values_of("boards") 421 | .unwrap() 422 | .map(|x| String::from(x)) 423 | .collect::>(); 424 | 425 | info!("Importing Boards: \"{}\"", boards.join("\",\"")); 426 | 427 | let mut runtime = tokio::runtime::Builder::new() 428 | .threaded_scheduler() 429 | .enable_all() 430 | .thread_name("torako") 431 | .build() 432 | .unwrap(); 433 | 434 | let r = PGSReIndex::new(postgres_url, mysql_url, boards, write_streams); 435 | let r = runtime.block_on(r.and_then(|r| r.build())); 436 | 437 | match r { 438 | Ok(_) => 0, 439 | Err(err) => { 440 | error!("Reindexing failed: {}", err); 441 | 1 442 | } 443 | } 444 | } 445 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | #![type_length_limit = "5136020"] 2 | 3 | #[cfg(all(feature = "jemalloc", not(target_env = "msvc")))] 4 | use jemallocator::Jemalloc; 5 | 6 | #[cfg(all(feature = "jemalloc", not(target_env = "msvc")))] 7 | #[global_allocator] 8 | static GLOBAL: Jemalloc = Jemalloc; 9 | 10 | use std::collections::{HashMap, HashSet}; 11 | use std::env; 12 | use std::hash::BuildHasherDefault; 13 | use std::panic; 14 | use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; 15 | use std::sync::Arc; 16 | use std::time::Duration; 17 | 18 | use clap::{crate_version, App, AppSettings, Arg, ArgMatches, SubCommand}; 19 | use futures::prelude::*; 20 | use log::{error, info, warn}; 21 | use pretty_env_logger; 22 | use thiserror::Error; 23 | 24 | mod api; 25 | mod config; 26 | mod feed; 27 | mod imageboard; 28 | mod storage; 29 | mod util; 30 | 31 | pub use feed::FeedSinkExt; 32 | 33 | pub type SeaHashMap = HashMap>; 34 | pub type SeaHashSet = HashSet>; 35 | 36 | #[derive(Debug, Error)] 37 | pub enum Error { 38 | #[error("Asagi: {}", .0)] 39 | Asagi(#[from] storage::asagi::Error), 40 | #[error("PG Search: {}", .0)] 41 | PG(#[from] storage::search_pg::Error), 42 | #[error("Lnx Search: {}", .0)] 43 | Lnx(#[from] storage::search_lnx::Error), 44 | } 45 | 46 | async fn run_async(config: config::Config) -> i32 { 47 | println!("Torako: Imageboard Archiver"); 48 | println!("\tVersion: {}", crate_version!()); 49 | println!("\tRepo: https://github.com/miyachan/torako"); 50 | let http_client = { 51 | let mut b = reqwest::Client::builder(); 52 | if let Some(timeout) = config.request_timeout { 53 | b = b.timeout(timeout); 54 | } 55 | if config.request_only_proxy && config.request_proxy.is_empty() { 56 | error!("Configuration error: request_only_proxy is true but not proxies were provided"); 57 | return 1; 58 | } 59 | if !config.request_proxy.is_empty() { 60 | let rr = AtomicUsize::new(0); 61 | let proxies = std::iter::once(None) 62 | .filter(|_| !config.request_only_proxy) 63 | .chain(config.request_proxy.iter().cloned().map(|p| Some(p))) 64 | .collect::>(); 65 | b = b.proxy(reqwest::Proxy::custom(move |_| { 66 | let r = rr.fetch_add(1, Ordering::AcqRel); 67 | proxies.get(r % proxies.len()).unwrap().clone() 68 | })); 69 | } 70 | b.build().unwrap() 71 | }; 72 | let rate_limiter = config.rate_limit.map(|limit| { 73 | let quota = governor::Quota::per_second(limit); 74 | Arc::new(governor::RateLimiter::direct(quota)) 75 | }); 76 | let concurrency_limiter = config.thread_concurrency.map(|limit| { 77 | let sem = tokio::sync::Semaphore::new(limit.get()); 78 | Arc::new(sem) 79 | }); 80 | 81 | let boards = config 82 | .boards 83 | .boards 84 | .iter() 85 | .map(|(name, board)| { 86 | info!("Archiving board '{}'", name); 87 | imageboard::BoardStream::new( 88 | http_client.clone(), 89 | board.tls.or(config.boards.tls).unwrap_or(true), 90 | board 91 | .host 92 | .as_ref() 93 | .cloned() 94 | .or(config.boards.host.as_ref().cloned()) 95 | .unwrap_or(String::from("a.4cdn.org")), 96 | name, 97 | board 98 | .refresh_rate 99 | .or(config.boards.refresh_rate) 100 | .unwrap_or(Duration::from_secs(10)), 101 | rate_limiter.clone(), 102 | concurrency_limiter.clone(), 103 | board 104 | .deleted_page_threshold 105 | .or(config.boards.deleted_page_threshold) 106 | .unwrap_or(8), 107 | board 108 | .url_media_filename 109 | .or(config.boards.url_media_filename) 110 | .unwrap_or(false), 111 | ) 112 | }) 113 | .collect::>(); 114 | if boards.len() == 0 { 115 | error!("No boards were configured for archiving!"); 116 | return 1; 117 | } 118 | 119 | let asagi = match config.backend.asagi.as_ref() { 120 | Some(asagi_conf) if !asagi_conf.disabled => { 121 | let asagi = { 122 | config 123 | .boards 124 | .boards 125 | .iter() 126 | .map(|(name, board)| { 127 | ( 128 | name.clone(), 129 | board 130 | .download_thumbs 131 | .or(config.boards.download_thumbs) 132 | .unwrap_or(true), 133 | board 134 | .download_media 135 | .or(config.boards.download_media) 136 | .unwrap_or(true), 137 | ) 138 | }) 139 | .fold( 140 | storage::asagi::AsagiBuilder::from(asagi_conf), 141 | |acc, (name, thumbs, media)| { 142 | acc.with_board( 143 | &name, 144 | thumbs, 145 | media, 146 | asagi_conf 147 | .boards 148 | .get(&name) 149 | .and_then(|x| x.media_storage.clone()), 150 | asagi_conf 151 | .boards 152 | .get(&name) 153 | .and_then(|x| x.thumb_storage.clone()), 154 | ) 155 | }, 156 | ) 157 | .with_http_client(http_client.clone()) 158 | .build() 159 | .await 160 | }; 161 | 162 | match asagi { 163 | Ok(a) => Some(a), 164 | Err(err) => { 165 | error!("Failed to initialize asagi storage backend: {}", err); 166 | return 1; 167 | } 168 | } 169 | } 170 | _ => None, 171 | }; 172 | 173 | let search = match config.backend.asagi_pg_search.as_ref() { 174 | Some(conf) if !conf.disabled => { 175 | let builder = storage::search_pg::SearchBuilder::from(conf); 176 | match builder.build().await { 177 | Ok(a) => Some(a), 178 | Err(err) => { 179 | error!( 180 | "Failed to initialize asagi pg storage search backend: {}", 181 | err 182 | ); 183 | return 1; 184 | } 185 | } 186 | } 187 | _ => None, 188 | }; 189 | 190 | let search_lnx = match config.backend.asagi_lnx_search.as_ref() { 191 | Some(conf) if !conf.disabled => { 192 | let builder = storage::search_lnx::SearchBuilder::from(conf); 193 | match builder.build().await { 194 | Ok(a) => Some(a), 195 | Err(err) => { 196 | error!( 197 | "Failed to initialize asagi pg storage search backend: {}", 198 | err 199 | ); 200 | return 1; 201 | } 202 | } 203 | } 204 | _ => None, 205 | }; 206 | 207 | let running = AtomicBool::new(true); 208 | let (end_stream, stream_ender) = tokio::sync::oneshot::channel(); 209 | let mut end_stream = Some(end_stream); 210 | ctrlc::set_handler(move || { 211 | match running.swap(false, Ordering::SeqCst) { 212 | true => { 213 | if let Some(end_stream) = end_stream.take() { 214 | let _ = end_stream.send(()); 215 | } 216 | warn!("Received SIGTERM/SIGINT signal. Torako will try to exit cleanly (by waiting for all current posts/images to finish downloading)."); 217 | warn!("Sending a second signal will forcefully stop Torako."); 218 | }, 219 | false => { 220 | error!("Exiting torako..."); 221 | std::process::exit(0); 222 | } 223 | } 224 | }).expect("Error setting Ctrl-C handler"); 225 | 226 | let board_metrics = boards.iter().map(|x| x.metrics()).collect(); 227 | let mut storage_metrics: Vec> = vec![]; 228 | if let Some(asagi) = asagi.as_ref() { 229 | storage_metrics.push(Box::new(asagi.metrics_provider())); 230 | } 231 | if let Some(search) = search.as_ref() { 232 | storage_metrics.push(Box::new(search.metrics_provider())); 233 | } 234 | if let Some(search) = search_lnx.as_ref() { 235 | storage_metrics.push(Box::new(search.metrics_provider())); 236 | } 237 | 238 | if let Some(addr) = config.api_addr { 239 | let addr_interface = config.api_addr_interface; 240 | tokio::spawn(async move { 241 | api::serve(addr, addr_interface, board_metrics, storage_metrics).await; 242 | }); 243 | } 244 | 245 | info!("Initialization complete."); 246 | info!("Starting archiving process..."); 247 | let boards_stream = futures::stream::select_all(boards).map(|x| Some(x)); 248 | let mut boards_stream = 249 | futures::stream::select(boards_stream, stream_ender.map(|_| None).into_stream()) 250 | .take_while(|x| future::ready(x.is_some())) 251 | .map(|x| Ok(x.unwrap())); 252 | // let res = asagi.feed_all(&mut boards_stream); 253 | let mut asagi = asagi.map(|asagi| asagi.sink_map_err(|err| Error::from(err))); 254 | let mut search = search.map(|search| search.sink_map_err(|err| Error::from(err))); 255 | let mut search_lnx = search_lnx.map(|search| search.sink_map_err(|err| Error::from(err))); 256 | let null = config 257 | .backend 258 | .null 259 | .and_then(|n| match n.disabled { 260 | true => None, 261 | false => Some(()), 262 | }) 263 | .map(|_| futures::sink::drain().sink_map_err(|_| unreachable!())); 264 | 265 | let res = match (asagi.as_mut(), search.as_mut(), search_lnx.as_mut()) { 266 | (Some(asagi), None, None) => asagi.feed_all(&mut boards_stream).await, 267 | (None, Some(search), None) => search.feed_all(&mut boards_stream).await, 268 | (None, None, Some(search)) => search.feed_all(&mut boards_stream).await, 269 | (Some(asagi), Some(search), None) => { 270 | let mut asagi = asagi.fanout(search); 271 | asagi.feed_all(&mut boards_stream).await 272 | } 273 | (Some(asagi), None, Some(search)) => { 274 | let mut asagi = asagi.fanout(search); 275 | asagi.feed_all(&mut boards_stream).await 276 | } 277 | (None, Some(asagi), Some(search)) => { 278 | let mut asagi = asagi.fanout(search); 279 | asagi.feed_all(&mut boards_stream).await 280 | } 281 | (Some(asagi), Some(search), Some(search_lnx)) => { 282 | let asagi = asagi.fanout(search); 283 | let mut asagi = asagi.fanout(search_lnx); 284 | asagi.feed_all(&mut boards_stream).await 285 | } 286 | (None, None, None) if null.is_some() => null.unwrap().feed_all(&mut boards_stream).await, 287 | (None, None, None) => { 288 | error!("No valid storage backend was configured."); 289 | return 1; 290 | } 291 | }; 292 | match res { 293 | Ok(_) => { 294 | let asagi_close = match asagi.as_mut() { 295 | Some(asagi) => futures::future::Either::Left(asagi.close()), 296 | None => futures::future::Either::Right(futures::future::ready(Ok(()))), 297 | }; 298 | let search_close = match search.as_mut() { 299 | Some(search) => futures::future::Either::Left(search.close()), 300 | None => futures::future::Either::Right(futures::future::ready(Ok(()))), 301 | }; 302 | let search_lnx_close = match search_lnx.as_mut() { 303 | Some(search) => futures::future::Either::Left(search.close()), 304 | None => futures::future::Either::Right(futures::future::ready(Ok(()))), 305 | }; 306 | let close = futures::future::join(asagi_close, search_close).map(|(a, b)| a.and(b)); 307 | let close = futures::future::join(close, search_lnx_close) 308 | .map(|(a, b)| a.and(b)) 309 | .await; 310 | match close { 311 | Ok(_) => { 312 | info!("Goodbye."); 313 | 0 314 | } 315 | Err(err) => { 316 | error!("An error occured shutting down: {}", err); 317 | 1 318 | } 319 | } 320 | } 321 | Err(err) => { 322 | error!("Torako failed: {}", err); 323 | 1 324 | } 325 | } 326 | } 327 | 328 | fn run<'a>(matches: ArgMatches<'a>) -> i32 { 329 | match matches.subcommand() { 330 | #[cfg(feature = "pgs-reindex")] 331 | ("pgs-reindex", Some(sub)) => return util::pgs::reindex(sub), 332 | #[cfg(feature = "lnx-reindex")] 333 | ("lnx-reindex", Some(sub)) => return util::lnx::reindex(sub), 334 | ("boo", Some(sub)) => return util::boo(sub), 335 | _ => (), 336 | }; 337 | 338 | let config: config::Config = { 339 | let config_str = match std::fs::read_to_string(matches.value_of("config").unwrap()) { 340 | Ok(s) => s, 341 | Err(err) => { 342 | error!( 343 | "Failed to read configuration @ {}: {}", 344 | matches.value_of("config").unwrap(), 345 | err 346 | ); 347 | return 1; 348 | } 349 | }; 350 | match toml::from_str(&config_str) { 351 | Ok(c) => c, 352 | Err(err) => { 353 | error!( 354 | "Failed to parse configuration file @ {}: {}", 355 | matches.value_of("config").unwrap(), 356 | err 357 | ); 358 | return 1; 359 | } 360 | } 361 | }; 362 | 363 | let mut runtime = tokio::runtime::Builder::new() 364 | .threaded_scheduler() 365 | .enable_all() 366 | .thread_name("torako") 367 | .build() 368 | .unwrap(); 369 | 370 | let r = runtime.block_on(run_async(config)); 371 | 372 | r 373 | } 374 | 375 | fn main() { 376 | let orig_hook = panic::take_hook(); 377 | panic::set_hook(Box::new(move |panic_info| { 378 | // invoke the default handler and exit the process 379 | orig_hook(panic_info); 380 | std::process::exit(1); 381 | })); 382 | 383 | if env::var("TORAKO_LOG").is_err() { 384 | env::set_var("TORAKO_LOG", "torako=info") 385 | } 386 | pretty_env_logger::try_init_timed_custom_env("TORAKO_LOG").unwrap(); 387 | 388 | let pgs_reindex: Option> = { 389 | #[cfg(feature = "pgs-reindex")] 390 | { 391 | let num_cpus: &'static str = Box::leak(num_cpus::get().to_string().into_boxed_str()); 392 | Some( 393 | SubCommand::with_name("pgs-reindex") 394 | .about("Reindex a postgres search database from MySQL Asagi") 395 | .arg( 396 | Arg::with_name("postgres") 397 | .long("postgres") 398 | .value_name("POSTGRES URL") 399 | .takes_value(true) 400 | .required(true), 401 | ) 402 | .arg( 403 | Arg::with_name("mysql") 404 | .long("mysql") 405 | .value_name("MYSQL URL") 406 | .takes_value(true) 407 | .required(true), 408 | ) 409 | .arg( 410 | Arg::with_name("write-streams") 411 | .long("write-streams") 412 | .value_name("COUNT") 413 | .default_value(num_cpus) 414 | .takes_value(true) 415 | .required(true), 416 | ) 417 | .arg(Arg::with_name("boards").multiple(true).required(true)), 418 | ) 419 | } 420 | 421 | #[cfg(not(feature = "pgs-reindex"))] 422 | { 423 | None 424 | } 425 | }; 426 | 427 | let lnx_reindex: Option> = { 428 | #[cfg(feature = "lnx-reindex")] 429 | { 430 | let num_cpus: &'static str = Box::leak(num_cpus::get().to_string().into_boxed_str()); 431 | Some( 432 | SubCommand::with_name("lnx-reindex") 433 | .about("Reindex a lnx search database from MySQL Asagi") 434 | .arg( 435 | Arg::with_name("lnx") 436 | .long("lnx") 437 | .value_name("LNX URL") 438 | .takes_value(true) 439 | .required(true), 440 | ) 441 | .arg( 442 | Arg::with_name("index") 443 | .long("index") 444 | .value_name("INDEX") 445 | .takes_value(true) 446 | .required(true), 447 | ) 448 | .arg( 449 | Arg::with_name("authentication-key") 450 | .long("authentication-key") 451 | .value_name("KEY") 452 | .takes_value(true), 453 | ) 454 | .arg( 455 | Arg::with_name("mysql") 456 | .long("mysql") 457 | .value_name("MYSQL URL") 458 | .takes_value(true) 459 | .required(true), 460 | ) 461 | .arg( 462 | Arg::with_name("write-streams") 463 | .long("write-streams") 464 | .value_name("COUNT") 465 | .default_value(num_cpus) 466 | .takes_value(true), 467 | ) 468 | .arg( 469 | Arg::with_name("commit-interval") 470 | .long("commit-interval") 471 | .value_name("SECONDS") 472 | .default_value("0") 473 | .takes_value(true), 474 | ) 475 | .arg( 476 | Arg::with_name("request-timeout") 477 | .long("request-timeout") 478 | .value_name("SECONDS") 479 | .default_value("600") 480 | .takes_value(true), 481 | ) 482 | .arg(Arg::with_name("boards").multiple(true).required(true)), 483 | ) 484 | } 485 | 486 | #[cfg(not(feature = "lnx-reindex"))] 487 | { 488 | None 489 | } 490 | }; 491 | 492 | let matches = App::new("Torako") 493 | .author("github.com/miyachan") 494 | .about("Torako: Imageboard archiver backend.") 495 | .version(crate_version!()) 496 | .arg( 497 | Arg::with_name("config") 498 | .long("config") 499 | .short("c") 500 | .help("Path to configuration file") 501 | .takes_value(true) 502 | .value_name("FILE") 503 | .default_value("./Torako.toml") 504 | .env("CONFIG"), 505 | ) 506 | .subcommand(SubCommand::with_name("boo").setting(AppSettings::Hidden)) 507 | .subcommands(pgs_reindex.into_iter()) 508 | .subcommands(lnx_reindex.into_iter()) 509 | .get_matches(); 510 | 511 | match run(matches) { 512 | 0 => return, 513 | i => std::process::exit(i), 514 | }; 515 | } 516 | --------------------------------------------------------------------------------