├── .gitignore ├── LICENSE ├── README.md └── projects ├── Cargo.toml ├── hst-cli ├── Cargo.toml └── src │ └── lib.rs ├── hst-deactivations ├── Cargo.toml └── src │ └── lib.rs ├── hst-tools ├── Cargo.toml └── src │ └── bin │ ├── hst-tw-db.rs │ └── hst-tw-images.rs ├── hst-tw-db ├── Cargo.toml └── src │ ├── lib.rs │ └── table.rs ├── hst-tw-images ├── Cargo.toml └── src │ ├── error.rs │ ├── lib.rs │ ├── model.rs │ └── store.rs ├── hst-tw-profiles ├── Cargo.toml ├── schemas │ └── avro │ │ └── user.avsc └── src │ ├── archive.rs │ ├── avro.rs │ ├── lib.rs │ ├── model.rs │ └── stream │ └── mod.rs └── hst-tw-utils ├── Cargo.toml └── src └── lib.rs /.gitignore: -------------------------------------------------------------------------------- 1 | projects/Cargo.lock 2 | projects/target/ 3 | projects/*/target/ 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | ANTI-CAPITALIST SOFTWARE LICENSE (v 1.4) 2 | 3 | Copyright © 2022 Travis Brown 4 | 5 | This is anti-capitalist software, released for free use by individuals and organizations that do not operate by capitalist principles. 6 | 7 | Permission is hereby granted, free of charge, to any person or organization (the "User") obtaining a copy of this software and associated documentation files (the "Software"), to use, copy, modify, merge, distribute, and/or sell copies of the Software, subject to the following conditions: 8 | 9 | 1. The above copyright notice and this permission notice shall be included in all copies or modified versions of the Software. 10 | 11 | 2. The User is one of the following: 12 | a. An individual person, laboring for themselves 13 | b. A non-profit organization 14 | c. An educational institution 15 | d. An organization that seeks shared profit for all of its members, and allows non-members to set the cost of their labor 16 | 17 | 3. If the User is an organization with owners, then all owners are workers and all workers are owners with equal equity and/or equal vote. 18 | 19 | 4. If the User is an organization, then the User is not law enforcement or military, or working for or under either. 20 | 21 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hassreden-Tracker 2 | 3 | This repository is currently being used to coordinate work on the [Hassreden-Tracker][hassreden-tracker-pf] project. 4 | This means that for the immediate future the [issue tracker](https://github.com/travisbrown/hassreden-tracker/issues) 5 | will be the most interesting part of this repository, although eventually some code will be migrated here. 6 | 7 | ## Projects 8 | 9 | The project builds on several existing open source projects that I maintain (all of which rely on other open source projects): 10 | 11 | * [cancel-culture](https://github.com/travisbrown/cancel-culture): Tools for Twitter archiving, indexing, and block list management. 12 | * [wayback-rs](https://github.com/travisbrown/wayback-rs): Rust library for working with the [Wayback Machine](https://web.archive.org/). 13 | * [twitter-watch](https://github.com/travisbrown/twitter-watch): Reports about screen name changes and suspensions. 14 | * [twitter-tracker](https://github.com/travisbrown/twitter-tracker): Services that produce the twitter-watch reports (currently private). 15 | * [evasion](https://github.com/travisbrown/evasion): Report tracking far-right ban evasion accounts. 16 | * [egg-mode-extras](https://github.com/travisbrown/egg-mode-extras): Rate-limit-aware asynchronous streams for working with the [Twitter API](https://developer.twitter.com/en/docs/twitter-api). 17 | * [orcrs](https://github.com/travisbrown/orcrs): [Apache ORC](https://orc.apache.org/) file reading library for Rust. 18 | * [hkvdb](https://github.com/travisbrown/hkvdb): A key-value store interface built on [RocksDB](https://rocksdb.org/). 19 | * [twpis](https://github.com/travisbrown/twpis): Twitter profile image collection. 20 | * [memory.lol](https://memory.lol): A web service providing historical Twitter account information (currently private). 21 | * [stop-the-steal](https://github.com/travisbrown/stop-the-steal): 9.7 million profile snapshots for Twitter users associated with the Stop the Steal movement. 22 | * [octocrabby](https://github.com/travisbrown/octocrabby): Block list management for [GitHub](https://github.com/) accounts. 23 | 24 | ## Selected citations 25 | 26 | ### [Washington Post](https://www.washingtonpost.com/technology/2022/04/19/libs-of-tiktok-right-wing-media/) (19 April 2022; [archived](https://archive.vn/pv6xK)) 27 | 28 | > On Saturday, software developer Travis Brown (who is working on a project with support from Prototype Fund, an organization that backs open-source projects) unearthed the account’s Twitter history and posted a thread detailing information about its profile changes. 29 | 30 | ### [Australian Financial Review](https://www.afr.com/politics/how-to-lose-an-election-scomo-style-20220817-p5bai6) (30 September 2022; [archived](https://archive.vn/rom1g)) 31 | 32 | > On head office advice, Deves deactivated her Twitter account and started studying the party’s policy platform. Both actions were pointless. Her entire social media history had already been captured by a Berlin online activist called Travis Brown. 33 | 34 | ### [The Information](https://www.theinformation.com/articles/the-great-eth-erasure-a-crypto-fad-fades-on-twitter) (31 October 2022; [archived](https://archive.vn/ogLx2)) 35 | 36 | > The trend was short-lived. By October people began quietly removing the suffix from their Twitter names. At least 16,000 Twitter accounts got rid of it between February and October, according to an analysis conducted by programmer Travis Brown and provided to The Information. 37 | 38 | ### [Washington Post](https://www.washingtonpost.com/technology/2022/11/16/musk-twitter-email-ultimatum-termination/) (16 November 2022; [archived](https://archive.vn/Z7o6g)) 39 | 40 | > A large portion of the most-followed accounts that got “verified” via Twitter Blue, according to the data reviewed by The Post, are from a few specific subcommunities on Twitter: pornography, cryptocurrency advocates and overseas accounts, particularly from the Middle East. The data was compiled by Berlin software developer Travis Brown and reviewed and verified by The Post. 41 | 42 | ### [Hatewatch](https://www.splcenter.org/hatewatch/2022/11/16/twitter-blesses-extremists-paid-blue-checks) (16 November 2022; [archived](https://archive.vn/zi053)) 43 | 44 | > Brown has developed several tools for monitoring extremists online. He told Hatewatch in a telephone conversation that the latest version of the list shows accounts that have paid for blue checks ranked “by their centrality in far-right Twitter networks,” so that accounts with more connections with other far-right accounts receive a higher ranking in the list. 45 | 46 | ### [New York Times](https://www.nytimes.com/interactive/2022/11/23/technology/twitter-elon-musk-twitter-blue-check-verification.html) (23 November 2022; [archived](https://archive.vn/rpkBR)) 47 | 48 | > The plan had attracted about 140,000 users as of Nov. 15, according to data from Travis Brown, a software developer in Berlin who has studied extremism on Twitter. 49 | 50 | ### [Business Insider](https://www.businessinsider.com/an-estimated-140000-people-paid-twitter-blue-5-days-report-2022-11) (24 November 2022; [archived](https://archive.vn/iStBE)) 51 | 52 | > Brown's data showed that thousands of subscribers were linked to around 5,000 far-right Twitter accounts that had been flagged for pushing extremist ideas and some were also listed by Cornell University for posting conspiracy theories about election fraud.  53 | 54 | ### [VICE](https://www.vice.com/en/article/3ade58/nyc-synagogue-shooting-threats-neo-nazi) (28 November 2022; [archived](https://archive.vn/xOlPp)) 55 | 56 | > Travis Brown was able to connect the accounts by using scraped Discord data that showed the @vrilgod account was connected to a network of overlapping usernames and posts, many of which identified new accounts as belonging to the owner of previously banned accounts. 57 | 58 | ### [NBC](https://www.nbcnews.com/tech/internet/elon-musks-twitter-beginning-take-shape-rcna58940) (2 December 2022; [archived](https://archive.vn/4jq73)) 59 | 60 | > Travis Brown, an independent software developer in Berlin who tracks Twitter suspensions and screen name changes as part of a project studying extremism, shared a dataset for this article that showed a wide variety of far-right accounts had been reinstated since Musk’s announcement. 61 | 62 | ### [CNN](https://edition.cnn.com/2022/12/08/tech/twitter-unbanned-users-returning/index.html) (8 December 2022; [archived](https://archive.vn/1Ckhx)) 63 | 64 | > A data set of many of the unbanned accounts compiled by researcher and software developer Travis Brown, who worked for Twitter for a year in 2014 and last year began a project tracking hate speech on the platform, shows dozens of users who have had their bans reversed are using QAnon-related phrases or hashtags in their account bios. 65 | 66 | ### [MIT Technology Review](https://www.technologyreview.com/2022/12/15/1065013/twitter-brain-death/) (15 December 2022; [archived](https://archive.vn/bWULN)) 67 | 68 | > According to data compiled by researcher Travis Brown, others reinstated include Meninist, a “men’s rights” account with more than a million followers; Peter McCullough, a cardiologist who gained a large audience for advocating discredited covid-19 treatments and arguing against receiving the vaccine; and Tim Gionet, a far-right media personality who livestreamed his participation in the January 6 attack on the US Capitol. 69 | 70 | ### [Capital](https://www.capital.fr/entreprises-marches/twitter-sommes-nous-en-train-dassister-a-la-mort-cerebrale-du-reseau-social-1455090) (16 December 2022; [archived](https://archive.vn/W8Xap)) 71 | 72 | > D’après les données compilées par le chercheur Travis Brown, parmi les autres personnes réintégrées on retrouve Meninist, un compte défendant les "droits des hommes" qui compte plus d’un million d’abonnés… 73 | 74 | ### [Media Matters](https://www.mediamatters.org/twitter/elon-musk-favors-right-wing-over-left-wing-users-he-reinstates-previously-suspended-twitter) (16 December 2022; [archived](https://archive.vn/qWVCx)) 75 | 76 | > In a new study, Media Matters analyzed independent software developer Travis Brown’s data sets that were released between December 7 and 13 and include recently reinstated Twitter accounts. 77 | 78 | ### [New York Times](https://www.nytimes.com/2022/12/22/technology/musk-twitter-bans.html) (22 December 2022; [archived](https://archive.vn/CqoeW)) 79 | 80 | > The posts were collected for The Times by Bright Data, a social media tracking company, using a list of reinstated users identified by Travis Brown, a Berlin-based software developer who has tracked extremism on Twitter. 81 | 82 | ### [Insider](https://www.insider.com/libs-of-tiktok-spotted-video-january-6-capitol-riots-2022-12) (28 December 2022; [archived](https://archive.vn/ty92q)) 83 | 84 | > They noted screenshots of a Twitter account shaya_ray, which software developer Travis Brown identified in April to be associated with Raichik, appeared to show her describing heading to DC that day. 85 | 86 | ### [Wall Street Journal](https://www.wsj.com/articles/elon-musks-first-100-days-at-twitter-defined-by-rapid-change-and-challenges-c33bafaf) (10 February 2023; [archived](https://archive.is/jDWcd)) 87 | 88 | > Travis Brown, a Berlin-based software developer, in late January estimated total subscribers to be between 275,000 and 325,000, based on computer programs that reviewed roughly 30 million accounts. 89 | 90 | ## Principles 91 | 92 | ### Technical 93 | 94 | Most code is written in the [Rust programming language](https://www.rust-lang.org/). I've chosen to build this software primarily in Rust for a couple of reasons: 95 | 96 | * The values of the Rust community tend to align with mine. 97 | * Rust's focus on performance is especially valuable for projects operated by organizations or individuals with limited resources. 98 | 99 | On the second point: almost all of the tools and services below can be run effectively on the smallest and cheapest Amazon Web Services EC2 instances, for example. 100 | 101 | ### Terms of service compliance 102 | 103 | We aim for all of the projects above to be compliant with the terms of service of any platform that they access. 104 | 105 | In most cases data is collected from open public archives and other public resources, 106 | such as [Archive Today][archive-today], the [Wayback Machine][wayback-machine], 107 | and the [Internet Archive][internet-archive]'s [Twitter Stream Grab][twitter-stream-grab]. 108 | 109 | Some of these projects do make limited use of platform APIs (e.g. the [Twitter API][twitter-api]). 110 | This includes collecting, archiving, and publishing public follower relationships, platform IDs, 111 | content status (e.g. whether a tweet is deleted or not), and screen names. 112 | 113 | Specifically, none of the tools above currently store tweets accessed through the Twitter API. 114 | If at some point we support archiving or publishing tweets accessed through the Twitter API, 115 | we will respect the deletion requirements of the Twitter API terms of service. 116 | 117 | ### Licensing and distribution 118 | 119 | All code and data is made publicly available except in cases where this would undermine the core project goals or the privacy or safety of project members. 120 | 121 | Most of these projects are published under the [Mozilla Public License](https://www.mozilla.org/en-US/MPL/). 122 | Some projects that could be misused for commercial surveillance are published under the [Anti-Capitalist Software License](https://anticapitalist.software/). 123 | 124 | Rust libraries are published to [crates.io](https://crates.io/), a widely-used Rust package registry. 125 | 126 | [archive-today]: https://archive.today/ 127 | [hassreden-tracker-pf]: https://prototypefund.de/project/hassreden-tracker/ 128 | [internet-archive]: https://archive.org/ 129 | [twitter-api]: https://developer.twitter.com/en/docs/twitter-api 130 | [twitter-stream-grab]: https://archive.org/details/twitterstream 131 | [wayback-machine]: https://web.archive.org/ -------------------------------------------------------------------------------- /projects/Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | 3 | members = [ 4 | "hst-cli", 5 | "hst-deactivations", 6 | "hst-tools", 7 | "hst-tw-db", 8 | "hst-tw-images", 9 | "hst-tw-profiles", 10 | "hst-tw-utils" 11 | ] 12 | -------------------------------------------------------------------------------- /projects/hst-cli/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "hst-cli" 3 | authors = ["Travis Brown "] 4 | homepage = "https://github.com/travisbrown/hassreden-tracker" 5 | repository = "https://github.com/travisbrown/hassreden-tracker" 6 | description = "Opinionated helpers for command-line applications" 7 | keywords = ["cli"] 8 | license-file = "../../LICENSE" 9 | readme = "../../README.md" 10 | version = "0.1.0" 11 | edition = "2021" 12 | 13 | [dependencies] 14 | clap = { version = "3", features = ["derive"] } 15 | log = "0.4" 16 | simplelog = "0.12" 17 | -------------------------------------------------------------------------------- /projects/hst-cli/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Opinionated helpers for building consistent command-line interfaces with [`clap`][clap] and [`simplelog`][simplelog]. 2 | //! 3 | //! ## Example 4 | //! 5 | //! The [`prelude`] module exports a minimal subset of these two crates. 6 | //! 7 | //! ```rust,no_run 8 | //! use hst_cli::prelude::*; 9 | //! 10 | //! #[derive(Debug, Parser)] 11 | //! #[clap(name = "demo", version, author)] 12 | //! struct Opts { 13 | //! #[clap(flatten)] 14 | //! verbose: Verbosity, 15 | //! } 16 | //! 17 | //! fn main() -> Result<(), log::SetLoggerError> { 18 | //! let opts: Opts = Opts::parse(); 19 | //! opts.verbose.init_logging()?; 20 | //! Ok(()) 21 | //! } 22 | //! ``` 23 | //! 24 | //! [clap]: https://docs.rs/clap/latest/clap/ 25 | //! [simplelog]: https://docs.rs/simplelog/latest/simplelog/ 26 | 27 | use simplelog::LevelFilter; 28 | 29 | fn select_log_level_filter(verbosity: i8) -> LevelFilter { 30 | match verbosity { 31 | 0 => LevelFilter::Off, 32 | 1 => LevelFilter::Error, 33 | 2 => LevelFilter::Warn, 34 | 3 => LevelFilter::Info, 35 | 4 => LevelFilter::Debug, 36 | _ => LevelFilter::Trace, 37 | } 38 | } 39 | 40 | #[derive(clap::Args, Debug, Clone)] 41 | pub struct Verbosity { 42 | /// Level of verbosity 43 | #[clap(long, short = 'v', parse(from_occurrences), global = true)] 44 | verbose: i8, 45 | } 46 | 47 | impl Verbosity { 48 | pub fn new(verbose: i8) -> Self { 49 | Self { verbose } 50 | } 51 | 52 | /// Initialize a default terminal logger with the indicated log level. 53 | pub fn init_logging(&self) -> Result<(), log::SetLoggerError> { 54 | simplelog::TermLogger::init( 55 | select_log_level_filter(self.verbose), 56 | simplelog::Config::default(), 57 | simplelog::TerminalMode::Stderr, 58 | simplelog::ColorChoice::Auto, 59 | ) 60 | } 61 | } 62 | 63 | pub mod prelude { 64 | pub use super::Verbosity; 65 | pub use ::clap::Parser; 66 | pub mod clap { 67 | pub use clap::{ 68 | builder, AppSettings, Arg, ArgAction, ArgMatches, Args, Command, CommandFactory, Error, 69 | ErrorKind, FromArgMatches, Parser, Subcommand, 70 | }; 71 | } 72 | pub mod log { 73 | pub use log::{error, info, warn, SetLoggerError}; 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /projects/hst-deactivations/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "hst-deactivations" 3 | authors = ["Travis Brown "] 4 | homepage = "https://github.com/travisbrown/hassreden-tracker" 5 | repository = "https://github.com/travisbrown/hassreden-tracker" 6 | description = "Simple database for tracking social media deactivations" 7 | keywords = ["twitter"] 8 | license-file = "../../LICENSE" 9 | readme = "../../README.md" 10 | version = "0.1.0" 11 | edition = "2021" 12 | 13 | [dependencies] 14 | chrono = "0.4" 15 | thiserror = "1" 16 | -------------------------------------------------------------------------------- /projects/hst-deactivations/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Simple database for tracking user account deactivations. 2 | //! 3 | //! This library was originally designed to track Twitter account suspensions and 4 | //! self-deactivations, but should be general enough to work in other contexts. 5 | //! 6 | //! It makes a few assumptions: 7 | //! 8 | //! * Users have an integral identifier (e.g. the Twitter ID). 9 | //! * Deactivations have an integral status code (e.g. for Twitter, 50 for self-deactivation and 10 | //! 63 for suspension). 11 | //! * A deactivation has a time at which it was first observed and (optionally) another at which it 12 | //! was reversed. 13 | 14 | use chrono::{DateTime, TimeZone, Utc}; 15 | use std::collections::{HashMap, HashSet}; 16 | use std::io::{BufRead, BufReader, BufWriter, Read, Write}; 17 | use std::ops::Add; 18 | 19 | #[derive(thiserror::Error, Debug)] 20 | pub enum Error { 21 | #[error("I/O error")] 22 | Io(#[from] std::io::Error), 23 | #[error("Invalid user ID")] 24 | InvalidUserId(Option), 25 | #[error("Invalid timestamp")] 26 | InvalidTimestamp(Option), 27 | #[error("Invalid status code")] 28 | InvalidStatus(Option), 29 | } 30 | 31 | #[derive(Clone, Copy, Debug, Eq, PartialEq)] 32 | pub struct Entry { 33 | pub status: u32, 34 | pub observed: DateTime, 35 | pub reversal: Option>, 36 | } 37 | 38 | #[derive(Clone, Debug, Eq, PartialEq)] 39 | pub struct DeactivationLog { 40 | entries: HashMap>, 41 | } 42 | 43 | impl DeactivationLog { 44 | pub fn lookup(&self, user_id: u64) -> Option> { 45 | self.entries.get(&user_id).cloned() 46 | } 47 | 48 | pub fn status(&self, user_id: u64) -> Option { 49 | self.entries.get(&user_id).and_then(|entries| { 50 | entries.iter().find_map(|entry| { 51 | if entry.reversal.is_none() { 52 | Some(entry.status) 53 | } else { 54 | None 55 | } 56 | }) 57 | }) 58 | } 59 | 60 | pub fn status_timestamp(&self, user_id: u64) -> Option> { 61 | self.entries.get(&user_id).and_then(|entries| { 62 | entries.iter().find_map(|entry| { 63 | if entry.reversal.is_none() { 64 | Some(entry.observed) 65 | } else { 66 | None 67 | } 68 | }) 69 | }) 70 | } 71 | 72 | pub fn deactivations(&self, status_filter: Option) -> Vec<(u64, Entry)> { 73 | let mut entries = self.entries.iter().collect::>(); 74 | entries.sort_by_key(|(user_id, _)| *user_id); 75 | 76 | entries 77 | .iter() 78 | .flat_map(|(user_id, entries)| { 79 | entries.iter().filter_map(|entry| { 80 | if status_filter 81 | .map(|status| entry.status == status) 82 | .unwrap_or(true) 83 | { 84 | Some((**user_id, *entry)) 85 | } else { 86 | None 87 | } 88 | }) 89 | }) 90 | .collect() 91 | } 92 | 93 | pub fn ever_deactivated(&self, status_filter: Option) -> HashSet { 94 | self.entries 95 | .iter() 96 | .filter_map(|(user_id, entries)| { 97 | if entries.iter().any(|entry| { 98 | status_filter 99 | .map(|status| entry.status == status) 100 | .unwrap_or(true) 101 | }) { 102 | Some(*user_id) 103 | } else { 104 | None 105 | } 106 | }) 107 | .collect() 108 | } 109 | 110 | pub fn current_deactivated(&self, status_filter: Option) -> HashSet { 111 | self.entries 112 | .iter() 113 | .filter_map(|(user_id, entries)| { 114 | if entries 115 | .last() 116 | .map(|entry| { 117 | entry.reversal.is_none() 118 | && status_filter 119 | .map(|status| entry.status == status) 120 | .unwrap_or(true) 121 | }) 122 | .unwrap_or(false) 123 | { 124 | Some(*user_id) 125 | } else { 126 | None 127 | } 128 | }) 129 | .collect() 130 | } 131 | 132 | pub fn update_with_reversals)>>( 133 | &mut self, 134 | reversals: I, 135 | ) -> Result<(), Vec<(u64, DateTime)>> { 136 | let mut invalid_pairs = vec![]; 137 | 138 | for (user_id, timestamp) in reversals { 139 | match self 140 | .entries 141 | .get_mut(&user_id) 142 | .and_then(|entries| entries.last_mut()) 143 | { 144 | Some(last) => { 145 | if last.reversal.is_none() { 146 | last.reversal = Some(timestamp); 147 | } else { 148 | invalid_pairs.push((user_id, timestamp)); 149 | } 150 | } 151 | None => { 152 | invalid_pairs.push((user_id, timestamp)); 153 | } 154 | } 155 | } 156 | 157 | if invalid_pairs.is_empty() { 158 | Ok(()) 159 | } else { 160 | Err(invalid_pairs) 161 | } 162 | } 163 | 164 | pub fn validate(&self) -> Result<(), Vec> { 165 | let mut invalid_user_ids = self 166 | .entries 167 | .iter() 168 | .filter_map(|(user_id, entries)| { 169 | if !entries.is_empty() && Self::validate_entries(entries) { 170 | None 171 | } else { 172 | Some(*user_id) 173 | } 174 | }) 175 | .collect::>(); 176 | 177 | invalid_user_ids.sort_unstable(); 178 | 179 | if invalid_user_ids.is_empty() { 180 | Ok(()) 181 | } else { 182 | Err(invalid_user_ids) 183 | } 184 | } 185 | 186 | fn validate_entries(entries: &[Entry]) -> bool { 187 | let valid_pairs = entries.windows(2).all(|pair| match pair[0].reversal { 188 | Some(reversal) => pair[0].observed < reversal && pair[0].observed < pair[1].observed, 189 | None => false, 190 | }); 191 | 192 | // We still have to checked whether the reversal (if there was one) for the final entry 193 | // happened after the observation. 194 | valid_pairs 195 | && match entries.last() { 196 | Some(entry) => match entry.reversal { 197 | Some(reversal) => entry.observed < reversal, 198 | None => true, 199 | }, 200 | None => true, 201 | } 202 | } 203 | 204 | pub fn read(reader: R) -> Result { 205 | let mut entries: HashMap> = HashMap::new(); 206 | 207 | for line in BufReader::new(reader).lines() { 208 | let line = line?; 209 | let fields = line.split(',').collect::>(); 210 | 211 | let user_id = fields 212 | .first() 213 | .and_then(|value| value.parse::().ok()) 214 | .ok_or_else(|| { 215 | Error::InvalidUserId(fields.first().map(|value| value.to_string())) 216 | })?; 217 | 218 | let status = fields 219 | .get(1) 220 | .and_then(|value| value.parse::().ok()) 221 | .ok_or_else(|| { 222 | Error::InvalidStatus(fields.get(1).map(|value| value.to_string())) 223 | })?; 224 | 225 | let observed = fields 226 | .get(2) 227 | .and_then(|value| value.parse::().ok()) 228 | .map(|value| Utc.timestamp(value, 0)) 229 | .ok_or_else(|| { 230 | Error::InvalidTimestamp(fields.get(2).map(|value| value.to_string())) 231 | })?; 232 | 233 | let reversal = fields 234 | .get(3) 235 | .and_then(|value| { 236 | if value.is_empty() { 237 | Some(None) 238 | } else { 239 | value 240 | .parse::() 241 | .ok() 242 | .map(|value| Some(Utc.timestamp(value, 0))) 243 | } 244 | }) 245 | .ok_or_else(|| { 246 | Error::InvalidTimestamp(fields.get(3).map(|value| value.to_string())) 247 | })?; 248 | 249 | let seen = entries.entry(user_id).or_default(); 250 | seen.push(Entry { 251 | status, 252 | observed, 253 | reversal, 254 | }); 255 | } 256 | 257 | Ok(Self { entries }) 258 | } 259 | 260 | pub fn write(&self, writer: W) -> Result<(), std::io::Error> { 261 | let mut entries = self.entries.iter().collect::>(); 262 | entries.sort_by_key(|(user_id, _)| *user_id); 263 | 264 | let mut writer = BufWriter::new(writer); 265 | 266 | for (user_id, entries) in entries { 267 | for entry in entries { 268 | writeln!( 269 | writer, 270 | "{},{},{},{}", 271 | user_id, 272 | entry.status, 273 | entry.observed.timestamp(), 274 | entry 275 | .reversal 276 | .map(|value| value.timestamp().to_string()) 277 | .unwrap_or_default() 278 | )?; 279 | } 280 | } 281 | 282 | Ok(()) 283 | } 284 | } 285 | 286 | impl Add for &DeactivationLog { 287 | type Output = DeactivationLog; 288 | 289 | fn add(self, other: Self) -> Self::Output { 290 | let mut new_entry_map = self.entries.clone(); 291 | 292 | for (user_id, entries) in &other.entries { 293 | let new_entries = new_entry_map.entry(*user_id).or_default(); 294 | new_entries.extend(entries.clone()); 295 | new_entries.sort_by_key(|entry| entry.observed); 296 | new_entries.dedup(); 297 | 298 | let len = new_entries.len(); 299 | if len >= 2 { 300 | let last1 = &new_entries[len - 2]; 301 | let last2 = &new_entries[len - 1]; 302 | if last1.status == last2.status 303 | && last1.reversal.is_none() 304 | && last2.reversal.is_none() 305 | { 306 | new_entries.pop(); 307 | } 308 | } 309 | } 310 | 311 | Self::Output { 312 | entries: new_entry_map, 313 | } 314 | } 315 | } 316 | -------------------------------------------------------------------------------- /projects/hst-tools/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "hst-tw-tools" 3 | authors = ["Travis Brown "] 4 | homepage = "https://github.com/travisbrown/hassreden-tracker" 5 | repository = "https://github.com/travisbrown/hassreden-tracker" 6 | description = "Hassreden-Tracker command-line tools" 7 | keywords = ["twitter"] 8 | license-file = "../../LICENSE" 9 | readme = "../../README.md" 10 | version = "0.1.0" 11 | edition = "2021" 12 | 13 | [dependencies] 14 | apache-avro = { version = "0.14", features = ["snappy"] } 15 | hst-cli = { path = "../hst-cli" } 16 | hst-tw-db = { path = "../hst-tw-db" } 17 | hst-tw-images = { path = "../hst-tw-images" } 18 | hst-tw-profiles = { path = "../hst-tw-profiles" } 19 | reqwest = { version = "0.11", features = ["gzip", "json"] } 20 | serde_json = { version = "1", features = ["preserve_order"] } 21 | thiserror = "1" 22 | tokio = { version = "1", features = ["macros", "rt-multi-thread"] } -------------------------------------------------------------------------------- /projects/hst-tools/src/bin/hst-tw-db.rs: -------------------------------------------------------------------------------- 1 | use hst_cli::prelude::*; 2 | use hst_tw_db::{ 3 | table::{ReadOnly, Table, Writeable}, 4 | ProfileDb, 5 | }; 6 | use hst_tw_profiles::model::User; 7 | use std::collections::HashSet; 8 | use std::fs::File; 9 | 10 | fn main() -> Result<(), Error> { 11 | let opts: Opts = Opts::parse(); 12 | opts.verbose.init_logging()?; 13 | 14 | match opts.command { 15 | Command::Import { input } => { 16 | let db = ProfileDb::::open(opts.db, false)?; 17 | 18 | let file = File::open(input)?; 19 | let reader = hst_tw_profiles::avro::reader(file)?; 20 | 21 | for value in reader { 22 | let user = apache_avro::from_value::(&value?)?; 23 | db.update(&user)?; 24 | } 25 | } 26 | Command::Lookup { id } => { 27 | let db = ProfileDb::::open(opts.db, true)?; 28 | let users = db.lookup(id)?; 29 | 30 | for (_, user) in users { 31 | println!("{}", serde_json::to_value(user)?); 32 | } 33 | } 34 | Command::Count => { 35 | let db = ProfileDb::::open(opts.db, true)?; 36 | let mut user_count = 0; 37 | let mut screen_name_count = 0; 38 | let mut verified = 0; 39 | let mut protected = 0; 40 | for result in db.iter() { 41 | let (_, users) = result?; 42 | let mut screen_names = HashSet::new(); 43 | 44 | user_count += 1; 45 | 46 | for (_, user) in &users { 47 | screen_names.insert(user.screen_name.clone()); 48 | } 49 | 50 | if let Some((_, user)) = users.last() { 51 | if user.verified { 52 | verified += 1; 53 | } 54 | if user.protected { 55 | protected += 1; 56 | } 57 | } 58 | 59 | screen_name_count += screen_names.len(); 60 | } 61 | 62 | println!("{} users, {} screen names", user_count, screen_name_count); 63 | println!("{} verified, {} protected", verified, protected); 64 | } 65 | Command::Stats => { 66 | let db = ProfileDb::::open(opts.db, true)?; 67 | if let Some(count) = db.get_estimated_key_count()? { 68 | println!("Estimated number of keys: {}", count); 69 | } 70 | println!("{:?}", db.statistics()); 71 | } 72 | } 73 | 74 | Ok(()) 75 | } 76 | 77 | #[derive(thiserror::Error, Debug)] 78 | pub enum Error { 79 | #[error("ProfileDb error")] 80 | ProfileDb(#[from] hst_tw_db::Error), 81 | #[error("Profile Avro error")] 82 | ProfileAvro(#[from] hst_tw_profiles::avro::Error), 83 | #[error("Avro decoding error")] 84 | Avro(#[from] apache_avro::Error), 85 | #[error("JSON encoding error")] 86 | Json(#[from] serde_json::Error), 87 | #[error("I/O error")] 88 | Io(#[from] std::io::Error), 89 | #[error("Log initialization error")] 90 | LogInitialization(#[from] log::SetLoggerError), 91 | } 92 | 93 | #[derive(Debug, Parser)] 94 | #[clap(name = "hst-tw-db", version, author)] 95 | struct Opts { 96 | #[clap(flatten)] 97 | verbose: Verbosity, 98 | /// Database directory path 99 | #[clap(long)] 100 | db: String, 101 | #[clap(subcommand)] 102 | command: Command, 103 | } 104 | 105 | #[derive(Debug, Parser)] 106 | enum Command { 107 | Import { 108 | /// Avro input path 109 | #[clap(short, long)] 110 | input: String, 111 | }, 112 | Lookup { 113 | /// Twitter user ID 114 | id: u64, 115 | }, 116 | Count, 117 | Stats, 118 | } 119 | -------------------------------------------------------------------------------- /projects/hst-tools/src/bin/hst-tw-images.rs: -------------------------------------------------------------------------------- 1 | use hst_cli::prelude::*; 2 | use hst_tw_images::{Image, Store}; 3 | use reqwest::Url; 4 | use std::fs::File; 5 | use std::io::Write; 6 | use std::path::Path; 7 | 8 | #[derive(thiserror::Error, Debug)] 9 | pub enum Error { 10 | #[error("Twitter image error")] 11 | TwitterImage(#[from] hst_tw_images::Error), 12 | #[error("Twitter image store error")] 13 | TwitterImageStore(#[from] hst_tw_images::store::Error), 14 | #[error("HTTP client error")] 15 | HttpClient(#[from] reqwest::Error), 16 | #[error("I/O error")] 17 | Io(#[from] std::io::Error), 18 | #[error("Log initialization error")] 19 | LogInitialization(#[from] log::SetLoggerError), 20 | } 21 | 22 | async fn download_image>( 23 | client: &reqwest::Client, 24 | image: &Image, 25 | output: P, 26 | ) -> Result<(Url, bool), Error> { 27 | let path = output.as_ref().join(image.path()); 28 | 29 | let url = image.url(); 30 | 31 | let response = client.get(url).send().await?; 32 | let response_url = response.url().clone(); 33 | let bytes = response.bytes().await?; 34 | 35 | if !bytes.is_empty() { 36 | if let Some(parent) = path.parent() { 37 | std::fs::create_dir_all(&parent)?; 38 | } 39 | 40 | let mut file = File::create(path)?; 41 | file.write_all(&bytes)?; 42 | 43 | Ok((response_url, false)) 44 | } else { 45 | Ok((response_url, true)) 46 | } 47 | } 48 | 49 | #[tokio::main] 50 | async fn main() -> Result<(), Error> { 51 | let opts: Opts = Opts::parse(); 52 | opts.verbose.init_logging()?; 53 | 54 | match opts.command { 55 | Command::StoreUrls { base } => { 56 | let store = Store::new(base); 57 | 58 | for entry in &store { 59 | let (image, _) = entry?; 60 | println!("{}", image); 61 | } 62 | } 63 | Command::Scrape => todo!(), 64 | } 65 | 66 | Ok(()) 67 | } 68 | 69 | #[derive(Parser)] 70 | #[clap(name = "hst-tw-images", about, version, author)] 71 | struct Opts { 72 | #[clap(flatten)] 73 | verbose: Verbosity, 74 | #[clap(subcommand)] 75 | command: Command, 76 | } 77 | 78 | #[derive(Debug, Parser)] 79 | enum Command { 80 | /// Download 81 | Scrape, 82 | /// Dump a list of URLs (arbitrarily ordered) from a store as text 83 | StoreUrls { base: String }, 84 | } 85 | -------------------------------------------------------------------------------- /projects/hst-tw-db/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "hst-tw-db" 3 | authors = ["Travis Brown "] 4 | homepage = "https://github.com/travisbrown/hassreden-tracker" 5 | repository = "https://github.com/travisbrown/hassreden-tracker" 6 | description = "RocksDB database for storing Twitter user profiles" 7 | keywords = ["twitter"] 8 | license-file = "../../LICENSE" 9 | readme = "../../README.md" 10 | version = "0.1.0" 11 | edition = "2021" 12 | 13 | [dependencies] 14 | apache-avro = { version = "0.14", features = ["snappy"] } 15 | chrono = "0.4" 16 | rocksdb = { version = "0.19", default-features = false, features = ["zstd"] } 17 | thiserror = "1" 18 | hst-tw-profiles = { path = "../hst-tw-profiles" } 19 | -------------------------------------------------------------------------------- /projects/hst-tw-db/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! A RocksDB database for storing user profiles from the Twitter API. 2 | 3 | use apache_avro::{from_avro_datum, from_value, to_avro_datum, to_value}; 4 | use chrono::{DateTime, TimeZone, Utc}; 5 | use hst_tw_profiles::{avro::USER_SCHEMA, model::User}; 6 | use rocksdb::{DBCompressionType, IteratorMode, Options, DB}; 7 | use std::io::Cursor; 8 | use std::iter::Peekable; 9 | use std::marker::PhantomData; 10 | use std::path::Path; 11 | use std::sync::Arc; 12 | 13 | pub mod table; 14 | 15 | #[derive(thiserror::Error, Debug)] 16 | pub enum Error { 17 | #[error("I/O error")] 18 | Io(#[from] std::io::Error), 19 | #[error("UTF-8 decoding error")] 20 | Utf8(#[from] std::str::Utf8Error), 21 | #[error("RocksDb error")] 22 | Db(#[from] rocksdb::Error), 23 | #[error("Avro decoding error")] 24 | Avro(#[from] apache_avro::Error), 25 | #[error("Invalid key bytes")] 26 | InvalidKeyBytes(Vec), 27 | #[error("Invalid timestamp bytes")] 28 | InvalidTimestampBytes(Vec), 29 | #[error("Invalid timestamp")] 30 | InvalidTimestamp(DateTime), 31 | } 32 | 33 | #[derive(Clone, Debug, Eq, PartialEq)] 34 | pub struct ProfileDbCounts { 35 | pub id_count: u64, 36 | pub pair_count: u64, 37 | } 38 | 39 | #[derive(Clone)] 40 | pub struct ProfileDb { 41 | db: Arc, 42 | options: Options, 43 | mode: PhantomData, 44 | } 45 | 46 | impl table::Table for ProfileDb { 47 | type Counts = ProfileDbCounts; 48 | 49 | fn underlying(&self) -> &DB { 50 | &self.db 51 | } 52 | 53 | fn get_counts(&self) -> Result { 54 | let mut pair_count = 0; 55 | let mut id_count = 0; 56 | let mut last_id = 0; 57 | 58 | let iter = self.db.iterator(IteratorMode::Start); 59 | 60 | for result in iter { 61 | let (key, _) = result?; 62 | pair_count += 1; 63 | let (id, _) = key_to_pair(&key)?; 64 | if id != last_id { 65 | id_count += 1; 66 | last_id = id; 67 | } 68 | } 69 | 70 | Ok(Self::Counts { 71 | id_count, 72 | pair_count, 73 | }) 74 | } 75 | } 76 | 77 | impl ProfileDb { 78 | pub fn statistics(&self) -> Option { 79 | self.options.get_statistics() 80 | } 81 | 82 | pub fn lookup(&self, target_user_id: u64) -> Result, User)>, Error> { 83 | let prefix = target_user_id.to_be_bytes(); 84 | let iter = self.db.prefix_iterator(prefix); 85 | let mut users = vec![]; 86 | 87 | for result in iter { 88 | let (key, value) = result?; 89 | let (user_id, snapshot) = key_to_pair(&key)?; 90 | 91 | if user_id == target_user_id { 92 | users.push((snapshot, parse_value(value)?)); 93 | } else { 94 | break; 95 | } 96 | } 97 | 98 | Ok(users) 99 | } 100 | 101 | pub fn iter( 102 | &self, 103 | ) -> impl Iterator, User)>), Error>> + '_ { 104 | ProfileIterator { 105 | underlying: self.raw_iter().peekable(), 106 | } 107 | } 108 | 109 | pub fn raw_iter(&self) -> impl Iterator, User), Error>> + '_ { 110 | self.db.iterator(IteratorMode::Start).map(|result| { 111 | result.map_err(Error::from).and_then(|(key, value)| { 112 | let (user_id, snapshot) = key_to_pair(&key)?; 113 | let user = parse_value(value)?; 114 | 115 | Ok((user_id, snapshot, user)) 116 | }) 117 | }) 118 | } 119 | } 120 | 121 | impl ProfileDb { 122 | pub fn open>(path: P, enable_statistics: bool) -> Result { 123 | let mut options = Options::default(); 124 | options.create_if_missing(true); 125 | options.set_compression_type(DBCompressionType::Zstd); 126 | 127 | if enable_statistics { 128 | options.enable_statistics(); 129 | } 130 | 131 | let db = if M::is_read_only() { 132 | DB::open_for_read_only(&options, path, true)? 133 | } else { 134 | DB::open(&options, path)? 135 | }; 136 | 137 | Ok(Self { 138 | db: Arc::new(db), 139 | options, 140 | mode: PhantomData, 141 | }) 142 | } 143 | } 144 | 145 | impl ProfileDb { 146 | pub fn update(&self, user: &User) -> Result<(), Error> { 147 | let key = pair_to_key(user.id(), Utc.timestamp(user.snapshot, 0))?; 148 | let avro_value = to_value(user)?; 149 | let bytes = to_avro_datum(&USER_SCHEMA, avro_value)?; 150 | Ok(self.db.put(key, bytes)?) 151 | } 152 | } 153 | 154 | fn pair_to_key(user_id: u64, snapshot: DateTime) -> Result<[u8; 12], Error> { 155 | let mut key = [0; 12]; 156 | key[0..8].copy_from_slice(&user_id.to_be_bytes()); 157 | 158 | let snapshot_s: u32 = snapshot 159 | .timestamp() 160 | .try_into() 161 | .map_err(|_| Error::InvalidTimestamp(snapshot))?; 162 | key[8..12].copy_from_slice(&snapshot_s.to_be_bytes()); 163 | 164 | Ok(key) 165 | } 166 | 167 | fn key_to_pair(key: &[u8]) -> Result<(u64, DateTime), Error> { 168 | let user_id = u64::from_be_bytes( 169 | key[0..8] 170 | .try_into() 171 | .map_err(|_| Error::InvalidKeyBytes(key.to_vec()))?, 172 | ); 173 | let snapshot = u32::from_be_bytes( 174 | key[8..12] 175 | .try_into() 176 | .map_err(|_| Error::InvalidKeyBytes(key.to_vec()))?, 177 | ); 178 | 179 | Ok((user_id, Utc.timestamp(snapshot as i64, 0))) 180 | } 181 | 182 | fn parse_value>(value: T) -> Result { 183 | let mut cursor = Cursor::new(&value); 184 | let avro_value = from_avro_datum(&USER_SCHEMA, &mut cursor, None)?; 185 | Ok(from_value(&avro_value)?) 186 | } 187 | 188 | pub struct ProfileIterator { 189 | underlying: Peekable, 190 | } 191 | 192 | impl, User), Error>>> Iterator 193 | for ProfileIterator 194 | { 195 | type Item = Result<(u64, Vec<(DateTime, User)>), Error>; 196 | 197 | fn next(&mut self) -> Option { 198 | self.underlying.next().map(|result| { 199 | result.map(|(user_id, snapshot, user)| { 200 | let current_user_id = user_id; 201 | let mut users = vec![(snapshot, user)]; 202 | 203 | while let Some(result) = self.underlying.next_if(|result| { 204 | result 205 | .as_ref() 206 | .map(|(user_id, _, _)| *user_id == current_user_id) 207 | .unwrap_or(false) 208 | }) { 209 | // We've checked for errors just above, so this will always add a pair. 210 | if let Ok((_, snapshot, user)) = result { 211 | users.push((snapshot, user)); 212 | } 213 | } 214 | 215 | (current_user_id, users) 216 | }) 217 | }) 218 | } 219 | } 220 | -------------------------------------------------------------------------------- /projects/hst-tw-db/src/table.rs: -------------------------------------------------------------------------------- 1 | use super::Error; 2 | use rocksdb::DB; 3 | 4 | pub trait Mode { 5 | fn is_read_only() -> bool; 6 | } 7 | 8 | pub struct ReadOnly; 9 | pub struct Writeable; 10 | 11 | impl Mode for ReadOnly { 12 | fn is_read_only() -> bool { 13 | true 14 | } 15 | } 16 | impl Mode for Writeable { 17 | fn is_read_only() -> bool { 18 | false 19 | } 20 | } 21 | 22 | pub trait Table: Sized { 23 | type Counts; 24 | 25 | fn underlying(&self) -> &DB; 26 | fn get_counts(&self) -> Result; 27 | 28 | fn get_estimated_key_count(&self) -> Result, Error> { 29 | Ok(self 30 | .underlying() 31 | .property_int_value("rocksdb.estimate-num-keys")?) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /projects/hst-tw-images/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "hst-tw-images" 3 | authors = ["Travis Brown "] 4 | homepage = "https://github.com/travisbrown/hassreden-tracker" 5 | repository = "https://github.com/travisbrown/hassreden-tracker" 6 | description = "Library for working with Twitter profile images" 7 | keywords = ["twitter"] 8 | license-file = "../../LICENSE" 9 | readme = "../../README.md" 10 | version = "0.1.0" 11 | edition = "2021" 12 | 13 | 14 | [dependencies] 15 | lazy_static = "1.4" 16 | regex = "1.4" 17 | reqwest = { version = "0.11", features = ["gzip", "json"] } 18 | thiserror = "1.0" 19 | tokio = { version = "1", features = ["macros", "rt-multi-thread"] } 20 | -------------------------------------------------------------------------------- /projects/hst-tw-images/src/error.rs: -------------------------------------------------------------------------------- 1 | #[derive(thiserror::Error, Debug)] 2 | pub enum Error { 3 | #[error("Parsing error")] 4 | Parse(#[from] super::model::ParseError), 5 | #[error("File store error")] 6 | Store(#[from] super::store::Error), 7 | #[error("HTTP client error")] 8 | Reqwest(#[from] reqwest::Error), 9 | #[error("I/O error")] 10 | Io(#[from] std::io::Error), 11 | } 12 | -------------------------------------------------------------------------------- /projects/hst-tw-images/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Library for working with Twitter profile images. 2 | 3 | pub mod error; 4 | pub mod model; 5 | pub mod store; 6 | 7 | pub use error::Error; 8 | pub use model::{Domain, Image, ImageKey, Size}; 9 | pub use store::Store; 10 | -------------------------------------------------------------------------------- /projects/hst-tw-images/src/model.rs: -------------------------------------------------------------------------------- 1 | use std::convert::TryFrom; 2 | use std::fmt::Formatter; 3 | use std::path::Path; 4 | use std::str::FromStr; 5 | 6 | const DEFAULT_PATH: &str = "profile_images/"; 7 | 8 | #[derive(thiserror::Error, Debug)] 9 | pub enum ParseError { 10 | #[error("Invalid size")] 11 | InvalidSize(String), 12 | #[error("Invalid URL")] 13 | InvalidUrl(String), 14 | #[error("Invalid file name")] 15 | InvalidFileName(String), 16 | #[error("Invalid profile image ID")] 17 | InvalidId(String), 18 | #[error("Invalid path")] 19 | InvalidPath(Box), 20 | } 21 | 22 | #[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] 23 | pub enum Size { 24 | Mini, 25 | Normal, 26 | Bigger, 27 | Square200, 28 | Square400, 29 | } 30 | 31 | impl std::fmt::Display for Size { 32 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { 33 | let value = match self { 34 | Self::Mini => "mini", 35 | Self::Normal => "normal", 36 | Self::Bigger => "bigger", 37 | Self::Square200 => "200x200", 38 | Self::Square400 => "400x400", 39 | }; 40 | write!(f, "{}", value) 41 | } 42 | } 43 | 44 | impl FromStr for Size { 45 | type Err = ParseError; 46 | 47 | fn from_str(s: &str) -> Result { 48 | match s { 49 | "mini" => Ok(Self::Mini), 50 | "normal" => Ok(Self::Normal), 51 | "bigger" => Ok(Self::Bigger), 52 | "200x200" => Ok(Self::Square200), 53 | "400x400" => Ok(Self::Square400), 54 | _ => Err(Self::Err::InvalidSize(s.to_string())), 55 | } 56 | } 57 | } 58 | 59 | #[derive(Clone, Debug, Eq, Hash, PartialEq)] 60 | pub enum Domain { 61 | Pbs, 62 | Si0, 63 | Other(String), 64 | } 65 | 66 | impl std::fmt::Display for Domain { 67 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { 68 | let value = match self { 69 | Self::Pbs => "pbs.twimg.com", 70 | Self::Si0 => "si0.twimg.com", 71 | Self::Other(value) => value, 72 | }; 73 | write!(f, "{}", value) 74 | } 75 | } 76 | 77 | impl FromStr for Domain { 78 | type Err = ParseError; 79 | 80 | fn from_str(s: &str) -> Result { 81 | match s { 82 | "pbs.twimg.com" => Ok(Self::Pbs), 83 | "si0.twimg.com" => Ok(Self::Si0), 84 | value => Ok(Self::Other(value.to_string())), 85 | } 86 | } 87 | } 88 | 89 | impl Default for Domain { 90 | fn default() -> Self { 91 | Self::Pbs 92 | } 93 | } 94 | 95 | #[derive(Clone, Debug, Eq, Hash, PartialEq)] 96 | pub struct ImageKey { 97 | domain: Domain, 98 | id: u64, 99 | name: String, 100 | extension: Option, 101 | } 102 | 103 | impl ImageKey { 104 | pub fn to_image(&self, size: Size) -> Image { 105 | Image { 106 | domain: self.domain.clone(), 107 | id: self.id, 108 | name: self.name.clone(), 109 | size, 110 | extension: self.extension.clone(), 111 | } 112 | } 113 | } 114 | 115 | #[derive(Clone, Debug, Eq, Hash, PartialEq)] 116 | pub struct Image { 117 | pub domain: Domain, 118 | pub id: u64, 119 | pub name: String, 120 | pub size: Size, 121 | pub extension: Option, 122 | } 123 | 124 | impl Image { 125 | pub fn key(&self) -> ImageKey { 126 | ImageKey { 127 | domain: self.domain.clone(), 128 | id: self.id, 129 | name: self.name.clone(), 130 | extension: self.extension.clone(), 131 | } 132 | } 133 | 134 | pub fn with_size(&self, size: Size) -> Self { 135 | Self { 136 | domain: self.domain.clone(), 137 | id: self.id, 138 | name: self.name.clone(), 139 | size, 140 | extension: self.extension.clone(), 141 | } 142 | } 143 | 144 | pub fn extension_string(&self) -> String { 145 | self.extension 146 | .as_ref() 147 | .map(|value| format!(".{}", value)) 148 | .unwrap_or_default() 149 | } 150 | 151 | pub fn url(&self) -> String { 152 | format!("{}", self) 153 | } 154 | 155 | pub fn id_prefix_url(&self) -> String { 156 | format!("https://{}/{}{}/", self.domain, DEFAULT_PATH, self.id) 157 | } 158 | 159 | /// Convert the ID to a string and split off the last four characters. 160 | /// 161 | /// This approach allows us to avoid directories containing millions of files, which can 162 | /// cause problems in some contexts. We use the ID because it's known to be numeric, so we 163 | /// don't have to worry about escaping characters, and we use the final digits for balance. 164 | fn path_dir_prefix(&self) -> (String, String) { 165 | let mut chars = self 166 | .id 167 | .to_string() 168 | .chars() 169 | .rev() 170 | .take(4) 171 | .collect::>(); 172 | 173 | // If the ID as a string has fewer than four characters, we pad it with zeroes. 174 | while chars.len() < 4 { 175 | chars.push('0'); 176 | } 177 | 178 | chars.reverse(); 179 | 180 | (chars[0..2].iter().collect(), chars[2..4].iter().collect()) 181 | } 182 | 183 | pub fn path(&self) -> String { 184 | let (prefix_a, prefix_b) = self.path_dir_prefix(); 185 | 186 | format!( 187 | "{}/{}/{}/{}-{}_{}{}", 188 | self.domain, 189 | prefix_a, 190 | prefix_b, 191 | self.id, 192 | self.name, 193 | self.size, 194 | self.extension_string() 195 | ) 196 | } 197 | 198 | fn parse_url_file_name(input: &str) -> Result<(String, Size, Option), ParseError> { 199 | lazy_static::lazy_static! { 200 | static ref URL_FILE_NAME_RE: regex::Regex = regex::Regex::new( 201 | r"^(.*)_([^\.]+)(\.[a-zA-Z0-9-]+)?$" 202 | ) 203 | .unwrap(); 204 | } 205 | 206 | let ((name_match, size_match), extension_match) = URL_FILE_NAME_RE 207 | .captures(input) 208 | .and_then(|captures| { 209 | captures 210 | .get(1) 211 | .zip(captures.get(2)) 212 | .map(|value| (value, captures.get(3))) 213 | }) 214 | .ok_or_else(|| ParseError::InvalidFileName(input.to_string()))?; 215 | 216 | let name_source = name_match.as_str(); 217 | let size_source = size_match.as_str(); 218 | let extension = extension_match.map(|value| value.as_str()[1..].to_string()); 219 | 220 | let name = name_source.to_string(); 221 | let size = size_source.parse()?; 222 | 223 | Ok((name, size, extension)) 224 | } 225 | 226 | pub fn parse_file_name(input: &str) -> Option { 227 | lazy_static::lazy_static! { 228 | static ref FILE_NAME_RE: regex::Regex = regex::Regex::new( 229 | r"^(\d+)\-(.*)$" 230 | ) 231 | .unwrap(); 232 | } 233 | 234 | let (id_match, rest_match) = FILE_NAME_RE 235 | .captures(input) 236 | .and_then(|captures| captures.get(1).zip(captures.get(2)))?; 237 | 238 | let id = id_match.as_str().parse::().ok()?; 239 | let (name, size, extension) = Self::parse_url_file_name(rest_match.as_str()).ok()?; 240 | 241 | Some(Self { 242 | domain: Domain::default(), 243 | id, 244 | name, 245 | size, 246 | extension, 247 | }) 248 | } 249 | } 250 | 251 | impl std::fmt::Display for Image { 252 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { 253 | write!( 254 | f, 255 | "https://{}/{}{}/{}_{}{}", 256 | self.domain, 257 | DEFAULT_PATH, 258 | self.id, 259 | self.name, 260 | self.size, 261 | self.extension_string() 262 | ) 263 | } 264 | } 265 | 266 | impl FromStr for Image { 267 | type Err = ParseError; 268 | 269 | fn from_str(s: &str) -> Result { 270 | lazy_static::lazy_static! { 271 | static ref URL_RE: regex::Regex = regex::Regex::new( 272 | r"^https?://([^/]+)/profile_images/(\d+)/(.*)$" 273 | ) 274 | .unwrap(); 275 | } 276 | 277 | let ((domain_match, id_match), rest_match) = URL_RE 278 | .captures(s) 279 | .and_then(|captures| captures.get(1).zip(captures.get(2)).zip(captures.get(3))) 280 | .ok_or_else(|| Self::Err::InvalidUrl(s.to_string()))?; 281 | 282 | let domain_source = domain_match.as_str(); 283 | let domain = domain_source.parse()?; 284 | 285 | let id = id_match 286 | .as_str() 287 | .parse::() 288 | .map_err(|_| ParseError::InvalidId(id_match.as_str().to_string()))?; 289 | let (name, size, extension) = Self::parse_url_file_name(rest_match.as_str())?; 290 | 291 | Ok(Self { 292 | domain, 293 | id, 294 | name, 295 | size, 296 | extension, 297 | }) 298 | } 299 | } 300 | 301 | impl TryFrom<&Path> for Image { 302 | type Error = ParseError; 303 | 304 | fn try_from(value: &Path) -> Result { 305 | let mut image = path_to_str(value) 306 | .and_then(Self::parse_file_name) 307 | .ok_or_else(|| Self::Error::InvalidPath(value.into()))?; 308 | 309 | let (domain, (prefix_a, prefix_b)) = get_parents_3(value) 310 | .filter(|(_, prefix_a_dir, prefix_b_dir)| { 311 | super::store::is_valid_prefix_dir(prefix_a_dir) 312 | && super::store::is_valid_prefix_dir(prefix_b_dir) 313 | }) 314 | .and_then(|(domain_dir, prefix_a_dir, prefix_b_dir)| { 315 | path_to_str(domain_dir) 316 | .zip(path_to_str(prefix_a_dir).zip(path_to_str(prefix_b_dir))) 317 | }) 318 | .ok_or_else(|| Self::Error::InvalidPath(value.into()))?; 319 | 320 | let id_str = image.id.to_string(); 321 | let id_len = id_str.len(); 322 | 323 | if id_len >= 4 324 | && prefix_a == &id_str[id_len - 4..id_len - 2] 325 | && prefix_b == &id_str[id_len - 2..] 326 | { 327 | image.domain = domain.parse()?; 328 | 329 | Ok(image) 330 | } else { 331 | Err(Self::Error::InvalidPath(value.into())) 332 | } 333 | } 334 | } 335 | 336 | fn path_to_str(path: &Path) -> Option<&str> { 337 | path.file_name().and_then(|value| value.to_str()) 338 | } 339 | 340 | fn get_parents_3(path: &Path) -> Option<(&Path, &Path, &Path)> { 341 | let parent_3 = path.parent()?; 342 | let parent_2 = parent_3.parent()?; 343 | let parent_1 = parent_2.parent()?; 344 | 345 | Some((parent_1, parent_2, parent_3)) 346 | } 347 | -------------------------------------------------------------------------------- /projects/hst-tw-images/src/store.rs: -------------------------------------------------------------------------------- 1 | use super::{model::ParseError, Image, ImageKey}; 2 | use std::marker::PhantomData; 3 | use std::path::{Path, PathBuf}; 4 | 5 | const DOMAIN_DIR_SIZE: usize = 1; 6 | const PREFIX_DIR_SIZE: usize = 100; 7 | const FILE_DIR_SIZE: usize = 1000; 8 | 9 | #[derive(thiserror::Error, Debug)] 10 | pub enum Error { 11 | #[error("Parsing error")] 12 | Parse(#[from] ParseError), 13 | #[error("Invalid directory")] 14 | InvalidDirectory(Box), 15 | #[error("Invalid file")] 16 | InvalidFile(Box), 17 | #[error("I/O error")] 18 | Io(#[from] std::io::Error), 19 | } 20 | 21 | pub(crate) fn is_valid_prefix_dir>(path: P) -> bool { 22 | lazy_static::lazy_static! { 23 | static ref PREFIX_DIR_RE: regex::Regex = regex::Regex::new(r"^\d\d$").unwrap(); 24 | } 25 | 26 | path.as_ref() 27 | .file_name() 28 | .and_then(|value| value.to_str()) 29 | .map(|value| PREFIX_DIR_RE.is_match(value)) 30 | .unwrap_or(false) 31 | } 32 | 33 | pub struct Store { 34 | base: PathBuf, 35 | } 36 | 37 | impl Store { 38 | pub fn new>(base: P) -> Self { 39 | Self { 40 | base: base.as_ref().to_path_buf(), 41 | } 42 | } 43 | 44 | pub fn keys(&self) -> StoreIterator { 45 | StoreIterator::new(self.base.as_path()) 46 | } 47 | 48 | pub fn path>(&self, path: P) -> PathBuf { 49 | self.base.join(path) 50 | } 51 | } 52 | 53 | impl IntoIterator for &Store { 54 | type Item = Result<(Image, PathBuf), Error>; 55 | type IntoIter = StoreIterator<(Image, PathBuf), ImagePathExtractor>; 56 | fn into_iter(self) -> Self::IntoIter { 57 | StoreIterator::new(self.base.as_path()) 58 | } 59 | } 60 | 61 | pub struct StoreIterator { 62 | base: Option, 63 | domain_dirs: Vec, 64 | prefix_a_dirs: Vec, 65 | prefix_b_dirs: Vec, 66 | files: Vec, 67 | _f: PhantomData, 68 | } 69 | 70 | impl StoreIterator { 71 | fn new(base: &Path) -> Self { 72 | Self { 73 | base: Some(base.to_path_buf()), 74 | domain_dirs: Vec::with_capacity(DOMAIN_DIR_SIZE), 75 | prefix_a_dirs: Vec::with_capacity(PREFIX_DIR_SIZE), 76 | prefix_b_dirs: Vec::with_capacity(PREFIX_DIR_SIZE), 77 | files: Vec::with_capacity(FILE_DIR_SIZE), 78 | _f: PhantomData, 79 | } 80 | } 81 | } 82 | 83 | impl> Iterator for StoreIterator { 84 | type Item = Result; 85 | 86 | fn next(&mut self) -> Option { 87 | if let Some(next_file_value) = self.files.pop() { 88 | Some(Ok(next_file_value)) 89 | } else if let Some(next_prefix_b_dir) = self.prefix_b_dirs.pop() { 90 | if is_valid_prefix_dir(&next_prefix_b_dir) { 91 | read_paths_with::(&next_prefix_b_dir, &mut self.files) 92 | .map_or_else(|error| Some(Err(error)), |_| self.next()) 93 | } else { 94 | Some(Err(Error::InvalidDirectory(next_prefix_b_dir.into()))) 95 | } 96 | } else if let Some(next_prefix_a_dir) = self.prefix_a_dirs.pop() { 97 | if is_valid_prefix_dir(&next_prefix_a_dir) { 98 | read_paths(&next_prefix_a_dir, &mut self.prefix_b_dirs) 99 | .map_or_else(|error| Some(Err(error)), |_| self.next()) 100 | } else { 101 | Some(Err(Error::InvalidDirectory(next_prefix_a_dir.into()))) 102 | } 103 | } else if let Some(next_domain_dir) = self.domain_dirs.pop() { 104 | read_paths(&next_domain_dir, &mut self.prefix_a_dirs) 105 | .map_or_else(|error| Some(Err(error)), |_| self.next()) 106 | } else if let Some(base_dir) = self.base.take() { 107 | read_paths(&base_dir, &mut self.domain_dirs) 108 | .map_or_else(|error| Some(Err(error)), |_| self.next()) 109 | } else { 110 | None 111 | } 112 | } 113 | } 114 | 115 | pub trait FileExtractor { 116 | type Output; 117 | 118 | fn apply(path: PathBuf) -> Result; 119 | } 120 | 121 | pub struct ImagePathExtractor; 122 | 123 | impl FileExtractor for ImagePathExtractor { 124 | type Output = (Image, PathBuf); 125 | 126 | fn apply(path: PathBuf) -> Result { 127 | Image::try_from(path.as_path()) 128 | .map(|image| (image, path)) 129 | .map_err(Error::from) 130 | } 131 | } 132 | 133 | pub struct ImageKeyExtractor; 134 | 135 | impl FileExtractor for ImageKeyExtractor { 136 | type Output = ImageKey; 137 | 138 | fn apply(path: PathBuf) -> Result { 139 | Image::try_from(path.as_path()) 140 | .map(|image| image.key()) 141 | .map_err(Error::from) 142 | } 143 | } 144 | 145 | fn read_paths(dir: &Path, result: &mut Vec) -> Result<(), Error> { 146 | for entry in std::fs::read_dir(dir)? { 147 | let entry = entry?; 148 | 149 | result.push(entry.path()); 150 | } 151 | 152 | result.sort(); 153 | 154 | Ok(()) 155 | } 156 | 157 | fn read_paths_with(dir: &Path, result: &mut Vec) -> Result<(), Error> { 158 | for entry in std::fs::read_dir(dir)? { 159 | let entry = entry?; 160 | 161 | result.push(F::apply(entry.path())?); 162 | } 163 | 164 | Ok(()) 165 | } 166 | -------------------------------------------------------------------------------- /projects/hst-tw-profiles/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "hst-tw-profiles" 3 | authors = ["Travis Brown "] 4 | homepage = "https://github.com/travisbrown/hassreden-tracker" 5 | repository = "https://github.com/travisbrown/hassreden-tracker" 6 | description = "Library for working with Twitter profiles" 7 | keywords = ["twitter"] 8 | license-file = "../../LICENSE" 9 | readme = "../../README.md" 10 | version = "0.1.0" 11 | edition = "2021" 12 | 13 | [dependencies] 14 | apache-avro = { version = "0.14", features = ["snappy"] } 15 | bzip2 = "0.4" 16 | chrono = "0.4" 17 | hst-tw-utils = { path = "../hst-tw-utils", version = "0.1.0" } 18 | lazy_static = "1.4" 19 | serde = { version = "1", features = ["derive"] } 20 | serde_derive = "1" 21 | serde_json = { version = "1", features = ["preserve_order"] } 22 | tar = "0.4" 23 | thiserror = "1" 24 | zip = { version = "0.6", default-features = false, features = ["bzip2", "deflate"] } 25 | -------------------------------------------------------------------------------- /projects/hst-tw-profiles/schemas/avro/user.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "name": "lol.memory.model.user", 3 | "type": "record", 4 | "fields": [ 5 | { "name": "id", "type": "long" }, 6 | { "name": "id_str", "type": "string" }, 7 | { "name": "name", "type": "string" }, 8 | { "name": "screen_name", "type": "string" }, 9 | { "name": "location", "type": ["null", "string"] }, 10 | { "name": "description", "type": ["null", "string"] }, 11 | { "name": "url", "type": ["null", "string"] }, 12 | { 13 | "name": "entities", 14 | "type": [ 15 | "null", 16 | { 17 | "name": "lol.memory.model.entities", 18 | "type": "record", 19 | "fields": [ 20 | { 21 | "name": "url", 22 | "type": [ 23 | "null", 24 | { 25 | "name": "lol.memory.model.entity", 26 | "type": "record", 27 | "fields": [ 28 | { 29 | "name": "urls", 30 | "type": { 31 | "type": "array", 32 | "items": { 33 | "name": "lol.memory.model.url", 34 | "type": "record", 35 | "fields": [ 36 | { "name": "url", "type": "string" }, 37 | { "name": "expanded_url", "type": ["null", "string"] }, 38 | { "name": "display_url", "type": ["null", "string"] }, 39 | { "name": "indices", "type": { "type": "array", "items": "long" } } 40 | ] 41 | } 42 | } 43 | } 44 | ] 45 | } 46 | ] 47 | }, 48 | { "name": "description", "type": ["null", "lol.memory.model.entity"] } 49 | ] 50 | } 51 | ] 52 | }, 53 | { "name": "protected", "type": "boolean" }, 54 | { "name": "followers_count", "type": "long" }, 55 | { "name": "friends_count", "type": "long" }, 56 | { "name": "listed_count", "type": "long" }, 57 | { "name": "created_at", "type": "string" }, 58 | { "name": "favourites_count", "type": "long" }, 59 | { "name": "utc_offset", "type": ["null", "int"] }, 60 | { "name": "time_zone", "type": ["null", "string"] }, 61 | { "name": "geo_enabled", "type": ["null", "boolean"] }, 62 | { "name": "verified", "type": "boolean" }, 63 | { "name": "statuses_count", "type": "long" }, 64 | { "name": "lang", "type": ["null", "string"] }, 65 | { "name": "profile_background_color", "type": ["null", "string"] }, 66 | { "name": "profile_background_image_url_https", "type": ["null", "string"] }, 67 | { "name": "profile_background_tile", "type": ["null", "boolean"] }, 68 | { "name": "profile_image_url_https", "type": "string" }, 69 | { "name": "profile_banner_url", "type": ["null", "string"] }, 70 | { "name": "profile_link_color", "type": ["null", "string"] }, 71 | { "name": "profile_sidebar_border_color", "type": ["null", "string"] }, 72 | { "name": "profile_sidebar_fill_color", "type": ["null", "string"] }, 73 | { "name": "profile_text_color", "type": ["null", "string"] }, 74 | { "name": "profile_use_background_image", "type": ["null", "boolean"] }, 75 | { "name": "has_extended_profile", "type": ["null", "boolean"] }, 76 | { "name": "default_profile", "type": "boolean" }, 77 | { "name": "default_profile_image", "type": "boolean" }, 78 | { "name": "withheld_scope", "type": ["null", "string"] }, 79 | { "name": "withheld_in_countries", "type": { "type": "array", "items": "string" } }, 80 | { "name": "snapshot", "type": "long" } 81 | ] 82 | } 83 | -------------------------------------------------------------------------------- /projects/hst-tw-profiles/src/archive.rs: -------------------------------------------------------------------------------- 1 | use crate::stream::UserInfo; 2 | use bzip2::read::MultiBzDecoder; 3 | use std::ffi::OsStr; 4 | use std::fs::File; 5 | use std::io::{BufRead, BufReader}; 6 | use std::path::Path; 7 | use tar::Archive; 8 | use zip::ZipArchive; 9 | 10 | #[derive(thiserror::Error, Debug)] 11 | pub enum Error { 12 | #[error("Profile stream error")] 13 | ProfileStream(#[from] crate::stream::Error), 14 | #[error("I/O error")] 15 | Io(#[from] std::io::Error), 16 | #[error("JSON error")] 17 | Json(#[from] serde_json::Error), 18 | #[error("ZIP error")] 19 | Zip(#[from] zip::result::ZipError), 20 | #[error("Other error")] 21 | Other(String), 22 | } 23 | 24 | pub fn extract_tar< 25 | P: AsRef, 26 | F: FnMut(Result, Error>) -> Result<(), Error>, 27 | >( 28 | path: P, 29 | mut f: F, 30 | ) -> Result<(), Error> { 31 | let bz2_ext = OsStr::new("bz2"); 32 | 33 | let file = File::open(path)?; 34 | let mut archive = Archive::new(file); 35 | 36 | for entry_res in archive.entries()? { 37 | let entry = entry_res?; 38 | let path = entry.path()?; 39 | 40 | if path.extension() == Some(bz2_ext) { 41 | let reader = BufReader::new(MultiBzDecoder::new(entry)); 42 | for line in reader.lines() { 43 | let result = line 44 | .map_err(Error::from) 45 | .and_then(|line| serde_json::from_str(&line).map_err(Error::from)) 46 | .and_then(|value| { 47 | crate::stream::extract_user_info(&value, true).map_err(Error::from) 48 | }); 49 | f(result)? 50 | } 51 | } 52 | } 53 | 54 | Ok(()) 55 | } 56 | 57 | pub fn extract_zip< 58 | P: AsRef, 59 | F: FnMut(Result, Error>) -> Result<(), Error>, 60 | >( 61 | path: P, 62 | mut f: F, 63 | ) -> Result<(), Error> { 64 | let file = File::open(path)?; 65 | let mut archive = ZipArchive::new(file)?; 66 | 67 | for i in 0..archive.len() { 68 | let file = archive.by_index(i)?; 69 | let file_name = file.name(); 70 | if file_name.ends_with("bz2") { 71 | let reader = BufReader::new(MultiBzDecoder::new(file)); 72 | for line in reader.lines() { 73 | let result = line 74 | .map_err(Error::from) 75 | .and_then(|line| serde_json::from_str(&line).map_err(Error::from)) 76 | .and_then(|value| { 77 | crate::stream::extract_user_info(&value, true).map_err(Error::from) 78 | }); 79 | f(result)? 80 | } 81 | } 82 | } 83 | 84 | Ok(()) 85 | } 86 | -------------------------------------------------------------------------------- /projects/hst-tw-profiles/src/avro.rs: -------------------------------------------------------------------------------- 1 | use super::model::User; 2 | use apache_avro::{schema::Schema, Codec, Reader, Writer}; 3 | use std::cmp::Ordering; 4 | use std::collections::HashMap; 5 | use std::io::{Read, Write}; 6 | 7 | pub fn writer(writer: W) -> Writer<'static, W> { 8 | Writer::with_codec(&USER_SCHEMA, writer, Codec::Snappy) 9 | } 10 | 11 | pub fn reader(reader: R) -> Result, Error> { 12 | Ok(Reader::with_schema(&USER_SCHEMA, reader)?) 13 | } 14 | 15 | pub fn validate(reader: Reader<'static, R>) -> Result { 16 | let mut count = 0; 17 | let mut last_snapshot = 0; 18 | let mut last_user_id = 0; 19 | let mut misordered_line_numbers = vec![]; 20 | let mut duplicate_line_numbers = vec![]; 21 | 22 | for (line_number, value) in reader.enumerate() { 23 | let user = apache_avro::from_value::(&value?)?; 24 | 25 | match user.snapshot.cmp(&last_snapshot) { 26 | Ordering::Greater => {} 27 | Ordering::Less => { 28 | misordered_line_numbers.push(line_number); 29 | } 30 | Ordering::Equal => match user.id.cmp(&last_user_id) { 31 | Ordering::Greater => {} 32 | Ordering::Less => { 33 | misordered_line_numbers.push(line_number); 34 | } 35 | Ordering::Equal => { 36 | duplicate_line_numbers.push(line_number); 37 | } 38 | }, 39 | } 40 | 41 | last_snapshot = user.snapshot; 42 | last_user_id = user.id; 43 | count += 1; 44 | } 45 | 46 | if misordered_line_numbers.is_empty() && duplicate_line_numbers.is_empty() { 47 | Ok(count) 48 | } else { 49 | Err(ValidationError::InvalidContents { 50 | misordered_line_numbers, 51 | duplicate_line_numbers, 52 | }) 53 | } 54 | } 55 | 56 | pub fn count_users( 57 | reader: Reader<'static, R>, 58 | ) -> Result, Error> { 59 | let mut counts = HashMap::new(); 60 | 61 | for value in reader { 62 | let user = apache_avro::from_value::(&value?)?; 63 | 64 | let count = counts.entry((user.id(), user.screen_name)).or_default(); 65 | *count += 1; 66 | } 67 | 68 | Ok(counts) 69 | } 70 | 71 | #[derive(thiserror::Error, Debug)] 72 | pub enum Error { 73 | #[error("I/O error")] 74 | Io(#[from] std::io::Error), 75 | #[error("Avro error")] 76 | Avro(#[from] apache_avro::Error), 77 | } 78 | 79 | #[derive(thiserror::Error, Debug)] 80 | pub enum ValidationError { 81 | #[error("Avro error")] 82 | Avro(#[from] apache_avro::Error), 83 | #[error("Unsorted lines")] 84 | InvalidContents { 85 | misordered_line_numbers: Vec, 86 | duplicate_line_numbers: Vec, 87 | }, 88 | } 89 | 90 | lazy_static::lazy_static! { 91 | pub static ref USER_SCHEMA: Schema = load_user_avro_schema().unwrap(); 92 | } 93 | 94 | fn load_user_avro_schema() -> Result { 95 | let source = std::include_str!("../schemas/avro/user.avsc"); 96 | 97 | Ok(Schema::parse_str(source)?) 98 | } 99 | -------------------------------------------------------------------------------- /projects/hst-tw-profiles/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Library for working with user profiles from the Twitter API. 2 | 3 | pub mod archive; 4 | pub mod avro; 5 | pub mod model; 6 | pub mod stream; 7 | -------------------------------------------------------------------------------- /projects/hst-tw-profiles/src/model.rs: -------------------------------------------------------------------------------- 1 | use chrono::{DateTime, Utc}; 2 | 3 | #[derive(Debug, Default, Eq, PartialEq, Clone, serde::Deserialize, serde::Serialize)] 4 | #[serde(default)] 5 | pub struct Url { 6 | pub url: String, 7 | pub expanded_url: Option, 8 | //#[serde(skip_serializing_if = "Option::is_none")] 9 | pub display_url: Option, 10 | pub indices: Vec, 11 | } 12 | 13 | #[derive(Debug, Default, Eq, PartialEq, Clone, serde::Deserialize, serde::Serialize)] 14 | #[serde(default)] 15 | pub struct Entity { 16 | pub urls: Vec, 17 | } 18 | 19 | #[derive(Debug, Default, Eq, PartialEq, Clone, serde::Deserialize, serde::Serialize)] 20 | #[serde(default)] 21 | pub struct Entities { 22 | //#[serde(skip_serializing_if = "Option::is_none")] 23 | pub url: Option, 24 | pub description: Option, 25 | } 26 | 27 | #[derive(Debug, Default, Eq, PartialEq, Clone, serde::Deserialize, serde::Serialize)] 28 | #[serde(default)] 29 | pub struct User { 30 | pub id: i64, 31 | pub id_str: String, 32 | pub name: String, 33 | pub screen_name: String, 34 | pub location: Option, 35 | pub description: Option, 36 | pub url: Option, 37 | pub entities: Option, 38 | pub protected: bool, 39 | pub followers_count: i64, 40 | pub friends_count: i64, 41 | pub listed_count: i64, 42 | pub created_at: String, 43 | pub favourites_count: i64, 44 | pub utc_offset: Option, 45 | pub time_zone: Option, 46 | pub geo_enabled: Option, 47 | pub verified: bool, 48 | pub statuses_count: i64, 49 | pub lang: Option, 50 | pub profile_background_color: Option, 51 | pub profile_background_image_url_https: Option, 52 | pub profile_background_tile: Option, 53 | pub profile_image_url_https: String, 54 | //#[serde(skip_serializing_if = "Option::is_none")] 55 | pub profile_banner_url: Option, 56 | pub profile_link_color: Option, 57 | pub profile_sidebar_border_color: Option, 58 | pub profile_sidebar_fill_color: Option, 59 | pub profile_text_color: Option, 60 | pub profile_use_background_image: Option, 61 | //#[serde(skip_serializing_if = "Option::is_none")] 62 | pub has_extended_profile: Option, 63 | pub default_profile: bool, 64 | pub default_profile_image: bool, 65 | //#[serde(skip_serializing_if = "Option::is_none")] 66 | pub withheld_scope: Option, 67 | pub withheld_in_countries: Vec, 68 | pub snapshot: i64, 69 | } 70 | 71 | impl User { 72 | pub fn id(&self) -> u64 { 73 | self.id as u64 74 | } 75 | 76 | pub fn created_at(&self) -> Result, chrono::ParseError> { 77 | hst_tw_utils::parse_date_time(&self.created_at) 78 | } 79 | 80 | pub fn expanded_url(&self) -> Option<&str> { 81 | let entities = self.entities.as_ref()?; 82 | let entity = entities.url.as_ref()?; 83 | let first_url = entity.urls.first()?; 84 | first_url.expanded_url.as_deref() 85 | } 86 | 87 | pub fn description_urls(&self) -> Vec<&str> { 88 | self.entities 89 | .as_ref() 90 | .and_then(|entity| entity.description.as_ref()) 91 | .map(|description| { 92 | description 93 | .urls 94 | .iter() 95 | .filter_map(|url| url.expanded_url.as_deref()) 96 | .collect() 97 | }) 98 | .unwrap_or_default() 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /projects/hst-tw-profiles/src/stream/mod.rs: -------------------------------------------------------------------------------- 1 | use super::model::User; 2 | use chrono::{DateTime, TimeZone, Utc}; 3 | use serde_json::Value; 4 | use std::collections::{HashMap, HashSet}; 5 | 6 | const TIMESTAMP_FIELD_NAME: &str = "snapshot"; 7 | 8 | #[derive(thiserror::Error, Debug)] 9 | pub enum Error { 10 | #[error("I/O error")] 11 | Io(#[from] std::io::Error), 12 | #[error("Missing snapshot timestamp")] 13 | MissingTimestamp(Value), 14 | #[error("Missing user")] 15 | MissingUser(Value), 16 | #[error("Invalid user object")] 17 | InvalidUser(serde_json::error::Error), 18 | } 19 | 20 | #[derive(Debug, Default, Eq, PartialEq, Clone, serde::Deserialize, serde::Serialize)] 21 | pub struct PartialUser { 22 | pub id: u64, 23 | pub screen_name: String, 24 | pub name: Option, 25 | } 26 | 27 | impl PartialUser { 28 | pub fn new(id: u64, screen_name: String, name: Option) -> Self { 29 | Self { 30 | id, 31 | screen_name, 32 | name, 33 | } 34 | } 35 | } 36 | 37 | #[derive(Clone, Debug, Eq, PartialEq)] 38 | pub struct UserInfo { 39 | pub snapshot: DateTime, 40 | pub users: Vec, 41 | pub partial_users: Vec, 42 | } 43 | 44 | pub fn extract_user_info( 45 | value: &Value, 46 | created_at_fallback: bool, 47 | ) -> Result, Error> { 48 | if value.get("delete").is_none() { 49 | // We try to determine the snapshot timestamp by checking for a `timestamp_ms` field, 50 | // and then (if specified) by parsing `created_at` (since `timestamp_ms` isn't available 51 | // for older Twitter API responses). 52 | let snapshot = get_timestamp_ms(value) 53 | .or_else(|| { 54 | if created_at_fallback { 55 | get_created_at(value) 56 | } else { 57 | None 58 | } 59 | }) 60 | .ok_or_else(|| Error::MissingTimestamp(value.clone()))?; 61 | 62 | let mut partial_user_map = HashMap::new(); 63 | 64 | let user = get_user(value, snapshot)?; 65 | add_partial_users(value, &mut partial_user_map); 66 | 67 | let mut seen = HashSet::new(); 68 | seen.insert(user.id); 69 | 70 | let mut users = vec![user]; 71 | 72 | if let Some(status_value) = value.get("retweeted_status") { 73 | let user = get_user(status_value, snapshot)?; 74 | add_partial_users(status_value, &mut partial_user_map); 75 | 76 | if !seen.contains(&user.id) { 77 | seen.insert(user.id); 78 | users.push(user); 79 | } 80 | } 81 | 82 | if let Some(status_value) = value.get("quoted_status") { 83 | let user = get_user(status_value, snapshot)?; 84 | add_partial_users(status_value, &mut partial_user_map); 85 | 86 | if !seen.contains(&user.id) { 87 | seen.insert(user.id); 88 | users.push(user); 89 | } 90 | } 91 | 92 | let partial_users = partial_user_map 93 | .into_iter() 94 | .filter_map(|(id, partial_user)| { 95 | if seen.contains(&(id as i64)) { 96 | None 97 | } else { 98 | Some(partial_user) 99 | } 100 | }) 101 | .collect(); 102 | 103 | Ok(Some(UserInfo { 104 | snapshot, 105 | users, 106 | partial_users, 107 | })) 108 | } else { 109 | Ok(None) 110 | } 111 | } 112 | 113 | fn add_partial_users(status_value: &Value, acc: &mut HashMap) { 114 | for partial_user in get_all_user_mentions(status_value) { 115 | acc.insert(partial_user.id, partial_user); 116 | } 117 | 118 | if let Some(partial_user) = get_in_reply_to(status_value) { 119 | acc.entry(partial_user.id).or_insert(partial_user); 120 | } 121 | } 122 | 123 | fn get_in_reply_to(status_value: &Value) -> Option { 124 | let id_str_value = status_value.get("in_reply_to_user_id_str")?; 125 | let id_str_string = id_str_value.as_str()?; 126 | let id_str_u64 = id_str_string.parse::().ok()?; 127 | let screen_name_value = status_value.get("in_reply_to_screen_name")?; 128 | let screen_name_string = screen_name_value.as_str()?; 129 | Some(PartialUser::new( 130 | id_str_u64, 131 | screen_name_string.to_string(), 132 | None, 133 | )) 134 | } 135 | 136 | fn get_all_user_mentions(status_value: &Value) -> Vec { 137 | let mut results = get_user_mentions(status_value); 138 | 139 | if let Some(extended_tweet) = status_value.get("extended_tweet") { 140 | results.extend(get_user_mentions(extended_tweet)); 141 | } 142 | 143 | results 144 | } 145 | 146 | fn get_user_mentions(status_value: &Value) -> Vec { 147 | status_value 148 | .get("entities") 149 | .and_then(|entities| { 150 | entities 151 | .get("user_mentions") 152 | .and_then(|user_mentions| user_mentions.as_array()) 153 | }) 154 | .map(|user_mentions| { 155 | user_mentions 156 | .iter() 157 | .filter_map(|user_mention| { 158 | let id_str_value = user_mention.get("id_str")?; 159 | let id_str_string = id_str_value.as_str()?; 160 | let id_str_u64 = id_str_string.parse::().ok()?; 161 | let screen_name_value = user_mention.get("screen_name")?; 162 | let screen_name_string = screen_name_value.as_str()?; 163 | let name_value = user_mention.get("name")?; 164 | let name_string = name_value.as_str()?; 165 | 166 | Some(PartialUser::new( 167 | id_str_u64, 168 | screen_name_string.to_string(), 169 | Some(name_string.to_string()), 170 | )) 171 | }) 172 | .collect() 173 | }) 174 | .unwrap_or_default() 175 | } 176 | 177 | fn get_timestamp_ms(value: &Value) -> Option> { 178 | let timestamp_ms_value = value.get("timestamp_ms")?; 179 | let timestamp_ms_string = timestamp_ms_value.as_str()?; 180 | let timestamp_ms_i64 = timestamp_ms_string.parse::().ok()?; 181 | Utc.timestamp_millis_opt(timestamp_ms_i64).single() 182 | } 183 | 184 | fn get_created_at(value: &Value) -> Option> { 185 | let created_at_value = value.get("created_at")?; 186 | let created_at_string = created_at_value.as_str()?; 187 | hst_tw_utils::parse_date_time(created_at_string).ok() 188 | } 189 | 190 | fn get_user(value: &Value, snapshot: DateTime) -> Result { 191 | let mut user_value = value 192 | .get("user") 193 | .ok_or_else(|| Error::MissingUser(value.clone()))? 194 | .clone(); 195 | 196 | if let Some(fields) = user_value.as_object_mut() { 197 | fields.insert( 198 | TIMESTAMP_FIELD_NAME.to_string(), 199 | serde_json::json!(snapshot.timestamp()), 200 | ); 201 | } 202 | 203 | serde_json::from_value(user_value).map_err(Error::InvalidUser) 204 | } 205 | -------------------------------------------------------------------------------- /projects/hst-tw-utils/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "hst-tw-utils" 3 | authors = ["Travis Brown "] 4 | homepage = "https://github.com/travisbrown/hassreden-tracker" 5 | repository = "https://github.com/travisbrown/hassreden-tracker" 6 | description = "Miscellaneous Twitter utilities" 7 | keywords = ["twitter"] 8 | license-file = "../../LICENSE" 9 | readme = "../../README.md" 10 | version = "0.1.0" 11 | edition = "2021" 12 | 13 | [dependencies] 14 | chrono = "0.4" -------------------------------------------------------------------------------- /projects/hst-tw-utils/src/lib.rs: -------------------------------------------------------------------------------- 1 | use chrono::{DateTime, Utc}; 2 | 3 | const TWITTER_DATE_TIME_FMT: &str = "%a %b %d %H:%M:%S %z %Y"; 4 | 5 | /// Parse the time format used in Twitter API responses. 6 | pub fn parse_date_time(input: &str) -> Result, chrono::ParseError> { 7 | Ok(DateTime::parse_from_str(input, TWITTER_DATE_TIME_FMT)?.into()) 8 | } 9 | --------------------------------------------------------------------------------