├── .gitignore ├── LICENSE ├── README.md ├── discovertext_api ├── __init__.py └── discovertext_api.py ├── dt_credentials.json ├── extract_users_from_csvs.py ├── extract_users_from_dt.py ├── gather_bio_corpus_stats.py ├── ngram_classifier ├── __init__.py ├── ngram_classifier.py └── ngram_classifier_record.py ├── requirements.txt ├── resources ├── model-is_good_or_bad_nnet.dat ├── model-is_good_or_bad_nnet.dat.h5 ├── model-is_good_or_bad_user_desc_ngram_class.dat ├── test.csv ├── test_bot.csv ├── test_good.csv ├── test_users.csv ├── training.csv ├── training_bot.csv └── training_good.csv ├── run-csvs-score.bat ├── run-csvs-score.sh ├── run-score-dtarchive.bat ├── run-score-dtarchive.sh ├── run_nnet.py ├── split_training_data.py ├── test_ngram_classifier.py ├── test_nnet.py ├── train_ngram_classifier.py ├── train_nnet.py ├── train_test_ngram_classifier.py ├── twitter_auth.json └── utils ├── __init__.py ├── app_utils.py └── text_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | /.vscode/** 2 | /env/** 3 | /pandas/** 4 | **/__pycache__/** 5 | /data/** -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2021 Texifter, LLC. 2 | 3 | Permission is hereby granted, free of charge, to any person 4 | obtaining a copy of this software and associated documentation 5 | files (the "Software"), to deal in the Software without 6 | restriction, including without limitation the rights to use, 7 | copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the 9 | Software is furnished to do so, subject to the following 10 | conditions: 11 | 12 | The above copyright notice and this permission notice shall be 13 | included in all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 17 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 19 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 20 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Trust Defender - A Twitter User Classifier 2 | 3 | A collection of scripts used to train and run a classifier to classify Twitter users as potential good or bad actors. 4 | The classifier models are based on Twitter user information and focuses heavily on the user bio description to make determinations. 5 | 6 | - [Prerequisites](#Prerequisites) 7 | - [Environment Setup](#Environment-Setup) 8 | - [Basic Workflow (CSV file)](#Basic-Workflow-CSV-file) 9 | - [Basic Workflow (DiscoverText API)](#Basic-Workflow-DiscoverText-API) 10 | - [Training and Re-training the model](#Training-and-Re-training-the-model) 11 | - [Individual Scripts](#Individual-Scripts) 12 | - [License](#License) 13 | 14 | ## Prerequisites 15 | 16 | - Python 3.6 or higher 17 | - If using Windows, you may need to use 32-bit python as there have been issues using the Pandas library on 64 bit 18 | - A Twitter Developer account with an application (https://developer.twitter.com/). You'll need the _Consumer API keys_ as well as a set of _Access token & access token secret_ pairs for the application. 19 | - (if connecting to the DiscoverText API), you'll need your DiscoverText API key, API secret, hostname, username, and password 20 | 21 | ## Environment Setup 22 | 23 | - Create your virtual environment. E.g. : `python -m venv env` 24 | - If necessary, make the `.sh` scripts executable: `chmod +x run-csvs-score.sh` and `chmod +x run-score-dtarchive.sh` 25 | - Activate your environment (`source env/bin/activate`, or on Windows: `env\Scripts\activate`) 26 | - Install requirements: `pip install -r requirements.txt` 27 | - Install nltk resources: 28 | 29 | ``` 30 | $ python 31 | >>> import nltk 32 | >>> nltk.download('stopwords') 33 | >>> nltk.download('punkt') 34 | >>> exit() 35 | ``` 36 | 37 | ### Credential Setup: 38 | 39 | In the root directory, you'll find two files: `dt_credentials.json` and `twitter_auth.json`. Fill in the requested values for each. These files will be used as input to various scripts. 40 | 41 | ## Basic Workflow (CSV file) 42 | 43 | The `run-csvs-score` helper script will run the following scripts: 44 | 45 | 1. it runs the [extract_users_from_csvs](#script-extract_users_from_csvs-py) script to transform the list of usernames to get their Twitter information 46 | 2. it then runs the [run_nnet](#script-run_nnet-py) script across the gathered Twitter data to score the user information 47 | 48 | ``` 49 | Usage: run-csvs-score {data_directory} {base_filename_without_ext} 50 | ``` 51 | 52 | ## Basic Workflow (Using the DiscoverText API) 53 | 54 | If you have an Enterprise-level DiscoverText account with API access, you can directly pull the list of username metadata 55 | from a DiscoverText archive or bucket. 56 | 57 | The `run-score-dtarchive` helper script performs the following: 58 | 59 | 1. Run the [extract_users_from_dt](#script-extract_users_from_dt-py) script to export a list of usernames from an archive or bucket 60 | 2. Run the [extract_users_from_csvs](#script-extract_users_from_csvs-py) script to transform the list of usernames to get their Twitter information 61 | 3. Run the [run_nnet](#script-run_nnet-py) script across the gathered Twitter data to score the user information 62 | 63 | ``` 64 | Usage: run-score-dtarchive {data_directory} {base_filename_without_ext} {archive_id} 65 | ``` 66 | 67 | ## Training and Re-training the model 68 | 69 | _(note: there are already well-tuned, pre-trained classifiers in place - only re-train if necessary)_ 70 | The pre-trained models can be found at: 71 | 72 | - `/resources/model-is_good_or_bad_nnet.dat`: Keras trained file for the neural network 73 | - `/resources/model-is_good_or_bad_nnet.dat.h5`: Keras trained file for the neural network 74 | - `/resources/model-is_good_or_bad_user_desc_ngram_class.dat`: the n-gram classifier pre-trained model 75 | 76 | There are two primary classifier types: 77 | 78 | - an _n-gram_ naive bayes classifier model, which uses 5,6,7,8, and 9-gram models combined and results a probability score for two classes: `bot` or `good` 79 | - a neural network that takes as input: 80 | - `p(bot)` (from the ngram classifier) 81 | - `p(good)` (from the ngram classifier) 82 | - number of days the Twitter account has been active 83 | - average number of statuses per day 84 | - average number of followers per day 85 | - average number of following per day 86 | - the description length in number of individual terms 87 | - the number of "lists" in the description (e.g. `god, country, president` would be considedered a "list") 88 | - the number of hashtags used in the description 89 | - the number of URLs found in the description 90 | 91 | Use the script [train_ngram_classifier.py](#script-train_ngram_classifier-py) to train the n-gram classifier. 92 | 93 | ``` 94 | Usage: python train_ngram_classifier.py -i {input_csv_file} -o {output_model_file} 95 | ``` 96 | 97 | The input data in `train_ngram_classifier.py` can be specified by modifying: 98 | 99 | - **CLASSES**: array of classes to be use 100 | - **TEXT_COLUMN**: the heading label of the column that contains the text to train on 101 | - **CLASS_COLUMN**: the heading label of the column that contains the correct class for the record 102 | 103 | The script [train_nnet.py](#script-train_nnet-py) is used to train the neural network. 104 | 105 | ``` 106 | Usage: python train_nnet.py -i {input_csv_file} 107 | -m {ngram_classifier_model_file} 108 | -n {number_training_rounds} 109 | -t {testing_file_csv_path} 110 | -o {output_model_file} 111 | ``` 112 | 113 | ## Individual Scripts: 114 | 115 | - [extract_users_from_csvs.py](#script-extract_users_from_csvs-py): Get Twitter user information from CSV username list 116 | - [extract_users_from_dt.py](#script-extract_users_from_dt-py): Extract Twitter usernames from a DiscoverText archive or bucket 117 | - [gather_bio_corpus_stats.py](#script-gather_bio_corpus_stats-py): Output various statistics for bios in a corpus 118 | - [run_nnet.py](#script-run_nnet-py): Runs the neural network model across Twitter user information 119 | - [split_training_data.py](#script-split_training_data-py): Splits a training CSV file into training and test sets 120 | - [train_ngram_classifier.py](#script-train_ngram_classifier-py): Trains the n-gram classifier 121 | - [train_nnet.py](#script-train_nnet-py): Trains the neural network 122 | 123 | ### script: extract_users_from_csvs.py 124 | 125 | Reads in a file or directory of CSV files (primarily from DiscoverText metadata item exports), and gets Twitter information from the usernames. 126 | 127 | ``` 128 | Usage: extract_users_from_csvs.py -i (input_file) -o (output_file) -c (credentials_file) 129 | ``` 130 | 131 | - input file is a CSV of usernames (in the `Value` column by default) 132 | - output file is a CSV of information for each of the Twitter users' bios 133 | - credentials file is your `twitter_auth.json` credentials 134 | 135 | ### script: extract_users_from_dt.py 136 | 137 | Interactive or CLI script to extract and save usernames from a DiscoverText archive or bucket (to then further use to feed into extract_users_from_csvs.py) 138 | 139 | ``` 140 | Usage: extract_users_from_dt.py -i {DiscoverText_credentials} 141 | [-a {archive_id} -o {output_file}] 142 | [-b {bucket_id} -o {output_file}] 143 | ``` 144 | 145 | - input file is your `dt_credentials.json` file 146 | - optionally, specifying the archive id and output file will extract and run from an archive 147 | - or, optionally, specifying the bucket id and output file will extract and run from a bucket 148 | - or, running interactivly, the script will prompt for information and ask where to save the output file to 149 | 150 | ### script: gather_bio_corpus_stats.py 151 | 152 | Gathers a bit of supurious stat information about user bio data in a corpus 153 | 154 | ``` 155 | Usage: gather_bio_corpus_stats.py -i (input_file) 156 | ``` 157 | 158 | - input file is the CSV output from extract_users_from_csvs.py 159 | 160 | The output will be printed to the console with the top 100 terms, top 100 URLs, and top 100 hashtagsassa. 161 | 162 | ### script: run_nnet.py 163 | 164 | Runs the classifier and neural network, scoring Twitter data as bot or good. 165 | 166 | ``` 167 | Usage: run_nnet.py -i (input_file) -o (output_file) -n (neural_net_model) -m (ngram_model) 168 | ``` 169 | 170 | - input file is the CSV output from extract_users_from_csvs.py 171 | - output file will be a CSV, same as the input, but augmented with classification scores 172 | - the neural net model is the path to the _is_good_or_bad_nnet.dat_ file 173 | - the ngram model is the path to the _is_good_or_bad_user_desc_ngram_class.dat_ file 174 | 175 | ### script: split_training_data.py 176 | 177 | Splits training data into training and test data. Paths are configured within the script 178 | 179 | ### script: train_ngram_classifier.py 180 | 181 | Trains the ngram classifier from training data. 182 | 183 | ``` 184 | Usage: train_ngram_classifier.py -i (input_file) -o (output_file) 185 | ``` 186 | 187 | - input file is the training data CSV with user_profile_description and a class_value (bot or good) 188 | - output file is the .dat file with the trainined ngram model 189 | 190 | ### script: train_nnet.py 191 | 192 | Trains and tests the neural network. 193 | 194 | ``` 195 | Usage: python train_nnet.py -i {input_csv_file} 196 | -m {ngram_classifier_model_file} 197 | -n {number_training_rounds} 198 | -t {testing_file_csv_path} 199 | -o {output_model_file} 200 | ``` 201 | 202 | - input file is the training data CSV with user_profile_description and a class_value (bot or good) 203 | - ngram_model is the .dat file from the trained ngram classifier 204 | - number of rounds is the number of rounds to train 205 | - test file is the test CSV file used to gather accuracy testing 206 | - output file is the .dat files with the trained neural network model 207 | 208 | ## License 209 | 210 | This software is licensed under the MIT license (see the [LICENSE](./LICENSE) file). 211 | 212 | By using this code, you assume all responsibility for any damages, additional charges, and all issues. 213 | -------------------------------------------------------------------------------- /discovertext_api/__init__.py: -------------------------------------------------------------------------------- 1 | from .discovertext_api import DiscoverTextApi 2 | -------------------------------------------------------------------------------- /discovertext_api/discovertext_api.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import hashlib 3 | import hmac 4 | import json 5 | import requests 6 | import time 7 | import urllib.parse 8 | 9 | 10 | class DiscoverTextApi(): 11 | API_VERSION = "v1" 12 | BASE_URL = "https://api.discovertext.com" 13 | 14 | def __init__(self, credential_file=None, 15 | api_key=None, api_secret=None, hostname=None, 16 | username=None, password=None, api_base_url=None): 17 | 18 | if (credential_file): 19 | with open(credential_file) as credential_file_handle: 20 | credentials = json.load(credential_file_handle) 21 | self._api_key = credentials["api_key"] 22 | self._api_secret = credentials["api_secret"] 23 | self._hostname = credentials["hostname"] 24 | if "username" in credentials: 25 | self._username = credentials["username"] 26 | if "password" in credentials: 27 | self._password = credentials["password"] 28 | else: 29 | self._api_key = api_key 30 | self._api_secret = api_secret 31 | self._hostname = hostname 32 | self._username = username 33 | self._password = password 34 | 35 | self._jwt_token = None 36 | self._jwt_token_renewal = 0 37 | self._jwt_token_exp = 0 38 | 39 | if not self._api_key: 40 | raise "missing api key" 41 | if not self._api_secret: 42 | raise "missing api secret key" 43 | if not self._hostname: 44 | raise "missing hostname" 45 | 46 | base_url = DiscoverTextApi.BASE_URL if not api_base_url else api_base_url 47 | self._api_base_url = f'{base_url}/api/{DiscoverTextApi.API_VERSION}' 48 | 49 | def _check_jwt(self): 50 | if not self._jwt_token or self._jwt_token_renewal == 0: 51 | raise "no token issued" 52 | if self._jwt_token_exp < int(time.time()): 53 | raise "expired token" 54 | 55 | def _check_renew(self): 56 | self._check_jwt() 57 | if self._jwt_token_renewal < int(time.time()): 58 | self.renew_token() 59 | 60 | def _get_request_response(self, response): 61 | if not response: 62 | raise "no response" 63 | response.raise_for_status() 64 | return response.text 65 | 66 | def _send_get(self, request_url, query_params=None, check_credentials=True): 67 | if check_credentials: 68 | self._check_renew() 69 | request_headers = { 70 | "Content-Type": "application/json", 71 | "Authorization": f'Bearer {self._jwt_token}' 72 | } 73 | response = requests.get(url=request_url, 74 | params=query_params, 75 | headers=request_headers 76 | ) 77 | return self._get_request_response(response) 78 | 79 | def _send_post(self, request_url, post_data=None, query_params=None, check_credentials=True): 80 | if check_credentials: 81 | self._check_renew() 82 | request_headers = { 83 | "Content-Type": "application/json", 84 | "Authorization": f'Bearer {self._jwt_token}' 85 | } 86 | response = requests.post(url=request_url, 87 | json=json.dumps(post_data), 88 | params=query_params, 89 | headers=request_headers 90 | ) 91 | return self._get_request_response(response) 92 | 93 | def _set_token_and_renewal(self, token): 94 | self._jwt_token = token 95 | # set token renewal to now + 6 minutes... gives us a 4 minute window 96 | self._jwt_token_exp = int(time.time()) + 600 97 | self._jwt_token_renewal = int(time.time()) + 360 98 | 99 | def login(self, username=None, password=None): 100 | ''' 101 | login the user and get the initial JWT 102 | 103 | https://api.discovertext.com/Docs/GettingStarted/Authentication 104 | ''' 105 | login_username = self._username if username is None else username 106 | login_password = self._password if password is None else password 107 | if not login_username: 108 | raise "Username not set" 109 | if not login_password: 110 | raise "Password not set" 111 | nonce = int(time.time()) 112 | sig_string = f'{self._api_key}:{self._hostname}:{login_username}:{login_password}:{nonce}' 113 | message = bytes(sig_string, 'utf-8') 114 | secret = bytes(self._api_secret, 'utf-8') 115 | signature = base64.b64encode( 116 | hmac.new(secret, message, digestmod=hashlib.sha256).digest()).decode('utf-8') 117 | 118 | request_url = f'{self._api_base_url}/login' 119 | request_data = { 120 | "apiKey": self._api_key, 121 | "hostname": self._hostname, 122 | "username": login_username, 123 | "password": login_password, 124 | "nonce": nonce, 125 | "signature": signature 126 | } 127 | 128 | response = requests.post(url=request_url, json=request_data) 129 | response.raise_for_status() 130 | self._set_token_and_renewal(response.text) 131 | 132 | def get_oauth_authorize_url(self, redirect_url): 133 | request_url = f'{self._api_base_url}/login/oauth' 134 | formatted_redirect_uri = urllib.parse.quote(redirect_url, safe='') 135 | return f'{request_url}?response_type=code&client_id={self._api_key}&redirect_uri={formatted_redirect_uri}&scope=read&hostname={self._hostname}' 136 | 137 | def get_oauth_access_token_url(self, auth_code, redirect_url): 138 | request_url = f'{self._api_base_url}/login/token' 139 | response = requests.get(url=request_url, 140 | params={ 141 | "client_id": self._api_key, 142 | "client_secret": self._api_secret, 143 | "grant_type": "authorization_code", 144 | "code": auth_code, 145 | "redirect_uri": redirect_url 146 | }) 147 | response.raise_for_status() 148 | response_item = response.json 149 | self._set_token_and_renewal(response.json["token"]) 150 | 151 | def renew_token(self): 152 | request_url = f'{self._api_base_url}/login/renew' 153 | request_headers = { 154 | "Content-Type": "application/json", 155 | "Authorization": f'Bearer {self._jwt_token}' 156 | } 157 | response = requests.get(url=request_url, 158 | headers=request_headers 159 | ) 160 | response.raise_for_status() 161 | self._set_token_and_renewal(response.text) 162 | 163 | def get_unit_types(self): 164 | request_url = f'{self._api_base_url}/system/unitTypes' 165 | return json.loads(self._send_get(request_url)) 166 | 167 | def get_projects(self, offset=0, limit=20): 168 | request_url = f'{self._api_base_url}/projects' 169 | return json.loads(self._send_get(request_url, { 170 | "offset": offset, 171 | "limit": limit 172 | })) 173 | 174 | def get_project_archives(self, project_id, offset=0, limit=20): 175 | request_url = f'{self._api_base_url}/projects/{project_id}/archives' 176 | return json.loads(self._send_get(request_url, { 177 | "offset": offset, 178 | "limit": limit 179 | })) 180 | 181 | def get_archive(self, archive_id): 182 | request_url = f'{self._api_base_url}/archives/{archive_id}' 183 | return json.loads(self._send_get(request_url, {})) 184 | 185 | def get_archive_units(self, archive_id, offset=0, limit=20, include_metadata=True): 186 | request_url = f'{self._api_base_url}/archives/{archive_id}/units' 187 | params = { 188 | "offset": offset, 189 | "limit": limit, 190 | "includeMetadata": "true" if include_metadata else "false" 191 | } 192 | return json.loads(self._send_get(request_url, params)) 193 | 194 | def get_project_buckets(self, project_id, offset=0, limit=20): 195 | request_url = f'{self._api_base_url}/projects/{project_id}/buckets' 196 | return json.loads(self._send_get(request_url, { 197 | "offset": offset, 198 | "limit": limit 199 | })) 200 | 201 | def get_bucket(self, bucket_id): 202 | request_url = f'{self._api_base_url}/buckets/{bucket_id}' 203 | return json.loads(self._send_get(request_url, {})) 204 | 205 | def get_bucket_units(self, bucket_id, offset=0, limit=20, include_metadata=True): 206 | request_url = f'{self._api_base_url}/buckets/{bucket_id}/units' 207 | params = { 208 | "offset": offset, 209 | "limit": limit, 210 | "includeMetadata": "true" if include_metadata else "false" 211 | } 212 | return json.loads(self._send_get(request_url, params)) 213 | 214 | def get_codeset_listing(self, offset=0, limit=20): 215 | request_url = f'{self._api_base_url}/codesets' 216 | params = { 217 | "offset": offset, 218 | "limit": limit 219 | } 220 | return json.loads(self._send_get(request_url, params)) 221 | 222 | def get_codeset_item(self, codeset_id): 223 | request_url = f'{self._api_base_url}/codesets/{codeset_id}' 224 | return json.loads(self._send_get(request_url, {})) 225 | 226 | def get_codeset_data(self, codeset_id, offset=0, limit=20): 227 | request_url = f'{self._api_base_url}/codesets/{codeset_id}/data' 228 | params = { 229 | "offset": offset, 230 | "limit": limit 231 | } 232 | return json.loads(self._send_get(request_url, params)) 233 | -------------------------------------------------------------------------------- /dt_credentials.json: -------------------------------------------------------------------------------- 1 | { 2 | "api_key": "", 3 | "api_secret": "", 4 | "hostname": "", 5 | "username": "", 6 | "password": "" 7 | } 8 | -------------------------------------------------------------------------------- /extract_users_from_csvs.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from datetime import datetime, timezone 3 | import glob 4 | import pandas as pd 5 | import os 6 | import timestring 7 | from utils import batch_list 8 | from nltk import word_tokenize 9 | from utils import get_hashtag_count, get_list_item_count, get_url_count, get_twitter_auth 10 | import tweepy 11 | 12 | SCREEN_NAME_COLUMN = 'Value' 13 | 14 | twitter_api = None 15 | 16 | 17 | def get_user_add_props(input_bio): 18 | bio_text = input_bio 19 | if not bio_text: 20 | bio_text = "" 21 | bio_text = bio_text.strip() 22 | tokens = word_tokenize(bio_text) 23 | tokens_terms_only = [word.lower() for word in tokens if word.isalpha()] 24 | 25 | desc_len_terms = len(tokens_terms_only) 26 | desc_len_chars = len(bio_text) 27 | num_list_items = get_list_item_count(bio_text) 28 | num_hashtags = get_hashtag_count(tokens) 29 | url_count = get_url_count(bio_text) 30 | 31 | return desc_len_terms, desc_len_chars, num_list_items, num_hashtags, url_count 32 | 33 | 34 | def get_val(user_object, attribute_name): 35 | return getattr(user_object, attribute_name, None) 36 | 37 | 38 | def get_twitter_api_batch(this_batch): 39 | retry_count = 0 40 | while True: 41 | try: 42 | user_data = twitter_api.lookup_users(screen_name=this_batch) 43 | return user_data 44 | except Exception as e: 45 | retry_count += 1 46 | if (retry_count > 3): 47 | print( 48 | f'!! exception getting Twitter API data... retry {retry_count} of 3...') 49 | else: 50 | print(f'exception: {e}') 51 | print('... too many retrys... skipping...') 52 | return None 53 | 54 | 55 | def get_batched_user_data(this_batch, todays_date): 56 | ret = [] 57 | user_data = get_twitter_api_batch(this_batch) 58 | if not user_data: 59 | return [] 60 | for this_user in user_data: 61 | user_create_date_str = get_val(this_user, "created_at") 62 | if not user_create_date_str: 63 | continue 64 | 65 | user_create_date = timestring.Date(user_create_date_str) 66 | 67 | num_statuses = int(get_val(this_user, "statuses_count")) 68 | follower_count = int(get_val(this_user, "followers_count")) 69 | following_count = int(get_val(this_user, "friends_count")) 70 | location = get_val(this_user, "location") 71 | 72 | num_days = (todays_date - user_create_date.date).days 73 | if num_days == 0: 74 | num_days = 1 75 | 76 | desc_len_terms, desc_len_chars, num_list_items, num_hashtags, url_count = get_user_add_props( 77 | get_val(this_user, "description")) 78 | 79 | ret_user = { 80 | "userid": get_val(this_user, "id"), 81 | "user_display_name": get_val(this_user, "name"), 82 | "user_screen_name": get_val(this_user, "screen_name"), 83 | "user_reported_location": location, 84 | "user_profile_description": get_val(this_user, "description"), 85 | "status_count": num_statuses, 86 | "follower_count": follower_count, 87 | "following_count": following_count, 88 | "num_days": num_days, 89 | "account_creation_date": user_create_date, 90 | "account_language": get_val(this_user, "id"), 91 | "statuses_per_day": num_statuses / num_days, 92 | "followers_per_day": follower_count / num_days, 93 | "following_per_day": following_count / num_days, 94 | "desc_len_terms": desc_len_terms, 95 | "desc_len_chars": desc_len_chars, 96 | "num_list_items": num_list_items, 97 | "num_hashtags": num_hashtags, 98 | "url_count": url_count, 99 | "has_location": 1 if location else 0, 100 | "verified": get_val(this_user, "verified") 101 | } 102 | ret.append(ret_user) 103 | return ret 104 | 105 | 106 | def get_user_data(screen_names): 107 | ret = [] 108 | todays_date = datetime.now(timezone.utc) 109 | for this_batch in batch_list(screen_names, 100): 110 | print( 111 | f'...getting batch of {len(this_batch)} from "{this_batch[0]}" to "{this_batch[-1]}"...') 112 | twitter_batch_data = get_batched_user_data(this_batch, todays_date) 113 | print(f'... got user data for {len(twitter_batch_data)} users...') 114 | if twitter_batch_data: 115 | ret.extend(twitter_batch_data) 116 | print(f'... gotten so far: {len(ret)}') 117 | return ret 118 | 119 | 120 | if __name__ == "__main__": 121 | parser = argparse.ArgumentParser() 122 | parser.add_argument("-i", "--input", help="input directory or file") 123 | parser.add_argument("-o", "--output", help="output file") 124 | parser.add_argument("-c", "--credentials", help="twitter credentials file") 125 | args = parser.parse_args() 126 | 127 | if not args.input: 128 | raise "missing input directory or file" 129 | if not args.output: 130 | raise "missing output file" 131 | if not args.credentials: 132 | raise "missing credentials file" 133 | 134 | twitter_app_auth = get_twitter_auth(args.credentials) 135 | twitter_auth = tweepy.OAuthHandler( 136 | twitter_app_auth["consumer_key"], twitter_app_auth["consumer_secret"]) 137 | twitter_auth.set_access_token( 138 | twitter_app_auth["access_token"], twitter_app_auth["access_secret"]) 139 | twitter_api = tweepy.API(twitter_auth, wait_on_rate_limit=True) 140 | 141 | if os.path.isdir(args.input): 142 | all_files = glob.glob(os.path.join(args.input, "*.csv")) 143 | elif os.path.isfile(args.input): 144 | all_files = [args.input] 145 | else: 146 | raise "Unknown input" 147 | 148 | all_file_len = len(all_files) 149 | print(f'... found {all_file_len} items') 150 | screen_names = {} 151 | f_counter = 0 152 | for this_file in all_files: 153 | print(f'... reading {this_file}') 154 | df = pd.read_csv(this_file, keep_default_na=False) 155 | for index, row in df.iterrows(): 156 | this_screen_name = row[SCREEN_NAME_COLUMN] 157 | if this_screen_name not in screen_names: 158 | screen_names[this_screen_name] = True 159 | f_counter += 1 160 | print(f'... processed {f_counter} of {all_file_len}') 161 | 162 | print(f'Found a total of {len(screen_names)} users') 163 | 164 | all_users = get_user_data(screen_names.keys()) 165 | print(f'total output of {len(all_users)} users...') 166 | 167 | df_out = pd.DataFrame(all_users) 168 | df_out.to_csv(args.output) 169 | -------------------------------------------------------------------------------- /extract_users_from_dt.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import collections 3 | from discovertext_api import DiscoverTextApi 4 | import pandas as pd 5 | 6 | dt_api = None 7 | 8 | 9 | def print_item_list(item_list): 10 | print("------------------------------------------------------------------") 11 | print("ID Timestamp Name") 12 | for item in item_list: 13 | print(f'{item["id"]:6} {item["timestamp"]:24} {item["name"]}') 14 | print("----") 15 | 16 | 17 | def extract_and_save(project_id, entity_type, entity_id, entity_name, output_file=None): 18 | screen_names = collections.Counter() 19 | current_offset = 0 20 | while True: 21 | if entity_type == "archive": 22 | current_units = dt_api.get_archive_units( 23 | entity_id, offset=current_offset, limit=100, include_metadata=True) 24 | elif entity_type == "bucket": 25 | current_units = dt_api.get_bucket_units( 26 | entity_id, offset=current_offset, limit=100, include_metadata=True) 27 | if not current_units: 28 | break 29 | max_limit = current_units["meta"]["count"] 30 | print(f"offset {current_offset} out of {max_limit}") 31 | 32 | if not "items" in current_units: 33 | break 34 | for this_item in current_units["items"]: 35 | if not "metadata" in this_item: 36 | continue 37 | screen_name_list = list( 38 | filter(lambda x: (x["key"].startswith("screen_name") or 39 | x["key"].startswith("from_user") or 40 | x["key"].startswith("entities_mentions_username")), 41 | this_item["metadata"])) 42 | for name in screen_name_list: 43 | screen_names.update({name["value"]: 1}) 44 | 45 | current_offset += 100 46 | if current_offset >= max_limit: 47 | break 48 | 49 | print(f'Completed {entity_type}: "{entity_name}"') 50 | if output_file: 51 | output_filename = output_file 52 | else: 53 | output_filename = input( 54 | f"Gathered {len(screen_names)} total names. Output file: ") 55 | print(f'Writing output to: {output_filename}') 56 | df = pd.DataFrame.from_dict(screen_names, orient='index').reset_index() 57 | df = df.rename(columns={'index': 'Value', 0: 'Total'}) 58 | df.to_csv(output_filename, encoding='utf8') 59 | 60 | 61 | def extract_from_archive(project_id, bucket_id, bucket_name): 62 | print(f"Extracting from archive: {bucket_name}") 63 | extract_and_save(project_id, "archive", bucket_id, bucket_name) 64 | 65 | 66 | def extract_from_bucket(project_id, bucket_id, bucket_name): 67 | print(f"Extracting from bucket: {bucket_name}") 68 | extract_and_save(project_id, "bucket", bucket_id, bucket_name) 69 | 70 | 71 | def get_project_buckets(project_id, project_name): 72 | buckets = dt_api.get_project_buckets(project_id, limit=1000) 73 | bucket_list = sorted(map(lambda x: { 74 | "name": x["name"], 75 | "id": x["id"], 76 | "timestamp": x["timestamp"] 77 | }, buckets["items"]), 78 | key=lambda x: x["timestamp"], 79 | reverse=True) 80 | print_item_list(bucket_list) 81 | 82 | while True: 83 | bucket_id_str = input("BucketId to extract from (0 for exit): ") 84 | if not bucket_id_str: 85 | print_item_list(bucket_list) 86 | continue 87 | bucket_id = int(bucket_id_str) 88 | if bucket_id == 0: 89 | return 90 | selected_bucket = list( 91 | filter(lambda x: x["id"] == bucket_id, bucket_list)) 92 | if (len(selected_bucket) == 0): 93 | print("unknown project id") 94 | else: 95 | extract_from_bucket( 96 | project_id, selected_bucket[0]["id"], selected_bucket[0]["name"]) 97 | return 98 | 99 | 100 | def get_project_archives(project_id, project_name): 101 | archives = dt_api.get_project_archives(project_id, limit=1000) 102 | archive_list = sorted(map(lambda x: { 103 | "name": x["name"], 104 | "id": x["id"], 105 | "timestamp": x["timestamp"] 106 | }, archives["items"]), 107 | key=lambda x: x["timestamp"], 108 | reverse=True) 109 | print_item_list(archive_list) 110 | 111 | while True: 112 | archive_id_str = input("ArchiveId to extract from (0 for exit): ") 113 | if not archive_id_str: 114 | print_item_list(archive_list) 115 | continue 116 | archive_id = int(archive_id_str) 117 | if archive_id == 0: 118 | return 119 | selecterd_archive_list = list( 120 | filter(lambda x: x["id"] == archive_id, archive_list)) 121 | if (len(selecterd_archive_list) == 0): 122 | print("unknown archive id") 123 | else: 124 | extract_from_archive( 125 | project_id, selecterd_archive_list[0]["id"], selecterd_archive_list[0]["name"]) 126 | return 127 | 128 | 129 | def do_selected_project(project_id, project_name): 130 | while True: 131 | print("--------") 132 | print(f"Project: {project_name}") 133 | item_selection = input("archive or bucket (or exit)? ") 134 | if item_selection == "exit": 135 | return 136 | elif item_selection == "archive": 137 | get_project_archives(project_id, project_name) 138 | elif item_selection == "bucket": 139 | get_project_buckets(project_id, project_name) 140 | else: 141 | print("unknown function") 142 | 143 | 144 | def do_project_select(project_list): 145 | print_item_list(project_list) 146 | while True: 147 | project_id_str = input("ProjectId (0 for exit): ") 148 | if not project_id_str: 149 | print_item_list(project_list) 150 | continue 151 | project_id = int(project_id_str) 152 | if project_id == 0: 153 | return 154 | selected_project = list( 155 | filter(lambda x: x["id"] == project_id, project_list)) 156 | if (len(selected_project) == 0): 157 | print("unknown project id") 158 | else: 159 | do_selected_project( 160 | selected_project[0]["id"], selected_project[0]["name"]) 161 | 162 | 163 | def extract_from_cli_archive(archive_id, output_file): 164 | archive = dt_api.get_archive(archive_id) 165 | if not archive: 166 | print("unknown archive id") 167 | return 168 | 169 | print(f'Extracting from archive {archive_id}: {archive["name"]}') 170 | 171 | print(f'writing file to: {output_file}') 172 | 173 | extract_and_save(0, "archive", archive_id, 174 | archive['name'], output_file=output_file) 175 | 176 | 177 | def extract_from_cli_bucket(bucket_id, output_file): 178 | bucket = dt_api.get_bucket(bucket_id) 179 | if not bucket: 180 | print("unknown bucket id") 181 | return 182 | 183 | print(f'Extracting from bucket {bucket_id}: {bucket["name"]}') 184 | 185 | print(f'writing file to: {output_file}') 186 | 187 | extract_and_save(0, "bucket", bucket_id, 188 | bucket['name'], output_file=output_file) 189 | 190 | 191 | if __name__ == "__main__": 192 | parser = argparse.ArgumentParser() 193 | parser.add_argument("-i", "--input", help="input credential file") 194 | parser.add_argument("-a", "--archive", help="archive to extract from") 195 | parser.add_argument("-b", "--bucket", help="bucket to extract from") 196 | parser.add_argument("-o", "--output", help="output file") 197 | args = parser.parse_args() 198 | 199 | if not args.input: 200 | raise "missing input credential file" 201 | 202 | dt_api = DiscoverTextApi(credential_file=args.input) 203 | 204 | dt_api.login() 205 | 206 | if args.archive and args.bucket: 207 | raise "cannot use --archive and --bucket flag at the same time" 208 | 209 | if args.archive: 210 | extract_from_cli_archive(args.archive, args.output) 211 | elif args.bucket: 212 | extract_from_cli_bucket(args.bucket, args.output) 213 | else: 214 | projects = dt_api.get_projects(limit=1000) 215 | project_list = sorted(map(lambda x: { 216 | "name": x["name"], 217 | "id": x["id"], 218 | "timestamp": x["timestamp"] 219 | }, projects["items"]), 220 | key=lambda x: x["timestamp"], 221 | reverse=True) 222 | 223 | do_project_select(project_list) 224 | -------------------------------------------------------------------------------- /gather_bio_corpus_stats.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from collections import Counter 3 | from nltk.corpus import stopwords 4 | from utils import clean_text, tokenize, get_urls, get_hashtags 5 | from sklearn.feature_extraction.text import CountVectorizer 6 | import pandas as pd 7 | 8 | stop_words = stopwords.words('english') 9 | 10 | 11 | class StatsCorpus(): 12 | def __init__(self): 13 | self._corpus = [] 14 | self._total_terms = 0 15 | self._unique_terms = Counter() 16 | 17 | 18 | def is_number(s): 19 | try: 20 | float(s) 21 | return True 22 | except ValueError: 23 | return False 24 | 25 | 26 | def get_tokens_for_counting(tokens): 27 | ret = [] 28 | for this_token in tokens: 29 | if len(this_token) < 3: 30 | continue 31 | if is_number(this_token): 32 | continue 33 | ret.append(this_token) 34 | return ret 35 | 36 | 37 | if __name__ == "__main__": 38 | parser = argparse.ArgumentParser() 39 | parser.add_argument("-i", "--input", help="input csv file") 40 | args = parser.parse_args() 41 | 42 | df = pd.read_csv(args.input, keep_default_na=False) 43 | term_counter = Counter() 44 | hashtag_counter = Counter() 45 | url_counter = Counter() 46 | for index, row in df.iterrows(): 47 | input_text = str(row["user_profile_description"]) 48 | if not input_text: 49 | continue 50 | 51 | urls = get_urls(input_text) 52 | hashtags = get_hashtags(input_text) 53 | cleaned = clean_text(input_text, remove_urls=True) 54 | tokens = tokenize(cleaned) 55 | if len(tokens) == 0: 56 | continue 57 | tokens = [x for x in tokens if x not in stop_words] 58 | tokens_for_count = get_tokens_for_counting(tokens) 59 | for token in tokens_for_count: 60 | term_counter[token] += 1 61 | 62 | for url in urls: 63 | url_counter[url.lower()] += 1 64 | for hashtag in hashtags: 65 | hashtag_counter[hashtag.lower()] += 1 66 | 67 | print('top 100 terms:') 68 | print(term_counter.most_common(100)) 69 | print(f'{len(term_counter)} unique, {sum(term_counter.values())} total') 70 | 71 | print('top 100 urls:') 72 | print(url_counter.most_common(100)) 73 | 74 | print("top 100 hashtags:") 75 | print(hashtag_counter.most_common(100)) 76 | print(f'{len(hashtag_counter)} unique, {sum(hashtag_counter.values())} total') 77 | -------------------------------------------------------------------------------- /ngram_classifier/__init__.py: -------------------------------------------------------------------------------- 1 | from .ngram_classifier import NGramClassifier 2 | from .ngram_classifier_record import NGramClassifierRecord 3 | -------------------------------------------------------------------------------- /ngram_classifier/ngram_classifier.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import math 3 | from .ngram_classifier_record import NGramClassifierRecord 4 | from collections import Counter 5 | from collections.abc import Sequence 6 | import utils as text_utils 7 | 8 | 9 | class NGramClassifier(): 10 | 11 | def __init__(self, classes=None, 12 | model_path=None, 13 | min_len=2, max_len=5 14 | ): 15 | 16 | self._classes = {} 17 | self._trained = False 18 | self._min = min_len 19 | self._max = max_len 20 | if model_path: 21 | self.load(model_path) 22 | return 23 | 24 | if not classes: 25 | raise "missing parameter classes" 26 | if not isinstance(classes, list): 27 | raise "invalid value for parameter classes" 28 | 29 | for this_class in classes: 30 | these_classes = {} 31 | for nval in range(self._min, self._max): 32 | these_classes[nval] = NGramClassifierRecord() 33 | self._classes[this_class] = these_classes 34 | 35 | def _get_ngrams(self, text, ngram_count): 36 | ngram_list = list(text_utils.get_ngrams(text, ngram_count)) 37 | return list("".join(n) for n in ngram_list) 38 | 39 | def _train_class(self, input_text, class_label): 40 | ''' 41 | trains a class with the input text 42 | 1) clean the input text 43 | 2) for each ngram value in _min to _max: 44 | 2a) get the ngrams for the cleaned text 45 | 2b) update the ngram list for that class / ngram count 46 | ''' 47 | if class_label not in self._classes: 48 | raise f'Unknown class label found: {class_label}' 49 | 50 | cleaned = text_utils.clean_text(input_text) 51 | for nval in range(self._min, self._max): 52 | if len(cleaned) < nval: 53 | continue 54 | ngram_list = self._get_ngrams(cleaned, nval) 55 | self._classes[class_label][nval].add(ngram_list) 56 | 57 | def _get_default_classify_return(self): 58 | ret = {} 59 | per_class = 1/len(self._classes.keys()) 60 | for this_class in self._classes.keys(): 61 | ret[this_class] = per_class 62 | return ret 63 | 64 | def _get_scale_factor(self, text_length): 65 | if text_length <= 10: 66 | return 1.0 67 | return 1.0 / math.pow(math.log(text_length), 2) 68 | 69 | def _calculate_classify_ratios(self, input_text): 70 | text_length = len(input_text) 71 | scale_factor = self._get_scale_factor(text_length) 72 | class_ratios = [] 73 | for nval in range(self._min, self._max): 74 | if len(input_text) < nval: 75 | continue 76 | 77 | class_counts = {} 78 | overall_count = 0 79 | for this_class in self._classes.keys(): 80 | this_count = self._classes[this_class][nval].total_count 81 | overall_count += this_count 82 | class_counts[this_class] = this_count 83 | 84 | p = {} 85 | for this_class in self._classes.keys(): 86 | p[this_class] = math.log((class_counts[this_class] + 1) * ( 87 | 1 - class_counts[this_class] / overall_count) / (class_counts[this_class] + overall_count)) 88 | 89 | ngram_list = self._get_ngrams(input_text, nval) 90 | 91 | for this_ngram in ngram_list: 92 | count_class = {} 93 | for this_class in self._classes.keys(): 94 | p_count = 0 if this_ngram not in self._classes[this_class][ 95 | nval].ngrams else self._classes[this_class][nval].ngrams[this_ngram] 96 | p_gram = (p_count + 1) / \ 97 | (class_counts[this_class] + overall_count) 98 | p[this_class] += math.log(p_gram) 99 | 100 | d = 0.0 101 | for this_class in self._classes.keys(): 102 | p[this_class] *= scale_factor 103 | d += math.exp(p[this_class]) 104 | 105 | if d != 0.0: 106 | vals = {} 107 | for this_class in self._classes.keys(): 108 | vals[this_class] = math.exp(p[this_class]) / d 109 | class_ratios.append(vals) 110 | 111 | return class_ratios 112 | 113 | # ---------------------------------------------------------------- 114 | # ---------------------------------------------------------------- 115 | # ---------------------------------------------------------------- 116 | 117 | def classify_text(self, input_text, is_cleaned=False): 118 | """Classify input text 119 | 120 | Returns a dictionary of { 121 | "class_1": p(c1), 122 | "class_2": p(c2), 123 | ... etc ... 124 | } 125 | """ 126 | 127 | assert(self._trained), "model must be trained before classifying" 128 | 129 | if is_cleaned: 130 | cleaned_text = input_text 131 | else: 132 | cleaned_text = text_utils.clean_text(input_text) 133 | 134 | if len(cleaned_text) < 2: 135 | return self._get_default_classify_return() 136 | 137 | class_ratios = self._calculate_classify_ratios(cleaned_text) 138 | if len(class_ratios) == 0: 139 | return self._get_default_classify_return() 140 | 141 | avgs = {} 142 | total_avg = 0.0 143 | for this_ratio_set in class_ratios: 144 | for this_class in self._classes.keys(): 145 | if not this_class in avgs: 146 | avgs[this_class] = this_ratio_set[this_class] 147 | else: 148 | avgs[this_class] += this_ratio_set[this_class] 149 | total_avg += this_ratio_set[this_class] 150 | 151 | if total_avg == 0.0: 152 | return self._get_default_classify_return() 153 | 154 | ret = {} 155 | for this_class in self._classes.keys(): 156 | ret[this_class] = avgs[this_class] / total_avg 157 | return ret 158 | 159 | def classify_text_list(self, text_list): 160 | return ([self.classify_text(t) for t in text_list]) 161 | 162 | # ---------------------------------------------------------------- 163 | # ---------------------------------------------------------------- 164 | # ---------------------------------------------------------------- 165 | 166 | def train_text(self, text_items, class_designations): 167 | assert(len(text_items) == len(class_designations) 168 | ), "Input arrays must be equal length" 169 | 170 | for index in range(0, len(text_items)): 171 | this_text = text_items[index] 172 | this_class = class_designations[index] 173 | self._train_class(this_text, this_class) 174 | self._trained = True 175 | self.update_counts() 176 | 177 | # ---------------------------------------------------------------- 178 | # ---------------------------------------------------------------- 179 | # ---------------------------------------------------------------- 180 | 181 | def update_counts(self): 182 | """ Manual call to update the total counts for each class """ 183 | for this_class in self._classes.keys(): 184 | for nval in range(self._min, self._max): 185 | self._classes[this_class][nval].update_total() 186 | 187 | # ---------------------------------------------------------------- 188 | # ---------------------------------------------------------------- 189 | # ---------------------------------------------------------------- 190 | 191 | def serialize(self, output_path, max_to_save): 192 | """Saves the classifier data to a file 193 | writes out the min/max ngrams, the class list, and the class data/counts for each 194 | """ 195 | with open(output_path, 'wb') as output_file: 196 | pickle.dump(self._min, output_file) 197 | pickle.dump(self._max, output_file) 198 | key_list = list(self._classes.keys()) 199 | pickle.dump(key_list, output_file) 200 | for this_key in key_list: 201 | for nval in range(self._min, self._max): 202 | pickle.dump( 203 | dict(self._classes[this_key][nval].ngrams.most_common(max_to_save)), output_file) 204 | 205 | def load(self, input_path): 206 | """Loads classifier data from a file 207 | includes the min/max ngrams, the class list, and the class data/counts 208 | """ 209 | self._classes = {} 210 | self._trained = False 211 | with open(input_path, 'rb') as input_file: 212 | self._min = pickle.load(input_file) 213 | self._max = pickle.load(input_file) 214 | key_list = pickle.load(input_file) 215 | for this_class in key_list: 216 | these_classes = {} 217 | for nval in range(self._min, self._max): 218 | ngram_data = pickle.load(input_file) 219 | these_classes[nval] = NGramClassifierRecord() 220 | these_classes[nval].ngrams = Counter(ngram_data) 221 | these_classes[nval].update_total() 222 | self._classes[this_class] = these_classes 223 | self._trained = True 224 | -------------------------------------------------------------------------------- /ngram_classifier/ngram_classifier_record.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | 4 | class NGramClassifierRecord(): 5 | def __init__(self): 6 | self.total_count = 0 7 | self.ngrams = Counter() 8 | 9 | def get_counts(self): 10 | return self.ngrams.most_common() 11 | 12 | def get_total(self): 13 | self.update_total(self) 14 | return self.total_count 15 | 16 | def add(self, ngrams): 17 | self.ngrams.update(ngrams) 18 | 19 | def update_total(self): 20 | self.total_count = sum(self.ngrams.values()) 21 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==1.2.0 2 | astunparse==1.6.3 3 | autopep8==1.7.0 4 | cachetools==5.2.0 5 | certifi==2023.7.22 6 | charset-normalizer==2.1.1 7 | click==8.1.3 8 | colorama==0.4.5 9 | flatbuffers==2.0.7 10 | gast==0.4.0 11 | google-auth==2.11.1 12 | google-auth-oauthlib==0.4.6 13 | google-pasta==0.2.0 14 | grpcio==1.56.0 15 | h5py==3.7.0 16 | idna==3.4 17 | joblib==1.2.0 18 | keras==2.10.0 19 | Keras-Preprocessing==1.1.2 20 | libclang==14.0.6 21 | Markdown==3.4.1 22 | MarkupSafe==2.1.1 23 | nltk==3.7 24 | numpy==1.23.3 25 | oauthlib==3.2.1 26 | opt-einsum==3.3.0 27 | packaging==21.3 28 | pandas==1.5.0 29 | protobuf==3.19.5 30 | pyasn1==0.4.8 31 | pyasn1-modules==0.2.8 32 | pycodestyle==2.9.1 33 | pyparsing==3.0.9 34 | python-dateutil==2.8.2 35 | pytz==2022.2.1 36 | regex==2022.9.13 37 | requests==2.28.1 38 | requests-oauthlib==1.3.1 39 | rsa==4.9 40 | scikit-learn==1.1.2 41 | scipy==1.9.1 42 | six==1.16.0 43 | sklearn==0.0 44 | tensorboard==2.10.0 45 | tensorboard-data-server==0.6.1 46 | tensorboard-plugin-wit==1.8.1 47 | tensorflow==2.10.0 48 | tensorflow-estimator==2.10.0 49 | tensorflow-io-gcs-filesystem==0.27.0 50 | termcolor==2.0.1 51 | threadpoolctl==3.1.0 52 | timestring==1.6.4 53 | toml==0.10.2 54 | tqdm==4.64.1 55 | tweepy==4.10.1 56 | typing_extensions==4.3.0 57 | urllib3==1.26.12 58 | Werkzeug==2.2.2 59 | wrapt==1.14.1 60 | -------------------------------------------------------------------------------- /resources/model-is_good_or_bad_nnet.dat: -------------------------------------------------------------------------------- 1 | {"class_name": "Sequential", "config": {"name": "sequential_1", "layers": [{"class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "batch_input_shape": [null, 10], "dtype": "float32", "units": 22, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dense", "config": {"name": "dense_2", "trainable": true, "dtype": "float32", "units": 1, "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}]}, "keras_version": "2.2.5", "backend": "tensorflow"} -------------------------------------------------------------------------------- /resources/model-is_good_or_bad_nnet.dat.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/texifter/trust-defender/08b19da602daafb0ac908ed0838243d120bfc94c/resources/model-is_good_or_bad_nnet.dat.h5 -------------------------------------------------------------------------------- /resources/model-is_good_or_bad_user_desc_ngram_class.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/texifter/trust-defender/08b19da602daafb0ac908ed0838243d120bfc94c/resources/model-is_good_or_bad_user_desc_ngram_class.dat -------------------------------------------------------------------------------- /resources/test_users.csv: -------------------------------------------------------------------------------- 1 | Name,Value,Total 2 | "from_user: ","Marley_Corcoran",2 3 | "from_user: ","TheTruth4Kody",2 4 | "from_user: ","Tuesdaywebcam",2 5 | "from_user: ","angie_angers",2 6 | "from_user: ","13NickiMtz",1 7 | "from_user: ","21ang_",1 8 | "from_user: ","ABC",1 9 | "from_user: ","AG_ixanna",1 10 | "from_user: ","AMBROSIATOYA_",1 11 | "from_user: ","Aakosir",1 12 | "from_user: ","Abaddonbaphome5",1 13 | "from_user: ","AdoreYve",1 14 | "from_user: ","AeornFlippout",1 15 | "from_user: ","AlyshaxCrowder",1 16 | "from_user: ","AmericaStyles8",1 17 | "from_user: ","Aneek87",1 18 | "from_user: ","Aunt_Bike",1 19 | "from_user: ","AyeeKay40seven",1 20 | "from_user: ","BAG_Malta",1 21 | "from_user: ","BFDPIO",1 22 | "from_user: ","BadBadBadDonald",1 23 | "from_user: ","BekkahBartl",1 24 | "from_user: ","Bethieexox",1 25 | "from_user: ","Beyuktiful",1 26 | "from_user: ","BlasianKidKam",1 27 | "from_user: ","CBSMiami",1 28 | "from_user: ","CHUKZY010",1 29 | "from_user: ","CaliGolfer_",1 30 | "from_user: ","CanIRunAwayNow",1 31 | "from_user: ","CarrollGooglem",1 32 | "from_user: ","CarrolltonTXPD",1 33 | "from_user: ","CathrynYong",1 34 | "from_user: ","CeballosMitch",1 35 | "from_user: ","ChasityBPierce",1 36 | "from_user: ","ChrisRicks40",1 37 | "from_user: ","CitlaliSierra1",1 38 | "from_user: ","Colin10971",1 39 | "from_user: ","CountyGibson",1 40 | "from_user: ","CroydonCyclists",1 41 | "from_user: ","Cyclelaw1",1 42 | "from_user: ","Cynnsinn_",1 43 | "from_user: ","DLoffman",1 44 | "from_user: ","DairyWifeHailey",1 45 | "from_user: ","DanClem88",1 46 | "from_user: ","DanskiAtSix",1 47 | "from_user: ","DesHackworth",1 48 | "from_user: ","DutchReach",1 49 | "from_user: ","DviusMusic",1 50 | "from_user: ","EmptyInkBuyer",1 51 | "from_user: ","ErinLeberak",1 52 | "from_user: ","Fitzy_35",1 53 | "from_user: ","From96Till",1 54 | "from_user: ","FuxkSallieMae",1 55 | "from_user: ","Goldeneyes003",1 56 | "from_user: ","GregTBasta",1 57 | "from_user: ","HannGilll",1 58 | "from_user: ","Hannah_monnier",1 59 | "from_user: ","HenriAguero",1 60 | "from_user: ","Hottsy_Tottsy",1 61 | "from_user: ","IDKCoach3",1 62 | "from_user: ","InkMySwagg",1 63 | "from_user: ","JamieStoneXXX",1 64 | "from_user: ","Jaybarragan_",1 65 | "from_user: ","JaysonStanley_",1 66 | "from_user: ","JazzieePooh",1 67 | "from_user: ","JenkinsNemo",1 68 | "from_user: ","Jessicaa_lynn22",1 69 | "from_user: ","JohnDunipace",1 70 | "from_user: ","JspringH2O",1 71 | "from_user: ","JustineNichole_",1 72 | "from_user: ","KENNIADK",1 73 | "from_user: ","Kaleb56892",1 74 | "from_user: ","KalistaDemyan",1 75 | "from_user: ","Kungfu_Kimmy",1 76 | "from_user: ","LAPDHQ",1 77 | "from_user: ","LAPDonlinenews",1 78 | "from_user: ","LATACO",1 79 | "from_user: ","LaaDaise",1 80 | "from_user: ","LambofAdenbtw",1 81 | "from_user: ","Laynee_mitchell",1 82 | "from_user: ","LegendSid",1 83 | "from_user: ","LiZnoel_",1 84 | "from_user: ","LinzDeFranco",1 85 | "from_user: ","LissLawFirm",1 86 | "from_user: ","LittleDreamer78",1 87 | "from_user: ","Lyrical_92",1 88 | "from_user: ","M4CDADY",1 89 | "from_user: ","MCIns",1 90 | "from_user: ","Maha_h23",1 91 | "from_user: ","MaraxRose",1 92 | "from_user: ","MarinoBambinos",1 93 | "from_user: ","MeganCawe",1 94 | "from_user: ","MeggieLite",1 95 | "from_user: ","MouldyTrain486",1 96 | "from_user: ","MullaBaby3",1 97 | "from_user: ","NaRiNe_94",1 98 | "from_user: ","NanaEsabelle",1 99 | "from_user: ","NaomiPariona",1 100 | "from_user: ","Naturally_TSC",1 101 | "from_user: ","NessaValenciano",1 102 | "from_user: ","NewsRadio930",1 103 | "from_user: ","NicoleDuCane",1 104 | "from_user: ","PLRGChicago",1 105 | "from_user: ","PamDinwiddie",1 106 | "from_user: ","ParaInTheWorld",1 107 | "from_user: ","PsirenSonzai",1 108 | "from_user: ","PvnchoVilla",1 109 | "from_user: ","RGurewitz",1 110 | "from_user: ","RaccoonBernard",1 111 | "from_user: ","Ralphdoesntlike",1 112 | "from_user: ","Rayswag_",1 113 | "from_user: ","RepulsiveRuby",1 114 | "from_user: ","RichmondPolice",1 115 | "from_user: ","RyanRiess1",1 116 | "from_user: ","SCDanOConnell",1 117 | "from_user: ","SOPHIAannJ",1 118 | "from_user: ","SaMMcKaay18",1 119 | "from_user: ","Sadie_Marie_",1 120 | "from_user: ","SanctionedArse",1 121 | "from_user: ","SarahTheScammer",1 122 | "from_user: ","SaveTaxi",1 123 | "from_user: ","SaxxonFox",1 124 | "from_user: ","Scottygoeshardy",1 125 | "from_user: ","ShelbeaMartinez",1 126 | "from_user: ","Sonoma_West",1 127 | "from_user: ","SophiePelham",1 128 | "from_user: ","Soul_Rebel420",1 129 | "from_user: ","SouthD_Boosh",1 130 | "from_user: ","Sparkes",1 131 | "from_user: ","Squally_Canada",1 132 | "from_user: ","SteveNegusMasr",1 133 | "from_user: ","StudentsFor3903",1 134 | "from_user: ","Sukipower",1 135 | "from_user: ","SydniSimone",1 136 | "from_user: ","Symone_LOL",1 137 | "from_user: ","TDuplichan",1 138 | "from_user: ","Tamiee143",1 139 | "from_user: ","Tarantosaurus",1 140 | "from_user: ","TeenSafeCom",1 141 | "from_user: ","TeriBarrMedia",1 142 | "from_user: ","TexasTribAbby",1 143 | "from_user: ","ThatErica",1 144 | "from_user: ","ThattKidRobert",1 145 | "from_user: ","TheLovelyHolly1",1 146 | "from_user: ","TheOmnisis",1 147 | "from_user: ","TheOtherFaller",1 148 | "from_user: ","TheaGilB",1 149 | "from_user: ","TialaTheCreator",1 150 | "from_user: ","Tikoush",1 151 | "from_user: ","TooAdorkable24",1 152 | "from_user: ","ToryPivoda",1 153 | "from_user: ","ToungeLee",1 154 | "from_user: ","TrumpBabe69",1 155 | "from_user: ","UmTAW",1 156 | "from_user: ","Useful_Chris",1 157 | "from_user: ","VTVakarian87",1 158 | "from_user: ","Veterans164",1 159 | "from_user: ","VisionZeroCA",1 160 | "from_user: ","WAHMCat",1 161 | "from_user: ","WTbatten",1 162 | "from_user: ","WarwickshireDad",1 163 | "from_user: ","WeekendInvestng",1 164 | "from_user: ","Wildohh",1 165 | "from_user: ","WilgaBeast30",1 166 | "from_user: ","WomAgainstUber",1 167 | "from_user: ","Young_Old_Nigga",1 168 | "from_user: ","ZeeFatimaa",1 169 | "from_user: ","Zenithwillrule",1 170 | "from_user: ","_BananaPriest_",1 171 | "from_user: ","_CesarTheGreat",1 172 | "from_user: ","_Donb411",1 173 | "from_user: ","_KayleePaul",1 174 | "from_user: ","_Nishaaaaaa",1 175 | "from_user: ","_TBrown1X",1 176 | "from_user: ","___andreeaaa",1 177 | "from_user: ","__jadebrown",1 178 | "from_user: ","_allisonplaton",1 179 | "from_user: ","_bayleigh",1 180 | "from_user: ","_keenacolada",1 181 | "from_user: ","_linda27_",1 182 | "from_user: ","_the1975__",1 183 | "from_user: ","abbeyrifix",1 184 | "from_user: ","abbyrailsback",1 185 | "from_user: ","adrianaafariass",1 186 | "from_user: ","akaBaldGuy",1 187 | "from_user: ","aleena_aodisho7",1 188 | "from_user: ","alexgolkar2",1 189 | "from_user: ","aliatanasio",1 190 | "from_user: ","amoneill_xo",1 191 | "from_user: ","angieflyte99",1 192 | "from_user: ","annxpalacios",1 193 | "from_user: ","awkwardly_ashD",1 194 | "from_user: ","brendalyonsart",1 195 | "from_user: ","brendan_colford",1 196 | "from_user: ","briemagro",1 197 | "from_user: ","brooklyndowns99",1 198 | "from_user: ","burner92503",1 199 | "from_user: ","cabrideA2",1 200 | "from_user: ","callmejrich",1 201 | "from_user: ","cassidyrose144",1 202 | "from_user: ","castroalec",1 203 | "from_user: ","chawkins450",1 204 | "from_user: ","chaymojay",1 205 | "from_user: ","chelsey_rowden",1 206 | "from_user: ","chloeestilliee",1 207 | "from_user: ","chuchi_patootie",1 208 | "from_user: ","clarewrites",1 209 | "from_user: ","courtneyquatt",1 210 | "from_user: ","crybabykris_",1 211 | "from_user: ","crystalp103",1 212 | "from_user: ","curtmagurt_808",1 213 | "from_user: ","dannywarren14",1 214 | "from_user: ","darcynews",1 215 | "from_user: ","darwyss",1 216 | "from_user: ","daztheleo",1 217 | "from_user: ","deanyb71",1 218 | "from_user: ","demigodofasgard",1 219 | "from_user: ","devdollasign",1 220 | "from_user: ","dhepburn",1 221 | "from_user: ","die_cortes",1 222 | "from_user: ","divinegoten",1 223 | "from_user: ","domunique33",1 224 | "from_user: ","dopestanna",1 225 | "from_user: ","earth_to_kels",1 226 | "from_user: ","eccunionpatrick",1 227 | "from_user: ","emmmcast",1 228 | "from_user: ","ericasmaldone",1 229 | "from_user: ","faygolordx2k18",1 230 | "from_user: ","fcfortune",1 231 | "from_user: ","ffs_dianaaa",1 232 | "from_user: ","fivefootk__",1 233 | "from_user: ","frankhigiro",1 234 | "from_user: ","garrity_d",1 235 | "from_user: ","gatekeeper400",1 236 | "from_user: ","gjukatmedotcom",1 237 | "from_user: ","grillindavid",1 238 | "from_user: ","haleyyyann_",1 239 | "from_user: ","hannahsworld__",1 240 | "from_user: ","harrys_smile_94",1 241 | "from_user: ","heathpie",1 242 | "from_user: ","hector_mbatha",1 243 | "from_user: ","heiressnessa",1 244 | "from_user: ","helping_meowt",1 245 | "from_user: ","hime_765",1 246 | "from_user: ","iHateJamal",1 247 | "from_user: ","ifyouuseekamy_",1 248 | "from_user: ","injurylawyerny",1 249 | "from_user: ","its__stephaanie",1 250 | "from_user: ","itselishaaa",1 251 | "from_user: ","ivanfm_",1 252 | "from_user: ","jackywaffle",1 253 | "from_user: ","jasminOrjas",1 254 | "from_user: ","jasonteen",1 255 | "from_user: ","jbouie",1 256 | "from_user: ","jensbeautylife",1 257 | "from_user: ","jentin9",1 258 | "from_user: ","jessicarose_k",1 259 | "from_user: ","jlippsy",1 260 | "from_user: ","justin_kopacz",1 261 | "from_user: ","karinlizette",1 262 | "from_user: ","karlibra",1 263 | "from_user: ","katiesnow_xox",1 264 | "from_user: ","keishafoxxx",1 265 | "from_user: ","keyohmee",1 266 | "from_user: ","kian_duhh",1 267 | "from_user: ","kiiimbra",1 268 | "from_user: ","kiniperry",1 269 | "from_user: ","kirbsterr__",1 270 | "from_user: ","kreativekat17",1 271 | "from_user: ","kwisarts",1 272 | "from_user: ","kwlem",1 273 | "from_user: ","kylegtrawr",1 274 | "from_user: ","lana__jk",1 275 | "from_user: ","levixkelly",1 276 | "from_user: ","lexidawnbean",1 277 | "from_user: ","lexmoreno15",1 278 | "from_user: ","lilwisehunnie",1 279 | "from_user: ","lindsguist",1 280 | "from_user: ","lizsavage",1 281 | "from_user: ","llizzy_yo",1 282 | "from_user: ","lmhealthc",1 283 | "from_user: ","lollhailz",1 284 | "from_user: ","lombardi_joanne",1 285 | "from_user: ","lporthouse",1 286 | "from_user: ","lyoshki",1 287 | "from_user: ","madelyn_streit",1 288 | "from_user: ","makkv3",1 289 | "from_user: ","marijuanadotorg",1 290 | "from_user: ","markgillam",1 291 | "from_user: ","maryferxsalas",1 292 | "from_user: ","megbooth67",1 293 | "from_user: ","missjermee",1 294 | "from_user: ","mona562lbc",1 295 | "from_user: ","moneymayaa_",1 296 | "from_user: ","moviehawk",1 297 | "from_user: ","mrs_wee",1 298 | "from_user: ","ms_marley_mar",1 299 | "from_user: ","msheathermagick",1 300 | "from_user: ","naethaniel987",1 301 | "from_user: ","nayaafernandez",1 302 | "from_user: ","nayan3216",1 303 | "from_user: ","nearsidejohn",1 304 | "from_user: ","neezydahl",1 305 | "from_user: ","nicole_khal",1 306 | "from_user: ","ninalove_me",1 307 | "from_user: ","nypost",1 308 | "from_user: ","oblivion1_",1 309 | "from_user: ","ohdamnitsmica_",1 310 | "from_user: ","paulbami",1 311 | "from_user: ","paultmadden",1 312 | "from_user: ","peaceOfSkind",1 313 | "from_user: ","phoooooop",1 314 | "from_user: ","plastic_fl0wers",1 315 | "from_user: ","quazo",1 316 | "from_user: ","rach_wills14",1 317 | "from_user: ","rachbarnhart",1 318 | "from_user: ","readbyrae",1 319 | "from_user: ","redt0mat0",1 320 | "from_user: ","rocksolidbadass",1 321 | "from_user: ","ronanodowd",1 322 | "from_user: ","roxxxy42084",1 323 | "from_user: ","rrosaliawarren",1 324 | "from_user: ","sIeepyri",1 325 | "from_user: ","sallll_xo",1 326 | "from_user: ","sehmaffa",1 327 | "from_user: ","shainakelll",1 328 | "from_user: ","shaneditullio03",1 329 | "from_user: ","sixxisgod74",1 330 | "from_user: ","soulfuhresh",1 331 | "from_user: ","spiraIarchitect",1 332 | "from_user: ","stedrea",1 333 | "from_user: ","stopBigDdrivers",1 334 | "from_user: ","sweet_den_sour",1 335 | "from_user: ","tabithaaaflynnn",1 336 | "from_user: ","taoofadam",1 337 | "from_user: ","taylor_devore",1 338 | "from_user: ","terilelikehke",1 339 | "from_user: ","thatkidsary",1 340 | "from_user: ","thechildlessmum",1 341 | "from_user: ","thecupcakegirl8",1 342 | "from_user: ","thelxrdgabe",1 343 | "from_user: ","thetruenet",1 344 | "from_user: ","tiffanyliberto",1 345 | "from_user: ","tinatiger00",1 346 | "from_user: ","toiletlyd",1 347 | "from_user: ","torrtorr24",1 348 | "from_user: ","tracirclayton",1 349 | "from_user: ","trippynessa",1 350 | "from_user: ","tristebeth",1 351 | "from_user: ","v3ktorious",1 352 | "from_user: ","vtothepowerof2",1 353 | "from_user: ","weeklystandard",1 354 | "from_user: ","wsvn",1 355 | "from_user: ","wwozzydo",1 356 | "from_user: ","xRajTheOne",1 357 | "from_user: ","xoMarleexox",1 358 | "from_user: ","ydanis",1 359 | "from_user: ","zee_bliss",1 360 | -------------------------------------------------------------------------------- /run-csvs-score.bat: -------------------------------------------------------------------------------- 1 | echo off 2 | 3 | IF %1.==. GOTO Error1 4 | IF %2.==. GOTO Error2 5 | 6 | SET DataPath=%1 7 | SET InputFilenameNoExt=%2 8 | 9 | SET TwitterCredentials=".\twitter_auth.json" 10 | SET NNModel=".\resources\model-is_good_or_bad_nnet.dat" 11 | SET NGramModel=".\resources\model-is_good_or_bad_user_desc_ngram_class.dat" 12 | SET FullInputPath="%DataPath%\%InputFilenameNoExt%.csv" 13 | SET ExtractOutputPath="%DataPath%\%InputFilenameNoExt%_userdat.csv" 14 | SET FinalOutputPath="%DataPath%\%InputFilenameNoExt%_scored.csv" 15 | 16 | python extract_users_from_csvs.py -i %FullInputPath% -o %ExtractOutputPath% -c %TwitterCredentials% 17 | python run_nnet.py -i %ExtractOutputPath% -o %FinalOutputPath% -n %NNModel% -m %NGramModel% 18 | 19 | GOTO EndScript 20 | 21 | :Error1 22 | ECHO Missing parameter 1 for path_to_data 23 | GOTO EndError 24 | 25 | :Error2 26 | ECHO Missing parameter 2 for data_filename_without_extension 27 | GOTO EndError 28 | 29 | :EndError 30 | ECHO ------ 31 | ECHO Command:: run-csvs-score path_to_data data_filename_without_extension 32 | ECHO ------ 33 | 34 | :EndScript 35 | -------------------------------------------------------------------------------- /run-csvs-score.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # import nltk 4 | # nltk.download('punkt') 5 | 6 | DataPath=$1 7 | InputFilenameNoExt=$2 8 | 9 | display_usage() 10 | { 11 | echo "Usage: run-csvs-score.sh {data_path} {base_filename}" 12 | } 13 | 14 | if [ "$DataPath" == "" ]; then 15 | echo "Missing data path" 16 | $(display_usage) 17 | exit 1 18 | fi 19 | 20 | if [ "$InputFilenameNoExt" == "" ]; then 21 | echo "Missing base filename" 22 | $(display_usage) 23 | exit 1 24 | fi 25 | 26 | TwitterCredentials="./twitter_auth.json" 27 | NNModel="./resources/model-is_good_or_bad_nnet.dat" 28 | NGramModel="./resources/model-is_good_or_bad_user_desc_ngram_class.dat" 29 | FullInputPath="$DataPath/$InputFilenameNoExt.csv" 30 | ExtractOutputPath="$DataPath/$InputFilenameNoExt.userdat.csv" 31 | FinalOutputPath="$DataPath/$InputFilenameNoExt.scored.csv" 32 | 33 | python extract_users_from_csvs.py -i $FullInputPath -o $ExtractOutputPath -c $TwitterCredentials 34 | python run_nnet.py -i $ExtractOutputPath -o $FinalOutputPath -n $NNModel -m $NGramModel 35 | -------------------------------------------------------------------------------- /run-score-dtarchive.bat: -------------------------------------------------------------------------------- 1 | echo off 2 | 3 | IF %1.==. GOTO Error1 4 | IF %2.==. GOTO Error2 5 | IF %3.==. GOTO Error3 6 | 7 | SET DataPath=%1 8 | SET InputFilenameNoExt=%2 9 | SET ArchiveId=%3 10 | 11 | SET TwitterCredentials=".\twitter_auth.json" 12 | SET DTCredentials=".\dt_credentials.json" 13 | SET NNModel=".\resources\model-is_good_or_bad_nnet.dat" 14 | SET NGramModel=".\resources\model-is_good_or_bad_user_desc_ngram_class.dat" 15 | SET FullInputPath="%DataPath%\%InputFilenameNoExt%.csv" 16 | SET ExtractOutputPath="%DataPath%\%InputFilenameNoExt%_userdat.csv" 17 | SET FinalOutputPath="%DataPath%\%InputFilenameNoExt%_scored.csv" 18 | 19 | python extract_users_from_dt.py -i %DTCredentials% -a %ArchiveId% -o %FullInputPath% 20 | python extract_users_from_csvs.py -i %FullInputPath% -o %ExtractOutputPath% -c %TwitterCredentials% 21 | python run_nnet.py -i %ExtractOutputPath% -o %FinalOutputPath% -n %NNModel% -m %NGramModel% 22 | 23 | GOTO EndScript 24 | 25 | :Error3 26 | ECHO Missing parameter 3 for archive_id 27 | GOTO EndError 28 | 29 | :Error2 30 | ECHO Missing parameter 2 for data_filename_without_extension 31 | GOTO EndError 32 | 33 | :Error1 34 | ECHO Missing parameter 1 for data_path 35 | GOTO EndError 36 | 37 | :EndError 38 | ECHO ------ 39 | ECHO Command:: run-score-dtarchive data_path data_filename_without_extension archive_id 40 | ECHO ------ 41 | 42 | :EndScript 43 | -------------------------------------------------------------------------------- /run-score-dtarchive.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # import nltk 4 | # nltk.download('punkt') 5 | 6 | DataPath=$1 7 | InputFilenameNoExt=$2 8 | ArchiveId=$3 9 | 10 | display_usage() 11 | { 12 | echo "Usage: run-score-dtarchive.sh {data_path} {base_filename} {archive_id} " 13 | } 14 | 15 | if [ "$DataPath" == "" ]; then 16 | echo "Missing data path" 17 | $(display_usage) 18 | exit 1 19 | fi 20 | 21 | if [ "$InputFilenameNoExt" == "" ]; then 22 | echo "Missing base filename" 23 | $(display_usage) 24 | exit 1 25 | fi 26 | 27 | if [ "$ArchiveId" == "" ]; then 28 | echo "Missing archive ID" 29 | $(display_usage) 30 | exit 1 31 | fi 32 | 33 | DTCredentials="./dt_credentials.json" 34 | TwitterCredentials="./twitter_auth.json" 35 | NNModel="./resources/model-is_good_or_bad_nnet.dat" 36 | NGramModel="./resources/model-is_good_or_bad_user_desc_ngram_class.dat" 37 | FullInputPath="$DataPath/$InputFilenameNoExt.csv" 38 | ExtractOutputPath="$DataPath/$InputFilenameNoExt.userdat.csv" 39 | FinalOutputPath="$DataPath/$InputFilenameNoExt.scored.csv" 40 | 41 | python extract_users_from_dt.py -i $DTCredentials -a $ArchiveId -o $FullInputPath 42 | python extract_users_from_csvs.py -i $FullInputPath -o $ExtractOutputPath -c $TwitterCredentials 43 | python run_nnet.py -i $ExtractOutputPath -o $FinalOutputPath -n $NNModel -m $NGramModel 44 | -------------------------------------------------------------------------------- /run_nnet.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy 3 | import pandas as pd 4 | from keras import backend as K 5 | from keras.models import Sequential 6 | from keras.layers import Dense 7 | from keras.models import model_from_json 8 | from ngram_classifier import NGramClassifier 9 | from sklearn.metrics import precision_recall_fscore_support 10 | 11 | THRESHOLD = 0.80 12 | 13 | CLASS_WEIGHTS = [ 14 | ("num_days", 0.997821848), 15 | ("statuses_per_day", 1.065570851), 16 | ("followers_per_day", 1.021055002), 17 | ("following_per_day", 1.122703153), 18 | ("desc_len_terms", 1.171072307), 19 | ("num_list_items", 1.017727903), 20 | ("num_hashtags", 0.889418197), 21 | ("url_count", 1.018365516) 22 | ] 23 | 24 | 25 | def get_input_vector(row, classifier): 26 | ''' 27 | (classifier): p_good 28 | (classifier): p_bot 29 | num_days 30 | statuses_per_day 31 | followers_per_day 32 | following_per_day 33 | desc_len_terms 34 | num_list_items 35 | num_hashtags 36 | url_count 37 | ''' 38 | class_probs = classifier.classify_text( 39 | str(row["user_profile_description"])) 40 | ret = [class_probs["good"], class_probs["bot"]] 41 | for label, weight in CLASS_WEIGHTS: 42 | try: 43 | ret.append(float(row[label]) * weight) 44 | except: 45 | ret.append(0.0) 46 | return ret 47 | 48 | 49 | def get_training_output(row): 50 | class_label = str(row["class_value"]) 51 | return 0.0 if class_label == "good" else 1.0 52 | 53 | 54 | def recall_m(y_true, y_pred): 55 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 56 | possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) 57 | recall = true_positives / (possible_positives + K.epsilon()) 58 | return recall 59 | 60 | 61 | def precision_m(y_true, y_pred): 62 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 63 | predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) 64 | precision = true_positives / (predicted_positives + K.epsilon()) 65 | return precision 66 | 67 | 68 | def f1_m(y_true, y_pred): 69 | precision = precision_m(y_true, y_pred) 70 | recall = recall_m(y_true, y_pred) 71 | return 2*((precision*recall)/(precision+recall+K.epsilon())) 72 | 73 | 74 | def is_bot_value(is_verified, value): 75 | if is_verified: 76 | return "good" 77 | return "bot" if value > THRESHOLD else "good" 78 | 79 | 80 | if __name__ == "__main__": 81 | parser = argparse.ArgumentParser() 82 | parser.add_argument("-i", "--input", help="test input csv file") 83 | parser.add_argument("-m", "--model", help="ngram model file") 84 | parser.add_argument("-n", "--nnetmodel", help="NNet model file") 85 | parser.add_argument("-o", "--output", help="output csv file") 86 | args = parser.parse_args() 87 | 88 | if not args.input: 89 | raise "missing input file" 90 | if not args.model: 91 | raise "missing ngram model file" 92 | if not args.nnetmodel: 93 | raise "missing nnet model file" 94 | if not args.output: 95 | raise "missing output file" 96 | 97 | classifier = NGramClassifier(model_path=args.model) 98 | 99 | with open(args.nnetmodel, 'r') as json_file: 100 | loaded_model_json = json_file.read() 101 | nnet = model_from_json(loaded_model_json) 102 | nnet.load_weights(f'{args.nnetmodel}.h5') 103 | nnet.compile(loss='binary_crossentropy', optimizer='adam', 104 | metrics=['acc', f1_m, precision_m, recall_m]) 105 | 106 | df_test = pd.read_csv(args.input, keep_default_na=False) 107 | 108 | df_test = df_test.drop(df_test[df_test.verified == True].index) 109 | indexes = [] 110 | targets_x = [] 111 | predictions = [] 112 | for index, row in df_test.iterrows(): 113 | try: 114 | input_vector = get_input_vector(row, classifier) 115 | except: 116 | print(f'(error parsing row {index}... skipping...)') 117 | continue 118 | indexes.append(index) 119 | targets_x.append(input_vector) 120 | predictions = nnet.predict(numpy.array(targets_x)) 121 | df_test["is_bot_belief"] = predictions 122 | df_test["is_bot"] = df_test.apply(lambda row: is_bot_value( 123 | row["verified"], row["is_bot_belief"]), axis=1) 124 | 125 | df_test.to_csv(args.output) 126 | -------------------------------------------------------------------------------- /split_training_data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.model_selection import train_test_split 3 | 4 | full_data_path = "./resources/training.csv" 5 | class_label_field = "class_value" 6 | user_desc_field = "user_profile_description" 7 | 8 | output_paths = { 9 | "training": "./resources/test_trainingdata", 10 | "test": "./resources/test_testdata" 11 | } 12 | 13 | percent_test = 0.25 14 | 15 | 16 | def combine_files(file_list, output_file): 17 | frames = [] 18 | for this_file in file_list: 19 | df = pd.read_csv(this_file, keep_default_na=False) 20 | frames.append(df) 21 | out_df = pd.concat(frames) 22 | out_df = out_df.sample(frac=1).reset_index(drop=True) 23 | out_df.to_csv(output_file) 24 | 25 | 26 | df = pd.read_csv(full_data_path, keep_default_na=False) 27 | print(df.shape) 28 | df = df[(df[user_desc_field].notnull()) & 29 | (df[user_desc_field].str.len() > 0)] 30 | print(df.shape) 31 | class_counts = df[class_label_field].value_counts() 32 | classes = list(class_counts.keys()) 33 | 34 | print(f'class_counts: {class_counts}') 35 | 36 | min_value = 9999999999 37 | for this_class in classes: 38 | min_value = min_value if class_counts[this_class] > min_value else class_counts[this_class] 39 | 40 | print(f'min value: {min_value}') 41 | 42 | training_files = [] 43 | test_files = [] 44 | 45 | for this_class in classes: 46 | class_count = class_counts[this_class] 47 | num_samples = int(class_count if class_count <= min_value else min_value) 48 | print(f'sampling: {this_class}, {num_samples} items') 49 | values_sample_filter = df.loc[df[class_label_field] == this_class] 50 | print(f'... filtered...') 51 | values_sample = values_sample_filter.sample(num_samples) 52 | print(f'... sampled...') 53 | train, test = train_test_split(values_sample, test_size=percent_test) 54 | print(f'... split.') 55 | train_out_path = f'{output_paths["training"]}_{this_class}.csv' 56 | test_out_path = f'{output_paths["test"]}_{this_class}.csv' 57 | train.to_csv(train_out_path) 58 | test.to_csv(test_out_path) 59 | training_files.append(train_out_path) 60 | test_files.append(test_out_path) 61 | 62 | combine_files(training_files, f'{output_paths["training"]}.csv') 63 | combine_files(test_files, f'{output_paths["test"]}.csv') 64 | -------------------------------------------------------------------------------- /test_ngram_classifier.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pandas as pd 3 | from ngram_classifier import NGramClassifier 4 | from sklearn.metrics import precision_recall_fscore_support 5 | 6 | 7 | def get_prediction(p): 8 | return "bot" if p["bot"] > p["good"] else "good" 9 | 10 | 11 | def print_metrics(metrics): 12 | print(f'precision: (good) {metrics[0][0]}, (bot) {metrics[0][1]}') 13 | print(f'recall: (good) {metrics[1][0]}, (bot) {metrics[1][1]}') 14 | print(f'fscore: (good) {metrics[2][0]}, (bot) {metrics[2][1]}') 15 | print(f'counts: (good) {metrics[3][0]}, (bot) {metrics[3][1]}') 16 | 17 | 18 | if __name__ == "__main__": 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument("-i", "--input", help="input test csv file") 21 | parser.add_argument("-m", "--model", help="model file") 22 | args = parser.parse_args() 23 | 24 | if not args.input: 25 | raise "missing input file" 26 | if not args.model: 27 | raise "missing model file" 28 | 29 | classifier = NGramClassifier(model_path=args.model) 30 | 31 | df = pd.read_csv(args.input, keep_default_na=False) 32 | x_values = [] 33 | y_values = [] 34 | for index, row in df.iterrows(): 35 | this_text = str(row["user_profile_description"]) 36 | this_class = str(row["class_value"]) 37 | if this_text and len(this_text) > 0 and this_class and len(this_class) > 0: 38 | x_values.append(this_text) 39 | y_values.append(this_class) 40 | 41 | predicted = classifier.classify_text_list(x_values) 42 | 43 | targets = [] 44 | predictions = [] 45 | for i in range(0, len(x_values)): 46 | predictions.append(get_prediction(predicted[i])) 47 | targets.append(y_values[i]) 48 | 49 | metrics = precision_recall_fscore_support(targets, predictions) 50 | print_metrics(metrics) 51 | -------------------------------------------------------------------------------- /test_nnet.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy 3 | import pandas as pd 4 | from keras import backend as K 5 | from keras.models import Sequential 6 | from keras.layers import Dense 7 | from keras.models import model_from_json 8 | from ngram_classifier import NGramClassifier 9 | from sklearn.metrics import precision_recall_fscore_support 10 | 11 | CLASS_WEIGHTS = [ 12 | ("num_days", 0.997821848), 13 | ("statuses_per_day", 1.065570851), 14 | ("followers_per_day", 1.021055002), 15 | ("following_per_day", 1.122703153), 16 | ("desc_len_terms", 1.171072307), 17 | ("num_list_items", 1.017727903), 18 | ("num_hashtags", 0.889418197), 19 | ("url_count", 1.018365516) 20 | ] 21 | 22 | 23 | def get_input_vector(row, classifier): 24 | ''' 25 | (classifier): p_good 26 | (classifier): p_bot 27 | num_days 28 | statuses_per_day 29 | followers_per_day 30 | following_per_day 31 | desc_len_terms 32 | num_list_items 33 | num_hashtags 34 | url_count 35 | ''' 36 | class_probs = classifier.classify_text( 37 | str(row["user_profile_description"])) 38 | ret = [class_probs["good"], class_probs["bot"]] 39 | for label, weight in CLASS_WEIGHTS: 40 | ret.append(float(row[label]) * weight) 41 | return ret 42 | 43 | 44 | def get_training_output(row): 45 | class_label = str(row["class_value"]) 46 | return 0.0 if class_label == "good" else 1.0 47 | 48 | 49 | def recall_m(y_true, y_pred): 50 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 51 | possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) 52 | recall = true_positives / (possible_positives + K.epsilon()) 53 | return recall 54 | 55 | 56 | def precision_m(y_true, y_pred): 57 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 58 | predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) 59 | precision = true_positives / (predicted_positives + K.epsilon()) 60 | return precision 61 | 62 | 63 | def f1_m(y_true, y_pred): 64 | precision = precision_m(y_true, y_pred) 65 | recall = recall_m(y_true, y_pred) 66 | return 2*((precision*recall)/(precision+recall+K.epsilon())) 67 | 68 | 69 | if __name__ == "__main__": 70 | parser = argparse.ArgumentParser() 71 | parser.add_argument("-i", "--input", help="test input csv file") 72 | parser.add_argument("-m", "--model", help="ngram model file") 73 | parser.add_argument("-n", "--nnetmodel", help="NNet model file") 74 | args = parser.parse_args() 75 | 76 | if not args.input: 77 | raise "missing input file" 78 | if not args.model: 79 | raise "missing ngram model file" 80 | if not args.nnetmodel: 81 | raise "missing nnet model file" 82 | 83 | classifier = NGramClassifier(model_path=args.model) 84 | 85 | with open(args.nnetmodel, 'r') as json_file: 86 | loaded_model_json = json_file.read() 87 | nnet = model_from_json(loaded_model_json) 88 | nnet.load_weights(f'{args.nnetmodel}.h5') 89 | nnet.compile(loss='binary_crossentropy', optimizer='adam', 90 | metrics=['acc', f1_m, precision_m, recall_m]) 91 | 92 | df_test = pd.read_csv(args.input, keep_default_na=False) 93 | targets_x = [] 94 | targets_y = [] 95 | predictions = [] 96 | for index, row in df_test.iterrows(): 97 | input_vector = get_input_vector(row, classifier) 98 | targets_x.append(input_vector) 99 | targets_y.append(get_training_output(row)) 100 | loss, accuracy, f1_score, precision, recall = nnet.evaluate( 101 | numpy.array(targets_x), numpy.array(targets_y), verbose=0) 102 | 103 | print( 104 | f'loss: {loss}, acc: {accuracy}, prec: {precision}, recall: {recall}, f1: {f1_score}') 105 | -------------------------------------------------------------------------------- /train_ngram_classifier.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pandas as pd 3 | from ngram_classifier import NGramClassifier 4 | 5 | CLASSES = ["bot", "good"] 6 | TEXT_COLUMN = "user_profile_description" 7 | CLASS_COLUMN = "class_value" 8 | 9 | if __name__ == "__main__": 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument("-i", "--input", help="input csv file") 12 | parser.add_argument("-o", "--output", help="output file") 13 | args = parser.parse_args() 14 | 15 | if not args.input: 16 | raise "missing input file" 17 | if not args.output: 18 | raise "missing output file" 19 | 20 | classifier = NGramClassifier(classes=CLASSES, min_len=5, max_len=9) 21 | 22 | df = pd.read_csv(args.input, keep_default_na=False) 23 | x_values = [] 24 | y_values = [] 25 | total_rows = len(df.index) 26 | for index, row in df.iterrows(): 27 | this_text = str(row[TEXT_COLUMN]) 28 | this_class = str(row[CLASS_COLUMN]) 29 | if this_text and len(this_text) > 0 and this_class and len(this_class) > 0: 30 | x_values.append(this_text) 31 | y_values.append(this_class) 32 | 33 | if index % 1000 == 0: 34 | print(f'trained {index} of {total_rows}') 35 | 36 | classifier.train_text(x_values, y_values) 37 | classifier.serialize(args.output, max_to_save=100000) 38 | -------------------------------------------------------------------------------- /train_nnet.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy 3 | from keras import backend as K 4 | from keras.models import Sequential 5 | from keras.layers import Dense 6 | import pandas as pd 7 | from ngram_classifier import NGramClassifier 8 | from sklearn.metrics import precision_recall_fscore_support 9 | from timeit import default_timer as timer 10 | 11 | CLASS_WEIGHTS = [ 12 | ("num_days", 0.997821848), 13 | ("statuses_per_day", 1.065570851), 14 | ("followers_per_day", 1.021055002), 15 | ("following_per_day", 1.122703153), 16 | ("desc_len_terms", 1.171072307), 17 | ("num_list_items", 1.017727903), 18 | ("num_hashtags", 0.889418197), 19 | ("url_count", 1.018365516) 20 | ] 21 | 22 | 23 | def get_input_vector(row, classifier): 24 | ''' 25 | (classifier): p_good 26 | (classifier): p_bot 27 | num_days 28 | statuses_per_day 29 | followers_per_day 30 | following_per_day 31 | desc_len_terms 32 | num_list_items 33 | num_hashtags 34 | url_count 35 | ''' 36 | class_probs = classifier.classify_text( 37 | str(row["user_profile_description"])) 38 | ret = [class_probs["good"], class_probs["bot"]] 39 | for label, weight in CLASS_WEIGHTS: 40 | ret.append(float(row[label]) * weight) 41 | return ret 42 | 43 | 44 | def get_training_output(row): 45 | class_label = str(row["class_value"]) 46 | return 0.0 if class_label == "good" else 1.0 47 | 48 | 49 | def print_metrics(which_round, metrics): 50 | print( 51 | f'round: {which_round} : p() {metrics[0][0]:.4f}, {metrics[0][1]:.4f} : r() {metrics[1][0]:.4f}, {metrics[1][1]:.4f} : f() {metrics[2][0]:.4f}, {metrics[2][1]:.4f}') 52 | 53 | 54 | def recall_m(y_true, y_pred): 55 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 56 | possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) 57 | recall = true_positives / (possible_positives + K.epsilon()) 58 | return recall 59 | 60 | 61 | def precision_m(y_true, y_pred): 62 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 63 | predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) 64 | precision = true_positives / (predicted_positives + K.epsilon()) 65 | return precision 66 | 67 | 68 | def f1_m(y_true, y_pred): 69 | precision = precision_m(y_true, y_pred) 70 | recall = recall_m(y_true, y_pred) 71 | return 2*((precision*recall)/(precision+recall+K.epsilon())) 72 | 73 | 74 | if __name__ == "__main__": 75 | parser = argparse.ArgumentParser() 76 | parser.add_argument("-i", "--input", help="input csv file") 77 | parser.add_argument("-m", "--model", help="ngram model file") 78 | parser.add_argument("-o", "--output", help="output file") 79 | parser.add_argument("-n", "--numrounds", help="number rounds to train") 80 | parser.add_argument("-t", "--testfile", help="testing_file") 81 | args = parser.parse_args() 82 | 83 | if not args.input: 84 | raise "missing input file" 85 | if not args.model: 86 | raise "missing model file" 87 | if not args.output: 88 | raise "missing output file" 89 | if not args.numrounds: 90 | raise "missing number of training rounds" 91 | if not args.testfile: 92 | raise "missing test file" 93 | 94 | classifier = NGramClassifier(model_path=args.model) 95 | 96 | num_training_rounds = int(args.numrounds) 97 | 98 | df = pd.read_csv(args.input, keep_default_na=False) 99 | df_test = pd.read_csv(args.testfile, keep_default_na=False) 100 | n_rows = len(df.index) 101 | 102 | nnet = Sequential() 103 | nnet.add(Dense(22, input_dim=10, activation='relu')) 104 | nnet.add(Dense(1, activation='sigmoid')) 105 | nnet.compile(loss='binary_crossentropy', optimizer='adam', 106 | metrics=['acc', f1_m, precision_m, recall_m]) 107 | 108 | print('----------------') 109 | 110 | # for i in range(0,num_training_rounds): 111 | start = timer() 112 | df_train = df.sample(frac=1.0).reset_index(drop=True) 113 | x_values = [] 114 | y_values = [] 115 | for index, row in df_train.iterrows(): 116 | input_vector = get_input_vector(row, classifier) 117 | output_val = get_training_output(row) 118 | x_values.append(input_vector) 119 | y_values.append(output_val) 120 | nnet.fit(numpy.array(x_values), numpy.array(y_values), 121 | epochs=num_training_rounds, batch_size=25) 122 | 123 | targets_x = [] 124 | targets_y = [] 125 | predictions = [] 126 | for index, row in df_test.iterrows(): 127 | input_vector = get_input_vector(row, classifier) 128 | targets_x.append(input_vector) 129 | targets_y.append(get_training_output(row)) 130 | loss, accuracy, f1_score, precision, recall = nnet.evaluate( 131 | numpy.array(targets_x), numpy.array(targets_y), verbose=0) 132 | 133 | end = timer() 134 | run_time = (end - start) 135 | 136 | print("model trained.") 137 | model_json = nnet.to_json() 138 | with open(args.output, "w") as json_file: 139 | json_file.write(model_json) 140 | # serialize weights to HDF5 141 | nnet.save_weights(f'{args.output}.h5') 142 | -------------------------------------------------------------------------------- /train_test_ngram_classifier.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import pandas as pd 3 | from ngram_classifier import NGramClassifier 4 | from sklearn.metrics import precision_recall_fscore_support 5 | from timeit import default_timer as timer 6 | 7 | CLASSES = ["bot", "good"] 8 | TEXT_COLUMN = "user_profile_description" 9 | CLASS_COLUMN = "class_value" 10 | 11 | training_data = "./resources/training.csv" 12 | testing_data = "./resources/test.csv" 13 | metric_output = "./resources/test_ngram_metrics.csv" 14 | 15 | 16 | def get_xy_for_data(df): 17 | x_values = [] 18 | y_values = [] 19 | for index, row in df_train.iterrows(): 20 | this_text = str(row[TEXT_COLUMN]) 21 | this_class = str(row[CLASS_COLUMN]) 22 | if this_text and len(this_text) > 0 and this_class and len(this_class) > 0: 23 | x_values.append(this_text) 24 | y_values.append(this_class) 25 | return x_values, y_values 26 | 27 | 28 | df_train = pd.read_csv(training_data, keep_default_na=False) 29 | train_x, train_y = get_xy_for_data(df_train) 30 | 31 | df_test = pd.read_csv(testing_data, keep_default_na=False) 32 | test_x, test_y = get_xy_for_data(df_test) 33 | 34 | 35 | def print_metrics(metrics): 36 | print( 37 | f'precision: ({CLASSES[0]}) {metrics[0][0]}, ({CLASSES[1]}) {metrics[0][1]}') 38 | print( 39 | f'recall: ({CLASSES[0]}) {metrics[1][0]}, ({CLASSES[1]}) {metrics[1][1]}') 40 | print( 41 | f'fscore: ({CLASSES[0]}) {metrics[2][0]}, ({CLASSES[1]}) {metrics[2][1]}') 42 | print( 43 | f'counts: ({CLASSES[0]}) {metrics[3][0]}, ({CLASSES[1]}) {metrics[3][1]}') 44 | 45 | 46 | def get_prediction(p): 47 | return CLASSES[0] if p[CLASSES[0]] > p[CLASSES[1]] else CLASSES[1] 48 | 49 | 50 | def run_class_and_test(ngram_min, ngram_max): 51 | classifier = NGramClassifier( 52 | classes=CLASSES, min_len=ngram_min, max_len=ngram_max) 53 | classifier.train_text(train_x, train_y) 54 | classifier.update_counts() 55 | 56 | predicted = classifier.classify_text_list(test_x) 57 | 58 | predictions = [] 59 | for i in range(0, len(test_x)): 60 | predictions.append(get_prediction(predicted[i])) 61 | 62 | return precision_recall_fscore_support(test_y, predictions) 63 | 64 | 65 | out_cols = ["range", f"p({CLASSES[0]})", f"p({CLASSES[1]})", f"r({CLASSES[0]})", 66 | f"r({CLASSES[1]})", f"f({CLASSES[0]})", f"f({CLASSES[1]})", "time"] 67 | 68 | with open(metric_output, 'w') as metric_out: 69 | csv_writer = csv.writer(metric_out, delimiter=',', 70 | quotechar='"', quoting=csv.QUOTE_MINIMAL) 71 | csv_writer.writerow(out_cols) 72 | for ngram_min in range(5, 10): 73 | for ngram_max in range(ngram_min, 10): 74 | print('----------------') 75 | print(f'testing ngrams({ngram_min},{ngram_max})') 76 | 77 | start = timer() 78 | metrics = run_class_and_test(ngram_min, ngram_max) 79 | end = timer() 80 | run_time = (end - start) 81 | print_metrics(metrics) 82 | 83 | metrics_row = [f'ngrams({ngram_min},{ngram_max})', 84 | metrics[0][0], metrics[0][1], 85 | metrics[1][0], metrics[1][1], 86 | metrics[2][0], metrics[2][1], 87 | run_time 88 | ] 89 | csv_writer.writerow(metrics_row) 90 | 91 | print('---- DONE ----') 92 | -------------------------------------------------------------------------------- /twitter_auth.json: -------------------------------------------------------------------------------- 1 | { 2 | "consumer_key": "", 3 | "consumer_secret": "", 4 | "access_token": "", 5 | "access_secret": "" 6 | } 7 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .app_utils import batch_list, get_twitter_auth 2 | from .text_utils import get_ngrams, count_ngrams, count_repeating_ngrams, clean_text, tokenize, get_list_item_count, get_hashtag_count, get_url_count, get_urls, get_hashtags 3 | -------------------------------------------------------------------------------- /utils/app_utils.py: -------------------------------------------------------------------------------- 1 | from functools import reduce 2 | import json 3 | 4 | def batch_list(input_list, batch_size): 5 | def reducer(cumulator, item): 6 | if len(cumulator[-1]) < batch_size: 7 | cumulator[-1].append(item) 8 | return cumulator 9 | else: 10 | cumulator.append([item]) 11 | return cumulator 12 | return reduce(reducer, input_list, [[]]) 13 | 14 | 15 | def get_twitter_auth(auth_file): 16 | with open(auth_file) as auth_file_handle: 17 | credentials = json.load(auth_file_handle) 18 | return { 19 | "consumer_key": credentials["consumer_key"], 20 | "consumer_secret": credentials["consumer_secret"], 21 | "access_token": credentials["access_token"], 22 | "access_secret": credentials["access_secret"] 23 | } 24 | -------------------------------------------------------------------------------- /utils/text_utils.py: -------------------------------------------------------------------------------- 1 | import html 2 | import re 3 | import string 4 | import sys 5 | import unicodedata 6 | from itertools import islice, tee, groupby 7 | from nltk.tokenize import WordPunctTokenizer 8 | 9 | regex_clean_newlines = re.compile(r"[\r|\n|\r\n]") 10 | regex_strip_urls = re.compile(r"\[http[s]?://.*?\s(.*?)\]") 11 | regex_strip_punct = re.compile(r'[%s]' % re.escape(string.punctuation)) 12 | 13 | punct_unicode_tbl = dict.fromkeys(i for i in range(sys.maxunicode) 14 | if unicodedata.category(chr(i)).startswith('P')) 15 | 16 | 17 | def get_ngrams(str_, n_grams=2): 18 | ''' 19 | gets all the ngrams for a string 20 | ''' 21 | tuples = zip(*(islice(seq, index, None) 22 | for index, seq in enumerate(tee("".join(str_.split()), n_grams)))) 23 | return list("".join(tup) for tup in tuples) 24 | 25 | 26 | def count_ngrams(str_, n_grams=2): 27 | ''' 28 | returns a list of the counts of each ngram in the input 29 | sorted by greatest to least count 30 | ''' 31 | ngrams = get_ngrams(str_, n_grams) 32 | ret_list = [(key, len(list(group))) 33 | for key, group in groupby(sorted(ngrams))] 34 | ret_list = sorted(ret_list, key=lambda x: x[1]) 35 | ret_list.reverse() 36 | return ret_list 37 | 38 | 39 | def count_repeating_ngrams(str_, ngrams=(2, 4), n_repeats_min=2): 40 | ''' 41 | Sums the count of number of ngrams that repeat equal to or 42 | more than 'n_repeats_min' times 43 | Example: 44 | count_repeating_ngrams('aaaa', ngrams(2,2)) -> [3] 45 | count_repeating_ngrams('aaaa', ngrams(2,3)) -> [3,2] 46 | count_repeating_ngrams('abab', ngrams(2,2)) -> [2] 47 | ''' 48 | output = list() 49 | for ngram in range(ngrams[0], ngrams[1] + 1): 50 | ngrams = count_ngrams(str_, ngram) 51 | this_count = sum(this_ngram[1] if this_ngram[1] >= 52 | n_repeats_min else 0 for this_ngram in ngrams) 53 | output.append(this_count) 54 | return output 55 | 56 | 57 | def remove_punctuation(input_string): 58 | working = regex_strip_punct.sub(" ", input_string) 59 | return working.translate(punct_unicode_tbl) 60 | 61 | 62 | def clean_newlines(input_text): 63 | return regex_clean_newlines.sub(" ", input_text) 64 | 65 | 66 | def clean_text(str_, remove_urls=False): 67 | if not str_: 68 | return "" 69 | 70 | cleaned = clean_newlines(str(str_)) 71 | cleaned = html.unescape(cleaned) 72 | if remove_urls: 73 | cleaned = regex_strip_urls.sub(" ", cleaned) 74 | cleaned = remove_punctuation(cleaned) 75 | cleaned = re.sub(r"=", " ", cleaned) 76 | cleaned = re.sub(r"\s\s+", " ", cleaned) 77 | 78 | return cleaned.strip().lower() 79 | 80 | 81 | def tokenize(str_): 82 | return WordPunctTokenizer().tokenize(str_) 83 | 84 | 85 | def get_list_item_count(raw_text): 86 | splitted = re.split(r'[\.;\:]', raw_text) 87 | return len(splitted) 88 | 89 | 90 | def get_list_item_count(raw_text): 91 | splitted = re.split(r'[\.;\:]', raw_text) 92 | return len(splitted) 93 | 94 | 95 | def get_hashtags(input_text): 96 | return re.findall(r'#(\w+)', input_text) 97 | 98 | 99 | def get_hashtag_count(tokens): 100 | counter = 0 101 | prev_was_hash = False 102 | for this_token in tokens: 103 | if this_token == "#": 104 | prev_was_hash = True 105 | continue 106 | if this_token.isalpha() and prev_was_hash: 107 | counter += 1 108 | prev_was_hash = False 109 | return counter 110 | 111 | 112 | def get_urls(raw_text): 113 | return re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', raw_text) 114 | 115 | 116 | def get_url_count(raw_text): 117 | return len(get_urls(raw_text)) 118 | --------------------------------------------------------------------------------