├── code ├── 1. preprocess_and_tokenize_tweets.ipynb ├── 2. encode_all_vectors.ipynb ├── 3. tweet_graph_analysis.ipynb ├── 4. analyze_clustering_results.ipynb ├── 5. label_nodes_and_clusters.ipynb ├── config │ ├── gexf_header.txt │ └── stopwords.json ├── helper_functions.py └── readme.md ├── example_output ├── tweet_graph_analysis_dems.txt ├── tweet_graph_analysis_output_ge2019.txt ├── tweet_graph_analysis_output_uktory_activists.txt └── tweet_graph_analysis_trump.txt ├── readme.md ├── readme.pdf └── readme └── media ├── image1.jpg ├── image10.png ├── image11.png ├── image13.png ├── image14.png ├── image15.png ├── image18.png ├── image19.png ├── image2.jpeg ├── image20.png ├── image21.png ├── image22.png ├── image23.png ├── image24.png ├── image25.png ├── image26.png ├── image27.png ├── image28.png ├── image29.png ├── image3.png ├── image30.png ├── image31.png ├── image32.png ├── image33.png ├── image36.png ├── image37.png ├── image38.png ├── image39.png ├── image4.jpeg ├── image40.png ├── image41.png ├── image42.png ├── image43.png ├── image44.png ├── image45.png ├── image46.png ├── image47.png ├── image48.png ├── image49.png ├── image5.png ├── image6.jpeg └── image9.png /code/1. preprocess_and_tokenize_tweets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from helper_functions import *" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "scrolled": true 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "# Read from saved Twitter data, preprocess, and extract metadata\n", 21 | "# The data is created by twitter_gatherer.py\n", 22 | "# see https://github.com/r0zetta/twitter_gather for details\n", 23 | "texts = []\n", 24 | "sns = []\n", 25 | "ids = []\n", 26 | "count = 0\n", 27 | "print(\"Reading raw twitter objects...\")\n", 28 | "filename = \"data/raw.json\"\n", 29 | "with open(filename, \"r\") as f:\n", 30 | " for line in f:\n", 31 | " count += 1\n", 32 | " if count % 1000000 == 0:\n", 33 | " print(\"Seen: \" + str(count) + \" captured: \" + str(len(texts)))\n", 34 | " status = json.loads(line.strip())\n", 35 | " text = status[\"text\"]\n", 36 | " sn = status[\"user\"][\"screen_name\"]\n", 37 | " id_str = status[\"id_str\"]\n", 38 | " lang = status[\"lang\"]\n", 39 | " # Omit non-EN tweets\n", 40 | " if lang != \"en\":\n", 41 | " continue\n", 42 | " # Omit retweets\n", 43 | " if \"retweeted_status\" in status and status[\"retweeted_status\"] is not None:\n", 44 | " continue\n", 45 | " if len(text) > 20 and \"…\" not in text:\n", 46 | " text = text.lower()\n", 47 | " text = re.sub(r'(@\\S+)', '', text)\n", 48 | " text = re.sub(r'(http\\S+)', '', text)\n", 49 | " text = text.strip()\n", 50 | " texts.append(text)\n", 51 | " sns.append(sn)\n", 52 | " ids.append(id_str)\n", 53 | "print(\"Found \" + str(len(texts)) + \" unique texts.\")" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "# Tokenize extracted data and save all files for use later\n", 63 | "all_sns = []\n", 64 | "all_ids = []\n", 65 | "all_tokens = []\n", 66 | "all_tokens_sw = []\n", 67 | "all_texts = []\n", 68 | "seen = set()\n", 69 | "min_tokens = 5\n", 70 | "for i, text in enumerate(texts):\n", 71 | " if text in seen:\n", 72 | " continue\n", 73 | " seen.add(text)\n", 74 | " sn = sns[i]\n", 75 | " id_str = ids[i]\n", 76 | " if i % 100000 == 0:\n", 77 | " print(str(i) + \"/\" + str(len(texts)) + \"/\" + str(len(all_tokens)))\n", 78 | " clean_tokens, clean_tokens_with_sw = custom_tokenize(text)\n", 79 | " # Keep tweets containing a minimum number of tokens after\n", 80 | " # preprocessing and tokenization\n", 81 | " if len(clean_tokens_with_sw) >= min_tokens:\n", 82 | " all_sns.append(sn)\n", 83 | " all_ids.append(id_str)\n", 84 | " all_tokens.append(clean_tokens)\n", 85 | " all_tokens_sw.append(clean_tokens_with_sw)\n", 86 | " all_texts.append(text)\n", 87 | "print(len(all_texts))\n", 88 | "if not os.path.exists(\"preprocessed\"):\n", 89 | " os.makedirs(\"preprocessed\")\n", 90 | "save_json(all_sns, \"preprocessed/sns.json\")\n", 91 | "save_json(all_ids, \"preprocessed/ids.json\")\n", 92 | "save_json(all_tokens, \"preprocessed/tokens.json\")\n", 93 | "save_json(all_tokens_sw, \"preprocessed/tokens_sw.json\")\n", 94 | "save_json(all_texts, \"preprocessed/texts.json\")\n", 95 | "print(\"Done. Now execute encode_all_vectors.ipynb\")" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [] 104 | } 105 | ], 106 | "metadata": { 107 | "kernelspec": { 108 | "display_name": "Python 3", 109 | "language": "python", 110 | "name": "python3" 111 | }, 112 | "language_info": { 113 | "codemirror_mode": { 114 | "name": "ipython", 115 | "version": 3 116 | }, 117 | "file_extension": ".py", 118 | "mimetype": "text/x-python", 119 | "name": "python", 120 | "nbconvert_exporter": "python", 121 | "pygments_lexer": "ipython3", 122 | "version": "3.7.3" 123 | } 124 | }, 125 | "nbformat": 4, 126 | "nbformat_minor": 2 127 | } 128 | -------------------------------------------------------------------------------- /code/2. encode_all_vectors.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from helper_functions import *" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "scrolled": true 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "import multiprocessing\n", 21 | "import gensim.models.word2vec as w2v\n", 22 | "from gensim import corpora, models, similarities\n", 23 | "\n", 24 | "def make_gram_vec(vec_slice):\n", 25 | " gv = []\n", 26 | " divisor = 0\n", 27 | " for i, v in enumerate(vec_slice):\n", 28 | " gv.append(v * (i+1))\n", 29 | " divisor += (i+1)\n", 30 | " gram_vec = np.sum(np.array(gv), axis=0)\n", 31 | " gram_vec = gram_vec/divisor\n", 32 | " return gram_vec\n", 33 | "\n", 34 | "def sentence_vector_ngram(vecs, num_grams):\n", 35 | " if num_grams == 1:\n", 36 | " return np.sum(np.array(vecs), axis=0)\n", 37 | " gram_vecs = []\n", 38 | " for i in range(0, len(vecs) - (num_grams-1)):\n", 39 | " gram_vec = make_gram_vec(vecs[i:i+num_grams])\n", 40 | " gram_vecs.append(gram_vec) \n", 41 | " final = np.sum(np.array(gram_vecs), axis=0)\n", 42 | " return final\n", 43 | "\n", 44 | "def calculate_sentence_vector(vecs):\n", 45 | " num_grams = 4\n", 46 | " return sentence_vector_ngram(vecs, num_grams)\n", 47 | "\n", 48 | "def get_sentence_vectors(tokens, model):\n", 49 | " vocab_set = set(list(model.wv.vocab.keys()))\n", 50 | " vecsindexed = []\n", 51 | " for clean_tokens in tokens:\n", 52 | " vecs = []\n", 53 | " for token in clean_tokens:\n", 54 | " if token in vocab_set:\n", 55 | " vecs.append(model.wv[token])\n", 56 | " final = calculate_sentence_vector(vecs)\n", 57 | " vecsindexed.append(final)\n", 58 | " return vecsindexed\n", 59 | "\n", 60 | "def make_word2vec_model(sentences):\n", 61 | " params = {'min_count': 3, 'window': 5, 'sample': 0.001, 'sg': 0, 'negative': 5, 'num_features': 768}\n", 62 | " print(\"w2v training data contained: \" + str(len(sentences)) + \" sentences.\")\n", 63 | " num_workers = multiprocessing.cpu_count()\n", 64 | " sentence_count = len(sentences)\n", 65 | "\n", 66 | " word2vec = w2v.Word2Vec(sg=params[\"sg\"],\n", 67 | " seed=1,\n", 68 | " workers=num_workers,\n", 69 | " size=params[\"num_features\"],\n", 70 | " min_count=params[\"min_count\"],\n", 71 | " window=params[\"window\"],\n", 72 | " sample=params[\"sample\"])\n", 73 | "\n", 74 | "\n", 75 | " word2vec.build_vocab(sentences)\n", 76 | " print(\"Training model with vocabulary length:\", len(word2vec.wv.vocab))\n", 77 | " epoch_count = 10\n", 78 | " word2vec.train(sentences, total_examples=sentence_count, epochs=epoch_count)\n", 79 | " return word2vec" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "from gensim.models import Doc2Vec\n", 89 | "from gensim.models.doc2vec import TaggedDocument\n", 90 | "import multiprocessing\n", 91 | "\n", 92 | "def make_doc2vec_model(sentences):\n", 93 | " num_workers = multiprocessing.cpu_count()\n", 94 | " sentence_count = len(sentences)\n", 95 | "\n", 96 | " documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(sentences)]\n", 97 | " alpha = 0.025\n", 98 | " doc2vec = Doc2Vec(documents,\n", 99 | " dm=1,\n", 100 | " alpha=alpha,\n", 101 | " min_alpha=0.00025, \n", 102 | " window=2, \n", 103 | " min_count=1,\n", 104 | " vector_size=768,\n", 105 | " workers=num_workers)\n", 106 | " print(\"Training model with vocabulary length:\", len(doc2vec.wv.vocab))\n", 107 | " \n", 108 | " epoch_count = 10\n", 109 | " print(\"Training doc2vec\")\n", 110 | " doc2vec.train(documents, total_examples=sentence_count, epochs=epoch_count)\n", 111 | " return doc2vec" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "# Load texts and tokens created by preprocess_and_tokenize_tweets.ipynb\n", 121 | "prefix = \"\"\n", 122 | "print(\"Loading tokens\")\n", 123 | "tokens = load_json(\"preprocessed/\" + prefix + \"tokens.json\")\n", 124 | "print(len(tokens))\n", 125 | "print(\"Loading texts\")\n", 126 | "texts = load_json(\"preprocessed/\" + prefix + \"texts.json\")\n", 127 | "print(len(texts))" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "# Make w2v vectors\n", 137 | "print(\"Building w2v model\")\n", 138 | "w2v_model = make_word2vec_model(tokens)\n", 139 | "print(\"Built\")\n", 140 | "print(\"Converting texts to vectors\")\n", 141 | "w2v_vectors = get_sentence_vectors(tokens, w2v_model)\n", 142 | "print(len(w2v_vectors))\n", 143 | "w2v_text_vec = {}\n", 144 | "for i, t in enumerate(texts):\n", 145 | " w2v_text_vec[t] = w2v_vectors[i]\n", 146 | "print(\"Saving\")\n", 147 | "save_bin(w2v_text_vec, \"preprocessed/\" + prefix + \"word2vec_text_vec.pkl\")\n", 148 | "print(\"Done\")" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "# Make d2v vectors\n", 158 | "print(\"Building d2v model\")\n", 159 | "d2v_model = make_doc2vec_model(tokens)\n", 160 | "print(\"Model built\")\n", 161 | "d2v_text_vec = {}\n", 162 | "for i, text in enumerate(texts):\n", 163 | " vec = d2v_model.docvecs[i]\n", 164 | " d2v_text_vec[text] = vec\n", 165 | "print(\"Saving\")\n", 166 | "save_bin(d2v_text_vec, \"preprocessed/\" + prefix + \"doc2vec_text_vec.pkl\")\n", 167 | "print(\"Done\")" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "from sentence_transformers import SentenceTransformer\n", 177 | "print(\"Instantiating BERT model\")\n", 178 | "bert_model = SentenceTransformer('bert-base-nli-mean-tokens')\n", 179 | "\n", 180 | "# Since converting sentences to bert vectors is time-consuming\n", 181 | "# we load the existing vectors and obtain a list of those sentences\n", 182 | "# that haven't yet been converted, so that we don't have to convert\n", 183 | "# all every time we run this.\n", 184 | "# Converting 100,000 sentences takes about an hour\n", 185 | "old_bert_text_vec = {}\n", 186 | "print(\"Loading existing vectors\")\n", 187 | "if os.path.exists(\"preprocessed/bert_text_vec.pkl\"):\n", 188 | " old_bert_text_vec = load_bin(\"preprocessed/\" + prefix + \"bert_text_vec.pkl\")\n", 189 | " print(len(old_bert_text_vec))\n", 190 | "already_processed = set([x for x, v in old_bert_text_vec.items()])\n", 191 | "not_processed = list(set(texts).difference(already_processed))\n", 192 | "print(\"Not processed: \" + str(len(not_processed)))\n", 193 | "\n", 194 | "# This encodes the vectors. There is no output during the process\n", 195 | "# so just be patient\n", 196 | "bert_vectors = bert_model.encode(not_processed)\n", 197 | "print(\"Vectors encoded\")\n", 198 | "\n", 199 | "# Combine newly created bert vectors with those that were already saved\n", 200 | "for i, t in enumerate(not_processed):\n", 201 | " old_bert_text_vec[t] = bert_vectors[i]\n", 202 | "bert_text_vec = {}\n", 203 | "for t in texts:\n", 204 | " bert_text_vec[t] = old_bert_text_vec[t]\n", 205 | "\n", 206 | "# Save the new set\n", 207 | "print(\"Saving vectors\")\n", 208 | "save_bin(bert_text_vec, \"preprocessed/\" + prefix + \"bert_text_vec.pkl\")\n", 209 | "print(\"Done\")" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "# Combine the three sentence vectors into meta embeddings and save for next step\n", 219 | "combined_vecs = []\n", 220 | "for i, text in enumerate(texts):\n", 221 | " if i % 100000 == 0:\n", 222 | " print(i)\n", 223 | " bert_vec = bert_text_vec[text]\n", 224 | " d2v_vec = d2v_text_vec[text]\n", 225 | " w2v_vec = w2v_text_vec[text]\n", 226 | " combined = np.sum([bert_vec, d2v_vec, w2v_vec], axis=0)\n", 227 | " combined_vecs.append(combined)\n", 228 | "print(\"Saving\")\n", 229 | "save_bin(combined_vecs, \"preprocessed/\" + prefix + \"combined_vecs.pkl\")\n", 230 | "print(\"Done. Now execute tweet_graph_analysis.ipynb\")" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [] 239 | } 240 | ], 241 | "metadata": { 242 | "kernelspec": { 243 | "display_name": "Python 3", 244 | "language": "python", 245 | "name": "python3" 246 | }, 247 | "language_info": { 248 | "codemirror_mode": { 249 | "name": "ipython", 250 | "version": 3 251 | }, 252 | "file_extension": ".py", 253 | "mimetype": "text/x-python", 254 | "name": "python", 255 | "nbconvert_exporter": "python", 256 | "pygments_lexer": "ipython3", 257 | "version": "3.7.3" 258 | } 259 | }, 260 | "nbformat": 4, 261 | "nbformat_minor": 2 262 | } 263 | -------------------------------------------------------------------------------- /code/3. tweet_graph_analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from helper_functions import *\n", 10 | "\n", 11 | "num_grams = 3" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# Load saved data from previous step\n", 21 | "print(\"Loading combined vectors\")\n", 22 | "combined_vecs = load_bin(\"preprocessed/combined_vecs.pkl\")\n", 23 | "print(len(combined_vecs))\n", 24 | "print(\"Loading texts\")\n", 25 | "base_texts = load_json(\"preprocessed/texts.json\")\n", 26 | "print(len(base_texts))\n", 27 | "print(\"Loading sns\")\n", 28 | "screen_names = load_json(\"preprocessed/sns.json\")\n", 29 | "print(len(screen_names))\n", 30 | "print(\"Loading ids\")\n", 31 | "id_strs = load_json(\"preprocessed/ids.json\")\n", 32 | "print(len(id_strs))\n", 33 | "print(\"Done\")" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": { 40 | "scrolled": false 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "# This cell performs the clustering\n", 45 | "\n", 46 | "# Parameters used by the algorithm\n", 47 | "# See the blog post for details about these\n", 48 | "batch_size = 10000\n", 49 | "total_samples = 10000\n", 50 | "edge_ratio = 3\n", 51 | "min_cluster_size = 50\n", 52 | "merge_similarity = 0.98\n", 53 | "similarity_threshold = 0\n", 54 | "\n", 55 | "msg = \"TARGET:\" + str(total_samples) \n", 56 | "msg += \" BS:\" + str(batch_size)\n", 57 | "msg += \" ER:\" + str(edge_ratio)\n", 58 | "msg += \" STH:\" + str(similarity_threshold)\n", 59 | "msg += \" MSIM:\" + str(merge_similarity)\n", 60 | "msg += \" MCS:\" + str(min_cluster_size)\n", 61 | "print(msg)\n", 62 | "\n", 63 | "# State information is saved in these variables\n", 64 | "# and used in later cells to generate output\n", 65 | "centers = []\n", 66 | "center_labels = []\n", 67 | "center_words = []\n", 68 | "center_ngrams = []\n", 69 | "center_svo = []\n", 70 | "center_sentiment = []\n", 71 | "center_sns = []\n", 72 | "center_sizes = []\n", 73 | "center_urls = []\n", 74 | "center_tweets = []\n", 75 | "\n", 76 | "vec_label = []\n", 77 | "item_mod = {}\n", 78 | "\n", 79 | "ncindices = []\n", 80 | "final_mapping = []\n", 81 | "\n", 82 | "\n", 83 | "used = set()\n", 84 | "mod_num = 0\n", 85 | "passes = 0\n", 86 | "merges = 0\n", 87 | "total_processed = 0\n", 88 | "finished = False\n", 89 | "\n", 90 | "# If this is true, samples will be randomly selected from the whole set\n", 91 | "# If this is false, samples will be selected sequentially starting from current_index\n", 92 | "get_random = True\n", 93 | "\n", 94 | "# If this is true, the start index for sequential sampling will be randomly selected\n", 95 | "start_random = True\n", 96 | "current_index = 0\n", 97 | "if start_random == True:\n", 98 | " end_ind = max(0, (len(base_texts) - total_samples+1))\n", 99 | " if end_ind > 0:\n", 100 | " current_index = random.randint(0, end_ind)\n", 101 | "\n", 102 | "if get_random == True:\n", 103 | " print(\"Using random sampling.\")\n", 104 | "else:\n", 105 | " print(\"Using sequential sampling, starting at index: \" + str(current_index))\n", 106 | "\n", 107 | "while finished == False: \n", 108 | " if total_processed < total_samples:\n", 109 | " num_to_add = batch_size - len(ncindices)\n", 110 | " \n", 111 | " # If this got stuck and didn't create any new clusters, truncate the list and add new stuff\n", 112 | " if num_to_add == 0 and passes > 0:\n", 113 | " samples_left = len(base_texts) - current_index\n", 114 | " num_to_remove = min(samples_left, round(batch_size/10))\n", 115 | " ncindices = ncindices[num_to_remove:]\n", 116 | " num_to_add = num_to_remove\n", 117 | " \n", 118 | " # Add new samples to batch\n", 119 | " count = 0\n", 120 | " if get_random == True:\n", 121 | " while count < num_to_add:\n", 122 | " rindex = random.randint(0,len(base_texts)-1)\n", 123 | " if rindex not in used:\n", 124 | " used.add(rindex)\n", 125 | " ncindices.append(rindex)\n", 126 | " count += 1\n", 127 | " else:\n", 128 | " while count < num_to_add:\n", 129 | " ncindices.append(current_index)\n", 130 | " current_index += 1\n", 131 | " if current_index > len(base_texts):\n", 132 | " finished = True\n", 133 | " break\n", 134 | " count += 1\n", 135 | "\n", 136 | " ncvectors = [combined_vecs[i] for i in ncindices]\n", 137 | " \n", 138 | " clusters, mapping = make_text_clusters(ncvectors, \n", 139 | " edge_ratio=edge_ratio, \n", 140 | " threshold=similarity_threshold)\n", 141 | "\n", 142 | " # Build mapping for gephi visualization\n", 143 | " nodes_to_omit = set()\n", 144 | " for mod, idl in clusters.items():\n", 145 | " if len(idl) <= min_cluster_size:\n", 146 | " for node in idl:\n", 147 | " nodes_to_omit.add(node)\n", 148 | " for m in mapping:\n", 149 | " x, y, c = m\n", 150 | " if x not in nodes_to_omit and y not in nodes_to_omit:\n", 151 | " final_mapping.append([ncindices[x], ncindices[y], c])\n", 152 | " \n", 153 | " # Renumber clusters to include actual data indices\n", 154 | " clustered = set()\n", 155 | " rclusters = {}\n", 156 | " for mod, idl in clusters.items():\n", 157 | " rclusters[mod] = []\n", 158 | " for i in idl:\n", 159 | " orig_index = ncindices[i]\n", 160 | " rclusters[mod].append(orig_index)\n", 161 | " clustered.add(orig_index)\n", 162 | "\n", 163 | " # Create a list of unclustered samples\n", 164 | " not_clustered = set(ncindices).difference(clustered)\n", 165 | " new_ncindices = list(not_clustered)\n", 166 | " \n", 167 | " # Iterate through identified clusters\n", 168 | " for mod, idl in sorted(rclusters.items()):\n", 169 | " # Check if cluster matches min_cluster_size\n", 170 | " if len(idl) >= min_cluster_size:\n", 171 | " texts = [base_texts[index] for index in idl]\n", 172 | " sns = [screen_names[index] for index in idl]\n", 173 | " ids = [id_strs[index] for index in idl]\n", 174 | " vectors = [combined_vecs[index] for index in idl]\n", 175 | " center = get_cluster_center(vectors)\n", 176 | " \n", 177 | " # Check if this cluster has a similar center to any other already found\n", 178 | " cluster_mod = None\n", 179 | " new_cluster = False\n", 180 | " sc = 0\n", 181 | " if len(centers) > 1:\n", 182 | " scores = fast_cosine_matrix(np.array(center), np.array(centers))\n", 183 | " sc = np.max(scores)\n", 184 | "\n", 185 | " if sc >= merge_similarity:\n", 186 | " cluster_mod = np.argmax(scores)\n", 187 | " merges += 1\n", 188 | " else:\n", 189 | " cluster_mod = mod_num\n", 190 | " new_cluster = True\n", 191 | " mod_num += 1\n", 192 | "\n", 193 | " # This is used for gephi visualization\n", 194 | " for c, item in enumerate(idl):\n", 195 | " item_mod[item] = [cluster_mod]\n", 196 | "\n", 197 | " # Measure the distance of items to the cluster center\n", 198 | " indices, rtweets, rurls = get_cluster_relevance(texts, vectors, sns, ids)\n", 199 | " \n", 200 | " # Save or update cluster size\n", 201 | " center_size = len(idl)\n", 202 | " if new_cluster == True:\n", 203 | " center_sizes.append(center_size)\n", 204 | " else:\n", 205 | " center_sizes[cluster_mod] += center_size\n", 206 | "\n", 207 | " # Get combined sentiment from texts in this cluster\n", 208 | " sentiment = get_sentiment(texts)\n", 209 | " if new_cluster == True:\n", 210 | " center_sentiment.append(sentiment)\n", 211 | " else:\n", 212 | " center_sentiment[cluster_mod] += sentiment\n", 213 | "\n", 214 | " # Get word frequencies from texts in this cluster\n", 215 | " wfreq = get_word_frequencies(texts)\n", 216 | " if new_cluster == True:\n", 217 | " center_words.append(wfreq)\n", 218 | " else:\n", 219 | " for x, c in wfreq.most_common():\n", 220 | " center_words[cluster_mod][x] += c\n", 221 | " \n", 222 | " # Get subject, verb, object triples\n", 223 | " svo_triples = get_subject_verb_object_triples(texts)\n", 224 | " if new_cluster == True:\n", 225 | " center_svo.append(svo_triples)\n", 226 | " else:\n", 227 | " for x, c in svo_triples.most_common():\n", 228 | " center_svo[cluster_mod][x] += c\n", 229 | "\n", 230 | " # Get common ngrams\n", 231 | " ngrams = get_ngram_frequencies(texts, num_grams)\n", 232 | " if new_cluster == True:\n", 233 | " center_ngrams.append(ngrams)\n", 234 | " else:\n", 235 | " for x, c in ngrams.most_common():\n", 236 | " center_ngrams[cluster_mod][x] += c\n", 237 | "\n", 238 | " # Get label text (to be shown on graphviz)\n", 239 | " if new_cluster == True:\n", 240 | " center_label = get_label_text(texts, vectors)\n", 241 | " center_labels.append(center_label)\n", 242 | "\n", 243 | " # Add center to the list if it is new (else keep the existing one)\n", 244 | " if new_cluster == True:\n", 245 | " centers.append(center)\n", 246 | "\n", 247 | " # Add vectors to the list\n", 248 | " for v in vectors:\n", 249 | " vec_label.append([v, cluster_mod])\n", 250 | "\n", 251 | " # Add or update sn list\n", 252 | " snc = Counter(sns)\n", 253 | " if new_cluster == True:\n", 254 | " center_sns.append(snc)\n", 255 | " else:\n", 256 | " for x, c in snc.most_common():\n", 257 | " center_sns[cluster_mod][x] += c\n", 258 | "\n", 259 | " # Update or add tweets by relevance\n", 260 | " if new_cluster == True:\n", 261 | " center_tweets.append(rtweets)\n", 262 | " else:\n", 263 | " for x, c in rtweets.most_common():\n", 264 | " center_tweets[cluster_mod][x] = c\n", 265 | "\n", 266 | " # Update or add tweet urls by relevance\n", 267 | " if new_cluster == True:\n", 268 | " center_urls.append(rurls)\n", 269 | " else:\n", 270 | " for x, c in rurls.most_common():\n", 271 | " center_urls[cluster_mod][x] = c\n", 272 | " else:\n", 273 | " # All stuff that was thrown away is added to the unclustered list\n", 274 | " new_ncindices.extend(idl)\n", 275 | "\n", 276 | " # Print some status output\n", 277 | " total_processed += (len(ncindices) - len(new_ncindices))\n", 278 | " if len(center_sizes) > 0:\n", 279 | " mean_size = np.mean(center_sizes)\n", 280 | " smallest = min(center_sizes)\n", 281 | " largest = max(center_sizes)\n", 282 | " msg = \"Pass: \" + str(passes)\n", 283 | " if get_random == False:\n", 284 | " msg += \" I:\" + str(current_index)\n", 285 | " msg += \" N:\" + str(total_processed) \n", 286 | " msg += \" C:\" + str(mod_num)\n", 287 | " msg += \" Mean : \" + \"%.2f\"%mean_size\n", 288 | " msg += \" Min: \" + str(smallest)\n", 289 | " msg += \" Max: \" + str(largest)\n", 290 | " msg += \" Merges: \" + str(merges)\n", 291 | " print(msg)\n", 292 | " ncindices = list(new_ncindices)\n", 293 | " passes += 1\n", 294 | " if total_processed > total_samples:\n", 295 | " finished = True\n", 296 | "print(\"Done\")" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": null, 302 | "metadata": {}, 303 | "outputs": [], 304 | "source": [ 305 | "# This cell writes a readable text file containing a summary of the clustering process \n", 306 | "print(\"Writing summary. You can read it by opening tweet_graph_analysis_output.txt\")\n", 307 | "with open(\"tweet_graph_analysis_output.txt\", \"w\") as f:\n", 308 | " for index in range(len(centers)):\n", 309 | " ngrams = center_ngrams[index]\n", 310 | " wfreq = center_words[index]\n", 311 | " svo = center_svo[index]\n", 312 | " sentiment = center_sentiment[index]\n", 313 | " tweeted = center_sns[index]\n", 314 | " size = center_sizes[index]\n", 315 | " tweets = center_tweets[index]\n", 316 | " f.write(\"\\n\")\n", 317 | " f.write(\"Cluster: \" + str(index) + \" contains: \" + str(size) + \" tweets.\\n\")\n", 318 | " f.write(\"Sentiment: \" + \"%.2f\"%sentiment+\"\\n\")\n", 319 | " wft = get_wft(wfreq)\n", 320 | " f.write(\"Words: \" + wft + \"\\n\")\n", 321 | " svoft = get_wft(svo)\n", 322 | " f.write(\"svo: \"+svoft+\"\\n\")\n", 323 | " nft = get_wft(ngrams)\n", 324 | " f.write(\"ngrams: \" + nft+\"\\n\")\n", 325 | " snt = print_counter_summary(tweeted)\n", 326 | " f.write(\"tweeted: \" + snt +\"\\n\")\n", 327 | " f.write(\"==================\\n\")\n", 328 | " tt = []\n", 329 | " for x, c in tweets.most_common():\n", 330 | " tt.append([c, x])\n", 331 | " for t in tt[:20]:\n", 332 | " f.write(\"%.3f\"%t[0] + \" \" + t[1] +\"\\n\")\n", 333 | " if len(tt) > 20:\n", 334 | " f.write(\"...\\n\")\n", 335 | " for t in tt[-5:]:\n", 336 | " f.write(\"%.3f\"%t[0] + \" \" + t[1] +\"\\n\")\n", 337 | " f.write(\"\\n\")\n", 338 | "print(\"Done\")" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": null, 344 | "metadata": {}, 345 | "outputs": [], 346 | "source": [ 347 | "# This writes out a file that can be read by gephi\n", 348 | "# WARNING! Consider commenting this out if you're planning on clustering a huge amount of tweets!\n", 349 | "write_gexf(final_mapping, \"tweet_mapping.gexf\", item_mod, [\"community\"])" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": null, 355 | "metadata": {}, 356 | "outputs": [], 357 | "source": [ 358 | "# Save data for subsequent analysis\n", 359 | "full = {}\n", 360 | "full[\"ngrams\"] = center_ngrams\n", 361 | "full[\"words\"] = center_words\n", 362 | "full[\"svo\"] = center_svo\n", 363 | "full[\"sentiment\"] = center_sentiment\n", 364 | "full[\"sns\"] = center_sns\n", 365 | "full[\"sizes\"] = center_sizes\n", 366 | "full[\"tweets\"] = center_tweets\n", 367 | "full[\"urls\"] = center_urls\n", 368 | "full[\"centers\"] = centers\n", 369 | "save_bin(full, \"clustering_data.pkl\")\n", 370 | "print(\"Done\")" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "metadata": { 377 | "scrolled": true 378 | }, 379 | "outputs": [], 380 | "source": [ 381 | "# This function extracts cluster ids containing search terms\n", 382 | "# and displays details about the top 5 most relevant clusters\n", 383 | "terms = [\"liar\", \"criminal\", \"idiot\", \"fool\", \"ignorant\", \"delusional\"]\n", 384 | "found = Counter()\n", 385 | "for index in range(len(full[\"tweets\"])):\n", 386 | " for x, c in full[\"tweets\"][index].most_common():\n", 387 | " for term in terms:\n", 388 | " if term in x:\n", 389 | " found[index] += 1\n", 390 | "print(\"Found \" + str(len(found)) + \" clusters contained the terms: \\\"\" + \", \".join(terms) + \"\\\".\")\n", 391 | "\n", 392 | "cluster_per = Counter()\n", 393 | "cluster_matches = Counter()\n", 394 | "for x, c in found.most_common():\n", 395 | " size = full[\"sizes\"][x]\n", 396 | " matches = c\n", 397 | " per = (matches/size) * 100\n", 398 | " cluster_per[x] = per\n", 399 | " cluster_matches[x] = matches\n", 400 | "\n", 401 | "targets = [x for x, c in cluster_per.most_common(5)]\n", 402 | "print()\n", 403 | "for t in targets:\n", 404 | " msg = \"Cluster \" + str(t) + \" (size \" + str(full[\"sizes\"][t]) + \") contained \" \n", 405 | " msg += str(cluster_matches[t]) + \" tweets (\" + \"%.2f\"%cluster_per[t] + \"%) that included the terms: \\\"\" \n", 406 | " msg += \", \".join(terms) + \"\\\".\"\n", 407 | " print(msg)\n", 408 | " tm = \"\"\n", 409 | " tc = 0\n", 410 | " for x, c in full[\"words\"][t].most_common():\n", 411 | " if x not in stopwords:\n", 412 | " tm += x + \"(\" + str(c) + \") \"\n", 413 | " tc += 1\n", 414 | " if tc >= 10:\n", 415 | " break\n", 416 | " print(tm)\n", 417 | " print()\n", 418 | " for x, c in full[\"tweets\"][t].most_common(10):\n", 419 | " print(\"%.3f\"%c + \": \" + x)\n", 420 | " print()" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": null, 426 | "metadata": {}, 427 | "outputs": [], 428 | "source": [] 429 | } 430 | ], 431 | "metadata": { 432 | "kernelspec": { 433 | "display_name": "Python 3", 434 | "language": "python", 435 | "name": "python3" 436 | }, 437 | "language_info": { 438 | "codemirror_mode": { 439 | "name": "ipython", 440 | "version": 3 441 | }, 442 | "file_extension": ".py", 443 | "mimetype": "text/x-python", 444 | "name": "python", 445 | "nbconvert_exporter": "python", 446 | "pygments_lexer": "ipython3", 447 | "version": "3.7.3" 448 | } 449 | }, 450 | "nbformat": 4, 451 | "nbformat_minor": 2 452 | } 453 | -------------------------------------------------------------------------------- /code/4. analyze_clustering_results.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook contains some example analyses on the resulting cluster output data" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "from helper_functions import *\n", 17 | "import seaborn as sns\n", 18 | "import matplotlib.pyplot as plt\n", 19 | "\n", 20 | "sns.set(palette=\"husl\")\n", 21 | "sns.set(style=\"whitegrid\")" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "# Load saved data\n", 31 | "full = load_bin(\"clustering_data.pkl\")\n", 32 | "print(\"Done\")" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "# Show a histogram of cluster sizes\n", 42 | "counts = []\n", 43 | "indices = []\n", 44 | "labels = []\n", 45 | "max_size = 1\n", 46 | "\n", 47 | "for i, x in enumerate(sorted(full[\"sizes\"], reverse=True)):\n", 48 | " if i % 10 == 0:\n", 49 | " labels.append(x)\n", 50 | " else:\n", 51 | " labels.append(\"\")\n", 52 | " if x > max_size:\n", 53 | " indices.append(i)\n", 54 | " counts.append(x)\n", 55 | "fig = plt.figure(figsize=(18,10))\n", 56 | "ax = sns.barplot(y=counts, x=indices, palette=\"husl\", ci=None)\n", 57 | "ax.set_title(\"Cluster sizes (above \"+str(max_size)+\" in size)\")\n", 58 | "y = ax.set(xticklabels=labels)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": { 65 | "scrolled": false 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "# Show counts of clusters which have similar sizes\n", 70 | "d = Counter()\n", 71 | "for i, x in enumerate(full[\"sizes\"]):\n", 72 | " d[x] += 1\n", 73 | "\n", 74 | "ordered = []\n", 75 | "for w in sorted(d):\n", 76 | " ordered.append([w, d[w]])\n", 77 | "counts = []\n", 78 | "indices = []\n", 79 | "labels = []\n", 80 | "for item in ordered:\n", 81 | " if item[1] > 1:\n", 82 | " counts.append(item[1])\n", 83 | " indices.append(item[0])\n", 84 | " labels.append(item[0])\n", 85 | "fig = plt.figure(figsize=(18,10))\n", 86 | "ax = sns.barplot(y=counts, x=indices, palette=\"husl\", ci=None)\n", 87 | "ax.set_title(\"Counts of clusters with similar sizes.\")\n", 88 | "y = ax.set(xticklabels=labels)\n", 89 | "y = ax.set_xticklabels(ax.get_xticklabels(), rotation=90)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "# Show words common to clusters larger than max_size\n", 99 | "max_size = 100\n", 100 | "word_counts = Counter()\n", 101 | "for index, size in enumerate(full[\"sizes\"]):\n", 102 | " if size >= max_size:\n", 103 | " for x, c in Counter(full[\"words\"][index]).most_common():\n", 104 | " word_counts[x] += c\n", 105 | "counts = []\n", 106 | "sizes = []\n", 107 | "added = 0\n", 108 | "for x, c in word_counts.most_common():\n", 109 | " if x not in stopwords:\n", 110 | " sizes.append(c)\n", 111 | " counts.append(str(x) + \" (\" + str(c) + \")\")\n", 112 | " added += 1\n", 113 | " if added > 20:\n", 114 | " break\n", 115 | "\n", 116 | "fig = plt.figure(figsize=(18,10))\n", 117 | "ax = fig.add_axes((0,0,.5,1))\n", 118 | "\n", 119 | "ax.set_title(\"Most common words in clusters smaller than 100\")\n", 120 | "plt.pie(sizes, labels=counts, startangle=0)\n", 121 | "plt.axis('equal')\n", 122 | "plt.show()" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "# Show words common to clusters smaller than max_size\n", 132 | "word_counts = Counter()\n", 133 | "for index, size in enumerate(full[\"sizes\"]):\n", 134 | " if size < max_size:\n", 135 | " for x, c in Counter(full[\"words\"][index]).most_common():\n", 136 | " word_counts[x] += c\n", 137 | "counts = []\n", 138 | "sizes = []\n", 139 | "added = 0\n", 140 | "for x, c in word_counts.most_common():\n", 141 | " if x not in stopwords:\n", 142 | " sizes.append(c)\n", 143 | " counts.append(str(x) + \" (\" + str(c) + \")\")\n", 144 | " added += 1\n", 145 | " if added > 20:\n", 146 | " break\n", 147 | "\n", 148 | "fig = plt.figure(figsize=(18,8))\n", 149 | "ax = fig.add_axes((0,0,.5,1))\n", 150 | "\n", 151 | "ax.set_title(\"Most common words in clusters smaller than 100\")\n", 152 | "plt.pie(sizes, labels=counts, startangle=0)\n", 153 | "plt.axis('equal')\n", 154 | "plt.show()" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "# Print common words and counts for larger clusters as pie chart\n", 164 | "sizes = []\n", 165 | "labels = []\n", 166 | "max_size = 300\n", 167 | "\n", 168 | "for i, s in enumerate(full[\"sizes\"]):\n", 169 | " if s > max_size:\n", 170 | " count = 0\n", 171 | " label = \"\"\n", 172 | " for x, c in full[\"words\"][i].most_common():\n", 173 | " if x not in stopwords:\n", 174 | " label += x + \"(\" + str(c) + \") \"\n", 175 | " count += 1\n", 176 | " if count % 5 == 0:\n", 177 | " label += \"\\n\"\n", 178 | " if count >= 15:\n", 179 | " break\n", 180 | " labels.append(label)\n", 181 | " sizes.append(s)\n", 182 | "\n", 183 | "fig = plt.figure(figsize=(20,10))\n", 184 | "ax = fig.add_axes((0,0,.5,1))\n", 185 | "\n", 186 | "plt.pie(sizes, labels=labels, startangle=20)\n", 187 | "plt.axis('equal')\n", 188 | "plt.show()" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "# Print details about larger clusters as a horizontal bar chart\n", 198 | "sizes = []\n", 199 | "labels = []\n", 200 | "sthreshold = 0.1\n", 201 | "max_size = 200\n", 202 | "\n", 203 | "count = 0\n", 204 | "for i, s in enumerate(full[\"sizes\"]):\n", 205 | " if s > max_size:\n", 206 | " count += 1\n", 207 | " svo = \"\"\n", 208 | " if len(full[\"svo\"][i]) > 0:\n", 209 | " svo = \" \".join([x for x, c in Counter(full[\"svo\"][i]).most_common(3)])\n", 210 | " top_words = [x for x, c in full[\"words\"][i].most_common() if x not in stopwords]\n", 211 | "\n", 212 | " sent = \" -POS-\"\n", 213 | " if full[\"sentiment\"][i]/s < sthreshold:\n", 214 | " sent = \" -NEG-\"\n", 215 | " label = \"\"\n", 216 | " label += \"[\" + str(i) + \"] \" \n", 217 | " label += svo \n", 218 | " #label += sent \n", 219 | " label += \"\\n\"\n", 220 | " label += \" / \".join([x for x, c in Counter(full[\"ngrams\"][i]).most_common(5)])\n", 221 | " label += \"\\n\"\n", 222 | " label += \", \".join(top_words[:10]) + \"\\n\"\n", 223 | " labels.append(label)\n", 224 | " sizes.append(s)\n", 225 | "\n", 226 | "jsondata = []\n", 227 | "for i, s in enumerate(sizes):\n", 228 | " label = labels[i]\n", 229 | " jsondata.append([s, label])\n", 230 | "\n", 231 | "plot_data = {}\n", 232 | "plot_data[\"labels\"] = []\n", 233 | "plot_data[\"counts\"] = []\n", 234 | "for item in jsondata:\n", 235 | " plot_data[\"counts\"].append(item[0])\n", 236 | " plot_data[\"labels\"].append(item[1])\n", 237 | "height = len(plot_data[\"counts\"])*2\n", 238 | "sns.set_style(\"white\", {'axes.spines.bottom': False,\n", 239 | " 'axes.spines.left': False,\n", 240 | " 'axes.spines.right': False,\n", 241 | " 'axes.spines.top': False})\n", 242 | "fig = plt.figure(figsize=(10,25))\n", 243 | "ax = sns.barplot(x=\"counts\", y=\"labels\", palette=\"husl\", data=plot_data)\n", 244 | "y = ax.set(yticklabels=[])\n", 245 | "y = ax.set(xticklabels=[])\n", 246 | "for i, v in enumerate(plot_data[\"counts\"]):\n", 247 | " pad = min(25.0, v/100)\n", 248 | " ax.text(v+pad, i+0.55, str(plot_data[\"labels\"][i]))\n", 249 | " ax.text(20, i, str(v), fontweight='bold')" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "# Print details about negative and positive clusters\n", 259 | "def print_fc(cvar, num):\n", 260 | " outs = \"\"\n", 261 | " col = 0\n", 262 | " for x, c in cvar.most_common(num):\n", 263 | " if col > 90:\n", 264 | " outs += \"\\n\"\n", 265 | " col = 0\n", 266 | " outs += str(x) + \" / \"\n", 267 | " col += len(str(x)) + 3\n", 268 | " print(outs)\n", 269 | "\n", 270 | "pos = []\n", 271 | "postot = 0\n", 272 | "pos_svo = Counter()\n", 273 | "pos_ngram = Counter()\n", 274 | "pos_word = Counter()\n", 275 | "neg = []\n", 276 | "negtot = 0\n", 277 | "neg_svo = Counter()\n", 278 | "neg_ngram = Counter()\n", 279 | "neg_word = Counter()\n", 280 | "for i, s in enumerate(full[\"sizes\"]):\n", 281 | " sent = full[\"sentiment\"][i]/s\n", 282 | " sv = [x for x, c in full[\"svo\"][i].most_common()]\n", 283 | " ng = [x for x, c in full[\"ngrams\"][i].most_common()]\n", 284 | " wd = [x for x, c in full[\"words\"][i].most_common()]\n", 285 | " if sent > sthreshold:\n", 286 | " pos.append(s)\n", 287 | " if len(sv) > 0:\n", 288 | " pos_svo[sv[0]] += 1\n", 289 | " for n in ng[:5]:\n", 290 | " pos_ngram[n] += 1\n", 291 | " cnt = 0\n", 292 | " for w in wd:\n", 293 | " if w not in stopwords:\n", 294 | " pos_word[w] += 1\n", 295 | " cnt += 1\n", 296 | " if cnt > 5:\n", 297 | " break\n", 298 | " else:\n", 299 | " neg.append(s)\n", 300 | " if len(sv) > 0:\n", 301 | " neg_svo[sv[0]] += 1\n", 302 | " for n in ng[:5]:\n", 303 | " neg_ngram[n] += 1\n", 304 | " cnt = 0\n", 305 | " for w in wd:\n", 306 | " if w not in stopwords:\n", 307 | " neg_word[w] += 1\n", 308 | " cnt += 1\n", 309 | " if cnt > 5:\n", 310 | " break\n", 311 | "itemc = 15\n", 312 | "print(\"Positive clusters: \" + str(len(pos)) + \" - \" + str(sum(pos)) + \" tweets.\")\n", 313 | "print(\"svo\")\n", 314 | "print_fc(pos_svo, itemc)\n", 315 | "print()\n", 316 | "print(\"ngram\")\n", 317 | "print_fc(pos_ngram, itemc)\n", 318 | "print()\n", 319 | "print(\"word\")\n", 320 | "print_fc(pos_word, itemc)\n", 321 | "\n", 322 | "print()\n", 323 | "print(\"Negative clusters: \" + str(len(neg)) + \" - \" + str(sum(neg)) + \" tweets.\")\n", 324 | "print(\"svo\")\n", 325 | "print_fc(neg_svo, itemc)\n", 326 | "print()\n", 327 | "print(\"ngram\")\n", 328 | "print_fc(neg_ngram, itemc)\n", 329 | "print()\n", 330 | "print(\"word\")\n", 331 | "print_fc(neg_word, itemc)" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": null, 337 | "metadata": { 338 | "scrolled": false 339 | }, 340 | "outputs": [], 341 | "source": [ 342 | "# This function extracts cluster ids containing search terms\n", 343 | "# and displays details about the top 5 most relevant clusters\n", 344 | "terms = [\"liar\", \"criminal\", \"idiot\", \"fool\", \"ignorant\", \"delusional\"]\n", 345 | "found = Counter()\n", 346 | "for index in range(len(full[\"tweets\"])):\n", 347 | " for x, c in full[\"tweets\"][index].most_common():\n", 348 | " for term in terms:\n", 349 | " if term in x:\n", 350 | " found[index] += 1\n", 351 | "print(\"Found \" + str(len(found)) + \" clusters contained the terms: \\\"\" + \", \".join(terms) + \"\\\".\")\n", 352 | "\n", 353 | "cluster_per = Counter()\n", 354 | "cluster_matches = Counter()\n", 355 | "for x, c in found.most_common():\n", 356 | " size = full[\"sizes\"][x]\n", 357 | " matches = c\n", 358 | " per = (matches/size) * 100\n", 359 | " cluster_per[x] = per\n", 360 | " cluster_matches[x] = matches\n", 361 | "\n", 362 | "targets = [x for x, c in cluster_per.most_common(5)]\n", 363 | "print()\n", 364 | "for t in targets:\n", 365 | " msg = \"Cluster \" + str(t) + \" (size \" + str(full[\"sizes\"][t]) + \") contained \" \n", 366 | " msg += str(cluster_matches[t]) + \" tweets (\" + \"%.2f\"%cluster_per[t] + \"%) that included the terms: \\\"\" \n", 367 | " msg += \", \".join(terms) + \"\\\".\"\n", 368 | " print(msg)\n", 369 | " tm = \"\"\n", 370 | " tc = 0\n", 371 | " for x, c in full[\"words\"][t].most_common():\n", 372 | " if x not in stopwords:\n", 373 | " tm += x + \"(\" + str(c) + \") \"\n", 374 | " tc += 1\n", 375 | " if tc >= 10:\n", 376 | " break\n", 377 | " print(tm)\n", 378 | " print()\n", 379 | " for x, c in full[\"tweets\"][t].most_common(10):\n", 380 | " print(\"%.3f\"%c + \": \" + x)\n", 381 | " print()" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": {}, 388 | "outputs": [], 389 | "source": [] 390 | } 391 | ], 392 | "metadata": { 393 | "kernelspec": { 394 | "display_name": "Python 3", 395 | "language": "python", 396 | "name": "python3" 397 | }, 398 | "language_info": { 399 | "codemirror_mode": { 400 | "name": "ipython", 401 | "version": 3 402 | }, 403 | "file_extension": ".py", 404 | "mimetype": "text/x-python", 405 | "name": "python", 406 | "nbconvert_exporter": "python", 407 | "pygments_lexer": "ipython3", 408 | "version": "3.7.3" 409 | } 410 | }, 411 | "nbformat": 4, 412 | "nbformat_minor": 2 413 | } 414 | -------------------------------------------------------------------------------- /code/5. label_nodes_and_clusters.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from helper_functions import *" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "cluster_data = load_bin(\"clustering_data.pkl\")" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "def get_sent(text):\n", 28 | " sents = []\n", 29 | " blob = TextBlob(text)\n", 30 | " for sentence in blob.sentences:\n", 31 | " sents.append(sentence.sentiment.polarity)\n", 32 | " return np.sum(sents)" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": { 39 | "scrolled": true 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "# Attempt to label each center\n", 44 | "# Also, calculate sentiment of the derived label\n", 45 | "# and compare it to sentiment derived from all tweets in the cluster\n", 46 | "# This is then used to calculate positive, negative, and toxic categories\n", 47 | "num_grams = 20\n", 48 | "num_words = 10\n", 49 | "\n", 50 | "node_label = {}\n", 51 | "node_sent = {}\n", 52 | "toxic = 0\n", 53 | "negative = 0\n", 54 | "positive = 0\n", 55 | "for index in range(len(cluster_data[\"centers\"])):\n", 56 | " words = Counter()\n", 57 | " # Get a list of important non-stop words from tweets\n", 58 | " while len(words) < num_words:\n", 59 | " for x, c in cluster_data[\"words\"][index].most_common():\n", 60 | " if x not in stopwords:\n", 61 | " words[x] = c\n", 62 | " # Get a list of the most common ngrams and svo triplets\n", 63 | " # reformat svo triplets to be same as ngrams\n", 64 | " summaries = Counter()\n", 65 | " for x, c in cluster_data[\"svo\"][index].most_common(num_grams):\n", 66 | " x = \" \".join(x[1:-1].split(\", \"))\n", 67 | " summaries[x] += c\n", 68 | " for x, c in cluster_data[\"ngrams\"][index].most_common(num_grams):\n", 69 | " summaries[x] += c\n", 70 | " # If a word is found in a summary, assign it to potential labels counter\n", 71 | " # with count equal to the frequency of the matched word\n", 72 | " labels = Counter()\n", 73 | " for word, count in words.most_common(num_words):\n", 74 | " for x, c in summaries.most_common(num_grams):\n", 75 | " if word in x:\n", 76 | " labels[x] += count\n", 77 | " # Create bigrams of commonly seen words\n", 78 | " # If a bigram is seen in a summary, assign it to potential labels counter\n", 79 | " # with count equal to the value of the word in the bigram with the highest frequenct\n", 80 | " word_combs = combinations([x for x, c in words.most_common(num_words)], 2)\n", 81 | " for comb in word_combs:\n", 82 | " fc = max([words[x] for x in comb])\n", 83 | " ws = \" \".join(comb)\n", 84 | " for x, c in summaries.most_common(num_grams):\n", 85 | " if ws in x:\n", 86 | " labels[x] += fc\n", 87 | " # The top item found in labels is the summary\n", 88 | " # Note how we add the node index to the label\n", 89 | " # This is because some labels are identical\n", 90 | " # and thus cause the gephi creation step in a future cell\n", 91 | " # to miss nodes\n", 92 | " top_label = \"\"\n", 93 | " for x, c in labels.most_common(1):\n", 94 | " top_label = \"[\" + str(index) + \"] \" + x\n", 95 | " # This is used to label the item in gephi or other visualizations\n", 96 | " node_label[index] = top_label\n", 97 | " # Get the sentiment score of the label\n", 98 | " sent = get_sent(top_label)\n", 99 | " # Get the average sentiment score of the tweets in this cluster\n", 100 | " size = len(cluster_data[\"tweets\"][index])\n", 101 | " tsent = cluster_data[\"sentiment\"][index]/size\n", 102 | " # Assign verdicts based on sentiment analysis\n", 103 | " # the values were hand-adjusted based on manual inspection\n", 104 | " # of tweets in each cluster\n", 105 | " verd = \"\"\n", 106 | " if tsent < -0.1:\n", 107 | " node_sent[index] = 0\n", 108 | " verd = \"TOXIC\\t\"\n", 109 | " toxic += size\n", 110 | " elif tsent < 0.1:\n", 111 | " node_sent[index] = 1\n", 112 | " verd = \"NEGATIVE\"\n", 113 | " negative += size\n", 114 | " elif tsent > 0.1:\n", 115 | " node_sent[index] = 2\n", 116 | " verd = \"POSITIVE\"\n", 117 | " positive += size\n", 118 | "\n", 119 | " # Print the results\n", 120 | " print(\"(\" + str(size) + \")\\t[\" + \"%.2f\"%sent + \"]\\t\" + \"%.2f\"%tsent + \"\\t\" + verd + \"\\t\" + top_label )" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "# Print a breakdown of categories\n", 130 | "tot = positive+negative+toxic\n", 131 | "posper = positive/tot*100\n", 132 | "negper = negative/tot*100\n", 133 | "toxper = toxic/tot*100\n", 134 | "msg = \"Positive: \" + str(positive) + \" (\" + \"%.2f\"%posper + \"%)\"\n", 135 | "msg += \" Negative: \" + str(negative) + \" (\" + \"%.2f\"%negper + \"%)\"\n", 136 | "msg += \" Toxic: \" + str(toxic) + \" (\" + \"%.2f\"%toxper + \"%)\"\n", 137 | "print(msg)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "# Examine a cluster defined by target variable\n", 147 | "# This cell prints tweets that don't contain terms identified\n", 148 | "# during the previous labeling operation\n", 149 | "# This is useful for manually inspecting the cluster to determine\n", 150 | "# whether the rest of the tweets are similar in topic or context\n", 151 | "target = 7\n", 152 | "print(\"Cluster: \" + node_label[target] + \" contains \" + str(len(cluster_data[\"tweets\"][target])) + \" tweets.\")\n", 153 | "terms = []\n", 154 | "for word in node_label[target][4:].split():\n", 155 | " if word not in stopwords:\n", 156 | " terms.append(word)\n", 157 | "print(terms)\n", 158 | "print()\n", 159 | "found = 0\n", 160 | "for x, c in cluster_data[\"tweets\"][target].most_common():\n", 161 | " matches = 0\n", 162 | " for t in terms:\n", 163 | " if t in x:\n", 164 | " matches += 1\n", 165 | " if matches == 0 :\n", 166 | " print(\"%.2f\"%c + \"\\t\" + x)\n", 167 | " found += 1\n", 168 | "print(\"Found: \" + str(found))" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "# Cluster the clusters!\n", 178 | "# This allows us to visualize the resulting data in gephi\n", 179 | "# This is also what was used to create the interactive demo\n", 180 | "# https://twitter-clustering.web.app/\n", 181 | "centers = cluster_data[\"centers\"]\n", 182 | "center_clusters, center_mapping = make_text_clusters(centers, edge_ratio=20)\n", 183 | "com_counts = [len(c) for x, c in center_clusters.items()]\n", 184 | "print(\"Communities: \" + str(len(center_clusters)) + \": \" + str(com_counts))\n", 185 | "nodes = set()\n", 186 | "for m, x in center_clusters.items():\n", 187 | " nodes.update(x)\n", 188 | "print(\"Nodes: \" + str(len(nodes)))\n", 189 | "print(\"Edges: \" + str(len(center_mapping)))\n", 190 | "\n", 191 | "nodes_json = {}\n", 192 | "center_node_attr = {}\n", 193 | "for mod, nodes in center_clusters.items():\n", 194 | " for n in nodes:\n", 195 | " label = \"n\" + str(n)\n", 196 | " summary = node_label[n]\n", 197 | " sent = node_sent[n]\n", 198 | " size = cluster_data[\"sizes\"][n]\n", 199 | " center_node_attr[summary] = [mod, size, sent]\n", 200 | " nodes_json[label] = {}\n", 201 | " nodes_json[label][\"label\"] = node_label[n]\n", 202 | " nodes_json[label][\"community\"] = mod\n", 203 | " nodes_json[label][\"sentiment\"] = cluster_data[\"sentiment\"][n]\n", 204 | " nodes_json[label][\"wfreq\"] = cluster_data[\"words\"][n]\n", 205 | " nodes_json[label][\"words\"] = get_wft(cluster_data[\"words\"][n])\n", 206 | " nodes_json[label][\"ngrams\"] = get_wft(cluster_data[\"ngrams\"][n])\n", 207 | " nodes_json[label][\"svo\"] = get_wft(cluster_data[\"svo\"][n])\n", 208 | " nodes_json[label][\"tweeted\"] = print_counter_summary(cluster_data[\"sns\"][n])\n", 209 | " nodes_json[label][\"size\"] = cluster_data[\"sizes\"][n]\n", 210 | " nodes_json[label][\"tweets\"] = [x for x, c in cluster_data[\"tweets\"][n].most_common(10)]\n", 211 | " nodes_json[label][\"urls\"] = [x for x, c in cluster_data[\"urls\"][n].most_common(10)]\n", 212 | "\n", 213 | "with open(\"edges.csv\", \"w\") as f:\n", 214 | " f.write(\"Sourceid,Targetid,Weight\\n\")\n", 215 | " for m in center_mapping:\n", 216 | " s, t, w = m\n", 217 | " f.write(\"n\"+str(s)+\",n\"+str(t)+\",\"+str(w)+\"\\n\")\n", 218 | "\n", 219 | "geph_mapping = []\n", 220 | "for item in center_mapping:\n", 221 | " s, t, w = item\n", 222 | " s1 = node_label[s]\n", 223 | " t1 = node_label[t]\n", 224 | " geph_mapping.append([s1, t1, w])\n", 225 | "\n", 226 | "save_json(nodes_json, \"nodes.json\")\n", 227 | "write_gexf(geph_mapping, \"center_mapping.gexf\", center_node_attr, [\"community\", \"size\", \"sentiment\"])\n", 228 | "print(\"Done\")" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": { 235 | "scrolled": true 236 | }, 237 | "outputs": [], 238 | "source": [ 239 | "# Print out some information about each cluster\n", 240 | "# Most common words, ngrams, svo triplets\n", 241 | "cluster_words = {}\n", 242 | "cluster_ngrams = {}\n", 243 | "cluster_svo = {}\n", 244 | "for mod, nodes in center_clusters.items():\n", 245 | " cluster_words[mod] = Counter()\n", 246 | " cluster_ngrams[mod] = Counter()\n", 247 | " cluster_svo[mod] = Counter()\n", 248 | " for n in nodes:\n", 249 | " words = cluster_data[\"words\"][n]\n", 250 | " ngrams = cluster_data[\"ngrams\"][n]\n", 251 | " svo = cluster_data[\"svo\"][n]\n", 252 | " for x, c in words.items():\n", 253 | " if x not in stopwords:\n", 254 | " cluster_words[mod][x] += c\n", 255 | " for x, c in ngrams.items():\n", 256 | " cluster_ngrams[mod][x] += c\n", 257 | " for x, c in svo.items():\n", 258 | " cluster_svo[mod][x] += c\n", 259 | "# Print top words\n", 260 | "print(\"Words\")\n", 261 | "for mod, words in cluster_words.items():\n", 262 | " top_words = \" \"\n", 263 | " for x, c in words.most_common(10):\n", 264 | " top_words += x + \"(\" + str(c) + \") \"\n", 265 | " \n", 266 | " print(\"Community \" + str(mod) + top_words)\n", 267 | "print()\n", 268 | "# Print top ngrams\n", 269 | "print(\"ngrams\")\n", 270 | "for mod, ngrams in cluster_ngrams.items():\n", 271 | " top_ngrams = \" \"\n", 272 | " for x, c in ngrams.most_common(5):\n", 273 | " top_ngrams += x + \"(\" + str(c) + \") \"\n", 274 | " \n", 275 | " print(\"Community \" + str(mod) + top_ngrams)\n", 276 | "print()\n", 277 | "# Print top svo\n", 278 | "print(\"svo\")\n", 279 | "for mod, svo in cluster_svo.items():\n", 280 | " top_svo = \" \"\n", 281 | " for x, c in svo.most_common(5):\n", 282 | " top_svo += x + \"(\" + str(c) + \") \"\n", 283 | " \n", 284 | " print(\"Community \" + str(mod) + top_svo)" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [ 293 | "# Attempt to find best label for each community\n", 294 | "# This uses a similar method to one above, but with different parameters\n", 295 | "num_grams = 10\n", 296 | "num_words = 10\n", 297 | "for mod, svo in cluster_svo.items():\n", 298 | " words = cluster_words[mod]\n", 299 | " ngrams = cluster_ngrams[mod]\n", 300 | " relevant_svo = Counter()\n", 301 | " for word, count in words.most_common(num_words):\n", 302 | " for x, c in svo.most_common(num_grams):\n", 303 | " x = \" \".join(x[1:-1].split(\", \"))\n", 304 | " if word in x:\n", 305 | " relevant_svo[x] += count\n", 306 | " for x, c in ngrams.most_common(num_grams):\n", 307 | " if word in x:\n", 308 | " relevant_svo[x] += count\n", 309 | " word_combs = combinations([x for x, c in words.most_common(num_words)], 2)\n", 310 | " for comb in word_combs:\n", 311 | " fc = max([words[x] for x in comb])\n", 312 | " ws = \" \".join(comb)\n", 313 | " for x, c in svo.most_common(num_grams):\n", 314 | " x = \" \".join(x[1:-1].split(\", \"))\n", 315 | " if ws in x:\n", 316 | " relevant_svo[x] += fc\n", 317 | " for x, c in ngrams.most_common(num_grams):\n", 318 | " if ws in x:\n", 319 | " relevant_svo[x] += fc\n", 320 | " \n", 321 | " top_svo = \"\"\n", 322 | " for x, c in relevant_svo.most_common(3):\n", 323 | " top_svo += x + \" (\" + str(c) + \") \"\n", 324 | " top_svo = [x for x, c in relevant_svo.most_common(1)][0]\n", 325 | " \n", 326 | " print(\"Community \" + str(mod) + \": \" + top_svo)" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": null, 332 | "metadata": {}, 333 | "outputs": [], 334 | "source": [] 335 | } 336 | ], 337 | "metadata": { 338 | "kernelspec": { 339 | "display_name": "Python 3", 340 | "language": "python", 341 | "name": "python3" 342 | }, 343 | "language_info": { 344 | "codemirror_mode": { 345 | "name": "ipython", 346 | "version": 3 347 | }, 348 | "file_extension": ".py", 349 | "mimetype": "text/x-python", 350 | "name": "python", 351 | "nbconvert_exporter": "python", 352 | "pygments_lexer": "ipython3", 353 | "version": "3.7.3" 354 | } 355 | }, 356 | "nbformat": 4, 357 | "nbformat_minor": 2 358 | } 359 | -------------------------------------------------------------------------------- /code/config/gexf_header.txt: -------------------------------------------------------------------------------- 1 | 2 | 7 | 8 | -------------------------------------------------------------------------------- /code/config/stopwords.json: -------------------------------------------------------------------------------- 1 | { 2 | "ru": [ 3 | "и", 4 | "в", 5 | "во", 6 | "не", 7 | "что", 8 | "он", 9 | "на", 10 | "я", 11 | "с", 12 | "со", 13 | "как", 14 | "а", 15 | "то", 16 | "все", 17 | "она", 18 | "так", 19 | "его", 20 | "но", 21 | "да", 22 | "ты", 23 | "к", 24 | "у", 25 | "же", 26 | "вы", 27 | "за", 28 | "бы", 29 | "по", 30 | "только", 31 | "ее", 32 | "мне", 33 | "было", 34 | "вот", 35 | "от", 36 | "меня", 37 | "еще", 38 | "нет", 39 | "о", 40 | "из", 41 | "ему", 42 | "теперь", 43 | "когда", 44 | "даже", 45 | "ну", 46 | "вдруг", 47 | "ли", 48 | "если", 49 | "уже", 50 | "или", 51 | "ни", 52 | "быть", 53 | "был", 54 | "него", 55 | "до", 56 | "вас", 57 | "нибудь", 58 | "опять", 59 | "уж", 60 | "вам", 61 | "ведь", 62 | "там", 63 | "потом", 64 | "себя", 65 | "ничего", 66 | "ей", 67 | "может", 68 | "они", 69 | "тут", 70 | "где", 71 | "есть", 72 | "надо", 73 | "ней", 74 | "для", 75 | "мы", 76 | "тебя", 77 | "их", 78 | "чем", 79 | "была", 80 | "сам", 81 | "чтоб", 82 | "без", 83 | "будто", 84 | "чего", 85 | "раз", 86 | "тоже", 87 | "себе", 88 | "под", 89 | "будет", 90 | "ж", 91 | "тогда", 92 | "кто", 93 | "этот", 94 | "того", 95 | "потому", 96 | "этого", 97 | "какой", 98 | "совсем", 99 | "ним", 100 | "здесь", 101 | "этом", 102 | "один", 103 | "почти", 104 | "мой", 105 | "тем", 106 | "чтобы", 107 | "нее", 108 | "сейчас", 109 | "были", 110 | "куда", 111 | "зачем", 112 | "всех", 113 | "никогда", 114 | "можно", 115 | "при", 116 | "наконец", 117 | "два", 118 | "об", 119 | "другой", 120 | "хоть", 121 | "после", 122 | "над", 123 | "больше", 124 | "тот", 125 | "через", 126 | "эти", 127 | "нас", 128 | "про", 129 | "всего", 130 | "них", 131 | "какая", 132 | "много", 133 | "разве", 134 | "три", 135 | "эту", 136 | "моя", 137 | "впрочем", 138 | "хорошо", 139 | "свою", 140 | "этой", 141 | "перед", 142 | "иногда", 143 | "лучше", 144 | "чуть", 145 | "том", 146 | "нельзя", 147 | "такой", 148 | "им", 149 | "более", 150 | "всегда", 151 | "конечно", 152 | "всю", 153 | "между" 154 | ], 155 | "fr": [ 156 | "au", 157 | "aux", 158 | "avec", 159 | "ce", 160 | "ces", 161 | "dans", 162 | "de", 163 | "des", 164 | "du", 165 | "elle", 166 | "en", 167 | "et", 168 | "eux", 169 | "il", 170 | "je", 171 | "la", 172 | "le", 173 | "leur", 174 | "lui", 175 | "ma", 176 | "mais", 177 | "me", 178 | "même", 179 | "mes", 180 | "moi", 181 | "mon", 182 | "ne", 183 | "nos", 184 | "notre", 185 | "nous", 186 | "on", 187 | "ou", 188 | "par", 189 | "pas", 190 | "pour", 191 | "qu", 192 | "que", 193 | "qui", 194 | "sa", 195 | "se", 196 | "ses", 197 | "son", 198 | "sur", 199 | "ta", 200 | "te", 201 | "tes", 202 | "toi", 203 | "ton", 204 | "tu", 205 | "un", 206 | "une", 207 | "vos", 208 | "votre", 209 | "vous", 210 | "c", 211 | "d", 212 | "j", 213 | "l", 214 | "à", 215 | "m", 216 | "n", 217 | "s", 218 | "t", 219 | "y", 220 | "été", 221 | "étée", 222 | "étées", 223 | "étés", 224 | "étant", 225 | "étante", 226 | "étants", 227 | "étantes", 228 | "suis", 229 | "es", 230 | "est", 231 | "sommes", 232 | "êtes", 233 | "sont", 234 | "serai", 235 | "seras", 236 | "sera", 237 | "serons", 238 | "serez", 239 | "seront", 240 | "serais", 241 | "serait", 242 | "serions", 243 | "seriez", 244 | "seraient", 245 | "étais", 246 | "était", 247 | "étions", 248 | "étiez", 249 | "étaient", 250 | "fus", 251 | "fut", 252 | "fûmes", 253 | "fûtes", 254 | "furent", 255 | "sois", 256 | "soit", 257 | "soyons", 258 | "soyez", 259 | "soient", 260 | "fusse", 261 | "fusses", 262 | "fût", 263 | "fussions", 264 | "fussiez", 265 | "fussent", 266 | "ayant", 267 | "ayante", 268 | "ayantes", 269 | "ayants", 270 | "eu", 271 | "eue", 272 | "eues", 273 | "eus", 274 | "ai", 275 | "as", 276 | "avons", 277 | "avez", 278 | "ont", 279 | "aurai", 280 | "auras", 281 | "aura", 282 | "aurons", 283 | "aurez", 284 | "auront", 285 | "aurais", 286 | "aurait", 287 | "aurions", 288 | "auriez", 289 | "auraient", 290 | "avais", 291 | "avait", 292 | "avions", 293 | "aviez", 294 | "avaient", 295 | "eut", 296 | "eûmes", 297 | "eûtes", 298 | "eurent", 299 | "aie", 300 | "aies", 301 | "ait", 302 | "ayons", 303 | "ayez", 304 | "aient", 305 | "eusse", 306 | "eusses", 307 | "eût", 308 | "eussions", 309 | "eussiez", 310 | "eussent" 311 | ], 312 | "en": [ 313 | "i", 314 | "me", 315 | "my", 316 | "myself", 317 | "we", 318 | "our", 319 | "ours", 320 | "ourselves", 321 | "you", 322 | "you're", 323 | "you've", 324 | "you'll", 325 | "you'd", 326 | "your", 327 | "yours", 328 | "yourself", 329 | "yourselves", 330 | "he", 331 | "him", 332 | "his", 333 | "himself", 334 | "she", 335 | "she's", 336 | "her", 337 | "hers", 338 | "herself", 339 | "it", 340 | "it's", 341 | "its", 342 | "itself", 343 | "they", 344 | "them", 345 | "their", 346 | "theirs", 347 | "themselves", 348 | "what", 349 | "which", 350 | "who", 351 | "whom", 352 | "this", 353 | "that", 354 | "that'll", 355 | "these", 356 | "those", 357 | "am", 358 | "is", 359 | "are", 360 | "was", 361 | "were", 362 | "be", 363 | "been", 364 | "being", 365 | "have", 366 | "has", 367 | "had", 368 | "having", 369 | "do", 370 | "does", 371 | "did", 372 | "doing", 373 | "a", 374 | "an", 375 | "the", 376 | "and", 377 | "but", 378 | "if", 379 | "or", 380 | "because", 381 | "as", 382 | "until", 383 | "while", 384 | "of", 385 | "at", 386 | "by", 387 | "for", 388 | "with", 389 | "about", 390 | "against", 391 | "between", 392 | "into", 393 | "through", 394 | "during", 395 | "before", 396 | "after", 397 | "above", 398 | "below", 399 | "to", 400 | "from", 401 | "up", 402 | "down", 403 | "in", 404 | "out", 405 | "on", 406 | "off", 407 | "over", 408 | "under", 409 | "again", 410 | "further", 411 | "then", 412 | "once", 413 | "here", 414 | "there", 415 | "when", 416 | "where", 417 | "why", 418 | "how", 419 | "all", 420 | "any", 421 | "both", 422 | "each", 423 | "few", 424 | "more", 425 | "most", 426 | "other", 427 | "some", 428 | "such", 429 | "no", 430 | "nor", 431 | "not", 432 | "only", 433 | "own", 434 | "same", 435 | "so", 436 | "than", 437 | "too", 438 | "very", 439 | "s", 440 | "t", 441 | "can", 442 | "will", 443 | "just", 444 | "don", 445 | "don't", 446 | "should", 447 | "should've", 448 | "now", 449 | "d", 450 | "ll", 451 | "m", 452 | "o", 453 | "re", 454 | "ve", 455 | "y", 456 | "ain", 457 | "aren", 458 | "aren't", 459 | "couldn", 460 | "couldn't", 461 | "didn", 462 | "didn't", 463 | "doesn", 464 | "doesn't", 465 | "hadn", 466 | "hadn't", 467 | "hasn", 468 | "hasn't", 469 | "haven", 470 | "haven't", 471 | "isn", 472 | "isn't", 473 | "ma", 474 | "mightn", 475 | "mightn't", 476 | "mustn", 477 | "mustn't", 478 | "needn", 479 | "needn't", 480 | "shan", 481 | "shan't", 482 | "shouldn", 483 | "shouldn't", 484 | "wasn", 485 | "wasn't", 486 | "weren", 487 | "weren't", 488 | "won", 489 | "won't", 490 | "wouldn", 491 | "wouldn't" 492 | ], 493 | "nl": [ 494 | "de", 495 | "en", 496 | "van", 497 | "ik", 498 | "te", 499 | "dat", 500 | "die", 501 | "in", 502 | "een", 503 | "hij", 504 | "het", 505 | "niet", 506 | "zijn", 507 | "is", 508 | "was", 509 | "op", 510 | "aan", 511 | "met", 512 | "als", 513 | "voor", 514 | "had", 515 | "er", 516 | "maar", 517 | "om", 518 | "hem", 519 | "dan", 520 | "zou", 521 | "of", 522 | "wat", 523 | "mijn", 524 | "men", 525 | "dit", 526 | "zo", 527 | "door", 528 | "over", 529 | "ze", 530 | "zich", 531 | "bij", 532 | "ook", 533 | "tot", 534 | "je", 535 | "mij", 536 | "uit", 537 | "der", 538 | "daar", 539 | "haar", 540 | "naar", 541 | "heb", 542 | "hoe", 543 | "heeft", 544 | "hebben", 545 | "deze", 546 | "u", 547 | "want", 548 | "nog", 549 | "zal", 550 | "me", 551 | "zij", 552 | "nu", 553 | "ge", 554 | "geen", 555 | "omdat", 556 | "iets", 557 | "worden", 558 | "toch", 559 | "al", 560 | "waren", 561 | "veel", 562 | "meer", 563 | "doen", 564 | "toen", 565 | "moet", 566 | "ben", 567 | "zonder", 568 | "kan", 569 | "hun", 570 | "dus", 571 | "alles", 572 | "onder", 573 | "ja", 574 | "eens", 575 | "hier", 576 | "wie", 577 | "werd", 578 | "altijd", 579 | "doch", 580 | "wordt", 581 | "wezen", 582 | "kunnen", 583 | "ons", 584 | "zelf", 585 | "tegen", 586 | "na", 587 | "reeds", 588 | "wil", 589 | "kon", 590 | "niets", 591 | "uw", 592 | "iemand", 593 | "geweest", 594 | "andere" 595 | ], 596 | "dk": [ 597 | "og", 598 | "i", 599 | "jeg", 600 | "det", 601 | "at", 602 | "en", 603 | "den", 604 | "til", 605 | "er", 606 | "som", 607 | "på", 608 | "de", 609 | "med", 610 | "han", 611 | "af", 612 | "for", 613 | "ikke", 614 | "der", 615 | "var", 616 | "mig", 617 | "sig", 618 | "men", 619 | "et", 620 | "har", 621 | "om", 622 | "vi", 623 | "min", 624 | "havde", 625 | "ham", 626 | "hun", 627 | "nu", 628 | "over", 629 | "da", 630 | "fra", 631 | "du", 632 | "ud", 633 | "sin", 634 | "dem", 635 | "os", 636 | "op", 637 | "man", 638 | "hans", 639 | "hvor", 640 | "eller", 641 | "hvad", 642 | "skal", 643 | "selv", 644 | "her", 645 | "alle", 646 | "vil", 647 | "blev", 648 | "kunne", 649 | "ind", 650 | "når", 651 | "være", 652 | "dog", 653 | "noget", 654 | "ville", 655 | "jo", 656 | "deres", 657 | "efter", 658 | "ned", 659 | "skulle", 660 | "denne", 661 | "end", 662 | "dette", 663 | "mit", 664 | "også", 665 | "under", 666 | "have", 667 | "dig", 668 | "anden", 669 | "hende", 670 | "mine", 671 | "alt", 672 | "meget", 673 | "sit", 674 | "sine", 675 | "vor", 676 | "mod", 677 | "disse", 678 | "hvis", 679 | "din", 680 | "nogle", 681 | "hos", 682 | "blive", 683 | "mange", 684 | "ad", 685 | "bliver", 686 | "hendes", 687 | "været", 688 | "thi", 689 | "jer", 690 | "sådan" 691 | ], 692 | "pt": [ 693 | "de", 694 | "a", 695 | "o", 696 | "que", 697 | "e", 698 | "do", 699 | "da", 700 | "em", 701 | "um", 702 | "para", 703 | "com", 704 | "não", 705 | "uma", 706 | "os", 707 | "no", 708 | "se", 709 | "na", 710 | "por", 711 | "mais", 712 | "as", 713 | "dos", 714 | "como", 715 | "mas", 716 | "ao", 717 | "ele", 718 | "das", 719 | "à", 720 | "seu", 721 | "sua", 722 | "ou", 723 | "quando", 724 | "muito", 725 | "nos", 726 | "já", 727 | "eu", 728 | "também", 729 | "só", 730 | "pelo", 731 | "pela", 732 | "até", 733 | "isso", 734 | "ela", 735 | "entre", 736 | "depois", 737 | "sem", 738 | "mesmo", 739 | "aos", 740 | "seus", 741 | "quem", 742 | "nas", 743 | "me", 744 | "esse", 745 | "eles", 746 | "você", 747 | "essa", 748 | "num", 749 | "nem", 750 | "suas", 751 | "meu", 752 | "às", 753 | "minha", 754 | "numa", 755 | "pelos", 756 | "elas", 757 | "qual", 758 | "nós", 759 | "lhe", 760 | "deles", 761 | "essas", 762 | "esses", 763 | "pelas", 764 | "este", 765 | "dele", 766 | "tu", 767 | "te", 768 | "vocês", 769 | "vos", 770 | "lhes", 771 | "meus", 772 | "minhas", 773 | "teu", 774 | "tua", 775 | "teus", 776 | "tuas", 777 | "nosso", 778 | "nossa", 779 | "nossos", 780 | "nossas", 781 | "dela", 782 | "delas", 783 | "esta", 784 | "estes", 785 | "estas", 786 | "aquele", 787 | "aquela", 788 | "aqueles", 789 | "aquelas", 790 | "isto", 791 | "aquilo", 792 | "estou", 793 | "está", 794 | "estamos", 795 | "estão", 796 | "estive", 797 | "esteve", 798 | "estivemos", 799 | "estiveram", 800 | "estava", 801 | "estávamos", 802 | "estavam", 803 | "estivera", 804 | "estivéramos", 805 | "esteja", 806 | "estejamos", 807 | "estejam", 808 | "estivesse", 809 | "estivéssemos", 810 | "estivessem", 811 | "estiver", 812 | "estivermos", 813 | "estiverem", 814 | "hei", 815 | "há", 816 | "havemos", 817 | "hão", 818 | "houve", 819 | "houvemos", 820 | "houveram", 821 | "houvera", 822 | "houvéramos", 823 | "haja", 824 | "hajamos", 825 | "hajam", 826 | "houvesse", 827 | "houvéssemos", 828 | "houvessem", 829 | "houver", 830 | "houvermos", 831 | "houverem", 832 | "houverei", 833 | "houverá", 834 | "houveremos", 835 | "houverão", 836 | "houveria", 837 | "houveríamos", 838 | "houveriam", 839 | "sou", 840 | "somos", 841 | "são", 842 | "era", 843 | "éramos", 844 | "eram", 845 | "fui", 846 | "foi", 847 | "fomos", 848 | "foram", 849 | "fora", 850 | "fôramos", 851 | "seja", 852 | "sejamos", 853 | "sejam", 854 | "fosse", 855 | "fôssemos", 856 | "fossem", 857 | "for", 858 | "formos", 859 | "forem", 860 | "serei", 861 | "será", 862 | "seremos", 863 | "serão", 864 | "seria", 865 | "seríamos", 866 | "seriam", 867 | "tenho", 868 | "tem", 869 | "temos", 870 | "tém", 871 | "tinha", 872 | "tínhamos", 873 | "tinham", 874 | "tive", 875 | "teve", 876 | "tivemos", 877 | "tiveram", 878 | "tivera", 879 | "tivéramos", 880 | "tenha", 881 | "tenhamos", 882 | "tenham", 883 | "tivesse", 884 | "tivéssemos", 885 | "tivessem", 886 | "tiver", 887 | "tivermos", 888 | "tiverem", 889 | "terei", 890 | "terá", 891 | "teremos", 892 | "terão", 893 | "teria", 894 | "teríamos", 895 | "teriam" 896 | ], 897 | "no": [ 898 | "og", 899 | "i", 900 | "jeg", 901 | "det", 902 | "at", 903 | "en", 904 | "et", 905 | "den", 906 | "til", 907 | "er", 908 | "som", 909 | "på", 910 | "de", 911 | "med", 912 | "han", 913 | "av", 914 | "ikke", 915 | "ikkje", 916 | "der", 917 | "så", 918 | "var", 919 | "meg", 920 | "seg", 921 | "men", 922 | "ett", 923 | "har", 924 | "om", 925 | "vi", 926 | "min", 927 | "mitt", 928 | "ha", 929 | "hadde", 930 | "hun", 931 | "nå", 932 | "over", 933 | "da", 934 | "ved", 935 | "fra", 936 | "du", 937 | "ut", 938 | "sin", 939 | "dem", 940 | "oss", 941 | "opp", 942 | "man", 943 | "kan", 944 | "hans", 945 | "hvor", 946 | "eller", 947 | "hva", 948 | "skal", 949 | "selv", 950 | "sjøl", 951 | "her", 952 | "alle", 953 | "vil", 954 | "bli", 955 | "ble", 956 | "blei", 957 | "blitt", 958 | "kunne", 959 | "inn", 960 | "når", 961 | "være", 962 | "kom", 963 | "noen", 964 | "noe", 965 | "ville", 966 | "dere", 967 | "som", 968 | "deres", 969 | "kun", 970 | "ja", 971 | "etter", 972 | "ned", 973 | "skulle", 974 | "denne", 975 | "for", 976 | "deg", 977 | "si", 978 | "sine", 979 | "sitt", 980 | "mot", 981 | "å", 982 | "meget", 983 | "hvorfor", 984 | "dette", 985 | "disse", 986 | "uten", 987 | "hvordan", 988 | "ingen", 989 | "din", 990 | "ditt", 991 | "blir", 992 | "samme", 993 | "hvilken", 994 | "hvilke", 995 | "sånn", 996 | "inni", 997 | "mellom", 998 | "vår", 999 | "hver", 1000 | "hvem", 1001 | "vors", 1002 | "hvis", 1003 | "både", 1004 | "bare", 1005 | "enn", 1006 | "fordi", 1007 | "før", 1008 | "mange", 1009 | "også", 1010 | "slik", 1011 | "vært", 1012 | "være", 1013 | "båe", 1014 | "begge", 1015 | "siden", 1016 | "dykk", 1017 | "dykkar", 1018 | "dei", 1019 | "deira", 1020 | "deires", 1021 | "deim", 1022 | "di", 1023 | "då", 1024 | "eg", 1025 | "ein", 1026 | "eit", 1027 | "eitt", 1028 | "elles", 1029 | "honom", 1030 | "hjå", 1031 | "ho", 1032 | "hoe", 1033 | "henne", 1034 | "hennar", 1035 | "hennes", 1036 | "hoss", 1037 | "hossen", 1038 | "ikkje", 1039 | "ingi", 1040 | "inkje", 1041 | "korleis", 1042 | "korso", 1043 | "kva", 1044 | "kvar", 1045 | "kvarhelst", 1046 | "kven", 1047 | "kvi", 1048 | "kvifor", 1049 | "me", 1050 | "medan", 1051 | "mi", 1052 | "mine", 1053 | "mykje", 1054 | "no", 1055 | "nokon", 1056 | "noka", 1057 | "nokor", 1058 | "noko", 1059 | "nokre", 1060 | "si", 1061 | "sia", 1062 | "sidan", 1063 | "so", 1064 | "somt", 1065 | "somme", 1066 | "um", 1067 | "upp", 1068 | "vere", 1069 | "vore", 1070 | "verte", 1071 | "vort", 1072 | "varte", 1073 | "vart" 1074 | ], 1075 | "de": [ 1076 | "aber", 1077 | "alle", 1078 | "allem", 1079 | "allen", 1080 | "aller", 1081 | "alles", 1082 | "als", 1083 | "also", 1084 | "am", 1085 | "an", 1086 | "ander", 1087 | "andere", 1088 | "anderem", 1089 | "anderen", 1090 | "anderer", 1091 | "anderes", 1092 | "anderm", 1093 | "andern", 1094 | "anderr", 1095 | "anders", 1096 | "auch", 1097 | "auf", 1098 | "aus", 1099 | "bei", 1100 | "bin", 1101 | "bis", 1102 | "bist", 1103 | "da", 1104 | "damit", 1105 | "dann", 1106 | "der", 1107 | "den", 1108 | "des", 1109 | "dem", 1110 | "die", 1111 | "das", 1112 | "daß", 1113 | "derselbe", 1114 | "derselben", 1115 | "denselben", 1116 | "desselben", 1117 | "demselben", 1118 | "dieselbe", 1119 | "dieselben", 1120 | "dasselbe", 1121 | "dazu", 1122 | "dein", 1123 | "deine", 1124 | "deinem", 1125 | "deinen", 1126 | "deiner", 1127 | "deines", 1128 | "denn", 1129 | "derer", 1130 | "dessen", 1131 | "dich", 1132 | "dir", 1133 | "du", 1134 | "dies", 1135 | "diese", 1136 | "diesem", 1137 | "diesen", 1138 | "dieser", 1139 | "dieses", 1140 | "doch", 1141 | "dort", 1142 | "durch", 1143 | "ein", 1144 | "eine", 1145 | "einem", 1146 | "einen", 1147 | "einer", 1148 | "eines", 1149 | "einig", 1150 | "einige", 1151 | "einigem", 1152 | "einigen", 1153 | "einiger", 1154 | "einiges", 1155 | "einmal", 1156 | "er", 1157 | "ihn", 1158 | "ihm", 1159 | "es", 1160 | "etwas", 1161 | "euer", 1162 | "eure", 1163 | "eurem", 1164 | "euren", 1165 | "eurer", 1166 | "eures", 1167 | "für", 1168 | "gegen", 1169 | "gewesen", 1170 | "hab", 1171 | "habe", 1172 | "haben", 1173 | "hat", 1174 | "hatte", 1175 | "hatten", 1176 | "hier", 1177 | "hin", 1178 | "hinter", 1179 | "ich", 1180 | "mich", 1181 | "mir", 1182 | "ihr", 1183 | "ihre", 1184 | "ihrem", 1185 | "ihren", 1186 | "ihrer", 1187 | "ihres", 1188 | "euch", 1189 | "im", 1190 | "in", 1191 | "indem", 1192 | "ins", 1193 | "ist", 1194 | "jede", 1195 | "jedem", 1196 | "jeden", 1197 | "jeder", 1198 | "jedes", 1199 | "jene", 1200 | "jenem", 1201 | "jenen", 1202 | "jener", 1203 | "jenes", 1204 | "jetzt", 1205 | "kann", 1206 | "kein", 1207 | "keine", 1208 | "keinem", 1209 | "keinen", 1210 | "keiner", 1211 | "keines", 1212 | "können", 1213 | "könnte", 1214 | "machen", 1215 | "man", 1216 | "manche", 1217 | "manchem", 1218 | "manchen", 1219 | "mancher", 1220 | "manches", 1221 | "mein", 1222 | "meine", 1223 | "meinem", 1224 | "meinen", 1225 | "meiner", 1226 | "meines", 1227 | "mit", 1228 | "muss", 1229 | "musste", 1230 | "nach", 1231 | "nicht", 1232 | "nichts", 1233 | "noch", 1234 | "nun", 1235 | "nur", 1236 | "ob", 1237 | "oder", 1238 | "ohne", 1239 | "sehr", 1240 | "sein", 1241 | "seine", 1242 | "seinem", 1243 | "seinen", 1244 | "seiner", 1245 | "seines", 1246 | "selbst", 1247 | "sich", 1248 | "sie", 1249 | "ihnen", 1250 | "sind", 1251 | "so", 1252 | "solche", 1253 | "solchem", 1254 | "solchen", 1255 | "solcher", 1256 | "solches", 1257 | "soll", 1258 | "sollte", 1259 | "sondern", 1260 | "sonst", 1261 | "über", 1262 | "um", 1263 | "und", 1264 | "uns", 1265 | "unsere", 1266 | "unserem", 1267 | "unseren", 1268 | "unser", 1269 | "unseres", 1270 | "unter", 1271 | "viel", 1272 | "vom", 1273 | "von", 1274 | "vor", 1275 | "während", 1276 | "war", 1277 | "waren", 1278 | "warst", 1279 | "was", 1280 | "weg", 1281 | "weil", 1282 | "weiter", 1283 | "welche", 1284 | "welchem", 1285 | "welchen", 1286 | "welcher", 1287 | "welches", 1288 | "wenn", 1289 | "werde", 1290 | "werden", 1291 | "wie", 1292 | "wieder", 1293 | "will", 1294 | "wir", 1295 | "wird", 1296 | "wirst", 1297 | "wo", 1298 | "wollen", 1299 | "wollte", 1300 | "würde", 1301 | "würden", 1302 | "zu", 1303 | "zum", 1304 | "zur", 1305 | "zwar", 1306 | "zwischen" 1307 | ], 1308 | "tr": [ 1309 | "acaba", 1310 | "ama", 1311 | "aslında", 1312 | "az", 1313 | "bazı", 1314 | "belki", 1315 | "biri", 1316 | "birkaç", 1317 | "birşey", 1318 | "biz", 1319 | "bu", 1320 | "çok", 1321 | "çünkü", 1322 | "da", 1323 | "daha", 1324 | "de", 1325 | "defa", 1326 | "diye", 1327 | "eğer", 1328 | "en", 1329 | "gibi", 1330 | "hem", 1331 | "hep", 1332 | "hepsi", 1333 | "her", 1334 | "hiç", 1335 | "için", 1336 | "ile", 1337 | "ise", 1338 | "kez", 1339 | "ki", 1340 | "kim", 1341 | "mı", 1342 | "mu", 1343 | "mü", 1344 | "nasıl", 1345 | "ne", 1346 | "neden", 1347 | "nerde", 1348 | "nerede", 1349 | "nereye", 1350 | "niçin", 1351 | "niye", 1352 | "o", 1353 | "sanki", 1354 | "şey", 1355 | "siz", 1356 | "şu", 1357 | "tüm", 1358 | "ve", 1359 | "veya", 1360 | "ya", 1361 | "yani" 1362 | ], 1363 | "it": [ 1364 | "ad", 1365 | "al", 1366 | "allo", 1367 | "ai", 1368 | "agli", 1369 | "all", 1370 | "agl", 1371 | "alla", 1372 | "alle", 1373 | "con", 1374 | "col", 1375 | "coi", 1376 | "da", 1377 | "dal", 1378 | "dallo", 1379 | "dai", 1380 | "dagli", 1381 | "dall", 1382 | "dagl", 1383 | "dalla", 1384 | "dalle", 1385 | "di", 1386 | "del", 1387 | "dello", 1388 | "dei", 1389 | "degli", 1390 | "dell", 1391 | "degl", 1392 | "della", 1393 | "delle", 1394 | "in", 1395 | "nel", 1396 | "nello", 1397 | "nei", 1398 | "negli", 1399 | "nell", 1400 | "negl", 1401 | "nella", 1402 | "nelle", 1403 | "su", 1404 | "sul", 1405 | "sullo", 1406 | "sui", 1407 | "sugli", 1408 | "sull", 1409 | "sugl", 1410 | "sulla", 1411 | "sulle", 1412 | "per", 1413 | "tra", 1414 | "contro", 1415 | "io", 1416 | "tu", 1417 | "lui", 1418 | "lei", 1419 | "noi", 1420 | "voi", 1421 | "loro", 1422 | "mio", 1423 | "mia", 1424 | "miei", 1425 | "mie", 1426 | "tuo", 1427 | "tua", 1428 | "tuoi", 1429 | "tue", 1430 | "suo", 1431 | "sua", 1432 | "suoi", 1433 | "sue", 1434 | "nostro", 1435 | "nostra", 1436 | "nostri", 1437 | "nostre", 1438 | "vostro", 1439 | "vostra", 1440 | "vostri", 1441 | "vostre", 1442 | "mi", 1443 | "ti", 1444 | "ci", 1445 | "vi", 1446 | "lo", 1447 | "la", 1448 | "li", 1449 | "le", 1450 | "gli", 1451 | "ne", 1452 | "il", 1453 | "un", 1454 | "uno", 1455 | "una", 1456 | "ma", 1457 | "ed", 1458 | "se", 1459 | "perché", 1460 | "anche", 1461 | "come", 1462 | "dov", 1463 | "dove", 1464 | "che", 1465 | "chi", 1466 | "cui", 1467 | "non", 1468 | "più", 1469 | "quale", 1470 | "quanto", 1471 | "quanti", 1472 | "quanta", 1473 | "quante", 1474 | "quello", 1475 | "quelli", 1476 | "quella", 1477 | "quelle", 1478 | "questo", 1479 | "questi", 1480 | "questa", 1481 | "queste", 1482 | "si", 1483 | "tutto", 1484 | "tutti", 1485 | "a", 1486 | "c", 1487 | "e", 1488 | "i", 1489 | "l", 1490 | "o", 1491 | "ho", 1492 | "hai", 1493 | "ha", 1494 | "abbiamo", 1495 | "avete", 1496 | "hanno", 1497 | "abbia", 1498 | "abbiate", 1499 | "abbiano", 1500 | "avrò", 1501 | "avrai", 1502 | "avrà", 1503 | "avremo", 1504 | "avrete", 1505 | "avranno", 1506 | "avrei", 1507 | "avresti", 1508 | "avrebbe", 1509 | "avremmo", 1510 | "avreste", 1511 | "avrebbero", 1512 | "avevo", 1513 | "avevi", 1514 | "aveva", 1515 | "avevamo", 1516 | "avevate", 1517 | "avevano", 1518 | "ebbi", 1519 | "avesti", 1520 | "ebbe", 1521 | "avemmo", 1522 | "aveste", 1523 | "ebbero", 1524 | "avessi", 1525 | "avesse", 1526 | "avessimo", 1527 | "avessero", 1528 | "avendo", 1529 | "avuto", 1530 | "avuta", 1531 | "avuti", 1532 | "avute", 1533 | "sono", 1534 | "sei", 1535 | "è", 1536 | "siamo", 1537 | "siete", 1538 | "sia", 1539 | "siate", 1540 | "siano", 1541 | "sarò", 1542 | "sarai", 1543 | "sarà", 1544 | "saremo", 1545 | "sarete", 1546 | "saranno", 1547 | "sarei", 1548 | "saresti", 1549 | "sarebbe", 1550 | "saremmo", 1551 | "sareste", 1552 | "sarebbero", 1553 | "ero", 1554 | "eri", 1555 | "era", 1556 | "eravamo", 1557 | "eravate", 1558 | "erano", 1559 | "fui", 1560 | "fosti", 1561 | "fu", 1562 | "fummo", 1563 | "foste", 1564 | "furono", 1565 | "fossi", 1566 | "fosse", 1567 | "fossimo", 1568 | "fossero", 1569 | "essendo", 1570 | "faccio", 1571 | "fai", 1572 | "facciamo", 1573 | "fanno", 1574 | "faccia", 1575 | "facciate", 1576 | "facciano", 1577 | "farò", 1578 | "farai", 1579 | "farà", 1580 | "faremo", 1581 | "farete", 1582 | "faranno", 1583 | "farei", 1584 | "faresti", 1585 | "farebbe", 1586 | "faremmo", 1587 | "fareste", 1588 | "farebbero", 1589 | "facevo", 1590 | "facevi", 1591 | "faceva", 1592 | "facevamo", 1593 | "facevate", 1594 | "facevano", 1595 | "feci", 1596 | "facesti", 1597 | "fece", 1598 | "facemmo", 1599 | "faceste", 1600 | "fecero", 1601 | "facessi", 1602 | "facesse", 1603 | "facessimo", 1604 | "facessero", 1605 | "facendo", 1606 | "sto", 1607 | "stai", 1608 | "sta", 1609 | "stiamo", 1610 | "stanno", 1611 | "stia", 1612 | "stiate", 1613 | "stiano", 1614 | "starò", 1615 | "starai", 1616 | "starà", 1617 | "staremo", 1618 | "starete", 1619 | "staranno", 1620 | "starei", 1621 | "staresti", 1622 | "starebbe", 1623 | "staremmo", 1624 | "stareste", 1625 | "starebbero", 1626 | "stavo", 1627 | "stavi", 1628 | "stava", 1629 | "stavamo", 1630 | "stavate", 1631 | "stavano", 1632 | "stetti", 1633 | "stesti", 1634 | "stette", 1635 | "stemmo", 1636 | "steste", 1637 | "stettero", 1638 | "stessi", 1639 | "stesse", 1640 | "stessimo", 1641 | "stessero", 1642 | "stando" 1643 | ], 1644 | "sv": [ 1645 | "och", 1646 | "det", 1647 | "att", 1648 | "i", 1649 | "en", 1650 | "jag", 1651 | "hon", 1652 | "som", 1653 | "han", 1654 | "på", 1655 | "den", 1656 | "med", 1657 | "var", 1658 | "sig", 1659 | "för", 1660 | "så", 1661 | "till", 1662 | "är", 1663 | "men", 1664 | "ett", 1665 | "om", 1666 | "hade", 1667 | "de", 1668 | "av", 1669 | "icke", 1670 | "mig", 1671 | "du", 1672 | "henne", 1673 | "då", 1674 | "sin", 1675 | "nu", 1676 | "har", 1677 | "inte", 1678 | "hans", 1679 | "honom", 1680 | "skulle", 1681 | "hennes", 1682 | "där", 1683 | "min", 1684 | "man", 1685 | "ej", 1686 | "vid", 1687 | "kunde", 1688 | "något", 1689 | "från", 1690 | "ut", 1691 | "när", 1692 | "efter", 1693 | "upp", 1694 | "vi", 1695 | "dem", 1696 | "vara", 1697 | "vad", 1698 | "över", 1699 | "än", 1700 | "dig", 1701 | "kan", 1702 | "sina", 1703 | "här", 1704 | "ha", 1705 | "mot", 1706 | "alla", 1707 | "under", 1708 | "någon", 1709 | "eller", 1710 | "allt", 1711 | "mycket", 1712 | "sedan", 1713 | "ju", 1714 | "denna", 1715 | "själv", 1716 | "detta", 1717 | "åt", 1718 | "utan", 1719 | "varit", 1720 | "hur", 1721 | "ingen", 1722 | "mitt", 1723 | "ni", 1724 | "bli", 1725 | "blev", 1726 | "oss", 1727 | "din", 1728 | "dessa", 1729 | "några", 1730 | "deras", 1731 | "blir", 1732 | "mina", 1733 | "samma", 1734 | "vilken", 1735 | "er", 1736 | "sådan", 1737 | "vår", 1738 | "blivit", 1739 | "dess", 1740 | "inom", 1741 | "mellan", 1742 | "sådant", 1743 | "varför", 1744 | "varje", 1745 | "vilka", 1746 | "ditt", 1747 | "vem", 1748 | "vilket", 1749 | "sitta", 1750 | "sådana", 1751 | "vart", 1752 | "dina", 1753 | "vars", 1754 | "vårt", 1755 | "våra", 1756 | "ert", 1757 | "era", 1758 | "vilkas" 1759 | ], 1760 | "ar": [ 1761 | "إذ", 1762 | "إذا", 1763 | "إذما", 1764 | "إذن", 1765 | "أف", 1766 | "أقل", 1767 | "أكثر", 1768 | "ألا", 1769 | "إلا", 1770 | "التي", 1771 | "الذي", 1772 | "الذين", 1773 | "اللاتي", 1774 | "اللائي", 1775 | "اللتان", 1776 | "اللتيا", 1777 | "اللتين", 1778 | "اللذان", 1779 | "اللذين", 1780 | "اللواتي", 1781 | "إلى", 1782 | "إليك", 1783 | "إليكم", 1784 | "إليكما", 1785 | "إليكن", 1786 | "أم", 1787 | "أما", 1788 | "أما", 1789 | "إما", 1790 | "أن", 1791 | "إن", 1792 | "إنا", 1793 | "أنا", 1794 | "أنت", 1795 | "أنتم", 1796 | "أنتما", 1797 | "أنتن", 1798 | "إنما", 1799 | "إنه", 1800 | "أنى", 1801 | "أنى", 1802 | "آه", 1803 | "آها", 1804 | "أو", 1805 | "أولاء", 1806 | "أولئك", 1807 | "أوه", 1808 | "آي", 1809 | "أي", 1810 | "أيها", 1811 | "إي", 1812 | "أين", 1813 | "أين", 1814 | "أينما", 1815 | "إيه", 1816 | "بخ", 1817 | "بس", 1818 | "بعد", 1819 | "بعض", 1820 | "بك", 1821 | "بكم", 1822 | "بكم", 1823 | "بكما", 1824 | "بكن", 1825 | "بل", 1826 | "بلى", 1827 | "بما", 1828 | "بماذا", 1829 | "بمن", 1830 | "بنا", 1831 | "به", 1832 | "بها", 1833 | "بهم", 1834 | "بهما", 1835 | "بهن", 1836 | "بي", 1837 | "بين", 1838 | "بيد", 1839 | "تلك", 1840 | "تلكم", 1841 | "تلكما", 1842 | "ته", 1843 | "تي", 1844 | "تين", 1845 | "تينك", 1846 | "ثم", 1847 | "ثمة", 1848 | "حاشا", 1849 | "حبذا", 1850 | "حتى", 1851 | "حيث", 1852 | "حيثما", 1853 | "حين", 1854 | "خلا", 1855 | "دون", 1856 | "ذا", 1857 | "ذات", 1858 | "ذاك", 1859 | "ذان", 1860 | "ذانك", 1861 | "ذلك", 1862 | "ذلكم", 1863 | "ذلكما", 1864 | "ذلكن", 1865 | "ذه", 1866 | "ذو", 1867 | "ذوا", 1868 | "ذواتا", 1869 | "ذواتي", 1870 | "ذي", 1871 | "ذين", 1872 | "ذينك", 1873 | "ريث", 1874 | "سوف", 1875 | "سوى", 1876 | "شتان", 1877 | "عدا", 1878 | "عسى", 1879 | "عل", 1880 | "على", 1881 | "عليك", 1882 | "عليه", 1883 | "عما", 1884 | "عن", 1885 | "عند", 1886 | "غير", 1887 | "فإذا", 1888 | "فإن", 1889 | "فلا", 1890 | "فمن", 1891 | "في", 1892 | "فيم", 1893 | "فيما", 1894 | "فيه", 1895 | "فيها", 1896 | "قد", 1897 | "كأن", 1898 | "كأنما", 1899 | "كأي", 1900 | "كأين", 1901 | "كذا", 1902 | "كذلك", 1903 | "كل", 1904 | "كلا", 1905 | "كلاهما", 1906 | "كلتا", 1907 | "كلما", 1908 | "كليكما", 1909 | "كليهما", 1910 | "كم", 1911 | "كم", 1912 | "كما", 1913 | "كي", 1914 | "كيت", 1915 | "كيف", 1916 | "كيفما", 1917 | "لا", 1918 | "لاسيما", 1919 | "لدى", 1920 | "لست", 1921 | "لستم", 1922 | "لستما", 1923 | "لستن", 1924 | "لسن", 1925 | "لسنا", 1926 | "لعل", 1927 | "لك", 1928 | "لكم", 1929 | "لكما", 1930 | "لكن", 1931 | "لكنما", 1932 | "لكي", 1933 | "لكيلا", 1934 | "لم", 1935 | "لما", 1936 | "لن", 1937 | "لنا", 1938 | "له", 1939 | "لها", 1940 | "لهم", 1941 | "لهما", 1942 | "لهن", 1943 | "لو", 1944 | "لولا", 1945 | "لوما", 1946 | "لي", 1947 | "لئن", 1948 | "ليت", 1949 | "ليس", 1950 | "ليسا", 1951 | "ليست", 1952 | "ليستا", 1953 | "ليسوا", 1954 | "ما", 1955 | "ماذا", 1956 | "متى", 1957 | "مذ", 1958 | "مع", 1959 | "مما", 1960 | "ممن", 1961 | "من", 1962 | "منه", 1963 | "منها", 1964 | "منذ", 1965 | "مه", 1966 | "مهما", 1967 | "نحن", 1968 | "نحو", 1969 | "نعم", 1970 | "ها", 1971 | "هاتان", 1972 | "هاته", 1973 | "هاتي", 1974 | "هاتين", 1975 | "هاك", 1976 | "هاهنا", 1977 | "هذا", 1978 | "هذان", 1979 | "هذه", 1980 | "هذي", 1981 | "هذين", 1982 | "هكذا", 1983 | "هل", 1984 | "هلا", 1985 | "هم", 1986 | "هما", 1987 | "هن", 1988 | "هنا", 1989 | "هناك", 1990 | "هنالك", 1991 | "هو", 1992 | "هؤلاء", 1993 | "هي", 1994 | "هيا", 1995 | "هيت", 1996 | "هيهات", 1997 | "والذي", 1998 | "والذين", 1999 | "وإذ", 2000 | "وإذا", 2001 | "وإن", 2002 | "ولا", 2003 | "ولكن", 2004 | "ولو", 2005 | "وما", 2006 | "ومن", 2007 | "وهو", 2008 | "يا" 2009 | ], 2010 | "fi": [ 2011 | "olla", 2012 | "olen", 2013 | "olet", 2014 | "on", 2015 | "olemme", 2016 | "olette", 2017 | "ovat", 2018 | "ole", 2019 | "oli", 2020 | "olisi", 2021 | "olisit", 2022 | "olisin", 2023 | "olisimme", 2024 | "olisitte", 2025 | "olisivat", 2026 | "olit", 2027 | "olin", 2028 | "olimme", 2029 | "olitte", 2030 | "olivat", 2031 | "ollut", 2032 | "olleet", 2033 | "en", 2034 | "et", 2035 | "ei", 2036 | "emme", 2037 | "ette", 2038 | "eivät", 2039 | "minä", 2040 | "minun", 2041 | "minut", 2042 | "minua", 2043 | "minussa", 2044 | "minusta", 2045 | "minuun", 2046 | "minulla", 2047 | "minulta", 2048 | "minulle", 2049 | "sinä", 2050 | "sinun", 2051 | "sinut", 2052 | "sinua", 2053 | "sinussa", 2054 | "sinusta", 2055 | "sinuun", 2056 | "sinulla", 2057 | "sinulta", 2058 | "sinulle", 2059 | "hän", 2060 | "hänen", 2061 | "hänet", 2062 | "häntä", 2063 | "hänessä", 2064 | "hänestä", 2065 | "häneen", 2066 | "hänellä", 2067 | "häneltä", 2068 | "hänelle", 2069 | "me", 2070 | "meidän", 2071 | "meidät", 2072 | "meitä", 2073 | "meissä", 2074 | "meistä", 2075 | "meihin", 2076 | "meillä", 2077 | "meiltä", 2078 | "meille", 2079 | "te", 2080 | "teidän", 2081 | "teidät", 2082 | "teitä", 2083 | "teissä", 2084 | "teistä", 2085 | "teihin", 2086 | "teillä", 2087 | "teiltä", 2088 | "teille", 2089 | "he", 2090 | "heidän", 2091 | "heidät", 2092 | "heitä", 2093 | "heissä", 2094 | "heistä", 2095 | "heihin", 2096 | "heillä", 2097 | "heiltä", 2098 | "heille", 2099 | "tämä", 2100 | "tämän", 2101 | "tätä", 2102 | "tässä", 2103 | "tästä", 2104 | "tähän", 2105 | "tallä", 2106 | "tältä", 2107 | "tälle", 2108 | "tänä", 2109 | "täksi", 2110 | "tuo", 2111 | "tuon", 2112 | "tuotä", 2113 | "tuossa", 2114 | "tuosta", 2115 | "tuohon", 2116 | "tuolla", 2117 | "tuolta", 2118 | "tuolle", 2119 | "tuona", 2120 | "tuoksi", 2121 | "se", 2122 | "sen", 2123 | "sitä", 2124 | "siinä", 2125 | "siitä", 2126 | "siihen", 2127 | "sillä", 2128 | "siltä", 2129 | "sille", 2130 | "sinä", 2131 | "siksi", 2132 | "nämä", 2133 | "näiden", 2134 | "näitä", 2135 | "näissä", 2136 | "näistä", 2137 | "näihin", 2138 | "näillä", 2139 | "näiltä", 2140 | "näille", 2141 | "näinä", 2142 | "näiksi", 2143 | "nuo", 2144 | "noiden", 2145 | "noita", 2146 | "noissa", 2147 | "noista", 2148 | "noihin", 2149 | "noilla", 2150 | "noilta", 2151 | "noille", 2152 | "noina", 2153 | "noiksi", 2154 | "ne", 2155 | "niiden", 2156 | "niitä", 2157 | "niissä", 2158 | "niistä", 2159 | "niihin", 2160 | "niillä", 2161 | "niiltä", 2162 | "niille", 2163 | "niinä", 2164 | "niiksi", 2165 | "kuka", 2166 | "kenen", 2167 | "kenet", 2168 | "ketä", 2169 | "kenessä", 2170 | "kenestä", 2171 | "keneen", 2172 | "kenellä", 2173 | "keneltä", 2174 | "kenelle", 2175 | "kenenä", 2176 | "keneksi", 2177 | "ketkä", 2178 | "keiden", 2179 | "ketkä", 2180 | "keitä", 2181 | "keissä", 2182 | "keistä", 2183 | "keihin", 2184 | "keillä", 2185 | "keiltä", 2186 | "keille", 2187 | "keinä", 2188 | "keiksi", 2189 | "mikä", 2190 | "minkä", 2191 | "minkä", 2192 | "mitä", 2193 | "missä", 2194 | "mistä", 2195 | "mihin", 2196 | "millä", 2197 | "miltä", 2198 | "mille", 2199 | "minä", 2200 | "miksi", 2201 | "mitkä", 2202 | "joka", 2203 | "jonka", 2204 | "jota", 2205 | "jossa", 2206 | "josta", 2207 | "johon", 2208 | "jolla", 2209 | "jolta", 2210 | "jolle", 2211 | "jona", 2212 | "joksi", 2213 | "jotka", 2214 | "joiden", 2215 | "joita", 2216 | "joissa", 2217 | "joista", 2218 | "joihin", 2219 | "joilla", 2220 | "joilta", 2221 | "joille", 2222 | "joina", 2223 | "joiksi", 2224 | "että", 2225 | "ja", 2226 | "jos", 2227 | "koska", 2228 | "kuin", 2229 | "mutta", 2230 | "niin", 2231 | "sekä", 2232 | "sillä", 2233 | "tai", 2234 | "vaan", 2235 | "vai", 2236 | "vaikka", 2237 | "kanssa", 2238 | "mukaan", 2239 | "noin", 2240 | "poikki", 2241 | "yli", 2242 | "kun", 2243 | "niin", 2244 | "nyt", 2245 | "itse" 2246 | ], 2247 | "es": [ 2248 | "de", 2249 | "la", 2250 | "que", 2251 | "el", 2252 | "en", 2253 | "y", 2254 | "a", 2255 | "los", 2256 | "del", 2257 | "se", 2258 | "las", 2259 | "por", 2260 | "un", 2261 | "para", 2262 | "con", 2263 | "no", 2264 | "una", 2265 | "su", 2266 | "al", 2267 | "lo", 2268 | "como", 2269 | "más", 2270 | "pero", 2271 | "sus", 2272 | "le", 2273 | "ya", 2274 | "o", 2275 | "este", 2276 | "sí", 2277 | "porque", 2278 | "esta", 2279 | "entre", 2280 | "cuando", 2281 | "muy", 2282 | "sin", 2283 | "sobre", 2284 | "también", 2285 | "me", 2286 | "hasta", 2287 | "hay", 2288 | "donde", 2289 | "quien", 2290 | "desde", 2291 | "todo", 2292 | "nos", 2293 | "durante", 2294 | "todos", 2295 | "uno", 2296 | "les", 2297 | "ni", 2298 | "contra", 2299 | "otros", 2300 | "ese", 2301 | "eso", 2302 | "ante", 2303 | "ellos", 2304 | "e", 2305 | "esto", 2306 | "mí", 2307 | "antes", 2308 | "algunos", 2309 | "qué", 2310 | "unos", 2311 | "yo", 2312 | "otro", 2313 | "otras", 2314 | "otra", 2315 | "él", 2316 | "tanto", 2317 | "esa", 2318 | "estos", 2319 | "mucho", 2320 | "quienes", 2321 | "nada", 2322 | "muchos", 2323 | "cual", 2324 | "poco", 2325 | "ella", 2326 | "estar", 2327 | "estas", 2328 | "algunas", 2329 | "algo", 2330 | "nosotros", 2331 | "mi", 2332 | "mis", 2333 | "tú", 2334 | "te", 2335 | "ti", 2336 | "tu", 2337 | "tus", 2338 | "ellas", 2339 | "nosotras", 2340 | "vosostros", 2341 | "vosostras", 2342 | "os", 2343 | "mío", 2344 | "mía", 2345 | "míos", 2346 | "mías", 2347 | "tuyo", 2348 | "tuya", 2349 | "tuyos", 2350 | "tuyas", 2351 | "suyo", 2352 | "suya", 2353 | "suyos", 2354 | "suyas", 2355 | "nuestro", 2356 | "nuestra", 2357 | "nuestros", 2358 | "nuestras", 2359 | "vuestro", 2360 | "vuestra", 2361 | "vuestros", 2362 | "vuestras", 2363 | "esos", 2364 | "esas", 2365 | "estoy", 2366 | "estás", 2367 | "está", 2368 | "estamos", 2369 | "estáis", 2370 | "están", 2371 | "esté", 2372 | "estés", 2373 | "estemos", 2374 | "estéis", 2375 | "estén", 2376 | "estaré", 2377 | "estarás", 2378 | "estará", 2379 | "estaremos", 2380 | "estaréis", 2381 | "estarán", 2382 | "estaría", 2383 | "estarías", 2384 | "estaríamos", 2385 | "estaríais", 2386 | "estarían", 2387 | "estaba", 2388 | "estabas", 2389 | "estábamos", 2390 | "estabais", 2391 | "estaban", 2392 | "estuve", 2393 | "estuviste", 2394 | "estuvo", 2395 | "estuvimos", 2396 | "estuvisteis", 2397 | "estuvieron", 2398 | "estuviera", 2399 | "estuvieras", 2400 | "estuviéramos", 2401 | "estuvierais", 2402 | "estuvieran", 2403 | "estuviese", 2404 | "estuvieses", 2405 | "estuviésemos", 2406 | "estuvieseis", 2407 | "estuviesen", 2408 | "estando", 2409 | "estado", 2410 | "estada", 2411 | "estados", 2412 | "estadas", 2413 | "estad", 2414 | "he", 2415 | "has", 2416 | "ha", 2417 | "hemos", 2418 | "habéis", 2419 | "han", 2420 | "haya", 2421 | "hayas", 2422 | "hayamos", 2423 | "hayáis", 2424 | "hayan", 2425 | "habré", 2426 | "habrás", 2427 | "habrá", 2428 | "habremos", 2429 | "habréis", 2430 | "habrán", 2431 | "habría", 2432 | "habrías", 2433 | "habríamos", 2434 | "habríais", 2435 | "habrían", 2436 | "había", 2437 | "habías", 2438 | "habíamos", 2439 | "habíais", 2440 | "habían", 2441 | "hube", 2442 | "hubiste", 2443 | "hubo", 2444 | "hubimos", 2445 | "hubisteis", 2446 | "hubieron", 2447 | "hubiera", 2448 | "hubieras", 2449 | "hubiéramos", 2450 | "hubierais", 2451 | "hubieran", 2452 | "hubiese", 2453 | "hubieses", 2454 | "hubiésemos", 2455 | "hubieseis", 2456 | "hubiesen", 2457 | "habiendo", 2458 | "habido", 2459 | "habida", 2460 | "habidos", 2461 | "habidas", 2462 | "soy", 2463 | "eres", 2464 | "es", 2465 | "somos", 2466 | "sois", 2467 | "son", 2468 | "sea", 2469 | "seas", 2470 | "seamos", 2471 | "seáis", 2472 | "sean", 2473 | "seré", 2474 | "serás", 2475 | "será", 2476 | "seremos", 2477 | "seréis", 2478 | "serán", 2479 | "sería", 2480 | "serías", 2481 | "seríamos", 2482 | "seríais", 2483 | "serían", 2484 | "era", 2485 | "eras", 2486 | "éramos", 2487 | "erais", 2488 | "eran", 2489 | "fui", 2490 | "fuiste", 2491 | "fue", 2492 | "fuimos", 2493 | "fuisteis", 2494 | "fueron", 2495 | "fuera", 2496 | "fueras", 2497 | "fuéramos", 2498 | "fuerais", 2499 | "fueran", 2500 | "fuese", 2501 | "fueses", 2502 | "fuésemos", 2503 | "fueseis", 2504 | "fuesen", 2505 | "sintiendo", 2506 | "sentido", 2507 | "sentida", 2508 | "sentidos", 2509 | "sentidas", 2510 | "siente", 2511 | "sentid", 2512 | "tengo", 2513 | "tienes", 2514 | "tiene", 2515 | "tenemos", 2516 | "tenéis", 2517 | "tienen", 2518 | "tenga", 2519 | "tengas", 2520 | "tengamos", 2521 | "tengáis", 2522 | "tengan", 2523 | "tendré", 2524 | "tendrás", 2525 | "tendrá", 2526 | "tendremos", 2527 | "tendréis", 2528 | "tendrán", 2529 | "tendría", 2530 | "tendrías", 2531 | "tendríamos", 2532 | "tendríais", 2533 | "tendrían", 2534 | "tenía", 2535 | "tenías", 2536 | "teníamos", 2537 | "teníais", 2538 | "tenían", 2539 | "tuve", 2540 | "tuviste", 2541 | "tuvo", 2542 | "tuvimos", 2543 | "tuvisteis", 2544 | "tuvieron", 2545 | "tuviera", 2546 | "tuvieras", 2547 | "tuviéramos", 2548 | "tuvierais", 2549 | "tuvieran", 2550 | "tuviese", 2551 | "tuvieses", 2552 | "tuviésemos", 2553 | "tuvieseis", 2554 | "tuviesen", 2555 | "teniendo", 2556 | "tenido", 2557 | "tenida", 2558 | "tenidos", 2559 | "tenidas", 2560 | "tened" 2561 | ] 2562 | } -------------------------------------------------------------------------------- /code/helper_functions.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from six.moves import cPickle 3 | import os, sys, json, io, re, random 4 | from string import punctuation 5 | from itertools import combinations 6 | from collections import Counter, deque 7 | from sklearn.metrics.pairwise import cosine_similarity 8 | from textblob import TextBlob 9 | import numpy as np 10 | import networkx as nx 11 | import community 12 | import spacy 13 | import textacy.extract 14 | import numba 15 | 16 | punctuation += "‘“" 17 | nlp = spacy.load('en_core_web_sm') 18 | 19 | 20 | # File helpers 21 | 22 | def save_bin(item, filename): 23 | with open(filename, "wb") as f: 24 | cPickle.dump(item, f) 25 | 26 | def load_bin(filename): 27 | ret = None 28 | if os.path.exists(filename): 29 | try: 30 | with open(filename, "rb") as f: 31 | ret = cPickle.load(f) 32 | except: 33 | pass 34 | return ret 35 | 36 | def save_json(variable, filename): 37 | with io.open(filename, "w", encoding="utf-8") as f: 38 | f.write(json.dumps(variable, indent=4, ensure_ascii=False)) 39 | 40 | def load_json(filename): 41 | ret = None 42 | if os.path.exists(filename): 43 | try: 44 | with io.open(filename, "r", encoding="utf-8") as f: 45 | ret = json.load(f) 46 | except: 47 | pass 48 | return ret 49 | 50 | def save_gephi_csv(data_map, filename): 51 | with io.open(filename, "w", encoding="utf-8") as f: 52 | f.write("Source,Target,Weight\n") 53 | for source, targets in data_map.items(): 54 | if len(targets) > 0: 55 | for target, weight in targets.items(): 56 | f.write(source + "," + target + "," + str(weight) + "\n") 57 | 58 | def write_gexf(mapping, filename, node_attrs=None, attr_names=None): 59 | nodes = sorted(list(set([m[0] for m in mapping]).union(set([m[1] for m in mapping])))) 60 | vocab = {} 61 | vocab_inv = {} 62 | for index, node in enumerate(nodes): 63 | label = "n" + str(index) 64 | vocab[node] = label 65 | vocab_inv[label] = node 66 | 67 | with open(filename, "w") as f: 68 | header = "" 69 | with open("config/gexf_header.txt", "r") as g: 70 | for line in g: 71 | header += line 72 | f.write(header + "\n") 73 | 74 | if attr_names is not None and len(attr_names) > 0: 75 | f.write("\t\t\n") 76 | for index, name in enumerate(attr_names): 77 | f.write("\t\t\t\n") 78 | f.write("\t\t\n") 79 | 80 | 81 | f.write("\t\t\n") 82 | indent = '\t\t\t' 83 | for index, node in enumerate(nodes): 84 | label = vocab[node] 85 | entry = indent+ "\n" 86 | if attr_names is not None and len(attr_names) > 0: 87 | entry += indent + "\t\n" 88 | for index, name in enumerate(attr_names): 89 | a = node_attrs[node] 90 | entry += indent + "\t\t\n" 91 | entry += indent + "\t\n" 92 | entry += indent + "\n" 93 | f.write(entry) 94 | f.write("\t\t\n") 95 | 96 | f.write("\t\t\n") 97 | for m in mapping: 98 | sid = vocab[m[0]] 99 | tid = vocab[m[1]] 100 | w = m[2] 101 | entry = indent + "\n" 102 | f.write(entry) 103 | f.write("\t\t\n") 104 | f.write("\t\n") 105 | f.write("\n") 106 | 107 | # Preprocessing and tokenization 108 | sw = load_json("config/stopwords.json") 109 | stopwords = sw["en"] 110 | stopwords.append("rt") 111 | 112 | # Preprocess token and return it, if it is valid 113 | def is_valid_token(r): 114 | hashtag = False 115 | if r[0] == "#": 116 | hashtag = True 117 | if r == "rt": 118 | return None 119 | if r[0] == "@": 120 | return None 121 | if r.startswith("htt"): 122 | return None 123 | if r.startswith("t.co/"): 124 | return None 125 | if "&" in r: 126 | return None 127 | r = r.replace("’", "'") 128 | r = r.strip(punctuation) 129 | if r is None or len(r) < 1: 130 | return None 131 | if hashtag == True: 132 | if r[0] != "#": 133 | return "#" + r 134 | return r 135 | 136 | # Tokenize tweet and return tokens. 137 | # Returns tokens both with and without stopwords 138 | def custom_tokenize(text): 139 | clean_tokens = [] 140 | clean_tokens_with_sw = [] 141 | raw_tokens = text.split() 142 | for r in raw_tokens: 143 | r = r.lower() 144 | tok = is_valid_token(r) 145 | if tok is not None: 146 | if tok not in stopwords: 147 | clean_tokens.append(tok) 148 | clean_tokens_with_sw.append(tok) 149 | return clean_tokens, clean_tokens_with_sw 150 | 151 | 152 | # Functions used for clustering 153 | 154 | 155 | 156 | num_grams = 3 157 | 158 | 159 | @numba.jit(target='cpu', nopython=True, parallel=True) 160 | def fast_cosine_matrix(u, M): 161 | scores = np.zeros(M.shape[0]) 162 | for i in numba.prange(M.shape[0]): 163 | v = M[i] 164 | m = u.shape[0] 165 | udotv = 0 166 | u_norm = 0 167 | v_norm = 0 168 | for j in range(m): 169 | if (np.isnan(u[j])) or (np.isnan(v[j])): 170 | continue 171 | 172 | udotv += u[j] * v[j] 173 | u_norm += u[j] * u[j] 174 | v_norm += v[j] * v[j] 175 | 176 | u_norm = np.sqrt(u_norm) 177 | v_norm = np.sqrt(v_norm) 178 | 179 | if (u_norm == 0) or (v_norm == 0): 180 | ratio = 1.0 181 | else: 182 | ratio = udotv / (u_norm * v_norm) 183 | scores[i] = ratio 184 | return scores 185 | 186 | def get_quick_mapping(vectors): 187 | add_unconnected = False 188 | mapping = [] 189 | t1 = int(time.time()) 190 | #threshold = min(0.9, (0.3 + (len(vectors)/60000))) 191 | threshold = 0.6 192 | total_calcs = ((len(vectors)*len(vectors))/2)-len(vectors) 193 | print("Num vectors: " + str(len(vectors))) 194 | step = round(len(vectors)/100) 195 | print("Initial threshold: " + "%.3f"%threshold) 196 | calcs = 0 197 | for index in range(len(vectors)-1): 198 | if index % 100 == 0: 199 | progress = (calcs/total_calcs)*100 200 | sys.stdout.write("\r") 201 | sys.stdout.flush() 202 | sys.stdout.write("Progress: " + "%.2f"%progress + "%") 203 | sys.stdout.flush() 204 | scores = fast_cosine_matrix(np.array(vectors[index]), np.array(vectors[index+1:])) 205 | calcs += len(vectors)-(index+1) 206 | aw = np.argwhere(scores>=threshold) 207 | temp = [[index, item[0]+index+1, scores[item[0]]] for item in aw if index != item[0]] 208 | if len(temp) > 0: 209 | mapping.extend(temp) 210 | t2 = int(time.time()) 211 | print() 212 | print("Took " + str(t2-t1) + " seconds.") 213 | return mapping 214 | 215 | def get_close_neighbours(vectors, threshold): 216 | xsim = cosine_similarity(vectors) 217 | similar = [] 218 | for x in range(len(xsim[0])): 219 | for y in range(x, len(xsim[1])): 220 | if x != y: 221 | if xsim[x][y] > threshold: 222 | similar.append([x, y, xsim[x][y]]) 223 | return similar 224 | 225 | def make_text_clusters(vectors, edge_ratio=3, threshold=0): 226 | add_unconnected = False 227 | vecsindexed = vectors 228 | desired_edges = round(len(vecsindexed) * edge_ratio) 229 | 230 | trimmed_mapping = [] 231 | if len(vectors) < 20000: 232 | xsim = cosine_similarity(vecsindexed) 233 | if threshold == 0: 234 | sims = [] 235 | for x in range(len(xsim[0])-1): 236 | sims.extend(list(xsim[x][x+1:])) 237 | ind = np.argpartition(sims, desired_edges*-1)[-1] 238 | threshold = sims[ind] 239 | #print("Threshold: " + "%.4f"%threshold) 240 | 241 | for x in range(len(xsim[0])-1): 242 | row = np.array(xsim[x][x+1:]) 243 | aw = np.argwhere(row>=threshold) 244 | for item in aw: 245 | i = item[0] 246 | y = i+x+1 247 | sim = row[i] 248 | trimmed_mapping.append([x, y, sim]) 249 | else: 250 | mapping = get_quick_mapping(vectors) 251 | if threshold != 0: 252 | trimmed_mapping = mapping 253 | else: 254 | if len(mapping) > desired_edges: 255 | sims = sorted([s for x, y, s in mapping], reverse=True) 256 | threshold = sims[desired_edges] 257 | #print("Threshold: " + "%.4f"%threshold) 258 | trimmed_mapping = [[x, y, s] for x, y, s in mapping if s >= threshold] 259 | else: 260 | trimmed_mapping = mapping 261 | 262 | g=nx.Graph() 263 | g.add_weighted_edges_from(trimmed_mapping) 264 | communities = community.best_partition(g) 265 | 266 | clusters = {} 267 | for node, mod in communities.items(): 268 | if mod not in clusters: 269 | clusters[mod] = [] 270 | clusters[mod].append(node) 271 | return clusters, trimmed_mapping 272 | 273 | def get_sentiment(texts): 274 | sents = [] 275 | for text in texts: 276 | blob = TextBlob(text) 277 | for sentence in blob.sentences: 278 | sents.append(sentence.sentiment.polarity) 279 | return np.sum(sents) 280 | 281 | def get_cluster_relevance(texts, vectors, sns, ids): 282 | center = get_cluster_center(vectors) 283 | tweets = Counter() 284 | urls = Counter() 285 | indices = Counter() 286 | for index, text in enumerate(texts): 287 | sn = sns[index] 288 | id_str = ids[index] 289 | final = vectors[index] 290 | sim = cosine_similarity([center, final])[0][1] 291 | tweets["@" + sn + ": " + text.replace("\n", " ")] = sim 292 | urls["https://twitter.com/"+sn+"/status/"+id_str] = sim 293 | indices[index] = sim 294 | return indices, tweets, urls 295 | 296 | def get_label_text(texts, vectors): 297 | center = get_cluster_center(vectors) 298 | similarities = Counter() 299 | for index, text in enumerate(texts): 300 | final = vectors[index] 301 | sim = cosine_similarity([center, final])[0][1] 302 | similarities[text.replace("\n", " ").replace("\"", "").replace("\'", "")[:20]] = sim 303 | most_relevant = [x for x, c in similarities.most_common(1)][0] 304 | return most_relevant 305 | 306 | def get_cluster_center(vectors): 307 | center = np.sum(np.array(vectors), axis=0) 308 | return center 309 | 310 | def get_pagerank(vectors): 311 | xsim = cosine_similarity(vectors) 312 | trimmed_mapping = [] 313 | threshold = 0 314 | for x in range(len(xsim[0])-1): 315 | row = np.array(xsim[x][x+1:]) 316 | aw = np.argwhere(row>=threshold) 317 | for item in aw: 318 | i = item[0] 319 | y = i+x+1 320 | sim = row[i] 321 | trimmed_mapping.append([x, y, sim]) 322 | g=nx.Graph() 323 | g.add_weighted_edges_from(trimmed_mapping) 324 | scores = nx.pagerank(g) 325 | return scores 326 | 327 | def rank_sentences(scores, sentences): 328 | ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True) 329 | return ranked_sentences 330 | 331 | def tokenize2(text, sw): 332 | clean_tokens = [] 333 | raw_tokens = text.split() 334 | for r in raw_tokens: 335 | r = r.lower() 336 | tok = is_valid_token(r) 337 | if sw == True: 338 | if tok in stopwords: 339 | tok = None 340 | if tok is not None: 341 | clean_tokens.append(tok) 342 | return clean_tokens 343 | 344 | def get_word_frequencies(texts): 345 | wfreq = Counter() 346 | for text in texts: 347 | toks = tokenize2(text, False) 348 | for t in toks: 349 | t = t.lower() 350 | wfreq[t] += 1 351 | return wfreq 352 | 353 | def get_ngram_frequencies(texts, num_grams): 354 | prev = deque() 355 | gram_freq = Counter() 356 | for text in texts: 357 | toks = tokenize2(text, False) 358 | for t in toks: 359 | prev.append(t) 360 | if len(prev) > num_grams: 361 | prev.popleft() 362 | if len(prev) == num_grams: 363 | last = list(prev) 364 | gram = " ".join(last) 365 | gram_freq[gram] += 1 366 | return gram_freq 367 | 368 | def get_wft(wfreq, num=5): 369 | wft = "" 370 | count = 0 371 | for x, c in wfreq.most_common(): 372 | if x not in stopwords and c > 1: 373 | m = x + "(" + str(c) + ") " 374 | wft += m 375 | count += 1 376 | if count >= num: 377 | break 378 | return wft 379 | 380 | def trim_vec_label(vec_label, threshold): 381 | label_counts = Counter([j for i, j in vec_label]) 382 | valid_labels = set([l for l, c in label_counts.most_common() if c > threshold]) 383 | trimmed_vec_label = [[i, j] for i, j in vec_label if j in valid_labels] 384 | return trimmed_vec_label 385 | 386 | def get_subject_verb_object_triples(texts): 387 | summary_counts = Counter() 388 | for text in texts: 389 | doc = nlp(text) 390 | for statement in textacy.extract.subject_verb_object_triples(doc): 391 | subject, verb, fact = statement 392 | summary = "(" + str(subject) + ", " + str(verb) + ", " + str(fact) + ")" 393 | summary_counts[summary] += 1 394 | return summary_counts 395 | 396 | def print_counter_summary(val_list): 397 | msg = "[ " 398 | valc = Counter(val_list) 399 | for x, c in valc.most_common(5): 400 | msg += x + " (" + str(c) + ") " 401 | msg += "]" 402 | return msg 403 | 404 | 405 | 406 | -------------------------------------------------------------------------------- /code/readme.md: -------------------------------------------------------------------------------- 1 | # Clustering of tweets based on textual content using meta embeddings and community detection 2 | 3 | This directory contains the code detailed in our blog post entitled "Identification And Categorization Of Toxic Twitter Posts Via Clustering" that can be found at: https://github.com/r0zetta/meta_embedding_clustering 4 | 5 | In order to replicate our experiments, you'll need to gather some data from Twitter. To gather input data in the correct format for the toolchain provide here, use the twitter_gatherer.py tool from https://github.com/r0zetta/twitter_gather 6 | 7 | If you run the twitter_gatherer.py tool in this directory, it should save data to the correct location for use in subsequent steps. 8 | 9 | The notebooks here require a few python modules to be installed in order to work. Those modules are the following: 10 | 11 | - spacy https://spacy.io/ 12 | - textacy https://chartbeat-labs.github.io/textacy/index.html 13 | - numba http://numba.pydata.org/ 14 | - networkx https://networkx.github.io/ 15 | - Louvain for networkx https://github.com/taynaud/python-louvain 16 | - sklearn https://scikit-learn.org/stable/ 17 | - textblob https://textblob.readthedocs.io/en/dev/ 18 | - gensim https://radimrehurek.com/gensim/auto_examples/index.html 19 | - word2vec https://radimrehurek.com/gensim/models/word2vec.html 20 | - doc2vec https://radimrehurek.com/gensim/models/doc2vec.html 21 | - sentence-transformers https://github.com/UKPLab/sentence-transformers 22 | 23 | Once you have collected data, run each jupyter notebook in numbered order. Follow instructions provided in code comments within the notebooks. 24 | 25 | Enjoy! 26 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # A New, Novel Method For Clustering Tweets 2 | 3 | ## Foreword 4 | 5 | This research was conducted between 1st November 2019 and 22nd January 2020 by Alexandros Kornilakis (University of Crete, FORTH-ICS institute) and Andrew Patel (F-Secure Corporation) as part of EU Horizon 2020 projects PROTASIS and SHERPA, and F-Secure's Project Blackfin. SHERPA is an EU-funded project which analyses how AI and big data analytics impact ethics and human rights. PROTASIS is a project that aims to expand the reach of systems security to the international community via joint research efforts. The PROTASIS project has received funding from the European Union’s Horizon 2020 research and innovation programme under the Marie Skłodowska-Curie grant agreement, No. 690972. Project Blackfin is a multi-year research effort aimed at investigating how to apply collective intelligence in the cyber security domain. 6 | 7 | ## Summary 8 | 9 | Due to the complex nature of human language, automated detection of negativity and toxicity in content posted on forums, comments sections, and social networks is a difficult task. We posit that an accurate method to cluster textual content is a necessary precursor to any system that may eventually be capable of detecting abusive content, especially on platforms that limit the length of messages that can be authored (such as Twitter). Clustering can be used to find similar phrases, such as those found in regular spam, reply-spam-based propaganda, and content artificially amplified by organized disinformation groups. It is also useful for identifying topics of conversation (*what people think about something or someone, what people are talking about*), and may also be used to measure sentiment around those topics (*how strongly people agree or disagree with something or someone*). As this article will illustrate, accurate clustering can also be used to identify other interesting phenomena, such as users who attempt to "hide" spam by slightly altering each of their tweets, groups of accounts spreading hatred towards specific demographics, and groups of accounts spreading disinformation, hoaxes, and fake news. 10 | 11 | In this article, we detail our own novel clustering methodology, based on meta embeddings and community detection, and the results of applying that methodology a number of different datasets collected from Twitter, including replies to US politicians, tweets captured against hashtags pertaining to the 2019 UK general elections, and content gathered from UK far-right activists. We present several examples of the output of our clustering methodology, including analysis and interpretation of the results we obtained, and an interactive site for readers to explore. We also discuss some future directions for this line of research. To the best of our knowledge, our approach is the most sophisticated method to date for clustering tweets. 12 | 13 | ## Introduction 14 | 15 | Anyone who's read comments sections on news sites, looked at replies to social media posts authored by politicians, or read comments on YouTube will appreciate that there's a great deal of toxicity on the internet. Some female and minority high-profile Twitter users are the target of constant, serious harassment, including death threats (https://www.youtube.com/watch?v=A3MopLxgvLc) from both individuals and coordinated groups of users. Social media posts authored by politicians, journalists, and news organizations often receive large numbers of angry or downright toxic replies from people who don't support their statements or opinions. Some of these replies originate from fake accounts that have been created for the express purpose of trolling - the process of posting controversial comments designed to provoke emotional reactions and start fights. Trolling is a highly efficient way to spread rumors and disinformation, alter public opinion, and disrupt otherwise meaningful conversation, and, as such, is a tool often used by organized groups of political activists, commercial troll farms, and nation state disinformation campaigns. 16 | 17 | On Twitter, troll accounts sometimes use a technique called reply-spamming to fish for engagement. This technique involves replying to a large number of high-profile accounts with the same or similar messages. This achieves two goals. The first is organic visibility - many people read replies to posts from politicians, and thus may read the post from the troll account. The second is social engineering – people get angry and reply to the troll’s posts, and occasionally the owner of the high-profile account may be tricked into engaging with the post themselves. Although high-profile accounts are rarely engaged by such tactics, it's not unheard of. 18 | 19 | The problem of analyzing and detecting abuse, toxicity, and hate speech in online social networks has been widely studied by the academic community. Recent studies made use of word embeddings to recognise and classify hate speech on Twitter (https://arxiv.org/pdf/1809.10644.pdf, https://arxiv.org/pdf/1906.03829.pdf), and Chakrabarty et. al. have used LSTMs to visualize abusive content on Twitter, by highlighting offensive use of language (https://arxiv.org/pdf/1809.08726.pdf). 20 | 21 | The challenges involved in detecting online abuse are discussed in a paper published by the Alan Turing Institute (https://www.turing.ac.uk/sites/default/files/2019-07/vidgen-alw2019.pdf) Furthemore, issues surrounding the detection of cyber-bullying and toxicity are discussed by Tsapatsoulis et. al (https://encase.socialcomputing.eu/wp-content/uploads/2019/05/NicolasTsapatsoulis.pdf). An approach for detecting bullying and aggression on twitter is proposed by Chatzakou et. al at (https://arxiv.org/pdf/1702.06877.pdf). Srivastava et. al have used capsule networks to identify toxic comments (https://www.aclweb.org/anthology/W18-4412.pdf). The challenges of classifying toxic comments are discussed further by van Aken et. al (https://arxiv.org/pdf/1809.07572.pdf). 22 | 23 | We note that methods involving the use of word embeddings have been previously used to cluster Twitter textual data (https://ieeexplore.ieee.org/document/7925400), and that community detection has been applied to text classification problems (https://arxiv.org/abs/1909.11706). However, we have not encountered literature referencing the combination of both. To the best of our knowledge, our approach is the most sophisticated method to date for clustering tweets. 24 | 25 | ### Successful reply-spam-based disinformation in the lead-up to the 2019 UK General Election 26 | 27 | Reply-spam was also used to successfully propagate disinformation during the run-up to the December 2019 UK general election. One such occasion involved a situation where a journalist attempted to show a picture of a child sleeping on the floor of an overcrowded hospital to Boris Johnson during a television interview. Instead of looking at the picture, Johnson pocketed the reporter's phone and attempted to change the subject of their conversation. A clip of the interview went viral on social media, and shortly after, a large number of accounts published posts on various social networks, including Facebook and Twitter, claiming to be an acquaintance of one of the senior nurses at the hospital, and that the aforementioned nurse could verify that the picture was faked (https://twitter.com/marcowenjones/status/1204183081009262592). 28 | 29 | ![](readme//media/image4.jpeg) 30 | 31 | *Above: some of the original reply spam tweets regarding the Leeds Hospital incident. Note how they are all replies to politicians and journalists.* 32 | 33 | ![](readme//media/image5.png) 34 | 35 | *Above: tory activists on Twitter reinforced the original campaign with more copy-paste reply spam* 36 | 37 | ![](readme//media/image6.jpeg) 38 | 39 | *Above: this was shortly followed by a second campaign containing a different tweet that was also copy-pasted across social networks (by the same group of tory activists)* 40 | 41 | Many of the accounts that posted this content on Twitter were created specifically for that purpose, and deleted shortly afterwards (https://twitter.com/r0zetta/status/1204519439640801280). The picture of the child sleeping on the floor of the hospital had appeared a week prior to the interview with Johnson in a local newspaper, and at that time, both the story and picture had been verified with personnel at the hospital. However, the fake social media posts were amplified to such a degree that voters, including those living in Leeds, believed that the picture had been faked. At least on Twitter, this disinformation was spread using reply-spam aimed at posts authored by politicians and journalists. 42 | 43 | During the run-up to the 2019 UK general elections, posts on social networks were enough to propagate false information. Very few traditional "fake news" sites were uncovered, and it is unlikely that those that were found had any significant impact. Fake news sites are traditionally created in order to give legitimacy to fabricated, "clickbait" headlines. However, people are often inclined to share a headline without even visiting the original article. As such, fake news sites are rarely necessary. Nowadays, it is often enough to simply post something emotionally appealing on a social network, promote it enough to reach a handful of people, and then sit back and watch as it is organically disseminated by proxy. Once a rumor or lie has been spread in this manner, it enters the public’s consciousness, and can be difficult to later refute, even if the initial claim is debunked (https://twitter.com/r0zetta/status/1210499949064052737). 44 | 45 | ### Dealing with social media posts on a large scale 46 | 47 | Anyone who runs a prominent social media account is unlikely to be able to find relevant or interesting replies to content they've posted due to the fact that they must wade through hundreds or even thousands of replies, many of which are toxic. This essentially amounts to an informational denial of service for both the account owner, and anyone with a genuine need to contact them. Well-established anti-spam systems exist to assist users with this problem for email, but no such systems exist for social networks. Since notification interfaces on most social networks don't scale well for highly engaged accounts, an automated filtering system would be a more than welcome feature. 48 | 49 | Detection of unwanted textual content such as email spam and hate speech is a much easier task than detecting nuances in language indicative of negativity or toxicity. Spam messages typically follow patterns that can be accurately separated with clustering techniques or even regular expressions. Hate speech often contains words that are rarely used outside of their context, and hence can be successfully detected with string matches and other relatively simple techniques. One might assume that sentiment analysis techniques could be used to find toxic content, but they are, unfortunately, still rather inaccurate on real-world data. They often fail to understand the fact that the context of a word can drastically alter its meaning (e.g. "You're a rotten crook" versus "You’ll beat that crook in the next election"). Although accurate sentiment analysis techniques may eventually be of use in this area, software designed to filter toxic comments may require more metadata (such as the subject matter, or topic of the message) in order to perform accurately, or to provide a better explanation as to why certain messages were filtered. 50 | 51 | ### Motivation for using a clustering / topic modelling approach 52 | 53 | In the context of our work, clustering (or topic modelling) is the process of grouping phrases or passages (or, in this case, tweets) into "buckets" based on their topic or subject matter. Clustering of textual content is useful for finding similar phrases, such as those found in regular spam (e.g. porn bots), reply-spam-based propaganda, and content artificially amplified by organized disinformation groups. It is also useful for identifying topics of conversation (*what people think about something or someone, what people are talking about*), and may also be used to measure sentiment around those topics (*how strongly people agree or disagree with something or someone*). As this article will illustrate, accurate clustering can also be used to identify other interesting phenomena, such as users who attempt to "hide" spam by slightly altering each of their tweets (something that is cumbersome to detect via regular expressions), groups of accounts spreading hatred towards specific demographics, and groups of accounts spreading disinformation, hoaxes, and fake news. Furthermore, the results of accurate clustering and topic modeling can be fed into downstream tasks such as: 54 | 55 | - systems designed to fact-check posts and comments 56 | 57 | - systems designed to detect and track rumors and the spread of disinformation, hoaxes, scams, and fake news 58 | 59 | - systems designed to identify the political stance of content published by one or more accounts or conversations 60 | 61 | - systems designed to quantify public opinion and assess the impact of social media on public opinion 62 | 63 | - trust analysis tasks (including those used to determine the quality of accounts on social networks) 64 | 65 | - the creation of disinformation knowledge bases and datasets 66 | 67 | - detection of bots or spam publishers 68 | 69 | To this end, we have attempted to build a system that is capable of clustering the type of written content typically encountered on social networks (or more specifically, on Twitter). Our experiments focus on tweets posted in reply to content authored by prominent US politicians and presidential candidates. 70 | 71 | ## Experiments 72 | 73 | We started by collecting two datasets: 74 | 75 | ### Set 1: US Democrats 76 | 77 | The first set captured direct replies to tweets published by a number of highly engaged democrat-affiliated Twitter accounts - @JoeBiden, @SenSanders, @BernieSanders, @SenWarren, @ewarren, @PeteButtigieg, @MikeBloomberg, @amyklobuchar, @AndrewYang and @AOC - between Sun Dec 15 2019 and Mon Jan 13 2020. A total of 978,721 tweets were collected during this period. After preprocessing, a total of 719,617 tweets remained. 78 | 79 | ### Set 2: Donald Trump 80 | 81 | The second set captured direct replies to tweets published by @realDonaldTrump between Sun Dec 15 2019 and Wed Jan 08 2020. A total of 4,940,317 tweets were collected during this period. Due to the discrepancy between the sizes of the two collected datasets, we opted to utilize a portion of this set containing 1,022,824 tweets. After preprocessing, a total of 747,232 tweets remained. 82 | 83 | --- 84 | 85 | We developed our own clustering methodology for this research, which involved preprocessing of captured data, converting tweets into sentence vectors (using different techniques), combining those vectors into meta embeddings, and then creating node-edge graphs using similarities between calculated meta embeddings. Clusters were then derived by performing community detection on the resulting graphs. A detailed description of our methodology can be found in appendix 1 of this article. 86 | 87 | ### Experiment 1: US Democrats 88 | 89 | Our first experiment involved clustering of a subset of data in set 1 (US democrats). We clustered a batch of 34,003 tweets, resulting in 209 clusters. We created an interactive demo using results of this clustering experiment (https://twitter-clustering.web.app/). Note that this interactive demo will not display correctly on mobile browsers, so we encourage you to visit it from a desktop computer. Use the scroll wheel to zoom in and out of the visualization space, left-click and drag to move the nodes around, and click on nodes or communities themselves to see details. Details include names of accounts that were replied to the most in tweets assigned to that cluster, subject-verb-object triplets and overall sentiment extracted from those tweets, and the two most relevant tweets, loaded on the right of the screen, as examples. Different communities related to different topics (e.g. Community 2 contains clusters relevant to recent events in Iran). 90 | 91 | The image below is a static graph visualization of the discovered clusters. Labels were derived by matching commonly occurring words, and bigram combinations of those words, with ngrams and subject-verb-object triplets found in the tweets contained within each cluster. The code for doing this can be found in our github repo (https://github.com/r0zetta/meta_embedding_clustering). 92 | 93 | ![](readme//media/image41.png) 94 | 95 | We ran sentiment analysis on each cluster by taking the average sentiment calculated across all tweets contained in the cluster. Sentiment analysis was performed with TextBlob’s lexical sentiment analyzer. We then summarized negative and positive groups of clusters by counting words, ngrams, and which account was replied to. We also extracted subject-verb-object triplets from clusters using the textacy python module. 96 | 97 | ![](readme//media/image43.png) 98 | 99 | Note how, in the above, sentiment analysis has incorrectly categorized a few statements such as “you will never be president” and "you're a moron" as positive. 100 | 101 | ![](readme//media/image44.png) 102 | 103 | As you can see in the above, negative clusters outnumbered positive clusters by a factor of two. 104 | 105 | ![](readme//media/image45.png) 106 | 107 | Above are clusters designated toxic by virtue of their average sentiment score. Negative posts are defined as those that express disagreement with the author; toxic posts are more strongly worded and tend to express hostility or downright hatred. 108 | 109 | ![](readme//media/image46.png) 110 | 111 | Above is a breakdown of replies by verdict for each candidate. Percentage-wise, @AndrewYang received by far the most positive replies, and @AOC and @SenWarren received the largest ratio of toxic replies. 112 | 113 | This simple analysis isn’t, unfortunately, all that accurate, due to deficiencies in the sentiment analysis library used. 114 | 115 | 116 | The following chart contains summaries of some of the larger clusters identified. Most of the larger clusters contained negative replies, including common themes such as: 117 | 118 | - you are an idiot/moron/liar/traitor (or similar) 119 | 120 | - you will never be president 121 | 122 | - Trump will win the next election 123 | 124 | Positive themes included: 125 | 126 | - We love you 127 | 128 | - You got this 129 | 130 | - You have my vote 131 | 132 | ![](readme//media/image9.png) 133 | 134 | 135 | Several clusters contained replies directed at just one account. They contained either replies to specific content posted by that account, or comments specifically directed at the politician’s history or personal life, including the following: 136 | 137 | - Comments about Joe Biden’s son 138 | 139 | - Replies to Pete Buttigieg correcting him on a tweet about Jesus being a refugee 140 | 141 | - Comments about Joe Biden’s involvement in the Ukraine 142 | 143 | - Comments about Pete Buttigieg’s net worth, and something about expensive wine 144 | 145 | - Highly positive replies to Andrew Yang’s posts 146 | 147 | ![](readme//media/image10.png) 148 | 149 | --- 150 | 151 | ### Noteworthy clusters 152 | 153 | ![](readme//media/image11.png) 154 | 155 | *Above: two discovered clusters – one containing toxic replies, and another containing praise* 156 | 157 | ![](readme//media/image13.png) 158 | 159 | *The above discovered cluster contains accounts propagating a hoax that the 2019 bushfires in Australia were caused by arsonists* 160 | 161 | ![](readme//media/image14.png) 162 | 163 | *Above is one of a few clusters containing replies only to Pete Buttigieg, where Twitter users state that Jesus wasn’t a refugee* 164 | 165 | ![](readme//media/image15.png) 166 | 167 | *The cluster shown above contains positive comments to democratic presidential candidates that were posted after a debate* 168 | 169 | Example output from this dataset can be found in our github repo (https://github.com/r0zetta/meta_embedding_clustering/blob/master/example_output/tweet_graph_analysis_dems.txt). 170 | 171 | --- 172 | 173 | ### Experiment 2: realDonaldTrump 174 | 175 | Our second experiment involved clustering of a subset of data in set 2 (@realDonaldTrump). We processed a batch of 30,044 tweets, resulting in 209 clusters. 176 | 177 | A image below is a static graph visualization of the discovered clusters: 178 | 179 | ![](readme//media/image42.png) 180 | 181 | Using the same methodology as in our first experiment, we separated the clusters into positive, negative, and toxic, and then summarized them. Positive clusters included both statements of thanks and wishes of Merry Christmas and a Happy New Year, but also included the incorrectly categorized phrase “you are a puppet”. A summarization of negative clusters didn’t find any obvious false-positives, and included themes such as recent impeachment hearings, and comments on the amount of time the president has spent playing golf. Clusters deemed toxic contained, as expected, a lot of profanity. 182 | 183 | ![](readme//media/image47.png) 184 | 185 | Final values for this set were as follows: 186 | 187 | Positive tweets: 7260 (24.16%) Negative tweets: 16364 (54.47%) Toxic tweets: 6420 (21.37%) 188 | 189 | Note how @realDonaldTrump received a great deal more toxic replies than any of the accounts studied in the previous dataset. Note also that tweets contained in negative and toxic clusters totalled roughly three times that of tweets in positive clusters. 190 | 191 | Here are some details from the largest identified clusters. They include the following negative themes: 192 | 193 | - You are an idiot/liar/disgrace/criminal/\#impotus 194 | 195 | - You are not our president 196 | 197 | - You have no idea / you know nothing 198 | 199 | - You should just shut up 200 | 201 | - You can’t stop lying 202 | 203 | - References to Vladimir Putin 204 | 205 | Here are some of the positive themes identified in these larger clusters: 206 | 207 | - God bless you, Mr. President 208 | 209 | - We love you 210 | 211 | - You are the best president 212 | 213 | ![](readme//media/image18.png) 214 | 215 | --- 216 | 217 | ### Noteworthy clusters 218 | 219 | ![](readme//media/image19.png) 220 | 221 | *Above and below are Christmas-themed clusters, but with quite different messages. The one above contains mostly season’s greetings, whilst the one below contains some questions to Trump about his plans for the holidays.* 222 | 223 | ![](readme//media/image20.png) 224 | 225 | *Below is a cluster that found a bunch of “pot calls the kettle black” phraseology. Note how it captures quite different phrases such as “name is pot and he says you’re black”, “kettle meet black”, “pot and kettle situation” and so on. It did fail on that one tweet that references blackface.* 226 | 227 | ![](readme//media/image21.png) 228 | 229 | *This next one (below) is interesting. It found tweets where people typed words or sentences with spaces between each letter.* 230 | 231 | ![](readme//media/image22.png) 232 | 233 | *Below is a cluster that identified “stfu” phraseology.* 234 | 235 | ![](readme//media/image23.png) 236 | 237 | Example output from this dataset (and others studies) can be found in our github repo (https://github.com/r0zetta/meta_embedding_clustering/tree/master/example_output). 238 | 239 | --- 240 | 241 | ### Content regarding the recent Iranian situation 242 | 243 | As mentioned in our methodology section (later in this article), the technique we’re using does sometimes identify multiple clusters containing similar subject matter. While looking through the clusters identified from replies to @realDonaldTrump, we found four clusters that all contained high percentages of tweets about a recent situation in Iran. Upon inspection we realized that those clusters contained different takes on the same issue. 244 | 245 | 246 | *Below is a cluster that contains some tweets praising Trump’s actions in the region.* 247 | 248 | ![](readme//media/image24.png) 249 | 250 | 251 | *Below is a cluster that contains some tweets mentioning Iraq and related repercussions of actions against Iran.* 252 | 253 | ![](readme//media/image25.png) 254 | 255 | 256 | *Below is a cluster that contains mostly negative comments about Trump’s actions in the region.* 257 | 258 | ![](readme//media/image26.png) 259 | 260 | 261 | *And finally, the cluster below contains a great deal of toxic comments.* 262 | 263 | ![](readme//media/image27.png) 264 | 265 | --- 266 | 267 | ### Testing our methodology on different data 268 | 269 | We tested our topic modeling methodology further by running the same toolchain on a set of tweets collected during the run-up to the UK elections. These were tweets captured on hashtags relevant to those elections (\#GE2019, \#generalelection2019, etc.). Our methodology turns out to be quite well-suited for finding spam. Here are a few examples: 270 | 271 | *The output below contains tweets posted by an app called “paper.li”, which is a legitimate online service that folks can use to craft their own custom newspaper. It turns out there were a great deal of paper.li links shared on top of the \#ge2019 hashtag. Unfortunately, this was one of four clusters identified that contained similar-looking paper.li tweets (which could be found more easily by filtering collected Twitter data by source field).* 272 | 273 | ![](readme//media/image28.png) 274 | 275 | 276 | *Below we can see some copy-paste disinformation, all shared by the same user. Note that this analysis was run over roughly 30,000 randomly selected tweets from a dataset with millions of entries. As such, I imagine we'd likely find more of the same from this user if we were to process a larger number of tweets.* 277 | 278 | ![](readme//media/image29.png) 279 | 280 | 281 | *Below we see some tweets advertising porn, on top of the \#ge2019 hashtag. Spam advertisers often piggyback their tweets on trending hashtags, and the ones we captured trended often during the run-up to the 2019 UK general elections.* 282 | 283 | ![](readme//media/image30.png) 284 | 285 | 286 | *A cluster that identified a certain style of writing also identified tweets coming mostly from one account. Rather useful.* 287 | 288 | ![](readme//media/image31.png) 289 | 290 | 291 | *The cluster below picked up on similar phraseology. Not sure what that conversation was about.* 292 | 293 | ![](readme//media/image32.png) 294 | 295 | 296 | *Finally, several clusters (shown below) contained a great deal of tweets including the word “antisemitism”. Many of the accounts in these clusters could be classified as trolls and/or fake disinformation accounts.* 297 | 298 | ![](readme//media/image33.png) 299 | 300 | *Note that we found similar clusters in data collected by following pro-tory activist accounts and sockpuppets during the same time period (shown below):* 301 | 302 | ![](readme//media/image48.png) 303 | 304 | *Other clusters were discovered in tweets from the same tory accounts, including a few that contained tweets designed to incite hatred towards specific demographics (see below)* 305 | 306 | ![](readme//media/image49.png) 307 | 308 | It’s worth noting that a portion of the accounts identified in our clustered data have been suspended since the data was originally collected. This is a good indication that some of the users who post frequent replies to politicians, and participate in harassment are either fake, or are performing activities that break Twitter’s terms of service. Any methodology that allows such accounts to be identified quickly and accurately is of value. 309 | 310 | --- 311 | 312 | ## Conclusions and future directions 313 | 314 | The methodology developed for our experiments yielded a mechanism for grouping tweets with similar content into reasonably accurate clusters. It did a very efficient job at identifying similar tweets, such as those posted by coordinated disinformation groups, from reply-spammers, and from services that post content on behalf of a user’s account (such as paper.li or share buttons on web sites). However, it still suffers from a tradeoff between accuracy and the creation of redundant clusters. Further work is needed to refine the parameters and logic of this methodology such that it is able to assign groups of relatively rare tweets into small clusters, while at the same time creating large clusters of similar content, where appropriate. 315 | 316 | In order to fully automate the detection of toxic content and online harassment, additional mechanisms must be researched and added to our toolchain. These include an automated method for creating rich, readable summaries of the contents of a cluster, more accurate sentiment or stance analysis of the contents of a cluster, and better methods for automatically assigning verdicts, labels, or categories to each cluster. 317 | 318 | Further research into whether the identified clusters may be used to classify new content is another area worth exploring (initial experiments into this line of research are documented in appendix 2 of this article). 319 | 320 | If these future goals can be completed successfully, a whole range of potential applications open up, such as, automated filtering or removal of toxic content, an automated method to assign quality scores to accounts based on how often they post toxic content or harass users, and the ability to track the propagation of toxic or trolling content on social networks (including, perhaps, behind-the-scenes identification of how such activity is coordinated). 321 | 322 | ## Appendix 1: Detailed methodology 323 | 324 | This section contains a detail explanation of the methodology we employed to cluster tweets based on their textual content. Since this section is fairly dry and technical, we opted to leave it until the end of this article. Feel free to skip it unless you’re interested in replicating it for your own means, are involved in similar research, or are both curious and patient 325 | 326 | All the code used to implement this can be found in our github repo (https://github.com/r0zetta/meta_embedding_clustering). 327 | 328 | ### 1\. Data collection, preprocessing, and vectorization 329 | 330 | Twitter data was collected using a custom python script leveraging the Twarc module. The script utilized Twarc.filter(follow=*accounts\_to\_follow*) to follow a list of Twitter user\_ids, and only collect tweets that were direct replies to *accounts\_to\_follow* list provided. Collected data was abbreviated (a subset of all status and user fields were selected) and appended to a file on disk. Once sufficient data had been gathered, the collection was terminated, and subsequent analyses were performed on the collected data. 331 | 332 | Collected Twitter data was read from disk and preprocessed in order to form a dataset of relevant tweets. Tweet texts were stripped of urls, @mentions, leading, and trailing whitespace, and then tokenized. If the tweet contained enough tokens, it was recorded, along with information about the account that published the tweet, the account that was replied to, and the tweet's status ID (in order to be able to recreate the original URL). Both the preprocessed tweet texts and tokens were saved during this process. 333 | 334 | Three different sentence vectors were then calculated from each saved tweet: 335 | 336 | 1. A word2vec model was trained on the tokenized tweets. Sentence vectors for each tweet were then calculated by summing the vector representations of each token in the tweet. 337 | 338 | 2. A doc2vec model was trained on the preprocessed tweet texts. Sentence vectors were then evaluated for each preprocessed tweet text. 339 | 340 | 3. BERT sentence vectors were calculated for each preprocessed tweet text using the model's encode function. Note that this can be a rather time-consuming process. 341 | 342 | Sentence meta embeddings were then calculated by summing the three sentence vectors calculated for each tweet. The resulting sentence meta embeddings were then saved in preparation for the next step. 343 | 344 | Traditional methods for clustering textual data (such as Latent Dirichlet Allocation) require text to be stemmed and/or lemmatized (the process of reducing inflected words to their word stem, base, or root form). This process can be cumbersome and inaccurate. Since embeddings capture relationships between similar words in an unsupervised manner, our approach does not require either stemming or lemmatization. 345 | 346 | ### 2\. Sample clustering 347 | 348 | Our clustering methodology involves the following steps: 349 | 350 | 1. Calculate a cosine similarity matrix between vector representations of the sentences meta embeddings for a batch of samples. This process generates a matrix of similarity values between all possible pairs of vectors in the sample batch. 351 | 352 | 2. Calculate (or manually set) a threshold value at which we would draw an edge between two nodes in a graph. 353 | 354 | 3. Find all vector pairs that have a cosine similarity equal to or greater than the threshold value. Create a node-edge graph from these values, setting the edge weight equal to the cosine similarity between that pair of vectors. 355 | 356 | 4. Perform Louvain community detection on the resulting graph. This process labels each node based on the community it was assigned to. 357 | 358 | 5. Process the results of the clustering - for instance, extract common words, n-grams, and subject-object-verb triplets. 359 | 360 | 6. Perform manual inspection and statistical analysis of the resulting output. 361 | 362 | Here is a diagram of the above process: 363 | 364 | ![](readme//media/image40.png) 365 | 366 | It is possible to perform reasonably fast (less than 10 seconds) in-memory cosine similarity matrix calculations on small sets (\<20000) using, for instance, the sklearn.metrics.pairwise cosine\_similarity function. However, larger sets of vectors that don't fit into memory require a calculation loop that can take anywhere between minutes and hours to run. In order to process our large sample sets, we opted to perform processing in batches using the following logic: 367 | 368 | 1. Start with an array, *current\_batch*, populated with small *batch\_size* (e.g. 10,000) randomly selected sample vectors from the full set of samples to be clustered. We used randomly sampled vectors during all of our experiments so as to not optimize clustering logic for a deterministic set of inputs. 369 | 370 | 2. Calculate an in-memory cosine similarity matrix between all vectors in *current\_batch*. 371 | 372 | 3. Calculate a threshold cosine similarity value that will select the top (*batch\_size* \* *edges\_per\_node*) samples from *current\_batch*. 373 | 374 | 4. Iterate through the cosine similarity matrix values found for vectors in *current\_batch*, adding pairs of nodes to a list, *graph\_mapping*, in the form \[source, target, cosine\_similarity\] for each pair whose cosine similarity was equal to or greater than the threshold value calculated in the previous step. 375 | 376 | 5. Create a node-edge graph using the *graph\_mapping* list created in the previous step. Edge weights are assigned to the *cosine\_similarity* values obtained during that process. Run the Louvain community detection algorithm on the graph to obtain a list of nodes, labeled by community. This process will not utilize all of the vectors in *current\_batch*, so save a list of vectors that were not included in the resulting graph into a new list, *new\_batch*. 377 | 378 | 6. Iterate through the communities found in the previous step, selecting the list of vectors that were assigned to each community. 379 | 380 | 7. If the length of the list of vectors assigned to a community is less than the defined *minimum\_cluster\_size*, add those vectors to *new\_batch* and proceed to the next community. 381 | 382 | 8. If the length of the list of vectors assigned to a community is equal to or greater than the defined *minimum\_cluster\_size*, continue processing that cluster. 383 | 384 | 9. For each cluster that fits the *minimum\_cluster\_size* requirement, calculate a *cluster\_center* vector by summing all vectors in that cluster. Compare *cluster\_center* with a list of *cluster\_center* values found from previously recorded clusters. If the new cluster center has a cosine similarity value that exceeds a *merge\_similarity* value, assign items to the previously recorded cluster. If not, create a new cluster, and assign items to that. 385 | 386 | 10. Once all communities discovered in step 5 have been processed, add new samples from the pool to be processed to *new\_batch* until it reaches size *batch\_size*, assign it to *current\_batch*, and return to step 1. Once all samples from the pool have been exhausted, or the desired number of samples have been clustered, exit the loop. 387 | 388 | Here is a diagram of the above process: 389 | 390 | ![](readme//media/image39.png) 391 | 392 | ### Failsafe 393 | 394 | Occasionally, the loop runs without finding any communities that fulfill the *minimim\_cluster\_size* requirement. This, of course, causes the loop to go infinite. We added logic to detect this (check that the length of *new\_batch* is not the same as *batch\_size* before proceeding to the next pass). Our fix was to forcefully remove the first 10% of the array and append that many new samples to the end before proceeding to the next pass. 395 | 396 | ### Variable settings 397 | 398 | Different batch sizes result in quite different outcomes. If *batch\_size* is small, the selection of samples used to create each graph may not contain a wide enough variety of samples from the full set, and hence samples will be missed. If *batch\_size* is large, more communities are discovered (and the calculations take longer, require more memory, etc.). We found that setting *batch\_size* to 10,000 was optimal in terms of accuracy, speed, and memory efficiency. 399 | 400 | The *edges\_per\_node* variable has a marked effect on the accuracy of the clustering process. When *edges\_per\_node* is set to a low value (1-3), less samples are selected from each batch during graph creation, and community detection often finds many very small (e.g. 2-item) communities. However, when *edges\_per\_node* is set to higher values (\>6), a smaller number of larger communities are detected. However, these communities can contain multiple topics (and hence are inaccurate). We found that an *edges\_per\_node* value of 3 to be optimal for a *batch\_size* of 10,000. Increasing *batch\_size* often requires also increasing *edges\_per\_node* to achieve similar looking results. 401 | 402 | The *minimum\_cluster\_size* variable affects the granularity of the final clustering output. If *minimum\_cluster\_size* is set to a low value, more clusters will be identified, but multiple, redundant clusters may be created (that all contain tweets with similar subject matter). If accuracy is not important, setting *minimum\_cluster\_size* to a higher value will result in less clusters, and less redundancy, but may create clusters containing multiple topics (false positives), and may cause some topics to be lost. In datasets that contain a very wide range of different topics, a high *minimum\_cluster\_size* value (e.g. 50) may cause the process to not find any relevant communities at all. We found this variable to be very dataset-dependent. We tried values between 5 and 50, but ended up using a value of 50 for our experiments, mostly to allow for aesthetically pleasing visualizations to be created. 403 | 404 | The *merge\_similarity* variable has a similar effect on the output as the *edges\_per\_node* variable discussed earlier. This variable dictates the threshold at which newly identified clusters are merged with previously discovered ones. At lower values, this variable may cause multiple different topics to be merged into the same cluster. At high values, more redundant topic clusters are created. In our setup, we set *merge\_similarity* to 0.98. 405 | 406 | An example of a visualized graph (the one we generated using 30k tweets from set 1) looks like this: 407 | 408 | ![](readme//media/image36.png) 409 | 410 | Below are a few examples of how tweets assigned to identified clusters map onto the visualized graph: 411 | 412 | ![](readme//media/image37.png) 413 | 414 | ![](readme//media/image38.png) 415 | 416 | ## Appendix 2: Experiment: Using identified clusters for new tweet classification 417 | 418 | We experimented with the idea that identified clusters might be used to classify new tweets. In order to do this, we clustered approximately 25% of all tweets from each dataset and then attempted to classify the entire captured dataset using the following process: 419 | 420 | 1\. For each tweet in the dataset, calculate meta embeddings using the same models and methods that were used to generate the clusters. 421 | 422 | 2\. Run cosine similarity between the new tweet's meta embedding and all previously identified cluster centers, and find the best match (highest cosine similarity score). 423 | 424 | 3\. If the cosine similarity exceeds a threshold, label that tweet accordingly. If not, discard it. In this case, we used a value of 0.65 as a threshold. 425 | 426 | Set 1 (democrats): 427 | 428 | 184,851 (approximately 25% of the full dataset) tweets were clustered (using a *minimum\_cluster\_size* of 5) to obtain 3,376 clusters. The full 719,617 set of tweets were then converted into sentence meta embeddings and compared to the clusters found. This process matched 541,812 (75.29%) of the tweets. 429 | 430 | Set 2 (realDonaldTrump): 431 | 432 | 188,010 (approximately 25% of the full dataset) tweets were clustered (using a *minimum\_cluster\_size* of 5) to obtain 3,894 clusters. The full 747,232 set of tweets were then converted into sentence meta embeddings and compared to the clusters found. This process matched 623,120 (83.39%) of the tweets. 433 | 434 | By manually inspecting the resulting output (lists of tweet texts, grouped by cluster) we were able to determine that while some newly classified tweets matched the original cluster topics fairly well, others didn't. As such, identified cluster centers can’t reliably be used as a classifier to label new tweets from data captured with similar parameters. When using a threshold value higher than 0.65, a lot less tweets ended up being matched to existing clusters. One possible reason for the failure of this experiment is that some identified clusters contain tweets that only have very high cosine similarity values to the cluster center (above 0.95), whilst others contain tweets with much lower similarities (albeit whilst the content of the tweets match each other). As such, it might be that each cluster must have its own specific threshold value in order to match similar content. We didn't spend a great deal of time exploring this topic, but feel it may be worth researching in the future. Naturally, if this were figured out, cluster centers would likely only be valid for a short duration after they've been created due to the fact that the political and news landscape changes rapidly, and no techniques exist (as of yet) in this area that are able to create models that include a temporal context. 435 | -------------------------------------------------------------------------------- /readme.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme.pdf -------------------------------------------------------------------------------- /readme/media/image1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image1.jpg -------------------------------------------------------------------------------- /readme/media/image10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image10.png -------------------------------------------------------------------------------- /readme/media/image11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image11.png -------------------------------------------------------------------------------- /readme/media/image13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image13.png -------------------------------------------------------------------------------- /readme/media/image14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image14.png -------------------------------------------------------------------------------- /readme/media/image15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image15.png -------------------------------------------------------------------------------- /readme/media/image18.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image18.png -------------------------------------------------------------------------------- /readme/media/image19.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image19.png -------------------------------------------------------------------------------- /readme/media/image2.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image2.jpeg -------------------------------------------------------------------------------- /readme/media/image20.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image20.png -------------------------------------------------------------------------------- /readme/media/image21.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image21.png -------------------------------------------------------------------------------- /readme/media/image22.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image22.png -------------------------------------------------------------------------------- /readme/media/image23.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image23.png -------------------------------------------------------------------------------- /readme/media/image24.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image24.png -------------------------------------------------------------------------------- /readme/media/image25.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image25.png -------------------------------------------------------------------------------- /readme/media/image26.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image26.png -------------------------------------------------------------------------------- /readme/media/image27.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image27.png -------------------------------------------------------------------------------- /readme/media/image28.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image28.png -------------------------------------------------------------------------------- /readme/media/image29.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image29.png -------------------------------------------------------------------------------- /readme/media/image3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image3.png -------------------------------------------------------------------------------- /readme/media/image30.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image30.png -------------------------------------------------------------------------------- /readme/media/image31.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image31.png -------------------------------------------------------------------------------- /readme/media/image32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image32.png -------------------------------------------------------------------------------- /readme/media/image33.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image33.png -------------------------------------------------------------------------------- /readme/media/image36.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image36.png -------------------------------------------------------------------------------- /readme/media/image37.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image37.png -------------------------------------------------------------------------------- /readme/media/image38.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image38.png -------------------------------------------------------------------------------- /readme/media/image39.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image39.png -------------------------------------------------------------------------------- /readme/media/image4.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image4.jpeg -------------------------------------------------------------------------------- /readme/media/image40.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image40.png -------------------------------------------------------------------------------- /readme/media/image41.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image41.png -------------------------------------------------------------------------------- /readme/media/image42.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image42.png -------------------------------------------------------------------------------- /readme/media/image43.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image43.png -------------------------------------------------------------------------------- /readme/media/image44.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image44.png -------------------------------------------------------------------------------- /readme/media/image45.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image45.png -------------------------------------------------------------------------------- /readme/media/image46.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image46.png -------------------------------------------------------------------------------- /readme/media/image47.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image47.png -------------------------------------------------------------------------------- /readme/media/image48.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image48.png -------------------------------------------------------------------------------- /readme/media/image49.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image49.png -------------------------------------------------------------------------------- /readme/media/image5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image5.png -------------------------------------------------------------------------------- /readme/media/image6.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image6.jpeg -------------------------------------------------------------------------------- /readme/media/image9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r0zetta/meta_embedding_clustering/c30b864eaeba430cd50bbf2e10d80ce66b43ddf6/readme/media/image9.png --------------------------------------------------------------------------------