├── .gitignore ├── 1_get_tweets.ipynb ├── 2_create_vis.ipynb ├── 3_bar_plot.R ├── LICENSE ├── README.md ├── data.csv ├── design ├── 1_bar.afdesign ├── 2_cluster_25.afdesign └── 3_cluster_100.afdesign └── results ├── 1_hgm.jpg ├── 2_hgm_cluster_25.jpg └── 3_hgm_cluster_100.jpg /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /1_get_tweets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 1. Get Followers" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# fixes some issues when running twint in Notebook\n", 17 | "import nest_asyncio\n", 18 | "nest_asyncio.apply()" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "import twint" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "# Configure\n", 37 | "c = twint.Config()\n", 38 | "\n", 39 | "c.Username = 'hgmaassen'\n", 40 | "c.Proxy_host = 'tor'\n", 41 | "c.Store_csv = True\n", 42 | "c.Output = 'followers.csv'" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "twint.run.Followers(c)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "## 2. Get Retweets of Followers" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "Fetching the follower could have been done with the official API as well. Originally, I wanted to everything with twint but Twitter blocked the API for the requests." 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "from pathlib import Path\n", 75 | "import twitter\n", 76 | "import json" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "followers = Path('followers.csv').read_text().split()[1:]" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "api = twitter.Api(\n", 95 | " '', '', '', '', sleep_on_rate_limit=True)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "def get_tweets(api=None, screen_name=None):\n", 105 | " timeline = api.GetUserTimeline(screen_name=screen_name, count=200)\n", 106 | " if len(timeline) == 0:\n", 107 | " return []\n", 108 | " print(len(timeline))\n", 109 | " earliest_tweet = min(timeline, key=lambda x: x.id).id\n", 110 | " print(\"getting tweets before:\", earliest_tweet)\n", 111 | "\n", 112 | " while True:\n", 113 | " tweets = api.GetUserTimeline(\n", 114 | " screen_name=screen_name, max_id=earliest_tweet, count=200\n", 115 | " )\n", 116 | " new_earliest = min(tweets, key=lambda x: x.id).id\n", 117 | "\n", 118 | " if not tweets or new_earliest == earliest_tweet:\n", 119 | " break\n", 120 | " else:\n", 121 | " earliest_tweet = new_earliest\n", 122 | " print(\"getting tweets before:\", earliest_tweet)\n", 123 | " timeline += tweets\n", 124 | "\n", 125 | " return timeline" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "for f in followers:\n", 135 | " if Path(f'/mnt/data/datasets/twitter/tweets-maassen/{f}.json').is_file():\n", 136 | " continue\n", 137 | " print(f)\n", 138 | " try:\n", 139 | " timeline = get_tweets(api=api, screen_name=f)\n", 140 | " with open(f'/mnt/data/datasets/twitter/tweets-maassen/{f}.json', 'w+') as f:\n", 141 | " for tweet in timeline:\n", 142 | " f.write(json.dumps(tweet._json))\n", 143 | " f.write('\\n')\n", 144 | " except Exception as e:\n", 145 | " print(e)" 146 | ] 147 | } 148 | ], 149 | "metadata": { 150 | "kernelspec": { 151 | "display_name": "Python 3", 152 | "language": "python", 153 | "name": "python3" 154 | }, 155 | "language_info": { 156 | "codemirror_mode": { 157 | "name": "ipython", 158 | "version": 3 159 | }, 160 | "file_extension": ".py", 161 | "mimetype": "text/x-python", 162 | "name": "python", 163 | "nbconvert_exporter": "python", 164 | "pygments_lexer": "ipython3", 165 | "version": "3.7.3" 166 | } 167 | }, 168 | "nbformat": 4, 169 | "nbformat_minor": 4 170 | } 171 | -------------------------------------------------------------------------------- /2_create_vis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 1. load data and count" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import json\n", 17 | "from collections import Counter\n", 18 | "from itertools import combinations\n", 19 | "from pathlib import Path\n", 20 | "\n", 21 | "from tqdm import tqdm\n", 22 | "\n", 23 | "from joblib import Parallel, delayed" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "def read_data(f):\n", 33 | " first_tweet_hgm = 1121716470856155136\n", 34 | " tweets = Path(f).read_text().split('\\n')\n", 35 | " tweets = [t for t in tweets if t != '']\n", 36 | " if len(tweets) == 0:\n", 37 | " return\n", 38 | " tweets = [json.loads(t) for t in tweets]\n", 39 | " rt = [t['retweeted_status']['user']['screen_name'].lower() for t in tweets if 'retweeted_status' in t and t['id'] > first_tweet_hgm]\n", 40 | " return rt" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "rts = Parallel(n_jobs=8)(delayed(read_data)(f) for f in tqdm(list(Path('/mnt/data/datasets/twitter/tweets-maassen/').glob('*.json'))))" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "c = Counter()\n", 59 | "for names in tqdm(rts):\n", 60 | " if names is None or 'hgmaassen' not in names:\n", 61 | " continue\n", 62 | " c.update(set(names))" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "len(c.keys())" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "c.most_common(21)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "import pandas as pd" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "df = pd.DataFrame([{'name': x[0], 'value': x[1] / c['hgmaassen']} for x in c.most_common(101)[1:]])" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "df.to_csv('data.csv', )" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "## 2. create vis with co-ocurrence matrix, PPMI and PCA" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "import math\n", 124 | "\n", 125 | "import matplotlib.pyplot as plt\n", 126 | "import numpy as np\n", 127 | "from scipy.sparse import csr_matrix, coo_matrix, dok_matrix\n", 128 | "from scipy.sparse.linalg import svds\n", 129 | "from sklearn.decomposition import PCA\n", 130 | "from sklearn.preprocessing import MinMaxScaler" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "embd_n = 1001" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "idx2token = [x[0] for x in list(c.most_common(embd_n)) if x[0] != 'hgmaassen']\n", 149 | "token2idx = {k: v for v, k in enumerate(idx2token)}\n", 150 | "n = len(idx2token)" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "xs, ys, data = [], [], []\n", 160 | "for x in tqdm(rts):\n", 161 | " if x is None or 'hgmaassen' not in x:\n", 162 | " continue\n", 163 | " s = set(x)\n", 164 | " s = [t for t in s if t in token2idx]\n", 165 | " for (c1, c2) in combinations(s, 2):\n", 166 | " c1 = token2idx[c1]\n", 167 | " c2 = token2idx[c2]\n", 168 | " xs.append(c1)\n", 169 | " xs.append(c2)\n", 170 | " ys.append(c2)\n", 171 | " ys.append(c1)\n", 172 | " data.append(1/len(s))\n", 173 | " data.append(1/len(s))" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "num_yes = 0\n", 183 | "num_no = 0" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "for x in tqdm(rts):\n", 193 | " if x is None:\n", 194 | " continue\n", 195 | " if 'hgmaassen' in x:\n", 196 | " num_yes += 1\n", 197 | " else:\n", 198 | " num_no +=1" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "num_no" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "num_yes" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "num_no / len(rts)" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "m = coo_matrix((data, (xs, ys)), (n, n), dtype=np.float32)" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "m = m.tocsr()" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "def calc_pmi(counts, cds):\n", 253 | " \"\"\"\n", 254 | " Calculates e^PMI; PMI without the log().\n", 255 | " \"\"\"\n", 256 | "\n", 257 | " sum_w = np.array(counts.sum(axis=1))[:, 0]\n", 258 | " sum_c = np.array(counts.sum(axis=0))[0, :]\n", 259 | " if cds != 1:\n", 260 | " sum_c = sum_c ** cds\n", 261 | " sum_total = sum_c.sum()\n", 262 | " sum_w = np.reciprocal(sum_w)\n", 263 | " sum_c = np.reciprocal(sum_c)\n", 264 | "\n", 265 | " pmi = csr_matrix(counts)\n", 266 | " pmi = multiply_by_rows(pmi, sum_w)\n", 267 | " pmi = multiply_by_columns(pmi, sum_c)\n", 268 | " pmi = pmi * sum_total\n", 269 | " return pmi\n", 270 | "\n", 271 | "\n", 272 | "def multiply_by_rows(matrix, row_coefs):\n", 273 | " normalizer = dok_matrix((len(row_coefs), len(row_coefs)))\n", 274 | " normalizer.setdiag(row_coefs)\n", 275 | " return normalizer.tocsr().dot(matrix)\n", 276 | "\n", 277 | "\n", 278 | "def multiply_by_columns(matrix, col_coefs):\n", 279 | " normalizer = dok_matrix((len(col_coefs), len(col_coefs)))\n", 280 | " normalizer.setdiag(col_coefs)\n", 281 | " return matrix.dot(normalizer.tocsr())" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "mm = calc_pmi(m, 0.75)" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": null, 296 | "metadata": {}, 297 | "outputs": [], 298 | "source": [ 299 | "res = MinMaxScaler().fit_transform(mm.todense())" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": null, 305 | "metadata": {}, 306 | "outputs": [], 307 | "source": [ 308 | "vis_n = 100" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": null, 314 | "metadata": {}, 315 | "outputs": [], 316 | "source": [ 317 | "res_vis = res[:vis_n, :]" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": null, 323 | "metadata": {}, 324 | "outputs": [], 325 | "source": [ 326 | "res_vis.shape" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": null, 332 | "metadata": {}, 333 | "outputs": [], 334 | "source": [ 335 | "res_vis = PCA(n_components=2).fit_transform(res_vis)\n", 336 | "res_vis = MinMaxScaler().fit_transform(res_vis)" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": null, 342 | "metadata": {}, 343 | "outputs": [], 344 | "source": [ 345 | "from adjustText import adjust_text" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": null, 351 | "metadata": {}, 352 | "outputs": [], 353 | "source": [ 354 | "from matplotlib import rcParams\n", 355 | "rcParams['font.family'] = 'lato'" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "metadata": {}, 362 | "outputs": [], 363 | "source": [ 364 | "fig, ax = plt.subplots(figsize=(20, 20))\n", 365 | "\n", 366 | "fig.patch.set_visible(False)\n", 367 | "ax.axis('off')\n", 368 | "\n", 369 | "sc = ax.scatter(res_vis[:, 0], res_vis[:, 1], color='black')\n", 370 | "\n", 371 | "texts = [plt.text(res_vis[i][0] + 0.015 * 0, res_vis[i][1] - 0.009 * 0, idx2token[i], weight='regular', size='14') for i in range(vis_n)]\n", 372 | "adjust_text(texts, weight='regular', size='14')" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": null, 378 | "metadata": {}, 379 | "outputs": [], 380 | "source": [ 381 | "fig.savefig(\"100.svg\")" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": {}, 388 | "outputs": [], 389 | "source": [] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": null, 394 | "metadata": {}, 395 | "outputs": [], 396 | "source": [] 397 | } 398 | ], 399 | "metadata": { 400 | "kernelspec": { 401 | "display_name": "Python 3", 402 | "language": "python", 403 | "name": "python3" 404 | }, 405 | "language_info": { 406 | "codemirror_mode": { 407 | "name": "ipython", 408 | "version": 3 409 | }, 410 | "file_extension": ".py", 411 | "mimetype": "text/x-python", 412 | "name": "python", 413 | "nbconvert_exporter": "python", 414 | "pygments_lexer": "ipython3", 415 | "version": "3.7.3" 416 | } 417 | }, 418 | "nbformat": 4, 419 | "nbformat_minor": 4 420 | } 421 | -------------------------------------------------------------------------------- /3_bar_plot.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | 3 | df <- read_csv('data.csv') 4 | 5 | df$name <- factor(df$name, levels = rev(df$name)) 6 | 7 | df <- head(df, 25) 8 | # Basic barplot 9 | 10 | p<-ggplot(data=df, aes(x=name, y=value)) + 11 | geom_bar(stat="identity", fill='#2b8cbe') + 12 | geom_text(aes(label=scales::percent(value)), vjust=0.6, hjust=1, color="white", 13 | position = position_dodge(0.9), size=3) + 14 | scale_y_continuous(labels = scales::percent) + 15 | theme_classic() + 16 | theme(axis.title=element_blank(), axis.line = element_blank(), axis.ticks=element_blank(), axis.text.x = element_blank(), axis.text.y=element_text(hjust = 0, color='white')) + 17 | theme(text=element_text(family="Lato", face="bold", size=12)) + 18 | labs(title='Twitter-Accounts,', subtitle="die Hans-Georg Maaßen retweeten, reweeten auch...", caption="© Creative Commons: CC BY Johannes Filter") 19 | 20 | p 21 | 22 | # Horizontal bar plot 23 | p + coord_flip() 24 | 25 | ggsave('bar.svg') 26 | 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Johannes Filter 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # hgmaassen-retweets 2 | 3 | If people are retweeting [@hgmaassen](https://twitter.com/hgmaassen), who are they retweeting besides him? An analysis. [Read the article](https://netzpolitik.org/2019/datenanalyse-maassens-follower-retweeten-rechtsradikale-accounts-aber-fast-nie-die-cdu/) (in German). 4 | 5 |