├── .gitignore
├── 1_get_tweets.ipynb
├── 2_create_vis.ipynb
├── 3_bar_plot.R
├── LICENSE
├── README.md
├── data.csv
├── design
    ├── 1_bar.afdesign
    ├── 2_cluster_25.afdesign
    └── 3_cluster_100.afdesign
└── results
    ├── 1_hgm.jpg
    ├── 2_hgm_cluster_25.jpg
    └── 3_hgm_cluster_100.jpg


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/1_get_tweets.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## 1. Get Followers"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "# fixes some issues when running twint in Notebook\n",
 17 |     "import nest_asyncio\n",
 18 |     "nest_asyncio.apply()"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "import twint"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "# Configure\n",
 37 |     "c = twint.Config()\n",
 38 |     "\n",
 39 |     "c.Username = 'hgmaassen'\n",
 40 |     "c.Proxy_host = 'tor'\n",
 41 |     "c.Store_csv = True\n",
 42 |     "c.Output = 'followers.csv'"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "twint.run.Followers(c)"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "## 2. Get Retweets of Followers"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "Fetching the follower could have been done with the official API as well. Originally, I wanted to everything with twint but Twitter blocked the API for the requests."
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "from pathlib import Path\n",
 75 |     "import twitter\n",
 76 |     "import json"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "followers = Path('followers.csv').read_text().split()[1:]"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "api = twitter.Api(\n",
 95 |     "    '', '', '', '', sleep_on_rate_limit=True)"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "def get_tweets(api=None, screen_name=None):\n",
105 |     "    timeline = api.GetUserTimeline(screen_name=screen_name, count=200)\n",
106 |     "    if len(timeline) == 0:\n",
107 |     "        return []\n",
108 |     "    print(len(timeline))\n",
109 |     "    earliest_tweet = min(timeline, key=lambda x: x.id).id\n",
110 |     "    print(\"getting tweets before:\", earliest_tweet)\n",
111 |     "\n",
112 |     "    while True:\n",
113 |     "        tweets = api.GetUserTimeline(\n",
114 |     "            screen_name=screen_name, max_id=earliest_tweet, count=200\n",
115 |     "        )\n",
116 |     "        new_earliest = min(tweets, key=lambda x: x.id).id\n",
117 |     "\n",
118 |     "        if not tweets or new_earliest == earliest_tweet:\n",
119 |     "            break\n",
120 |     "        else:\n",
121 |     "            earliest_tweet = new_earliest\n",
122 |     "            print(\"getting tweets before:\", earliest_tweet)\n",
123 |     "            timeline += tweets\n",
124 |     "\n",
125 |     "    return timeline"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": [
134 |     "for f in followers:\n",
135 |     "    if Path(f'/mnt/data/datasets/twitter/tweets-maassen/{f}.json').is_file():\n",
136 |     "        continue\n",
137 |     "    print(f)\n",
138 |     "    try:\n",
139 |     "        timeline = get_tweets(api=api, screen_name=f)\n",
140 |     "        with open(f'/mnt/data/datasets/twitter/tweets-maassen/{f}.json', 'w+') as f:\n",
141 |     "            for tweet in timeline:\n",
142 |     "                f.write(json.dumps(tweet._json))\n",
143 |     "                f.write('\\n')\n",
144 |     "    except Exception as e:\n",
145 |     "        print(e)"
146 |    ]
147 |   }
148 |  ],
149 |  "metadata": {
150 |   "kernelspec": {
151 |    "display_name": "Python 3",
152 |    "language": "python",
153 |    "name": "python3"
154 |   },
155 |   "language_info": {
156 |    "codemirror_mode": {
157 |     "name": "ipython",
158 |     "version": 3
159 |    },
160 |    "file_extension": ".py",
161 |    "mimetype": "text/x-python",
162 |    "name": "python",
163 |    "nbconvert_exporter": "python",
164 |    "pygments_lexer": "ipython3",
165 |    "version": "3.7.3"
166 |   }
167 |  },
168 |  "nbformat": 4,
169 |  "nbformat_minor": 4
170 | }
171 | 


--------------------------------------------------------------------------------
/2_create_vis.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## 1. load data and count"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import json\n",
 17 |     "from collections import Counter\n",
 18 |     "from itertools import combinations\n",
 19 |     "from pathlib import Path\n",
 20 |     "\n",
 21 |     "from tqdm import tqdm\n",
 22 |     "\n",
 23 |     "from joblib import Parallel, delayed"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "def read_data(f):\n",
 33 |     "    first_tweet_hgm = 1121716470856155136\n",
 34 |     "    tweets = Path(f).read_text().split('\\n')\n",
 35 |     "    tweets = [t for t in tweets if t != '']\n",
 36 |     "    if len(tweets) == 0:\n",
 37 |     "        return\n",
 38 |     "    tweets = [json.loads(t) for t in tweets]\n",
 39 |     "    rt = [t['retweeted_status']['user']['screen_name'].lower() for t in tweets if 'retweeted_status' in t and t['id'] > first_tweet_hgm]\n",
 40 |     "    return rt"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "rts = Parallel(n_jobs=8)(delayed(read_data)(f) for f in tqdm(list(Path('/mnt/data/datasets/twitter/tweets-maassen/').glob('*.json'))))"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "c = Counter()\n",
 59 |     "for names in tqdm(rts):\n",
 60 |     "    if names is None or 'hgmaassen' not in names:\n",
 61 |     "        continue\n",
 62 |     "    c.update(set(names))"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "len(c.keys())"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "c.most_common(21)"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "import pandas as pd"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "df = pd.DataFrame([{'name': x[0], 'value': x[1] / c['hgmaassen']} for x in c.most_common(101)[1:]])"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "df.to_csv('data.csv', )"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "metadata": {},
113 |    "source": [
114 |     "## 2. create vis with co-ocurrence matrix, PPMI and PCA"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "import math\n",
124 |     "\n",
125 |     "import matplotlib.pyplot as plt\n",
126 |     "import numpy as np\n",
127 |     "from scipy.sparse import csr_matrix, coo_matrix, dok_matrix\n",
128 |     "from scipy.sparse.linalg import svds\n",
129 |     "from sklearn.decomposition import PCA\n",
130 |     "from sklearn.preprocessing import MinMaxScaler"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": [
139 |     "embd_n = 1001"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "idx2token = [x[0] for x in list(c.most_common(embd_n)) if x[0] != 'hgmaassen']\n",
149 |     "token2idx = {k: v for v, k in enumerate(idx2token)}\n",
150 |     "n = len(idx2token)"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": [
159 |     "xs, ys, data = [], [], []\n",
160 |     "for x in tqdm(rts):\n",
161 |     "    if x is None or 'hgmaassen' not in x:\n",
162 |     "        continue\n",
163 |     "    s = set(x)\n",
164 |     "    s = [t for t in s if t in token2idx]\n",
165 |     "    for (c1, c2) in combinations(s, 2):\n",
166 |     "        c1 = token2idx[c1]\n",
167 |     "        c2 = token2idx[c2]\n",
168 |     "        xs.append(c1)\n",
169 |     "        xs.append(c2)\n",
170 |     "        ys.append(c2)\n",
171 |     "        ys.append(c1)\n",
172 |     "        data.append(1/len(s))\n",
173 |     "        data.append(1/len(s))"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": null,
179 |    "metadata": {},
180 |    "outputs": [],
181 |    "source": [
182 |     "num_yes = 0\n",
183 |     "num_no = 0"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "for x in tqdm(rts):\n",
193 |     "    if x is None:\n",
194 |     "        continue\n",
195 |     "    if 'hgmaassen' in x:\n",
196 |     "        num_yes += 1\n",
197 |     "    else:\n",
198 |     "        num_no +=1"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": null,
204 |    "metadata": {},
205 |    "outputs": [],
206 |    "source": [
207 |     "num_no"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "metadata": {},
214 |    "outputs": [],
215 |    "source": [
216 |     "num_yes"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": null,
222 |    "metadata": {},
223 |    "outputs": [],
224 |    "source": [
225 |     "num_no / len(rts)"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": null,
231 |    "metadata": {},
232 |    "outputs": [],
233 |    "source": [
234 |     "m = coo_matrix((data, (xs, ys)), (n, n), dtype=np.float32)"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": null,
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "m = m.tocsr()"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "metadata": {},
250 |    "outputs": [],
251 |    "source": [
252 |     "def calc_pmi(counts, cds):\n",
253 |     "    \"\"\"\n",
254 |     "    Calculates e^PMI; PMI without the log().\n",
255 |     "    \"\"\"\n",
256 |     "\n",
257 |     "    sum_w = np.array(counts.sum(axis=1))[:, 0]\n",
258 |     "    sum_c = np.array(counts.sum(axis=0))[0, :]\n",
259 |     "    if cds != 1:\n",
260 |     "        sum_c = sum_c ** cds\n",
261 |     "    sum_total = sum_c.sum()\n",
262 |     "    sum_w = np.reciprocal(sum_w)\n",
263 |     "    sum_c = np.reciprocal(sum_c)\n",
264 |     "\n",
265 |     "    pmi = csr_matrix(counts)\n",
266 |     "    pmi = multiply_by_rows(pmi, sum_w)\n",
267 |     "    pmi = multiply_by_columns(pmi, sum_c)\n",
268 |     "    pmi = pmi * sum_total\n",
269 |     "    return pmi\n",
270 |     "\n",
271 |     "\n",
272 |     "def multiply_by_rows(matrix, row_coefs):\n",
273 |     "    normalizer = dok_matrix((len(row_coefs), len(row_coefs)))\n",
274 |     "    normalizer.setdiag(row_coefs)\n",
275 |     "    return normalizer.tocsr().dot(matrix)\n",
276 |     "\n",
277 |     "\n",
278 |     "def multiply_by_columns(matrix, col_coefs):\n",
279 |     "    normalizer = dok_matrix((len(col_coefs), len(col_coefs)))\n",
280 |     "    normalizer.setdiag(col_coefs)\n",
281 |     "    return matrix.dot(normalizer.tocsr())"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "code",
286 |    "execution_count": null,
287 |    "metadata": {},
288 |    "outputs": [],
289 |    "source": [
290 |     "mm = calc_pmi(m, 0.75)"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": null,
296 |    "metadata": {},
297 |    "outputs": [],
298 |    "source": [
299 |     "res = MinMaxScaler().fit_transform(mm.todense())"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "code",
304 |    "execution_count": null,
305 |    "metadata": {},
306 |    "outputs": [],
307 |    "source": [
308 |     "vis_n = 100"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "code",
313 |    "execution_count": null,
314 |    "metadata": {},
315 |    "outputs": [],
316 |    "source": [
317 |     "res_vis = res[:vis_n, :]"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": null,
323 |    "metadata": {},
324 |    "outputs": [],
325 |    "source": [
326 |     "res_vis.shape"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": null,
332 |    "metadata": {},
333 |    "outputs": [],
334 |    "source": [
335 |     "res_vis = PCA(n_components=2).fit_transform(res_vis)\n",
336 |     "res_vis = MinMaxScaler().fit_transform(res_vis)"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "code",
341 |    "execution_count": null,
342 |    "metadata": {},
343 |    "outputs": [],
344 |    "source": [
345 |     "from adjustText import adjust_text"
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "code",
350 |    "execution_count": null,
351 |    "metadata": {},
352 |    "outputs": [],
353 |    "source": [
354 |     "from matplotlib import rcParams\n",
355 |     "rcParams['font.family'] = 'lato'"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": null,
361 |    "metadata": {},
362 |    "outputs": [],
363 |    "source": [
364 |     "fig, ax = plt.subplots(figsize=(20, 20))\n",
365 |     "\n",
366 |     "fig.patch.set_visible(False)\n",
367 |     "ax.axis('off')\n",
368 |     "\n",
369 |     "sc = ax.scatter(res_vis[:, 0], res_vis[:, 1], color='black')\n",
370 |     "\n",
371 |     "texts = [plt.text(res_vis[i][0] + 0.015 * 0, res_vis[i][1] - 0.009 * 0, idx2token[i], weight='regular', size='14') for i in range(vis_n)]\n",
372 |     "adjust_text(texts, weight='regular', size='14')"
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "code",
377 |    "execution_count": null,
378 |    "metadata": {},
379 |    "outputs": [],
380 |    "source": [
381 |     "fig.savefig(\"100.svg\")"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": null,
387 |    "metadata": {},
388 |    "outputs": [],
389 |    "source": []
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": null,
394 |    "metadata": {},
395 |    "outputs": [],
396 |    "source": []
397 |   }
398 |  ],
399 |  "metadata": {
400 |   "kernelspec": {
401 |    "display_name": "Python 3",
402 |    "language": "python",
403 |    "name": "python3"
404 |   },
405 |   "language_info": {
406 |    "codemirror_mode": {
407 |     "name": "ipython",
408 |     "version": 3
409 |    },
410 |    "file_extension": ".py",
411 |    "mimetype": "text/x-python",
412 |    "name": "python",
413 |    "nbconvert_exporter": "python",
414 |    "pygments_lexer": "ipython3",
415 |    "version": "3.7.3"
416 |   }
417 |  },
418 |  "nbformat": 4,
419 |  "nbformat_minor": 4
420 | }
421 | 


--------------------------------------------------------------------------------
/3_bar_plot.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | 
 3 | df <- read_csv('data.csv')
 4 | 
 5 | df$name <- factor(df$name, levels = rev(df$name))
 6 | 
 7 | df <- head(df, 25)
 8 | # Basic barplot
 9 | 
10 | p<-ggplot(data=df, aes(x=name, y=value)) +
11 |   geom_bar(stat="identity", fill='#2b8cbe') + 
12 |   geom_text(aes(label=scales::percent(value)), vjust=0.6, hjust=1, color="white",
13 |             position = position_dodge(0.9), size=3) +
14 |   scale_y_continuous(labels = scales::percent) +
15 |   theme_classic() + 
16 |   theme(axis.title=element_blank(), axis.line = element_blank(), axis.ticks=element_blank(), axis.text.x = element_blank(), axis.text.y=element_text(hjust = 0, color='white')) +
17 |   theme(text=element_text(family="Lato", face="bold", size=12)) + 
18 |   labs(title='Twitter-Accounts,', subtitle="die Hans-Georg Maaßen retweeten, reweeten auch...", caption="© Creative Commons: CC BY Johannes Filter")
19 | 
20 | p
21 | 
22 | # Horizontal bar plot
23 | p + coord_flip()
24 | 
25 | ggsave('bar.svg')
26 | 
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Johannes Filter
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # hgmaassen-retweets
 2 | 
 3 | If people are retweeting [@hgmaassen](https://twitter.com/hgmaassen), who are they retweeting besides him? An analysis. [Read the article](https://netzpolitik.org/2019/datenanalyse-maassens-follower-retweeten-rechtsradikale-accounts-aber-fast-nie-die-cdu/) (in German).
 4 | 
 5 | <div align="center">
 6 |   <img src="results/2_hgm_cluster_25.jpg" alt="clusters of twitter accounts">
 7 | </div>
 8 | 
 9 | ## Background
10 | 
11 | We construct an embedding for Twitter acounts to visualize clusters. We apply techniques normaly used to construct [Word Embeddings](https://www.google.com/search?q=word+embeddings&source=lnms&tbm=isch). As far as we know, we are the first ones to use the method like this.
12 | 
13 | Method:
14 | 
15 | 1. iterate over all accounts and count [co-occurrences](https://en.wikipedia.org/wiki/Co-occurrence) (in the sense: who are they retweeting besides @hgmaassen as a binary choice, count them pair-wise in a 2D matrix)
16 | 2. [Pointwise Mutual Information](https://en.wikipedia.org/wiki/Pointwise_mutual_information) to normalize counts and construct a vector space
17 | 3. choose N accounts, i.e. the ones with the highest total count, and apply [PCA](https://en.wikipedia.org/wiki/Principal_component_analysis) to project them onto a 2D plane for visualization
18 | 
19 | This will result into an image where points that are closer together have a similar retweet behaviour of its recipients.
20 | 
21 | See [2_create_vis.ipynb](2_create_vis.ipynb) for more details.
22 | 
23 | Some reference if you want to dig deeper in the (NLP) topic: ["Improving Distributional Similarity with Lessons Learned from Word Embeddings"](https://aclweb.org/anthology/papers/Q/Q15/Q15-1016/) by Omer Levy, Yoav Goldberg, Ido Dagan, TACL 2015.
24 | 
25 | I am not sure wheter I should write/experiment more on the method. If you have an opinion on it, write me an [email](mailto:hi@jfilter.de).
26 | 
27 | ## License
28 | 
29 | MIT.
30 | 


--------------------------------------------------------------------------------
/data.csv:
--------------------------------------------------------------------------------
  1 | ,name,value
  2 | 0,rolandtichy,0.6771868787276342
  3 | 1,hartes_geld,0.6605367793240556
  4 | 2,reitschuster,0.6237574552683897
  5 | 3,lawyerberlin,0.6222664015904572
  6 | 4,steinhoefel,0.6205268389662028
  7 | 5,dushanwegner,0.5862326043737575
  8 | 6,_donalphonso,0.5837475149105368
  9 | 7,hallaschka_hh,0.580765407554672
 10 | 8,alice_weidel,0.5593936381709742
 11 | 9,shakriet,0.5484592445328031
 12 | 10,maltekaufmann,0.5320576540755467
 13 | 11,steinbacherika,0.5161530815109344
 14 | 12,louiecrit,0.5156560636182903
 15 | 13,tichyseinblick,0.49875745526838966
 16 | 14,iq_stimulator,0.4960238568588469
 17 | 15,alicologne,0.4840954274353877
 18 | 16,junge_freiheit,0.4788767395626243
 19 | 17,eddie_1412,0.46570576540755465
 20 | 18,drkissler,0.4639662027833002
 21 | 19,georg_pazderski,0.4609840954274354
 22 | 20,einzelfallinfos,0.4510437375745527
 23 | 21,sirschnee,0.44831013916500995
 24 | 22,beatrix_vstorch,0.4480616302186879
 25 | 23,joanacotar,0.44681908548707755
 26 | 24,philipplickert,0.4463220675944334
 27 | 25,arnd_diringer,0.4440854870775348
 28 | 26,drdavidberger,0.44259443339960236
 29 | 27,exgruene,0.4356361829025845
 30 | 28,netzdenunziant,0.43290258449304175
 31 | 29,maxotte_says,0.4316600397614314
 32 | 30,uwe_junge_mdl,0.42992047713717696
 33 | 31,kittypunk7,0.42768389662027834
 34 | 32,schweizok2,0.4224652087475149
 35 | 33,fern_schreiber,0.4207256461232604
 36 | 34,achgut_com,0.4172465208747515
 37 | 35,michael_leh,0.4157554671968191
 38 | 36,deutsch365,0.41451292246520877
 39 | 37,shlomosapiens,0.4120278330019881
 40 | 38,m_t_franz,0.4018389662027833
 41 | 39,drumheadberlin,0.3961232604373757
 42 | 40,dpolghh,0.39165009940357853
 43 | 41,afd,0.3894135188866799
 44 | 42,gtzfrmming,0.38817097415506957
 45 | 43,helllud123,0.3874254473161034
 46 | 44,afdimbundestag,0.3856858846918489
 47 | 45,krk979,0.38220675944334
 48 | 46,ibikus31,0.38096421471172964
 49 | 47,joerg_meuthen,0.3802186878727634
 50 | 48,gottfriedcurio,0.37997017892644136
 51 | 49,frank_pasemann,0.37847912524850896
 52 | 50,bimbas_world,0.37624254473161034
 53 | 51,andreaschlegel3,0.3754970178926441
 54 | 52,brudervom,0.3745029821073559
 55 | 53,ichbinkoelnerin,0.37326043737574555
 56 | 54,birgit_kelle,0.37326043737574555
 57 | 55,baerbeli04,0.3705268389662028
 58 | 56,haraldbecker80,0.36456262425447317
 59 | 57,vonschwer,0.36307157057654077
 60 | 58,norbertbolz,0.36083499005964215
 61 | 59,koeppelroger,0.36083499005964215
 62 | 60,kachelmann,0.35785288270377735
 63 | 61,pinkcrazypony,0.356610337972167
 64 | 62,henrykstoeckl,0.3551192842942346
 65 | 63,der_monk,0.3548707753479125
 66 | 64,drbrandner,0.3548707753479125
 67 | 65,sonjadelarosa7,0.3523856858846918
 68 | 66,realdonaldtrump,0.3523856858846918
 69 | 67,zhangdanhong,0.3476640159045726
 70 | 68,matteosalvinimi,0.3474155069582505
 71 | 69,renetruninger,0.34642147117296224
 72 | 70,achimspiegel,0.34592445328031807
 73 | 71,krahmax,0.34592445328031807
 74 | 72,gundel_gaukeley,0.34493041749502984
 75 | 73,spiro0815,0.3446819085487077
 76 | 74,ahmadmansour__,0.34343936381709744
 77 | 75,markus_krall,0.34319085487077533
 78 | 76,publizistikon,0.34169980119284293
 79 | 77,den_tyske,0.3379721669980119
 80 | 78,burger_ein,0.33474155069582506
 81 | 79,emrich_5933,0.33399602385685884
 82 | 80,der__patriot,0.33250497017892644
 83 | 81,petrbystronafd,0.3302683896620278
 84 | 82,mitschalexander,0.32852882703777336
 85 | 83,frreschke,0.327286282306163
 86 | 84,compactmagazin,0.32480119284294234
 87 | 85,hugomuellervogg,0.3245526838966203
 88 | 86,fackfellowat,0.3240556660039761
 89 | 87,neythomas,0.32380715705765406
 90 | 88,willnurschreibn,0.32007952286282304
 91 | 89,wokeup777,0.31858846918489064
 92 | 90,kokolores20,0.3133697813121272
 93 | 91,houelle_beck,0.3116302186878728
 94 | 92,haraldlaatsch,0.30989065606361826
 95 | 93,wasistzeitgeist,0.30790258449304175
 96 | 94,lars9596,0.3066600397614314
 97 | 95,stbrandner,0.3024353876739563
 98 | 96,marcfelixserrao,0.3019383697813121
 99 | 97,ungebeten1,0.3019383697813121
100 | 98,eysvog3l,0.30168986083499005
101 | 99,zentralrat1,0.30119284294234594
102 | 


--------------------------------------------------------------------------------
/design/1_bar.afdesign:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jfilter/hgmaassen-retweets/6cfa744edfa359db27f5e143f1e507f39a957a99/design/1_bar.afdesign


--------------------------------------------------------------------------------
/design/2_cluster_25.afdesign:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jfilter/hgmaassen-retweets/6cfa744edfa359db27f5e143f1e507f39a957a99/design/2_cluster_25.afdesign


--------------------------------------------------------------------------------
/design/3_cluster_100.afdesign:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jfilter/hgmaassen-retweets/6cfa744edfa359db27f5e143f1e507f39a957a99/design/3_cluster_100.afdesign


--------------------------------------------------------------------------------
/results/1_hgm.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jfilter/hgmaassen-retweets/6cfa744edfa359db27f5e143f1e507f39a957a99/results/1_hgm.jpg


--------------------------------------------------------------------------------
/results/2_hgm_cluster_25.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jfilter/hgmaassen-retweets/6cfa744edfa359db27f5e143f1e507f39a957a99/results/2_hgm_cluster_25.jpg


--------------------------------------------------------------------------------
/results/3_hgm_cluster_100.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jfilter/hgmaassen-retweets/6cfa744edfa359db27f5e143f1e507f39a957a99/results/3_hgm_cluster_100.jpg


--------------------------------------------------------------------------------