├── README.md
├── analysis.ipynb
├── multi_site_scraping.ipynb
├── socialbakers_scraping.ipynb
├── super_bowl.ipynb
├── tweeps_scraping_tweepy.ipynb
└── tweetsminig.ipynb
/README.md:
--------------------------------------------------------------------------------
1 | # TweetsClassification
2 | Exploring Jaccard-similarities technique on tweets then visualising its output using k means and kmeans with pca.
3 | Additional input on time series analysis, web scrapping and twitter scrapping.
4 |
5 | [Medium blog](https://medium.com/swlh/tweets-classification-and-clustering-in-python-b107be1ba7c7)
6 |
--------------------------------------------------------------------------------
/multi_site_scraping.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# More Scraping\n",
8 | "\n",
9 | "## Introduction\n",
10 | "Scraping twitter handles of Kenyan governors and other generally popular people from [Soko Directory](https://sokodirectory.com/2019/03/here-are-the-official-twitter-handles-for-all-the-47-governors-in-kenya), [Travel Start](http://www.travelstart.co.ke/blog/the-50-best-kenyans-to-follow-on-twitter/) and [Royal Trendia](https://royaltrendia.com/top-twitter-profiles-kenya/) sites.\n",
11 | "\n",
12 | "## Table of Contents\n",
13 | "1. [Libraries](#Libraries)\n",
14 | "2. [Governors](#Governors)\n",
15 | "3. [Travel Start](#Travel-Start)\n",
16 | "4. [Royal Trendia](#Royal-Trendia)\n",
17 | "5. [Mega DataFrame](#Mega-DF)\n"
18 | ]
19 | },
20 | {
21 | "cell_type": "markdown",
22 | "metadata": {},
23 | "source": [
24 | "### Libraries"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 1,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "from requests import get\n",
34 | "from requests.exceptions import RequestException\n",
35 | "from contextlib import closing\n",
36 | "from bs4 import BeautifulSoup\n",
37 | "import pandas as pd\n",
38 | "import os, sys\n",
39 | "\n",
40 | "import fire\n",
41 | "\n",
42 | "import sys\n",
43 | "import os\n",
44 | "import json\n",
45 | "import matplotlib.pyplot as plt\n",
46 | "import re\n",
47 | "import string\n",
48 | "\n",
49 | "import matplotlib.dates as mdates\n",
50 | "import seaborn as sns\n",
51 | "\n",
52 | "# to view all columns\n",
53 | "pd.set_option(\"display.max.columns\", None)\n",
54 | "\n",
55 | "import tweepy\n",
56 | "from tweepy.streaming import StreamListener\n",
57 | "from tweepy import OAuthHandler\n",
58 | "from tweepy import Stream\n",
59 | "from tweepy import API\n",
60 | "from tweepy import Cursor\n",
61 | "from datetime import datetime, date, time, timedelta\n",
62 | "from collections import Counter\n",
63 | "import sys\n",
64 | "\n",
65 | "\n",
66 | "import preprocessor as p"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": 2,
72 | "metadata": {},
73 | "outputs": [],
74 | "source": [
75 | "#%%writefile ../pyscrap_url.py\n",
76 | "\n",
77 | "def simple_get(url):\n",
78 | " \"\"\"\n",
79 | " Attempts to get the content at `url` by making an HTTP GET request.\n",
80 | " If the content-type of response is some kind of HTML/XML, return the\n",
81 | " text content, otherwise return None.\n",
82 | " \"\"\"\n",
83 | " try:\n",
84 | " with closing(get(url, stream=True)) as resp:\n",
85 | " if is_good_response(resp):\n",
86 | " return resp.content #.encode(BeautifulSoup.original_encoding)\n",
87 | " else:\n",
88 | " return None\n",
89 | "\n",
90 | " except RequestException as e:\n",
91 | " log_error('Error during requests to {0} : {1}'.format(url, str(e)))\n",
92 | " return None\n",
93 | "\n",
94 | "\n",
95 | "def is_good_response(resp):\n",
96 | " \"\"\"\n",
97 | " Returns True if the response seems to be HTML, False otherwise.\n",
98 | " \"\"\"\n",
99 | " content_type = resp.headers['Content-Type'].lower()\n",
100 | " return (resp.status_code == 200 \n",
101 | " and content_type is not None \n",
102 | " and content_type.find('html') > -1)\n",
103 | "\n",
104 | "\n",
105 | "def log_error(e):\n",
106 | " \"\"\"\n",
107 | " It is always a good idea to log errors. \n",
108 | " This function just prints them, but you can\n",
109 | " make it do anything.\n",
110 | " \"\"\"\n",
111 | " print(e)\n",
112 | " \n",
113 | "def get_elements(url, tag='',search={}, fname=None):\n",
114 | " \"\"\"\n",
115 | " Downloads a page specified by the url parameter\n",
116 | " and returns a list of strings, one per tag element\n",
117 | " \"\"\"\n",
118 | " \n",
119 | " if isinstance(url,str):\n",
120 | " response = simple_get(url)\n",
121 | " else:\n",
122 | " #if already it is a loaded html page\n",
123 | " response = url\n",
124 | "\n",
125 | " if response is not None:\n",
126 | " html = BeautifulSoup(response, 'html.parser')\n",
127 | " \n",
128 | " res = []\n",
129 | " if tag: \n",
130 | " for li in html.select(tag):\n",
131 | " for name in li.text.split('\\n'):\n",
132 | " if len(name) > 0:\n",
133 | " res.append(name.strip())\n",
134 | " \n",
135 | " \n",
136 | " if search:\n",
137 | " soup = html \n",
138 | " \n",
139 | " \n",
140 | " r = ''\n",
141 | " if 'find' in search.keys():\n",
142 | " print('findaing',search['find'])\n",
143 | " soup = soup.find(**search['find'])\n",
144 | " r = soup\n",
145 | "\n",
146 | " \n",
147 | " if 'find_all' in search.keys():\n",
148 | " print('findaing all of',search['find_all'])\n",
149 | " r = soup.find_all(**search['find_all'])\n",
150 | " \n",
151 | " if r:\n",
152 | " for x in list(r):\n",
153 | " if len(x) > 0:\n",
154 | " res.extend(x)\n",
155 | " \n",
156 | " return res\n",
157 | " \n",
158 | "def get_tag_elements(url, tag='h2'):\n",
159 | " \"\"\"\n",
160 | " Downloads a page specified by the url parameter\n",
161 | " and returns a list of strings, one per tag element\n",
162 | " \"\"\"\n",
163 | " \n",
164 | " response = simple_get(url)\n",
165 | "\n",
166 | " if response is not None:\n",
167 | " html = BeautifulSoup(response, 'html.parser')\n",
168 | " names = set()\n",
169 | " for li in html.select(tag):\n",
170 | " for name in li.text.split('\\n'):\n",
171 | " if len(name) > 0:\n",
172 | " names.add(name.strip())\n",
173 | " return list(names)\n",
174 | "\n",
175 | " # Raise an exception if we failed to get any data from the url\n",
176 | " raise Exception('Error retrieving contents at {}'.format(url)) \n",
177 | " \n",
178 | " \n",
179 | "if get_ipython().__class__.__name__ == '__main__':\n",
180 | " fire(get_tag_elements)"
181 | ]
182 | },
183 | {
184 | "cell_type": "markdown",
185 | "metadata": {},
186 | "source": [
187 | "### Governors"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": 18,
193 | "metadata": {},
194 | "outputs": [
195 | {
196 | "data": {
197 | "text/plain": [
198 | "['Ali Hassan Joho (Mombasa County) – @HassanAliJohoSalim Mvurya (Kwale County) – @GovernorMvuryaAmason Kingi (Kilifi County) – @governorkingiDhadho Godhana (Tana River County) – @EGodhanaFahim Twaha (Lamu County) – @fahimyasintwahaGranton Samboja (Taita Taveta County) – @SambojagrantonBunow Korane (Garissa County) – @HonAliKoraneMohamed Abdi Mohamud (Wajir County) – @GovernorWajirAli Ibrahim Roba (Mandera County) – @aliirobaMohamud Ali (Marsabit County) – @GovMohamudAliMohammed Kuti (Isiolo County) – @GovernorkutiKiraitu Murungi (Meru County) – @GovernorKiraituOnesimus Njuki (Tharaka Nithi County) – @MuthomiNjukiMartin Wambora (Embu County) – @WamboraCharity Ngilu (Kitui County) – @mamangiluAlfred Mutua (Machakos County) – @DrAlfredMutuaKivutha Kibwana (Makueni County) – @governorkibwanaFrancis Kimemia (Nyandarua County) – @GovFKimemiaMutahi Kahiga (Nyeri County) – @GovernorKahigaAnne Waiguru (Kirinyaga County) – @AnneWaiguruMwangi wa Iria (Murang’a County) – @MwangiWaIriaFerdinand Waititu (Kiambu County) – @GovernorBabayaoJosphat Nanok (Turkana County) – @GovernorNanokJohn Lonyangapuo (West Pokot County) – @JohnlonyangapuoMoses Kasaine (Samburu County) – @HELenolkulalPatrick Khaemba (Trans Nzoia County) – @GovP_KhaembaJackson Mandago (Uasin Gishu) – @GvnMandagoAlex Tolgos (Elgeyo-Marakwet County) – @Governor_TolgosStephen Sang (Nandi County) – @Araap_SangStanley Kiptis (Baringo County) – @GovernorKiptisNdiritu Muriithi (Laikipia County) – @NdirituMuriithiLee Kinyanjui (Nakuru County) – @GovLeeKinyanjuiSamuel Tunai (Narok County) – @SamuelTunaiJoseph Ole Lenku (Kajiado County) – @joelenkuPaul Chepkwony (Kericho County) – @GovChepkwonyJoyce Laboso (Bomet County) – @LabosoJoyceWycliffe Oparanya (Kakamega County) – @GovWOparanyaWilber Otichilo (Vihiga County) – @GovernorVihigaWycliffe Wangamati (Bungoma County) – @GovWWangamatiSospeter Ojaamong (Busia County) – @GovOjaamongCornel Rasanga (Siaya County) – @Rasanga_CornelAnyang’ Nyong’o (Kisumu County) – @AnyangNyongoCyprian Awiti (Homa Bay County) @GovernorAwitiZacharia Obado (Migori County) – @GovOkothObadoJames Ongwae (Kisii County) – @JamesOngwaeJohn Nyagarama (Nyamira County) – @JOHNNYAGARAMA1Mike Sonko (Nairobi) – @MikeSonko']"
199 | ]
200 | },
201 | "execution_count": 18,
202 | "metadata": {},
203 | "output_type": "execute_result"
204 | }
205 | ],
206 | "source": [
207 | "#governors\n",
208 | "url= 'https://sokodirectory.com/2019/03/here-are-the-official-twitter-handles-for-all-the-47-governors-in-kenya/'\n",
209 | "url = simple_get(url)\n",
210 | "gs = get_elements(url, tag = 'ol', search = {})\n",
211 | "gs"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": 24,
217 | "metadata": {},
218 | "outputs": [
219 | {
220 | "data": {
221 | "text/plain": [
222 | "['@HassanAliJohoSalim',\n",
223 | " '@GovernorMvuryaAmason',\n",
224 | " '@governorkingiDhadho',\n",
225 | " '@EGodhanaFahim',\n",
226 | " '@fahimyasintwahaGranton',\n",
227 | " '@SambojagrantonBunow',\n",
228 | " '@HonAliKoraneMohamed',\n",
229 | " '@GovernorWajirAli',\n",
230 | " '@aliirobaMohamud',\n",
231 | " '@GovMohamudAliMohammed',\n",
232 | " '@GovernorkutiKiraitu',\n",
233 | " '@GovernorKiraituOnesimus',\n",
234 | " '@MuthomiNjukiMartin',\n",
235 | " '@WamboraCharity',\n",
236 | " '@mamangiluAlfred',\n",
237 | " '@DrAlfredMutuaKivutha',\n",
238 | " '@governorkibwanaFrancis',\n",
239 | " '@GovFKimemiaMutahi',\n",
240 | " '@GovernorKahigaAnne',\n",
241 | " '@AnneWaiguruMwangi',\n",
242 | " '@MwangiWaIriaFerdinand',\n",
243 | " '@GovernorBabayaoJosphat',\n",
244 | " '@GovernorNanokJohn',\n",
245 | " '@JohnlonyangapuoMoses',\n",
246 | " '@HELenolkulalPatrick',\n",
247 | " '@GovP_KhaembaJackson',\n",
248 | " '@GvnMandagoAlex',\n",
249 | " '@Governor_TolgosStephen',\n",
250 | " '@Araap_SangStanley',\n",
251 | " '@GovernorKiptisNdiritu',\n",
252 | " '@NdirituMuriithiLee',\n",
253 | " '@GovLeeKinyanjuiSamuel',\n",
254 | " '@SamuelTunaiJoseph',\n",
255 | " '@joelenkuPaul',\n",
256 | " '@GovChepkwonyJoyce',\n",
257 | " '@LabosoJoyceWycliffe',\n",
258 | " '@GovWOparanyaWilber',\n",
259 | " '@GovernorVihigaWycliffe',\n",
260 | " '@GovWWangamatiSospeter',\n",
261 | " '@GovOjaamongCornel',\n",
262 | " '@Rasanga_CornelAnyang’',\n",
263 | " '@AnyangNyongoCyprian',\n",
264 | " '@GovernorAwitiZacharia',\n",
265 | " '@GovOkothObadoJames',\n",
266 | " '@JamesOngwaeJohn',\n",
267 | " '@JOHNNYAGARAMA1Mike',\n",
268 | " '@MikeSonko']"
269 | ]
270 | },
271 | "execution_count": 24,
272 | "metadata": {},
273 | "output_type": "execute_result"
274 | }
275 | ],
276 | "source": [
277 | "governors = []\n",
278 | "for element in gs:\n",
279 | " for string in element.split(' '):\n",
280 | " if string.startswith('@'):\n",
281 | " governors.append(string)\n",
282 | " \n",
283 | "governors"
284 | ]
285 | },
286 | {
287 | "cell_type": "code",
288 | "execution_count": 25,
289 | "metadata": {},
290 | "outputs": [
291 | {
292 | "data": {
293 | "text/html": [
294 | "
\n",
295 | "\n",
308 | "
\n",
309 | " \n",
310 | " \n",
311 | " | \n",
312 | " handles | \n",
313 | "
\n",
314 | " \n",
315 | " \n",
316 | " \n",
317 | " 0 | \n",
318 | " @HassanAliJohoSalim | \n",
319 | "
\n",
320 | " \n",
321 | " 1 | \n",
322 | " @GovernorMvuryaAmason | \n",
323 | "
\n",
324 | " \n",
325 | " 2 | \n",
326 | " @governorkingiDhadho | \n",
327 | "
\n",
328 | " \n",
329 | " 3 | \n",
330 | " @EGodhanaFahim | \n",
331 | "
\n",
332 | " \n",
333 | " 4 | \n",
334 | " @fahimyasintwahaGranton | \n",
335 | "
\n",
336 | " \n",
337 | "
\n",
338 | "
"
339 | ],
340 | "text/plain": [
341 | " handles\n",
342 | "0 @HassanAliJohoSalim\n",
343 | "1 @GovernorMvuryaAmason\n",
344 | "2 @governorkingiDhadho\n",
345 | "3 @EGodhanaFahim\n",
346 | "4 @fahimyasintwahaGranton"
347 | ]
348 | },
349 | "execution_count": 25,
350 | "metadata": {},
351 | "output_type": "execute_result"
352 | }
353 | ],
354 | "source": [
355 | "governors_df = pd.DataFrame({'handles':governors})\n",
356 | "governors_df.to_csv('governors.csv')\n",
357 | "governors_df.head()"
358 | ]
359 | },
360 | {
361 | "cell_type": "markdown",
362 | "metadata": {},
363 | "source": [
364 | "### Travel Start"
365 | ]
366 | },
367 | {
368 | "cell_type": "code",
369 | "execution_count": 28,
370 | "metadata": {},
371 | "outputs": [
372 | {
373 | "data": {
374 | "text/plain": [
375 | "['NTV Kenya @ntvkenya',\n",
376 | " 'Uhuru Kenyatta @UKenyatta',\n",
377 | " 'Julius Kanyi @kanyicool',\n",
378 | " 'Jeff Koinange MBS @KoinangeJeff',\n",
379 | " 'Lempuris @majani_',\n",
380 | " 'Bob Collymore @bobcollymore',\n",
381 | " 'Dr Willy M Mutunga @WMutunga',\n",
382 | " 'Larry Madowo @LarryMadowo',\n",
383 | " 'Aly-Khan Satchu @alykhansatchu',\n",
384 | " 'Erik Hersman @whiteafrican',\n",
385 | " 'John-Allan Namu @johnallannamu',\n",
386 | " 'Yvonne Okwara @@YvonneOkwara',\n",
387 | " 'Robert Alai @RobertAlai',\n",
388 | " 'Dennis Itumbi @OleItumbi',\n",
389 | " 'Dan Ndambuki aka Churchill @MwalimChurchill',\n",
390 | " 'Carol Radull @CarolRadull',\n",
391 | " 'Gina Din @gina_din',\n",
392 | " 'Peter Nduati @PeterNduati',\n",
393 | " 'Bankelele @bankelele',\n",
394 | " 'Chris Kirubi @CKirubi',\n",
395 | " 'Oliver Mathenge @OliverMathenge',\n",
396 | " 'Mark Kaigwa @MKaigwa',\n",
397 | " 'Charles Onyango-Obbo @cobbo3',\n",
398 | " 'Patricia Kihoro @Misskihoro',\n",
399 | " 'Julie Gichuru @juliegichuru',\n",
400 | " 'Caroline Mutoko @CarolineMutoko',\n",
401 | " 'EKODYDDA @ekodydda01',\n",
402 | " 'Juliani @JulianiKenya',\n",
403 | " 'Maina Kageni @ItsMainaKageni',\n",
404 | " 'RAMpunzZyl @RamzZy_',\n",
405 | " 'The Trend Setter @xtiandela',\n",
406 | " 'NyaKundiH @C_NyaKundiH',\n",
407 | " 'CN @crazynairobian',\n",
408 | " 'Masaku @masaku_',\n",
409 | " 'Brian Mbunde @Brianmbunde',\n",
410 | " 'Comedy Central @ComedyCentralKE',\n",
411 | " 'SokoAnalyst @SokoAnalyst',\n",
412 | " 'JOEmeson @JoeWMuchiri',\n",
413 | " 'Naomi Mutua @AKenyanGirl',\n",
414 | " 'Deejay Eynie @Evabulence',\n",
415 | " 'Pat Nanyaro @PatNanyaro',\n",
416 | " 'Boniface Mwangi @bonifacemwangi',\n",
417 | " 'Ng’endo Machua @Ngendo87',\n",
418 | " 'Nancie Mwai @nanciemwai',\n",
419 | " 'Mutua Matheka @truthslinger',\n",
420 | " 'Comrade Mbasu @itsbuddhablaze',\n",
421 | " 'Sunny Bindra @sunnysunwords',\n",
422 | " 'Laura Walubengo @lwalubengo',\n",
423 | " 'Anne Kiguta @AnneKiguta',\n",
424 | " 'Teputy Kavanaa @DavidBurudi',\n",
425 | " 'mbithi',\n",
426 | " '',\n",
427 | " 'February 21, 2014',\n",
428 | " '@blogs_kenya?',\n",
429 | " 'Reply',\n",
430 | " 'Tuska',\n",
431 | " '',\n",
432 | " 'February 21, 2014',\n",
433 | " 'What a horrid list most of those are just people in the media. Try again!!!',\n",
434 | " 'Reply',\n",
435 | " 'Ogetembe',\n",
436 | " '',\n",
437 | " 'February 21, 2014',\n",
438 | " 'how can @vinnieo be miising from that list',\n",
439 | " 'sema rigging',\n",
440 | " 'Reply',\n",
441 | " 'Stephen Machua',\n",
442 | " '',\n",
443 | " 'February 21, 2014',\n",
444 | " 'I would like to know the parameters used to attain this list. I wouldn’t mind making it there next time.',\n",
445 | " 'Reply',\n",
446 | " 'Lyagamula',\n",
447 | " '',\n",
448 | " 'February 23, 2014',\n",
449 | " 'Up comimg Make up artist and young enterprenure CEO of Lush Beauty Michelle Njoroge @mslushmakeup',\n",
450 | " 'Reply',\n",
451 | " 'Kiki',\n",
452 | " '',\n",
453 | " 'April 5, 2014',\n",
454 | " 'I came across a kenyan newbie @PeterQuotlant and that guy already has 33k followers! who is he?',\n",
455 | " 'Reply']"
456 | ]
457 | },
458 | "execution_count": 28,
459 | "metadata": {},
460 | "output_type": "execute_result"
461 | }
462 | ],
463 | "source": [
464 | "#random kenyans\n",
465 | "url= 'http://www.travelstart.co.ke/blog/the-50-best-kenyans-to-follow-on-twitter/'\n",
466 | "url = simple_get(url)\n",
467 | "ks = get_elements(url, tag = 'ol', search = {})\n",
468 | "ks"
469 | ]
470 | },
471 | {
472 | "cell_type": "code",
473 | "execution_count": 29,
474 | "metadata": {},
475 | "outputs": [
476 | {
477 | "data": {
478 | "text/plain": [
479 | "['@ntvkenya',\n",
480 | " '@UKenyatta',\n",
481 | " '@kanyicool',\n",
482 | " '@KoinangeJeff',\n",
483 | " '@majani_',\n",
484 | " '@bobcollymore',\n",
485 | " '@WMutunga',\n",
486 | " '@LarryMadowo',\n",
487 | " '@alykhansatchu',\n",
488 | " '@whiteafrican',\n",
489 | " '@johnallannamu',\n",
490 | " '@@YvonneOkwara',\n",
491 | " '@RobertAlai',\n",
492 | " '@OleItumbi',\n",
493 | " '@MwalimChurchill',\n",
494 | " '@CarolRadull',\n",
495 | " '@gina_din',\n",
496 | " '@PeterNduati',\n",
497 | " '@bankelele',\n",
498 | " '@CKirubi',\n",
499 | " '@OliverMathenge',\n",
500 | " '@MKaigwa',\n",
501 | " '@cobbo3',\n",
502 | " '@Misskihoro',\n",
503 | " '@juliegichuru',\n",
504 | " '@CarolineMutoko',\n",
505 | " '@ekodydda01',\n",
506 | " '@JulianiKenya',\n",
507 | " '@ItsMainaKageni',\n",
508 | " '@RamzZy_',\n",
509 | " '@xtiandela',\n",
510 | " '@C_NyaKundiH',\n",
511 | " '@crazynairobian',\n",
512 | " '@masaku_',\n",
513 | " '@Brianmbunde',\n",
514 | " '@ComedyCentralKE',\n",
515 | " '@SokoAnalyst',\n",
516 | " '@JoeWMuchiri',\n",
517 | " '@AKenyanGirl',\n",
518 | " '@Evabulence',\n",
519 | " '@PatNanyaro',\n",
520 | " '@bonifacemwangi',\n",
521 | " '@Ngendo87',\n",
522 | " '@nanciemwai',\n",
523 | " '@truthslinger',\n",
524 | " '@itsbuddhablaze',\n",
525 | " '@sunnysunwords',\n",
526 | " '@lwalubengo',\n",
527 | " '@AnneKiguta',\n",
528 | " '@DavidBurudi',\n",
529 | " '@blogs_kenya?',\n",
530 | " '@vinnieo',\n",
531 | " '@mslushmakeup',\n",
532 | " '@PeterQuotlant']"
533 | ]
534 | },
535 | "execution_count": 29,
536 | "metadata": {},
537 | "output_type": "execute_result"
538 | }
539 | ],
540 | "source": [
541 | "kenyans = []\n",
542 | "for element in ks:\n",
543 | " for string in element.split(' '):\n",
544 | " if string.startswith('@'):\n",
545 | " kenyans.append(string)\n",
546 | " \n",
547 | "kenyans"
548 | ]
549 | },
550 | {
551 | "cell_type": "code",
552 | "execution_count": 30,
553 | "metadata": {},
554 | "outputs": [
555 | {
556 | "data": {
557 | "text/html": [
558 | "\n",
559 | "\n",
572 | "
\n",
573 | " \n",
574 | " \n",
575 | " | \n",
576 | " handles | \n",
577 | "
\n",
578 | " \n",
579 | " \n",
580 | " \n",
581 | " 0 | \n",
582 | " @ntvkenya | \n",
583 | "
\n",
584 | " \n",
585 | " 1 | \n",
586 | " @UKenyatta | \n",
587 | "
\n",
588 | " \n",
589 | " 2 | \n",
590 | " @kanyicool | \n",
591 | "
\n",
592 | " \n",
593 | " 3 | \n",
594 | " @KoinangeJeff | \n",
595 | "
\n",
596 | " \n",
597 | " 4 | \n",
598 | " @majani_ | \n",
599 | "
\n",
600 | " \n",
601 | "
\n",
602 | "
"
603 | ],
604 | "text/plain": [
605 | " handles\n",
606 | "0 @ntvkenya\n",
607 | "1 @UKenyatta\n",
608 | "2 @kanyicool\n",
609 | "3 @KoinangeJeff\n",
610 | "4 @majani_"
611 | ]
612 | },
613 | "execution_count": 30,
614 | "metadata": {},
615 | "output_type": "execute_result"
616 | }
617 | ],
618 | "source": [
619 | "k_df = pd.DataFrame({'handles':kenyans})\n",
620 | "k_df.to_csv('k_df.csv')\n",
621 | "k_df.head()"
622 | ]
623 | },
624 | {
625 | "cell_type": "markdown",
626 | "metadata": {},
627 | "source": [
628 | "### Royal Trendia"
629 | ]
630 | },
631 | {
632 | "cell_type": "code",
633 | "execution_count": 32,
634 | "metadata": {},
635 | "outputs": [
636 | {
637 | "data": {
638 | "text/plain": [
639 | "['Home',\n",
640 | " 'About Us',\n",
641 | " 'Services',\n",
642 | " 'Blog',\n",
643 | " 'Contact Us',\n",
644 | " 'Home',\n",
645 | " 'About Us',\n",
646 | " 'Services',\n",
647 | " 'Blog',\n",
648 | " 'Contact Us',\n",
649 | " 'Get a quote',\n",
650 | " 'Social Media',\n",
651 | " 'Kenyans on Twitter',\n",
652 | " 'Top 100 Most Followed Users on Twitter in Kenya',\n",
653 | " 'Top Twitter users in Kenya',\n",
654 | " 'Twitter Kenya',\n",
655 | " 'Who has the most followers on twitter in Kenya?',\n",
656 | " 'Twitter',\n",
657 | " 'Uhuru Kenyatta (@UKenyatta)',\n",
658 | " 'NTV Kenya (@ntvkenya)',\n",
659 | " 'Citizen TV Kenya (@citizentvkenya)',\n",
660 | " 'William Samoei Ruto (@WilliamsRuto)',\n",
661 | " 'ktn (@KTNKenya)',\n",
662 | " 'Raila Odinga (@RailaOdinga)',\n",
663 | " 'Daily Nation (@dailynation)',\n",
664 | " 'Larry Madowo, first of his name (@LarryMadowo)',\n",
665 | " 'Cyprian, Is Nyakundi (@C_NyaKundiH)',\n",
666 | " 'Jeff Koinange, MBS (@KoinangeJeff)',\n",
667 | " 'Capital FM Kenya (@CapitalFMKenya)',\n",
668 | " 'Thee Trend Setter ™ (@xtiandela)',\n",
669 | " 'Bob Collymore (@bobcollymore)',\n",
670 | " 'Safaricom Limited (@SafaricomLtd)',\n",
671 | " 'Julie Gichuru (@JulieGichuru)',\n",
672 | " 'Robert ALAI (Nyakwar Nyokwongi) (@RobertAlai)',\n",
673 | " 'Mutahi Ngunyi (@MutahiNgunyi)',\n",
674 | " 'Boniface Mwangi (@bonifacemwangi)',\n",
675 | " 'K24 TV (@K24Tv)',\n",
676 | " 'Ma3Route (@Ma3Route)',\n",
677 | " 'Kenya Red Cross (@KenyaRedCross)',\n",
678 | " 'The Standard Digital (@StandardKenya)',\n",
679 | " 'Victor Mochere (@VictorMochere)',\n",
680 | " 'Chris Kirubi (@CKirubi)',\n",
681 | " 'Dr Willy M Mutunga (@WMutunga)',\n",
682 | " 'Dr. Evans Kidero (@KideroEvans)',\n",
683 | " 'Safaricom Care (@Safaricom_Care)',\n",
684 | " 'Martha Karua (@MarthaKarua)',\n",
685 | " 'UN Environment (@UNEP)',\n",
686 | " 'John-Allan Namu (@johnallannamu)',\n",
687 | " 'The Star, Kenya (@TheStarKenya)',\n",
688 | " 'Kenya Power (@KenyaPower_Care)',\n",
689 | " 'Dennis Itumbi (@OleItumbi)',\n",
690 | " 'Kenya Airways (@KenyaAirways)',\n",
691 | " 'Peter Kenneth (@Peter_Kenneth)',\n",
692 | " 'State House Kenya (@StateHouseKenya)',\n",
693 | " 'Maina Kageni (@ItsMainaKageni)',\n",
694 | " 'BusinessDaily Africa (@BD_Africa)',\n",
695 | " 'SAUTI SOL (@sautisol)',\n",
696 | " 'AMB:Amina Mohamed (@AMB_A_Mohammed)',\n",
697 | " 'Anne Kiguta (@AnneKiguta)',\n",
698 | " 'Aly-Khan Satchu (@alykhansatchu)',\n",
699 | " 'Carol Radull (@CarolRadull)',\n",
700 | " 'Dennis Onsarigo (@Donsarigo)',\n",
701 | " 'Linus Kaikai (@LinusKaikai)',\n",
702 | " 'Terryanne Chebet (@TerryanneChebet)',\n",
703 | " 'I keep it real (@lKeepltReal)',\n",
704 | " 'Willis Raburu (@WillisRaburu)',\n",
705 | " 'Lindah Oguttu (@lindahoguttu)',\n",
706 | " 'IEBC (@IEBCKenya)',\n",
707 | " 'Mark Masai (@MarkMasai)',\n",
708 | " 'James Smart (@jamessmat)',\n",
709 | " 'Senator Sakaja (@SakajaJohnson)',\n",
710 | " 'Mike Sonko (@MikeSonko)',\n",
711 | " 'KTN News (@KTNNews)',\n",
712 | " 'Smriti Vidyarthi (@SmritiVidyarthi)',\n",
713 | " 'Victor Wanyama (@VictorWanyama)',\n",
714 | " 'Book Lovers (@BookLoversgate)',\n",
715 | " 'iHub (@iHub)',\n",
716 | " 'David Rudisha MBS (@rudishadavid)',\n",
717 | " 'Lord Mutai (@ItsMutai)',\n",
718 | " 'Ory Okolloh Mwangi (@kenyanpundit)',\n",
719 | " '#TeamShaffie (@ShaffieWeru)',\n",
720 | " 'Nimrod Taabu (@nimrodtaabu)',\n",
721 | " 'President of Kenya (@PresidentKE)',\n",
722 | " 'Lulu Hassan (@LuluHassan)',\n",
723 | " 'Joe Ageyo (@jageyo)',\n",
724 | " 'Gado Cartoons (@iGaddo)',\n",
725 | " 'InteriorCNG Ministry (@InteriorKE)',\n",
726 | " 'Churchill (@MwalimChurchill)',\n",
727 | " 'Jamila Mohamed (@JamilaMohamed)',\n",
728 | " 'issamichuzi (@issamichuzi)',\n",
729 | " 'Airtel Kenya (@AIRTEL_KE)',\n",
730 | " 'DStv Kenya (@DStv_Kenya)',\n",
731 | " 'Apostle M. M. Wangui (@ApostleMMWangui)',\n",
732 | " 'Francis Gachuri (@Fchurii)',\n",
733 | " 'Mugo Kibati (@mugokibati)',\n",
734 | " 'U.S. Embassy Nairobi (@USEmbassyKenya)',\n",
735 | " 'Najib Balala (@tunajibu)',\n",
736 | " 'Jimmi Gathu (@JimmiGathu)',\n",
737 | " 'KCB Group (@KCBGroup)',\n",
738 | " 'Nation FM (@NationFMKe)',\n",
739 | " 'IG (Rtd) Dr. Kimaiyo (@IGkimaiyo)',\n",
740 | " 'Mpasho News (@MpashoNews)',\n",
741 | " 'Siraj Maryan (@sirajmaryan)',\n",
742 | " 'Martin Oduor-Otieno (@OduorMartinO)',\n",
743 | " 'Linus Gitahi (@LinusGitahi)',\n",
744 | " 'Kenya Tourism Board (@MagicalKenya)',\n",
745 | " 'Ghafla Kenya (@GhaflaKenya)',\n",
746 | " 'Moseax™ 202k 🇰🇪 (@Moseax)',\n",
747 | " 'Kirigo Ng’arua 😉 (@KirigoNgarua)',\n",
748 | " 'HBR (@HomeboyzRadio)',\n",
749 | " 'Ben Kitili (@Ben_Kitili)',\n",
750 | " 'Bitange Ndemo (@bantigito)',\n",
751 | " 'Malik Obama (@ObamaMalik)',\n",
752 | " 'John Githongo (@johngithongo)',\n",
753 | " 'Jimnah Mbaru (@JimnahMbaru)',\n",
754 | " 'Africa Review (@africareview)',\n",
755 | " 'The EastAfrican (@The_EastAfrican)',\n",
756 | " 'Edith kimani (@Edith_kimani)',\n",
757 | " '.ud0a5095dafd12ce8af2fe63af3a157d5 { padding:0px; margin: 0; padding-top:1em!important; padding-bottom:1em!important; width:100%; display: block; font-weight:bold; background-color:#8E44AD; border:0!important; border-left:4px solid inherit!important; box-shadow: 0 1px 2px rgba(0, 0, 0, 0.17); -moz-box-shadow: 0 1px 2px rgba(0, 0, 0, 0.17); -o-box-shadow: 0 1px 2px rgba(0, 0, 0, 0.17); -webkit-box-shadow: 0 1px 2px rgba(0, 0, 0, 0.17); text-decoration:none; } .ud0a5095dafd12ce8af2fe63af3a157d5:active, .ud0a5095dafd12ce8af2fe63af3a157d5:hover { opacity: 1; transition: opacity 250ms; webkit-transition: opacity 250ms; text-decoration:none; } .ud0a5095dafd12ce8af2fe63af3a157d5 { transition: background-color 250ms; webkit-transition: background-color 250ms; opacity: 1; transition: opacity 250ms; webkit-transition: opacity 250ms; } .ud0a5095dafd12ce8af2fe63af3a157d5 .ctaText { font-weight:bold; color:#eaeaea; text-decoration:none; font-size: 16px; } .ud0a5095dafd12ce8af2fe63af3a157d5 .postTitle { color:#FFFFFF; text-decoration: underline!important; font-size: 16px; } .ud0a5095dafd12ce8af2fe63af3a157d5:hover .postTitle { text-decoration: underline!important; } READ\\xa0 Ways To Encourage Engagement With Your Instagram Audience in 2020',\n",
758 | " 'Daniel Maithya',\n",
759 | " 'Prev',\n",
760 | " 'Next',\n",
761 | " 'Cancel Reply',\n",
762 | " 'RoyalTrendia',\n",
763 | " 'Tweets by RoyalTrendia',\n",
764 | " 'Mizizi Africa homes launch music unit to lift up talented musicians',\n",
765 | " 'July 13, 2020',\n",
766 | " 'Daniel Maithya',\n",
767 | " 'Telkom and Loon announce progressive development of Loon technology to customers from July',\n",
768 | " 'July 8, 2020',\n",
769 | " 'Daniel Maithya',\n",
770 | " 'The Green Zone Estate ready for occupation says Krishna Group CEO',\n",
771 | " 'July 7, 2020',\n",
772 | " 'Daniel Maithya',\n",
773 | " 'Cisco Webex Meetings adds more capacity, security and Intelligent Insights',\n",
774 | " 'June 19, 2020',\n",
775 | " 'Daniel Maithya',\n",
776 | " 'Cisco Radically Simplifies Security for Today’s Accelerated IT Agenda',\n",
777 | " 'June 17, 2020',\n",
778 | " 'Daniel Maithya',\n",
779 | " '',\n",
780 | " 'Follow @RoyalTrendia',\n",
781 | " 'RoyalTrendia',\n",
782 | " 'Monitoring Ranking',\n",
783 | " 'Search Engine Marketing',\n",
784 | " 'Search Engine Optimization',\n",
785 | " 'Code Optimization',\n",
786 | " 'Mizizi Africa homes launch music unit to lift up talented musicians',\n",
787 | " 'July 13, 2020',\n",
788 | " 'Telkom and Loon announce progressive development of Loon technology to customers from July',\n",
789 | " 'July 8, 2020',\n",
790 | " 'The Green Zone Estate ready for occupation says Krishna Group CEO',\n",
791 | " 'July 7, 2020',\n",
792 | " 'Cisco Webex Meetings adds more capacity, security and Intelligent Insights',\n",
793 | " 'June 19, 2020',\n",
794 | " '×',\n",
795 | " 'Chat']"
796 | ]
797 | },
798 | "execution_count": 32,
799 | "metadata": {},
800 | "output_type": "execute_result"
801 | }
802 | ],
803 | "source": [
804 | "#more Kenyans\n",
805 | "url= 'https://royaltrendia.com/top-twitter-profiles-kenya/'\n",
806 | "url = simple_get(url)\n",
807 | "k = get_elements(url, tag = 'a', search = {})\n",
808 | "k"
809 | ]
810 | },
811 | {
812 | "cell_type": "code",
813 | "execution_count": 36,
814 | "metadata": {},
815 | "outputs": [
816 | {
817 | "data": {
818 | "text/plain": [
819 | "['(@UKenyatta)',\n",
820 | " '(@ntvkenya)',\n",
821 | " '(@citizentvkenya)',\n",
822 | " '(@WilliamsRuto)',\n",
823 | " '(@KTNKenya)',\n",
824 | " '(@RailaOdinga)',\n",
825 | " '(@dailynation)',\n",
826 | " '(@LarryMadowo)',\n",
827 | " '(@C_NyaKundiH)',\n",
828 | " '(@KoinangeJeff)',\n",
829 | " '(@CapitalFMKenya)',\n",
830 | " '(@xtiandela)',\n",
831 | " '(@bobcollymore)',\n",
832 | " '(@SafaricomLtd)',\n",
833 | " '(@JulieGichuru)',\n",
834 | " '(@RobertAlai)',\n",
835 | " '(@MutahiNgunyi)',\n",
836 | " '(@bonifacemwangi)',\n",
837 | " '(@K24Tv)',\n",
838 | " '(@Ma3Route)',\n",
839 | " '(@KenyaRedCross)',\n",
840 | " '(@StandardKenya)',\n",
841 | " '(@VictorMochere)',\n",
842 | " '(@CKirubi)',\n",
843 | " '(@WMutunga)',\n",
844 | " '(@KideroEvans)',\n",
845 | " '(@Safaricom_Care)',\n",
846 | " '(@MarthaKarua)',\n",
847 | " '(@UNEP)',\n",
848 | " '(@johnallannamu)',\n",
849 | " '(@TheStarKenya)',\n",
850 | " '(@KenyaPower_Care)',\n",
851 | " '(@OleItumbi)',\n",
852 | " '(@KenyaAirways)',\n",
853 | " '(@Peter_Kenneth)',\n",
854 | " '(@StateHouseKenya)',\n",
855 | " '(@ItsMainaKageni)',\n",
856 | " '(@BD_Africa)',\n",
857 | " '(@sautisol)',\n",
858 | " '(@AMB_A_Mohammed)',\n",
859 | " '(@AnneKiguta)',\n",
860 | " '(@alykhansatchu)',\n",
861 | " '(@CarolRadull)',\n",
862 | " '(@Donsarigo)',\n",
863 | " '(@LinusKaikai)',\n",
864 | " '(@TerryanneChebet)',\n",
865 | " '(@lKeepltReal)',\n",
866 | " '(@WillisRaburu)',\n",
867 | " '(@lindahoguttu)',\n",
868 | " '(@IEBCKenya)',\n",
869 | " '(@MarkMasai)',\n",
870 | " '(@jamessmat)',\n",
871 | " '(@SakajaJohnson)',\n",
872 | " '(@MikeSonko)',\n",
873 | " '(@KTNNews)',\n",
874 | " '(@SmritiVidyarthi)',\n",
875 | " '(@VictorWanyama)',\n",
876 | " '(@BookLoversgate)',\n",
877 | " '(@iHub)',\n",
878 | " '(@rudishadavid)',\n",
879 | " '(@ItsMutai)',\n",
880 | " '(@kenyanpundit)',\n",
881 | " '(@ShaffieWeru)',\n",
882 | " '(@nimrodtaabu)',\n",
883 | " '(@PresidentKE)',\n",
884 | " '(@LuluHassan)',\n",
885 | " '(@jageyo)',\n",
886 | " '(@iGaddo)',\n",
887 | " '(@InteriorKE)',\n",
888 | " '(@MwalimChurchill)',\n",
889 | " '(@JamilaMohamed)',\n",
890 | " '(@issamichuzi)',\n",
891 | " '(@AIRTEL_KE)',\n",
892 | " '(@DStv_Kenya)',\n",
893 | " '(@ApostleMMWangui)',\n",
894 | " '(@Fchurii)',\n",
895 | " '(@mugokibati)',\n",
896 | " '(@USEmbassyKenya)',\n",
897 | " '(@tunajibu)',\n",
898 | " '(@JimmiGathu)',\n",
899 | " '(@KCBGroup)',\n",
900 | " '(@NationFMKe)',\n",
901 | " '(@IGkimaiyo)',\n",
902 | " '(@MpashoNews)',\n",
903 | " '(@sirajmaryan)',\n",
904 | " '(@OduorMartinO)',\n",
905 | " '(@LinusGitahi)',\n",
906 | " '(@MagicalKenya)',\n",
907 | " '(@GhaflaKenya)',\n",
908 | " '(@Moseax)',\n",
909 | " '(@KirigoNgarua)',\n",
910 | " '(@HomeboyzRadio)',\n",
911 | " '(@Ben_Kitili)',\n",
912 | " '(@bantigito)',\n",
913 | " '(@ObamaMalik)',\n",
914 | " '(@johngithongo)',\n",
915 | " '(@JimnahMbaru)',\n",
916 | " '(@africareview)',\n",
917 | " '(@The_EastAfrican)',\n",
918 | " '(@Edith_kimani)',\n",
919 | " '@RoyalTrendia']"
920 | ]
921 | },
922 | "execution_count": 36,
923 | "metadata": {},
924 | "output_type": "execute_result"
925 | }
926 | ],
927 | "source": [
928 | "k_1 = []\n",
929 | "for element in k:\n",
930 | " for string in element.split(' '):\n",
931 | " if '@' in string:\n",
932 | " k_1.append(string)\n",
933 | " \n",
934 | "k_1"
935 | ]
936 | },
937 | {
938 | "cell_type": "code",
939 | "execution_count": 37,
940 | "metadata": {},
941 | "outputs": [
942 | {
943 | "data": {
944 | "text/plain": [
945 | "['@UKenyatta',\n",
946 | " '@ntvkenya',\n",
947 | " '@citizentvkenya',\n",
948 | " '@WilliamsRuto',\n",
949 | " '@KTNKenya',\n",
950 | " '@RailaOdinga',\n",
951 | " '@dailynation',\n",
952 | " '@LarryMadowo',\n",
953 | " '@C_NyaKundiH',\n",
954 | " '@KoinangeJeff',\n",
955 | " '@CapitalFMKenya',\n",
956 | " '@xtiandela',\n",
957 | " '@bobcollymore',\n",
958 | " '@SafaricomLtd',\n",
959 | " '@JulieGichuru',\n",
960 | " '@RobertAlai',\n",
961 | " '@MutahiNgunyi',\n",
962 | " '@bonifacemwangi',\n",
963 | " '@K24Tv',\n",
964 | " '@Ma3Route',\n",
965 | " '@KenyaRedCross',\n",
966 | " '@StandardKenya',\n",
967 | " '@VictorMochere',\n",
968 | " '@CKirubi',\n",
969 | " '@WMutunga',\n",
970 | " '@KideroEvans',\n",
971 | " '@Safaricom_Care',\n",
972 | " '@MarthaKarua',\n",
973 | " '@UNEP',\n",
974 | " '@johnallannamu',\n",
975 | " '@TheStarKenya',\n",
976 | " '@KenyaPower_Care',\n",
977 | " '@OleItumbi',\n",
978 | " '@KenyaAirways',\n",
979 | " '@Peter_Kenneth',\n",
980 | " '@StateHouseKenya',\n",
981 | " '@ItsMainaKageni',\n",
982 | " '@BD_Africa',\n",
983 | " '@sautisol',\n",
984 | " '@AMB_A_Mohammed',\n",
985 | " '@AnneKiguta',\n",
986 | " '@alykhansatchu',\n",
987 | " '@CarolRadull',\n",
988 | " '@Donsarigo',\n",
989 | " '@LinusKaikai',\n",
990 | " '@TerryanneChebet',\n",
991 | " '@lKeepltReal',\n",
992 | " '@WillisRaburu',\n",
993 | " '@lindahoguttu',\n",
994 | " '@IEBCKenya',\n",
995 | " '@MarkMasai',\n",
996 | " '@jamessmat',\n",
997 | " '@SakajaJohnson',\n",
998 | " '@MikeSonko',\n",
999 | " '@KTNNews',\n",
1000 | " '@SmritiVidyarthi',\n",
1001 | " '@VictorWanyama',\n",
1002 | " '@BookLoversgate',\n",
1003 | " '@iHub',\n",
1004 | " '@rudishadavid',\n",
1005 | " '@ItsMutai',\n",
1006 | " '@kenyanpundit',\n",
1007 | " '@ShaffieWeru',\n",
1008 | " '@nimrodtaabu',\n",
1009 | " '@PresidentKE',\n",
1010 | " '@LuluHassan',\n",
1011 | " '@jageyo',\n",
1012 | " '@iGaddo',\n",
1013 | " '@InteriorKE',\n",
1014 | " '@MwalimChurchill',\n",
1015 | " '@JamilaMohamed',\n",
1016 | " '@issamichuzi',\n",
1017 | " '@AIRTEL_KE',\n",
1018 | " '@DStv_Kenya',\n",
1019 | " '@ApostleMMWangui',\n",
1020 | " '@Fchurii',\n",
1021 | " '@mugokibati',\n",
1022 | " '@USEmbassyKenya',\n",
1023 | " '@tunajibu',\n",
1024 | " '@JimmiGathu',\n",
1025 | " '@KCBGroup',\n",
1026 | " '@NationFMKe',\n",
1027 | " '@IGkimaiyo',\n",
1028 | " '@MpashoNews',\n",
1029 | " '@sirajmaryan',\n",
1030 | " '@OduorMartinO',\n",
1031 | " '@LinusGitahi',\n",
1032 | " '@MagicalKenya',\n",
1033 | " '@GhaflaKenya',\n",
1034 | " '@Moseax',\n",
1035 | " '@KirigoNgarua',\n",
1036 | " '@HomeboyzRadio',\n",
1037 | " '@Ben_Kitili',\n",
1038 | " '@bantigito',\n",
1039 | " '@ObamaMalik',\n",
1040 | " '@johngithongo',\n",
1041 | " '@JimnahMbaru',\n",
1042 | " '@africareview',\n",
1043 | " '@The_EastAfrican',\n",
1044 | " '@Edith_kimani',\n",
1045 | " 'RoyalTrendi']"
1046 | ]
1047 | },
1048 | "execution_count": 37,
1049 | "metadata": {},
1050 | "output_type": "execute_result"
1051 | }
1052 | ],
1053 | "source": [
1054 | "def trim_word(word, from_start=0, from_end=0):\n",
1055 | " return word[from_start:len(word) - from_end]\n",
1056 | "fellow_kenyans = []\n",
1057 | "for element in k_1:\n",
1058 | " element = trim_word(element, 1,1)\n",
1059 | " fellow_kenyans.append(element)\n",
1060 | " \n",
1061 | "fellow_kenyans"
1062 | ]
1063 | },
1064 | {
1065 | "cell_type": "code",
1066 | "execution_count": 38,
1067 | "metadata": {},
1068 | "outputs": [
1069 | {
1070 | "data": {
1071 | "text/html": [
1072 | "\n",
1073 | "\n",
1086 | "
\n",
1087 | " \n",
1088 | " \n",
1089 | " | \n",
1090 | " handles | \n",
1091 | "
\n",
1092 | " \n",
1093 | " \n",
1094 | " \n",
1095 | " 0 | \n",
1096 | " @UKenyatta | \n",
1097 | "
\n",
1098 | " \n",
1099 | " 1 | \n",
1100 | " @ntvkenya | \n",
1101 | "
\n",
1102 | " \n",
1103 | " 2 | \n",
1104 | " @citizentvkenya | \n",
1105 | "
\n",
1106 | " \n",
1107 | " 3 | \n",
1108 | " @WilliamsRuto | \n",
1109 | "
\n",
1110 | " \n",
1111 | " 4 | \n",
1112 | " @KTNKenya | \n",
1113 | "
\n",
1114 | " \n",
1115 | "
\n",
1116 | "
"
1117 | ],
1118 | "text/plain": [
1119 | " handles\n",
1120 | "0 @UKenyatta\n",
1121 | "1 @ntvkenya\n",
1122 | "2 @citizentvkenya\n",
1123 | "3 @WilliamsRuto\n",
1124 | "4 @KTNKenya"
1125 | ]
1126 | },
1127 | "execution_count": 38,
1128 | "metadata": {},
1129 | "output_type": "execute_result"
1130 | }
1131 | ],
1132 | "source": [
1133 | "fellow_kenyans_df = pd.DataFrame({'handles':fellow_kenyans})\n",
1134 | "fellow_kenyans_df.to_csv('fellow_kenyans_df.csv')\n",
1135 | "fellow_kenyans_df.head()"
1136 | ]
1137 | },
1138 | {
1139 | "cell_type": "markdown",
1140 | "metadata": {},
1141 | "source": [
1142 | "### Mega DF"
1143 | ]
1144 | },
1145 | {
1146 | "cell_type": "code",
1147 | "execution_count": 44,
1148 | "metadata": {},
1149 | "outputs": [
1150 | {
1151 | "data": {
1152 | "text/html": [
1153 | "\n",
1154 | "\n",
1167 | "
\n",
1168 | " \n",
1169 | " \n",
1170 | " | \n",
1171 | " handles | \n",
1172 | "
\n",
1173 | " \n",
1174 | " \n",
1175 | " \n",
1176 | " 0 | \n",
1177 | " @UKenyatta | \n",
1178 | "
\n",
1179 | " \n",
1180 | " 1 | \n",
1181 | " @ntvkenya | \n",
1182 | "
\n",
1183 | " \n",
1184 | " 2 | \n",
1185 | " @citizentvkenya | \n",
1186 | "
\n",
1187 | " \n",
1188 | " 3 | \n",
1189 | " @WilliamsRuto | \n",
1190 | "
\n",
1191 | " \n",
1192 | " 4 | \n",
1193 | " @KTNKenya | \n",
1194 | "
\n",
1195 | " \n",
1196 | "
\n",
1197 | "
"
1198 | ],
1199 | "text/plain": [
1200 | " handles\n",
1201 | "0 @UKenyatta\n",
1202 | "1 @ntvkenya\n",
1203 | "2 @citizentvkenya\n",
1204 | "3 @WilliamsRuto\n",
1205 | "4 @KTNKenya"
1206 | ]
1207 | },
1208 | "execution_count": 44,
1209 | "metadata": {},
1210 | "output_type": "execute_result"
1211 | }
1212 | ],
1213 | "source": [
1214 | "mega_df = pd.concat([fellow_kenyans_df, k_df, governors_df ])\n",
1215 | "mega_df = mega_df.drop_duplicates(subset = ['handles'])\n",
1216 | "mega_df.to_csv('mega2.csv')\n",
1217 | "mega_df.head()"
1218 | ]
1219 | },
1220 | {
1221 | "cell_type": "code",
1222 | "execution_count": 43,
1223 | "metadata": {},
1224 | "outputs": [
1225 | {
1226 | "data": {
1227 | "text/plain": [
1228 | "(183, 1)"
1229 | ]
1230 | },
1231 | "execution_count": 43,
1232 | "metadata": {},
1233 | "output_type": "execute_result"
1234 | }
1235 | ],
1236 | "source": [
1237 | "mega_df.shape"
1238 | ]
1239 | },
1240 | {
1241 | "cell_type": "markdown",
1242 | "metadata": {},
1243 | "source": [
1244 | "#### [Back to top](#More-Scraping)"
1245 | ]
1246 | },
1247 | {
1248 | "cell_type": "code",
1249 | "execution_count": null,
1250 | "metadata": {},
1251 | "outputs": [],
1252 | "source": []
1253 | }
1254 | ],
1255 | "metadata": {
1256 | "kernelspec": {
1257 | "display_name": "Python 3",
1258 | "language": "python",
1259 | "name": "python3"
1260 | },
1261 | "language_info": {
1262 | "codemirror_mode": {
1263 | "name": "ipython",
1264 | "version": 3
1265 | },
1266 | "file_extension": ".py",
1267 | "mimetype": "text/x-python",
1268 | "name": "python",
1269 | "nbconvert_exporter": "python",
1270 | "pygments_lexer": "ipython3",
1271 | "version": "3.7.4"
1272 | }
1273 | },
1274 | "nbformat": 4,
1275 | "nbformat_minor": 4
1276 | }
1277 |
--------------------------------------------------------------------------------
/super_bowl.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Super Bowl\n",
8 | "Concatinating all the data from the 3 scrapers. The dataframes were saved to csvs.\n"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "### Site scrapers data"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 1,
21 | "metadata": {},
22 | "outputs": [],
23 | "source": [
24 | "import pandas as pd\n",
25 | "import numpy as np"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 3,
31 | "metadata": {},
32 | "outputs": [
33 | {
34 | "data": {
35 | "text/plain": [
36 | "(330, 2)"
37 | ]
38 | },
39 | "execution_count": 3,
40 | "metadata": {},
41 | "output_type": "execute_result"
42 | }
43 | ],
44 | "source": [
45 | "#from scrapping\n",
46 | "mega_1 = pd.read_csv('mega1.csv')\n",
47 | "mega_2 = pd.read_csv('mega2.csv')\n",
48 | "mega_abs = pd.concat([mega_1, mega_2])\n",
49 | "mega_abs = mega_abs.drop_duplicates(subset = ['handles'])\n",
50 | "mega_abs.shape"
51 | ]
52 | },
53 | {
54 | "cell_type": "markdown",
55 | "metadata": {},
56 | "source": [
57 | "### Tweeps scraper data"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 38,
63 | "metadata": {},
64 | "outputs": [
65 | {
66 | "data": {
67 | "text/plain": [
68 | "(803, 7)"
69 | ]
70 | },
71 | "execution_count": 38,
72 | "metadata": {},
73 | "output_type": "execute_result"
74 | }
75 | ],
76 | "source": [
77 | "# from twitter. Pick out the guys with popularity > 30k.\n",
78 | "_1 = pd.read_csv('starting users.csv')\n",
79 | "_2 = pd.read_csv('df_2 users.csv')\n",
80 | "_3 = pd.read_csv('df_3 users.csv')\n",
81 | "_4 = pd.read_csv('df_4 users.csv')\n",
82 | "_5 = pd.read_csv('df_5 users.csv')\n",
83 | "_6 = pd.read_csv('df_6 users.csv')\n",
84 | "_7 = pd.read_csv('df_7 users.csv')\n",
85 | "_8 = pd.read_csv('df_8 users.csv')\n",
86 | "_9 = pd.read_csv('df_9 users.csv')\n",
87 | "_0 = pd.read_csv('df_0 users.csv') \n",
88 | "_01 = pd.read_csv('df_01 users.csv') \n",
89 | "_02 = pd.read_csv('df_02 users.csv') \n",
90 | "_03 = pd.read_csv('df_03 users.csv') \n",
91 | "_04 = pd.read_csv('df_04 users.csv') \n",
92 | "_05 = pd.read_csv('df_05 users.csv') \n",
93 | " \n",
94 | "mega_df = pd.concat([_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _01, _02, _03, _04, _05])\n",
95 | "mega_df['popularity'] = mega_df['followers'] - mega_df['friends']\n",
96 | "mega_df = mega_df[mega_df.popularity >= 10000]\n",
97 | "mega_df= mega_df.drop_duplicates(subset = ['screen_name'])\n",
98 | "mega_df.shape"
99 | ]
100 | },
101 | {
102 | "cell_type": "markdown",
103 | "metadata": {},
104 | "source": [
105 | "### SuperBowl"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": 39,
111 | "metadata": {},
112 | "outputs": [
113 | {
114 | "data": {
115 | "text/plain": [
116 | "(1066, 1)"
117 | ]
118 | },
119 | "execution_count": 39,
120 | "metadata": {},
121 | "output_type": "execute_result"
122 | }
123 | ],
124 | "source": [
125 | "names = pd.DataFrame(columns = ['handles'])\n",
126 | "names['handles'] = mega_df['screen_name']\n",
127 | "handles = pd.DataFrame(columns = ['handles'])\n",
128 | "handles['handles'] = mega_abs['handles']\n",
129 | "\n",
130 | "no_tags = names['handles'].to_list()\n",
131 | "def prepend(list, str): \n",
132 | " str += '{0}'\n",
133 | " list = [str.format(i) for i in list] \n",
134 | " return(list) \n",
135 | "tags = prepend(no_tags, '@')\n",
136 | "names['handles'] = tags\n",
137 | "names.head()\n",
138 | "\n",
139 | "\n",
140 | "super_bowl = pd.concat([names, handles])\n",
141 | "super_bowl = super_bowl.drop_duplicates(subset = 'handles')\n",
142 | "super_bowl.shape"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": 40,
148 | "metadata": {},
149 | "outputs": [
150 | {
151 | "data": {
152 | "text/html": [
153 | "\n",
154 | "\n",
167 | "
\n",
168 | " \n",
169 | " \n",
170 | " | \n",
171 | " handles | \n",
172 | "
\n",
173 | " \n",
174 | " \n",
175 | " \n",
176 | " 1 | \n",
177 | " @nanga_tp | \n",
178 | "
\n",
179 | " \n",
180 | " 2 | \n",
181 | " @KenInvest | \n",
182 | "
\n",
183 | " \n",
184 | " 12 | \n",
185 | " @chriskirwa | \n",
186 | "
\n",
187 | " \n",
188 | " 14 | \n",
189 | " @Thee_mavERIC | \n",
190 | "
\n",
191 | " \n",
192 | " 33 | \n",
193 | " @KTNNewsKE | \n",
194 | "
\n",
195 | " \n",
196 | "
\n",
197 | "
"
198 | ],
199 | "text/plain": [
200 | " handles\n",
201 | "1 @nanga_tp\n",
202 | "2 @KenInvest\n",
203 | "12 @chriskirwa\n",
204 | "14 @Thee_mavERIC\n",
205 | "33 @KTNNewsKE"
206 | ]
207 | },
208 | "execution_count": 40,
209 | "metadata": {},
210 | "output_type": "execute_result"
211 | }
212 | ],
213 | "source": [
214 | "super_bowl.to_csv('superbowl.csv')\n",
215 | "super_bowl.head()"
216 | ]
217 | },
218 | {
219 | "cell_type": "markdown",
220 | "metadata": {},
221 | "source": [
222 | "[Back to top](#Super-Bowl)"
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": null,
228 | "metadata": {},
229 | "outputs": [],
230 | "source": []
231 | }
232 | ],
233 | "metadata": {
234 | "kernelspec": {
235 | "display_name": "Python 3",
236 | "language": "python",
237 | "name": "python3"
238 | },
239 | "language_info": {
240 | "codemirror_mode": {
241 | "name": "ipython",
242 | "version": 3
243 | },
244 | "file_extension": ".py",
245 | "mimetype": "text/x-python",
246 | "name": "python",
247 | "nbconvert_exporter": "python",
248 | "pygments_lexer": "ipython3",
249 | "version": "3.7.4"
250 | }
251 | },
252 | "nbformat": 4,
253 | "nbformat_minor": 4
254 | }
255 |
--------------------------------------------------------------------------------
/tweeps_scraping_tweepy.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Tweeps scraping\n",
8 | "\n",
9 | "## Introduction\n",
10 | "Scrapping twitter handles of popular users from twitter using tweepy. Geotags and hashtags are used to get Kenyan tweets from which user Ids are extracted. The popular users (determined by followers and friends) are added to the dataframe and the less popular ones dropped.\n",
11 | "\n",
12 | "The requests are split into batches in order to have an easy time to pinpoint errors when they arise. \n",
13 | "\n",
14 | "## Table of Contents\n",
15 | "1. [Libraries](#Libraries)\n",
16 | "2. [Kenya's co-ordinates](#geo)\n",
17 | "3. [Geo Scraper](#Geo-Scraper)\n",
18 | "4. [Keyword Scraper](#Keyword-Scraper)\n",
19 | "5. [Mega DataFrame](#Mega-Df)\n"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "### Libraries"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 5,
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "from requests import get\n",
36 | "from requests.exceptions import RequestException\n",
37 | "from contextlib import closing\n",
38 | "from bs4 import BeautifulSoup\n",
39 | "import pandas as pd\n",
40 | "import os, sys\n",
41 | "\n",
42 | "import fire\n",
43 | "\n",
44 | "import sys\n",
45 | "import os\n",
46 | "import json\n",
47 | "import matplotlib.pyplot as plt\n",
48 | "import re\n",
49 | "import string\n",
50 | "\n",
51 | "import matplotlib.dates as mdates\n",
52 | "import seaborn as sns\n",
53 | "\n",
54 | "# to view all columns\n",
55 | "pd.set_option(\"display.max.columns\", None)\n",
56 | "\n",
57 | "import tweepy\n",
58 | "from tweepy.streaming import StreamListener\n",
59 | "from tweepy import OAuthHandler\n",
60 | "from tweepy import Stream\n",
61 | "from tweepy import API\n",
62 | "from tweepy import Cursor\n",
63 | "from datetime import datetime, date, time, timedelta\n",
64 | "from collections import Counter\n",
65 | "import sys\n",
66 | "\n",
67 | "\n",
68 | "import preprocessor as p"
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "metadata": {},
74 | "source": [
75 | "### geo\n",
76 | " The first step is to get Kenya's geo cordinates from twitter then use it with the search method to fetch Kenya tweets from Jan 1st this year. From the tweets the users are extracted. The limit is set to 2k users/tweets, from which a few 800 will be selected based on popularity."
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": 6,
82 | "metadata": {},
83 | "outputs": [],
84 | "source": [
85 | "consumer_key = os.environ.get('consumer_key')\n",
86 | "consumer_secret = os.environ.get('consumer_secret')\n",
87 | "access_token = os.environ.get('access_token')\n",
88 | "access_token_secret = os.environ.get('access_token_secret')\n",
89 | "\n",
90 | "auth = OAuthHandler(consumer_key, consumer_secret)\n",
91 | "auth.set_access_token(access_token, access_token_secret)\n",
92 | "auth_api = API(auth, wait_on_rate_limit=True)\n",
93 | "\n",
94 | "places = auth_api.geo_search(query=\"KENYA\", granularity=\"country\")\n",
95 | "place_id = places[0].id\n",
96 | "places"
97 | ]
98 | },
99 | {
100 | "cell_type": "markdown",
101 | "metadata": {},
102 | "source": [
103 | "### Geo Scraper"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": 91,
109 | "metadata": {},
110 | "outputs": [
111 | {
112 | "data": {
113 | "text/html": [
114 | "\n",
115 | "\n",
128 | "
\n",
129 | " \n",
130 | " \n",
131 | " | \n",
132 | " screen_name | \n",
133 | " followers | \n",
134 | " friends | \n",
135 | " description | \n",
136 | " tweets | \n",
137 | "
\n",
138 | " \n",
139 | " \n",
140 | " \n",
141 | " 0 | \n",
142 | " tyra_gracy | \n",
143 | " 333 | \n",
144 | " 317 | \n",
145 | " FAITH | \n",
146 | " 183 | \n",
147 | "
\n",
148 | " \n",
149 | " 1 | \n",
150 | " etadv | \n",
151 | " 4415 | \n",
152 | " 4285 | \n",
153 | " Advocate, Chairman LSK Nairobi Branch | \n",
154 | " 9745 | \n",
155 | "
\n",
156 | " \n",
157 | " 2 | \n",
158 | " LaRecetteNwar | \n",
159 | " 391 | \n",
160 | " 489 | \n",
161 | " french producer • credits : ISK • Bino • Jeune... | \n",
162 | " 5628 | \n",
163 | "
\n",
164 | " \n",
165 | " 3 | \n",
166 | " ParagraphKenya | \n",
167 | " 184 | \n",
168 | " 151 | \n",
169 | " Making News Interesting | \n",
170 | " 39594 | \n",
171 | "
\n",
172 | " \n",
173 | " 4 | \n",
174 | " BenjaminsOtenge | \n",
175 | " 1127 | \n",
176 | " 5002 | \n",
177 | " Male|Politik|Academician|\\n\\nA teacher by prof... | \n",
178 | " 12415 | \n",
179 | "
\n",
180 | " \n",
181 | "
\n",
182 | "
"
183 | ],
184 | "text/plain": [
185 | " screen_name followers friends \\\n",
186 | "0 tyra_gracy 333 317 \n",
187 | "1 etadv 4415 4285 \n",
188 | "2 LaRecetteNwar 391 489 \n",
189 | "3 ParagraphKenya 184 151 \n",
190 | "4 BenjaminsOtenge 1127 5002 \n",
191 | "\n",
192 | " description tweets \n",
193 | "0 FAITH 183 \n",
194 | "1 Advocate, Chairman LSK Nairobi Branch 9745 \n",
195 | "2 french producer • credits : ISK • Bino • Jeune... 5628 \n",
196 | "3 Making News Interesting 39594 \n",
197 | "4 Male|Politik|Academician|\\n\\nA teacher by prof... 12415 "
198 | ]
199 | },
200 | "execution_count": 91,
201 | "metadata": {},
202 | "output_type": "execute_result"
203 | }
204 | ],
205 | "source": [
206 | "tweets = tweepy.Cursor(auth_api.search,\n",
207 | " geocode=\"0.0236,37.9062,530km\", since_id = '2020-05-01').items(2000)\n",
208 | "users = []\n",
209 | "followers = []\n",
210 | "friends = []\n",
211 | "description = []\n",
212 | "tweet_count = []\n",
213 | "for status in tweets:\n",
214 | " name = status.user.screen_name\n",
215 | " desc = status.user.description\n",
216 | " number_of_tweets = status.user.statuses_count\n",
217 | " following = status.user.friends_count\n",
218 | " follower = status.user.followers_count\n",
219 | " users.append(name)\n",
220 | " followers.append(follower)\n",
221 | " friends.append(following)\n",
222 | " description.append(desc)\n",
223 | " tweet_count.append(number_of_tweets) \n",
224 | "\n",
225 | " \n",
226 | "data = {'screen_name':users, 'followers':followers, 'friends':friends, 'description':description, 'tweets':tweet_count}\n",
227 | "users_df = pd.DataFrame(data)\n",
228 | " \n",
229 | "# df['screen_name'] = users\n",
230 | "# df['location'] = location\n",
231 | " \n",
232 | "users_df.to_csv('starting users.csv')\n",
233 | "users_df.head()"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": 93,
239 | "metadata": {},
240 | "outputs": [
241 | {
242 | "data": {
243 | "text/html": [
244 | "\n",
245 | "\n",
258 | "
\n",
259 | " \n",
260 | " \n",
261 | " | \n",
262 | " screen_name | \n",
263 | " followers | \n",
264 | " friends | \n",
265 | " description | \n",
266 | " tweets | \n",
267 | "
\n",
268 | " \n",
269 | " \n",
270 | " \n",
271 | " 0 | \n",
272 | " EmmaOltOlato | \n",
273 | " 4169 | \n",
274 | " 3991 | \n",
275 | " Fashion Model | King of the Iteso | Civil Eng... | \n",
276 | " 4259 | \n",
277 | "
\n",
278 | " \n",
279 | " 1 | \n",
280 | " AmbuchiJR | \n",
281 | " 45 | \n",
282 | " 71 | \n",
283 | " Writer, Photographer and Screenplay Director. ... | \n",
284 | " 37 | \n",
285 | "
\n",
286 | " \n",
287 | " 2 | \n",
288 | " mtieno_ | \n",
289 | " 233 | \n",
290 | " 425 | \n",
291 | " Tech Enthusiast | Born Sinner | A child of God... | \n",
292 | " 903 | \n",
293 | "
\n",
294 | " \n",
295 | " 3 | \n",
296 | " JonathanKaribu | \n",
297 | " 12253 | \n",
298 | " 6364 | \n",
299 | " God_fearing, Optimist,Always Faithful and humble | \n",
300 | " 61646 | \n",
301 | "
\n",
302 | " \n",
303 | " 4 | \n",
304 | " 2142bebe | \n",
305 | " 9 | \n",
306 | " 61 | \n",
307 | " | \n",
308 | " 3793 | \n",
309 | "
\n",
310 | " \n",
311 | "
\n",
312 | "
"
313 | ],
314 | "text/plain": [
315 | " screen_name followers friends \\\n",
316 | "0 EmmaOltOlato 4169 3991 \n",
317 | "1 AmbuchiJR 45 71 \n",
318 | "2 mtieno_ 233 425 \n",
319 | "3 JonathanKaribu 12253 6364 \n",
320 | "4 2142bebe 9 61 \n",
321 | "\n",
322 | " description tweets \n",
323 | "0 Fashion Model | King of the Iteso | Civil Eng... 4259 \n",
324 | "1 Writer, Photographer and Screenplay Director. ... 37 \n",
325 | "2 Tech Enthusiast | Born Sinner | A child of God... 903 \n",
326 | "3 God_fearing, Optimist,Always Faithful and humble 61646 \n",
327 | "4 3793 "
328 | ]
329 | },
330 | "execution_count": 93,
331 | "metadata": {},
332 | "output_type": "execute_result"
333 | }
334 | ],
335 | "source": [
336 | "tweets = tweepy.Cursor(auth_api.search,\n",
337 | " geocode=\"0.0236,37.9062,530km\", since_id = '2020-06-01').items(2000)\n",
338 | "users = []\n",
339 | "followers = []\n",
340 | "friends = []\n",
341 | "description = []\n",
342 | "tweet_count = []\n",
343 | "for status in tweets:\n",
344 | " name = status.user.screen_name\n",
345 | " desc = status.user.description\n",
346 | " number_of_tweets = status.user.statuses_count\n",
347 | " following = status.user.friends_count\n",
348 | " follower = status.user.followers_count\n",
349 | " users.append(name)\n",
350 | " followers.append(follower)\n",
351 | " friends.append(following)\n",
352 | " description.append(desc)\n",
353 | " tweet_count.append(number_of_tweets) \n",
354 | "\n",
355 | "data = {'screen_name':users, 'followers':followers, 'friends':friends, 'description':description, 'tweets':tweet_count}\n",
356 | "df_2 = pd.DataFrame(data)\n",
357 | " \n",
358 | "# df['screen_name'] = users\n",
359 | "# df['location'] = location\n",
360 | " \n",
361 | "df_2.to_csv('df_2 users.csv')\n",
362 | "df_2.head()"
363 | ]
364 | },
365 | {
366 | "cell_type": "code",
367 | "execution_count": 95,
368 | "metadata": {},
369 | "outputs": [
370 | {
371 | "data": {
372 | "text/html": [
373 | "\n",
374 | "\n",
387 | "
\n",
388 | " \n",
389 | " \n",
390 | " | \n",
391 | " screen_name | \n",
392 | " followers | \n",
393 | " friends | \n",
394 | " description | \n",
395 | " tweets | \n",
396 | "
\n",
397 | " \n",
398 | " \n",
399 | " \n",
400 | " 0 | \n",
401 | " purity_ng | \n",
402 | " 10225 | \n",
403 | " 137 | \n",
404 | " Loves good Music,lover of life/Daughter/Sister... | \n",
405 | " 46303 | \n",
406 | "
\n",
407 | " \n",
408 | " 1 | \n",
409 | " ngubesi | \n",
410 | " 825 | \n",
411 | " 5000 | \n",
412 | " | \n",
413 | " 4167 | \n",
414 | "
\n",
415 | " \n",
416 | " 2 | \n",
417 | " Jabuto_Jabuto | \n",
418 | " 2117 | \n",
419 | " 191 | \n",
420 | " Agribusiness Strategist | Agripreneur | Projec... | \n",
421 | " 59984 | \n",
422 | "
\n",
423 | " \n",
424 | " 3 | \n",
425 | " chief_sultan | \n",
426 | " 6966 | \n",
427 | " 5168 | \n",
428 | " 🅖🅞🅓'🅢 🅟🅛🅐🅝. | \n",
429 | " 42889 | \n",
430 | "
\n",
431 | " \n",
432 | " 4 | \n",
433 | " jdrizzy254 | \n",
434 | " 27031 | \n",
435 | " 13192 | \n",
436 | " God is above all \\nArsenal fan 🔴🔴🔴\\neconomist\\... | \n",
437 | " 43855 | \n",
438 | "
\n",
439 | " \n",
440 | "
\n",
441 | "
"
442 | ],
443 | "text/plain": [
444 | " screen_name followers friends \\\n",
445 | "0 purity_ng 10225 137 \n",
446 | "1 ngubesi 825 5000 \n",
447 | "2 Jabuto_Jabuto 2117 191 \n",
448 | "3 chief_sultan 6966 5168 \n",
449 | "4 jdrizzy254 27031 13192 \n",
450 | "\n",
451 | " description tweets \n",
452 | "0 Loves good Music,lover of life/Daughter/Sister... 46303 \n",
453 | "1 4167 \n",
454 | "2 Agribusiness Strategist | Agripreneur | Projec... 59984 \n",
455 | "3 🅖🅞🅓'🅢 🅟🅛🅐🅝. 42889 \n",
456 | "4 God is above all \\nArsenal fan 🔴🔴🔴\\neconomist\\... 43855 "
457 | ]
458 | },
459 | "execution_count": 95,
460 | "metadata": {},
461 | "output_type": "execute_result"
462 | }
463 | ],
464 | "source": [
465 | "tweets = tweepy.Cursor(auth_api.search,\n",
466 | " geocode=\"0.0236,37.9062,530km\", since_id = '2020-07-01').items(2000)\n",
467 | "users = []\n",
468 | "followers = []\n",
469 | "friends = []\n",
470 | "description = []\n",
471 | "tweet_count = []\n",
472 | "for status in tweets:\n",
473 | " name = status.user.screen_name\n",
474 | " desc = status.user.description\n",
475 | " number_of_tweets = status.user.statuses_count\n",
476 | " following = status.user.friends_count\n",
477 | " follower = status.user.followers_count\n",
478 | " users.append(name)\n",
479 | " followers.append(follower)\n",
480 | " friends.append(following)\n",
481 | " description.append(desc)\n",
482 | " tweet_count.append(number_of_tweets) \n",
483 | "\n",
484 | "data = {'screen_name':users, 'followers':followers, 'friends':friends, 'description':description, 'tweets':tweet_count}\n",
485 | "df_3 = pd.DataFrame(data)\n",
486 | " \n",
487 | "# df['screen_name'] = users\n",
488 | "# df['location'] = location\n",
489 | " \n",
490 | "df_3.to_csv('df_3 users.csv')\n",
491 | "df_3.head()"
492 | ]
493 | },
494 | {
495 | "cell_type": "code",
496 | "execution_count": 9,
497 | "metadata": {},
498 | "outputs": [
499 | {
500 | "data": {
501 | "text/html": [
502 | "\n",
503 | "\n",
516 | "
\n",
517 | " \n",
518 | " \n",
519 | " | \n",
520 | " screen_name | \n",
521 | " followers | \n",
522 | " friends | \n",
523 | " description | \n",
524 | " tweets | \n",
525 | "
\n",
526 | " \n",
527 | " \n",
528 | " \n",
529 | " 0 | \n",
530 | " juscloedenise | \n",
531 | " 7789 | \n",
532 | " 7524 | \n",
533 | " Not today | \n",
534 | " 15526 | \n",
535 | "
\n",
536 | " \n",
537 | " 1 | \n",
538 | " ErickssonOdhia2 | \n",
539 | " 427 | \n",
540 | " 770 | \n",
541 | " I'm a happy soul full of intrigue and colors\\n... | \n",
542 | " 9591 | \n",
543 | "
\n",
544 | " \n",
545 | " 2 | \n",
546 | " kidaha_alfred | \n",
547 | " 411 | \n",
548 | " 316 | \n",
549 | " A Communication Lecturer at Multimedia Univers... | \n",
550 | " 159 | \n",
551 | "
\n",
552 | " \n",
553 | " 3 | \n",
554 | " anselme_kwizera | \n",
555 | " 7 | \n",
556 | " 125 | \n",
557 | " Am proud to be Rwandan | \n",
558 | " 21 | \n",
559 | "
\n",
560 | " \n",
561 | " 4 | \n",
562 | " DanKweno | \n",
563 | " 9 | \n",
564 | " 70 | \n",
565 | " | \n",
566 | " 79 | \n",
567 | "
\n",
568 | " \n",
569 | "
\n",
570 | "
"
571 | ],
572 | "text/plain": [
573 | " screen_name followers friends \\\n",
574 | "0 juscloedenise 7789 7524 \n",
575 | "1 ErickssonOdhia2 427 770 \n",
576 | "2 kidaha_alfred 411 316 \n",
577 | "3 anselme_kwizera 7 125 \n",
578 | "4 DanKweno 9 70 \n",
579 | "\n",
580 | " description tweets \n",
581 | "0 Not today 15526 \n",
582 | "1 I'm a happy soul full of intrigue and colors\\n... 9591 \n",
583 | "2 A Communication Lecturer at Multimedia Univers... 159 \n",
584 | "3 Am proud to be Rwandan 21 \n",
585 | "4 79 "
586 | ]
587 | },
588 | "execution_count": 9,
589 | "metadata": {},
590 | "output_type": "execute_result"
591 | }
592 | ],
593 | "source": [
594 | "tweets = tweepy.Cursor(auth_api.search,\n",
595 | " geocode=\"0.0236,37.9062,530km\", since_id = '2020-04-01').items(2000)\n",
596 | "users = []\n",
597 | "followers = []\n",
598 | "friends = []\n",
599 | "description = []\n",
600 | "tweet_count = []\n",
601 | "for status in tweets:\n",
602 | " name = status.user.screen_name\n",
603 | " desc = status.user.description\n",
604 | " number_of_tweets = status.user.statuses_count\n",
605 | " following = status.user.friends_count\n",
606 | " follower = status.user.followers_count\n",
607 | " users.append(name)\n",
608 | " followers.append(follower)\n",
609 | " friends.append(following)\n",
610 | " description.append(desc)\n",
611 | " tweet_count.append(number_of_tweets) \n",
612 | "\n",
613 | "data = {'screen_name':users, 'followers':followers, 'friends':friends, 'description':description, 'tweets':tweet_count}\n",
614 | "df_01 = pd.DataFrame(data)\n",
615 | " \n",
616 | "# df['screen_name'] = users\n",
617 | "# df['location'] = location\n",
618 | " \n",
619 | "df_01.to_csv('df_01 users.csv')\n",
620 | "df_01.head()"
621 | ]
622 | },
623 | {
624 | "cell_type": "code",
625 | "execution_count": 12,
626 | "metadata": {},
627 | "outputs": [
628 | {
629 | "data": {
630 | "text/html": [
631 | "\n",
632 | "\n",
645 | "
\n",
646 | " \n",
647 | " \n",
648 | " | \n",
649 | " screen_name | \n",
650 | " followers | \n",
651 | " friends | \n",
652 | " description | \n",
653 | " tweets | \n",
654 | "
\n",
655 | " \n",
656 | " \n",
657 | " \n",
658 | " 0 | \n",
659 | " mr_Okabi | \n",
660 | " 656 | \n",
661 | " 493 | \n",
662 | " Happiness is a myth | Manchester United Fan | ... | \n",
663 | " 11872 | \n",
664 | "
\n",
665 | " \n",
666 | " 1 | \n",
667 | " SDEKenya | \n",
668 | " 53648 | \n",
669 | " 280 | \n",
670 | " We take Entertainment seriously! Ultimate Keny... | \n",
671 | " 144816 | \n",
672 | "
\n",
673 | " \n",
674 | " 2 | \n",
675 | " Catalyst_Darsil | \n",
676 | " 6006 | \n",
677 | " 1320 | \n",
678 | " Lunatic psycho maniac💯💯😂\\nSocial influencer\\nT... | \n",
679 | " 4535 | \n",
680 | "
\n",
681 | " \n",
682 | " 3 | \n",
683 | " EdgarbrwnBrwn | \n",
684 | " 2386 | \n",
685 | " 4999 | \n",
686 | " Addicted to Reggae #VybezRadioke /RaceWalking ... | \n",
687 | " 37867 | \n",
688 | "
\n",
689 | " \n",
690 | " 4 | \n",
691 | " NzauPriscilla | \n",
692 | " 879 | \n",
693 | " 935 | \n",
694 | " Finance and Banking (Mauritius 2019). Commerce... | \n",
695 | " 6540 | \n",
696 | "
\n",
697 | " \n",
698 | "
\n",
699 | "
"
700 | ],
701 | "text/plain": [
702 | " screen_name followers friends \\\n",
703 | "0 mr_Okabi 656 493 \n",
704 | "1 SDEKenya 53648 280 \n",
705 | "2 Catalyst_Darsil 6006 1320 \n",
706 | "3 EdgarbrwnBrwn 2386 4999 \n",
707 | "4 NzauPriscilla 879 935 \n",
708 | "\n",
709 | " description tweets \n",
710 | "0 Happiness is a myth | Manchester United Fan | ... 11872 \n",
711 | "1 We take Entertainment seriously! Ultimate Keny... 144816 \n",
712 | "2 Lunatic psycho maniac💯💯😂\\nSocial influencer\\nT... 4535 \n",
713 | "3 Addicted to Reggae #VybezRadioke /RaceWalking ... 37867 \n",
714 | "4 Finance and Banking (Mauritius 2019). Commerce... 6540 "
715 | ]
716 | },
717 | "execution_count": 12,
718 | "metadata": {},
719 | "output_type": "execute_result"
720 | }
721 | ],
722 | "source": [
723 | "tweets = tweepy.Cursor(auth_api.search,\n",
724 | " geocode=\"0.0236,37.9062,530km\", since_id = '2020-03-01').items(2000)\n",
725 | "users = []\n",
726 | "followers = []\n",
727 | "friends = []\n",
728 | "description = []\n",
729 | "tweet_count = []\n",
730 | "for status in tweets:\n",
731 | " name = status.user.screen_name\n",
732 | " desc = status.user.description\n",
733 | " number_of_tweets = status.user.statuses_count\n",
734 | " following = status.user.friends_count\n",
735 | " follower = status.user.followers_count\n",
736 | " users.append(name)\n",
737 | " followers.append(follower)\n",
738 | " friends.append(following)\n",
739 | " description.append(desc)\n",
740 | " tweet_count.append(number_of_tweets) \n",
741 | "\n",
742 | "data = {'screen_name':users, 'followers':followers, 'friends':friends, 'description':description, 'tweets':tweet_count}\n",
743 | "df_02 = pd.DataFrame(data)\n",
744 | " \n",
745 | "# df['screen_name'] = users\n",
746 | "# df['location'] = location\n",
747 | " \n",
748 | "df_02.to_csv('df_02 users.csv')\n",
749 | "df_02.head()"
750 | ]
751 | },
752 | {
753 | "cell_type": "code",
754 | "execution_count": 19,
755 | "metadata": {},
756 | "outputs": [
757 | {
758 | "data": {
759 | "text/html": [
760 | "\n",
761 | "\n",
774 | "
\n",
775 | " \n",
776 | " \n",
777 | " | \n",
778 | " screen_name | \n",
779 | " followers | \n",
780 | " friends | \n",
781 | " description | \n",
782 | " tweets | \n",
783 | "
\n",
784 | " \n",
785 | " \n",
786 | " \n",
787 | " 0 | \n",
788 | " VinnieBaite | \n",
789 | " 17620 | \n",
790 | " 854 | \n",
791 | " ●Twitter Professor ●Actor ●Film technologist ●... | \n",
792 | " 14549 | \n",
793 | "
\n",
794 | " \n",
795 | " 1 | \n",
796 | " _sisiosontogago | \n",
797 | " 789 | \n",
798 | " 269 | \n",
799 | " si mika ang katumbas ng habang-buhay | \n",
800 | " 38105 | \n",
801 | "
\n",
802 | " \n",
803 | " 2 | \n",
804 | " Dayliff | \n",
805 | " 42799 | \n",
806 | " 28808 | \n",
807 | " We are the leaders in Water & Energy solutions... | \n",
808 | " 6008 | \n",
809 | "
\n",
810 | " \n",
811 | " 3 | \n",
812 | " iam_jaymoe | \n",
813 | " 11919 | \n",
814 | " 674 | \n",
815 | " | \n",
816 | " 23368 | \n",
817 | "
\n",
818 | " \n",
819 | " 4 | \n",
820 | " wanjerililiank | \n",
821 | " 516 | \n",
822 | " 611 | \n",
823 | " born n brought up ni kiambu county. working in... | \n",
824 | " 1843 | \n",
825 | "
\n",
826 | " \n",
827 | "
\n",
828 | "
"
829 | ],
830 | "text/plain": [
831 | " screen_name followers friends \\\n",
832 | "0 VinnieBaite 17620 854 \n",
833 | "1 _sisiosontogago 789 269 \n",
834 | "2 Dayliff 42799 28808 \n",
835 | "3 iam_jaymoe 11919 674 \n",
836 | "4 wanjerililiank 516 611 \n",
837 | "\n",
838 | " description tweets \n",
839 | "0 ●Twitter Professor ●Actor ●Film technologist ●... 14549 \n",
840 | "1 si mika ang katumbas ng habang-buhay 38105 \n",
841 | "2 We are the leaders in Water & Energy solutions... 6008 \n",
842 | "3 23368 \n",
843 | "4 born n brought up ni kiambu county. working in... 1843 "
844 | ]
845 | },
846 | "execution_count": 19,
847 | "metadata": {},
848 | "output_type": "execute_result"
849 | }
850 | ],
851 | "source": [
852 | "tweets = tweepy.Cursor(auth_api.search,\n",
853 | " geocode=\"0.0236,37.9062,530km\", since_id = '2020-02-01').items(2000)\n",
854 | "users = []\n",
855 | "followers = []\n",
856 | "friends = []\n",
857 | "description = []\n",
858 | "tweet_count = []\n",
859 | "for status in tweets:\n",
860 | " name = status.user.screen_name\n",
861 | " desc = status.user.description\n",
862 | " number_of_tweets = status.user.statuses_count\n",
863 | " following = status.user.friends_count\n",
864 | " follower = status.user.followers_count\n",
865 | " users.append(name)\n",
866 | " followers.append(follower)\n",
867 | " friends.append(following)\n",
868 | " description.append(desc)\n",
869 | " tweet_count.append(number_of_tweets) \n",
870 | "\n",
871 | "data = {'screen_name':users, 'followers':followers, 'friends':friends, 'description':description, 'tweets':tweet_count}\n",
872 | "df_03 = pd.DataFrame(data)\n",
873 | " \n",
874 | "# df['screen_name'] = users\n",
875 | "# df['location'] = location\n",
876 | " \n",
877 | "df_03.to_csv('df_03 users.csv')\n",
878 | "df_03.head()"
879 | ]
880 | },
881 | {
882 | "cell_type": "code",
883 | "execution_count": 16,
884 | "metadata": {},
885 | "outputs": [
886 | {
887 | "data": {
888 | "text/html": [
889 | "\n",
890 | "\n",
903 | "
\n",
904 | " \n",
905 | " \n",
906 | " | \n",
907 | " screen_name | \n",
908 | " followers | \n",
909 | " friends | \n",
910 | " description | \n",
911 | " tweets | \n",
912 | "
\n",
913 | " \n",
914 | " \n",
915 | " \n",
916 | " 0 | \n",
917 | " Vikemu_Ke | \n",
918 | " 4232 | \n",
919 | " 3243 | \n",
920 | " 🌟\\nCfc⚽\\nMy Love For Football Is Underrated 😅\\... | \n",
921 | " 18796 | \n",
922 | "
\n",
923 | " \n",
924 | " 1 | \n",
925 | " GKILINDAH | \n",
926 | " 2 | \n",
927 | " 25 | \n",
928 | " | \n",
929 | " 40 | \n",
930 | "
\n",
931 | " \n",
932 | " 2 | \n",
933 | " SamuelOmerhi | \n",
934 | " 1064 | \n",
935 | " 1242 | \n",
936 | " THE NEXT BIG NAME‼️ | \n",
937 | " 830 | \n",
938 | "
\n",
939 | " \n",
940 | " 3 | \n",
941 | " Isebania_Finest | \n",
942 | " 3447 | \n",
943 | " 4993 | \n",
944 | " #Dm for business promotions and Ads. |We are h... | \n",
945 | " 4620 | \n",
946 | "
\n",
947 | " \n",
948 | " 4 | \n",
949 | " pizzainnke | \n",
950 | " 1458 | \n",
951 | " 92 | \n",
952 | " Giving you nothing but the best, yummy Pizza! | \n",
953 | " 1301 | \n",
954 | "
\n",
955 | " \n",
956 | "
\n",
957 | "
"
958 | ],
959 | "text/plain": [
960 | " screen_name followers friends \\\n",
961 | "0 Vikemu_Ke 4232 3243 \n",
962 | "1 GKILINDAH 2 25 \n",
963 | "2 SamuelOmerhi 1064 1242 \n",
964 | "3 Isebania_Finest 3447 4993 \n",
965 | "4 pizzainnke 1458 92 \n",
966 | "\n",
967 | " description tweets \n",
968 | "0 🌟\\nCfc⚽\\nMy Love For Football Is Underrated 😅\\... 18796 \n",
969 | "1 40 \n",
970 | "2 THE NEXT BIG NAME‼️ 830 \n",
971 | "3 #Dm for business promotions and Ads. |We are h... 4620 \n",
972 | "4 Giving you nothing but the best, yummy Pizza! 1301 "
973 | ]
974 | },
975 | "execution_count": 16,
976 | "metadata": {},
977 | "output_type": "execute_result"
978 | }
979 | ],
980 | "source": [
981 | "tweets = tweepy.Cursor(auth_api.search,\n",
982 | " geocode=\"0.0236,37.9062,530km\", since_id = '2020-01-01').items(2000)\n",
983 | "users = []\n",
984 | "followers = []\n",
985 | "friends = []\n",
986 | "description = []\n",
987 | "tweet_count = []\n",
988 | "for status in tweets:\n",
989 | " name = status.user.screen_name\n",
990 | " desc = status.user.description\n",
991 | " number_of_tweets = status.user.statuses_count\n",
992 | " following = status.user.friends_count\n",
993 | " follower = status.user.followers_count\n",
994 | " users.append(name)\n",
995 | " followers.append(follower)\n",
996 | " friends.append(following)\n",
997 | " description.append(desc)\n",
998 | " tweet_count.append(number_of_tweets) \n",
999 | "\n",
1000 | "data = {'screen_name':users, 'followers':followers, 'friends':friends, 'description':description, 'tweets':tweet_count}\n",
1001 | "df_04 = pd.DataFrame(data)\n",
1002 | " \n",
1003 | "# df['screen_name'] = users\n",
1004 | "# df['location'] = location\n",
1005 | " \n",
1006 | "df_04.to_csv('df_04 users.csv')\n",
1007 | "df_04.head()"
1008 | ]
1009 | },
1010 | {
1011 | "cell_type": "code",
1012 | "execution_count": 14,
1013 | "metadata": {},
1014 | "outputs": [
1015 | {
1016 | "data": {
1017 | "text/html": [
1018 | "\n",
1019 | "\n",
1032 | "
\n",
1033 | " \n",
1034 | " \n",
1035 | " | \n",
1036 | " screen_name | \n",
1037 | " followers | \n",
1038 | " friends | \n",
1039 | " description | \n",
1040 | " tweets | \n",
1041 | "
\n",
1042 | " \n",
1043 | " \n",
1044 | " \n",
1045 | " 0 | \n",
1046 | " BeckyKwagala | \n",
1047 | " 620 | \n",
1048 | " 599 | \n",
1049 | " Loves to cook. 🤗 Potential public speaker. Gun... | \n",
1050 | " 1274 | \n",
1051 | "
\n",
1052 | " \n",
1053 | " 1 | \n",
1054 | " Kubaimugo1 | \n",
1055 | " 159 | \n",
1056 | " 1132 | \n",
1057 | " Pauline njeris husband father of Wambui n Loui... | \n",
1058 | " 447 | \n",
1059 | "
\n",
1060 | " \n",
1061 | " 2 | \n",
1062 | " Kush_Kahima | \n",
1063 | " 2153 | \n",
1064 | " 1527 | \n",
1065 | " Student🌺 || Mark 10:27 || @Arsenal die hard ||... | \n",
1066 | " 2303 | \n",
1067 | "
\n",
1068 | " \n",
1069 | " 3 | \n",
1070 | " KarokiNjogu | \n",
1071 | " 1351 | \n",
1072 | " 1252 | \n",
1073 | " Contributing To Entropy Since 1998.\\n🇰🇪 ➡️ 🌍 | \n",
1074 | " 7693 | \n",
1075 | "
\n",
1076 | " \n",
1077 | " 4 | \n",
1078 | " Jr_LincolnKE | \n",
1079 | " 122464 | \n",
1080 | " 1592 | \n",
1081 | " Your Favourite Voice of Reason! Digital Market... | \n",
1082 | " 54243 | \n",
1083 | "
\n",
1084 | " \n",
1085 | "
\n",
1086 | "
"
1087 | ],
1088 | "text/plain": [
1089 | " screen_name followers friends \\\n",
1090 | "0 BeckyKwagala 620 599 \n",
1091 | "1 Kubaimugo1 159 1132 \n",
1092 | "2 Kush_Kahima 2153 1527 \n",
1093 | "3 KarokiNjogu 1351 1252 \n",
1094 | "4 Jr_LincolnKE 122464 1592 \n",
1095 | "\n",
1096 | " description tweets \n",
1097 | "0 Loves to cook. 🤗 Potential public speaker. Gun... 1274 \n",
1098 | "1 Pauline njeris husband father of Wambui n Loui... 447 \n",
1099 | "2 Student🌺 || Mark 10:27 || @Arsenal die hard ||... 2303 \n",
1100 | "3 Contributing To Entropy Since 1998.\\n🇰🇪 ➡️ 🌍 7693 \n",
1101 | "4 Your Favourite Voice of Reason! Digital Market... 54243 "
1102 | ]
1103 | },
1104 | "execution_count": 14,
1105 | "metadata": {},
1106 | "output_type": "execute_result"
1107 | }
1108 | ],
1109 | "source": [
1110 | "tweets = tweepy.Cursor(auth_api.search,\n",
1111 | " geocode=\"0.0236,37.9062,530km\", since_id = '2019-12-01').items(2000)\n",
1112 | "users = []\n",
1113 | "followers = []\n",
1114 | "friends = []\n",
1115 | "description = []\n",
1116 | "tweet_count = []\n",
1117 | "for status in tweets:\n",
1118 | " name = status.user.screen_name\n",
1119 | " desc = status.user.description\n",
1120 | " number_of_tweets = status.user.statuses_count\n",
1121 | " following = status.user.friends_count\n",
1122 | " follower = status.user.followers_count\n",
1123 | " users.append(name)\n",
1124 | " followers.append(follower)\n",
1125 | " friends.append(following)\n",
1126 | " description.append(desc)\n",
1127 | " tweet_count.append(number_of_tweets) \n",
1128 | "\n",
1129 | "data = {'screen_name':users, 'followers':followers, 'friends':friends, 'description':description, 'tweets':tweet_count}\n",
1130 | "df_05 = pd.DataFrame(data)\n",
1131 | " \n",
1132 | "# df['screen_name'] = users\n",
1133 | "# df['location'] = location\n",
1134 | " \n",
1135 | "df_05.to_csv('df_05 users.csv')\n",
1136 | "df_05.head()"
1137 | ]
1138 | },
1139 | {
1140 | "cell_type": "markdown",
1141 | "metadata": {},
1142 | "source": [
1143 | "#### Geo df"
1144 | ]
1145 | },
1146 | {
1147 | "cell_type": "code",
1148 | "execution_count": 139,
1149 | "metadata": {},
1150 | "outputs": [
1151 | {
1152 | "data": {
1153 | "text/plain": [
1154 | "(6000, 5)"
1155 | ]
1156 | },
1157 | "execution_count": 139,
1158 | "metadata": {},
1159 | "output_type": "execute_result"
1160 | }
1161 | ],
1162 | "source": [
1163 | "df = pd.concat([users_df, df_2, df_3])\n",
1164 | "df.shape"
1165 | ]
1166 | },
1167 | {
1168 | "cell_type": "code",
1169 | "execution_count": 140,
1170 | "metadata": {},
1171 | "outputs": [
1172 | {
1173 | "data": {
1174 | "text/plain": [
1175 | "(3162, 6)"
1176 | ]
1177 | },
1178 | "execution_count": 140,
1179 | "metadata": {},
1180 | "output_type": "execute_result"
1181 | }
1182 | ],
1183 | "source": [
1184 | "df['popularity'] = df['followers'] - df['friends']\n",
1185 | "df = df.sort_values(by = 'popularity', ascending = False)\n",
1186 | "df = df.drop_duplicates(subset =\"screen_name\") \n",
1187 | "df.shape"
1188 | ]
1189 | },
1190 | {
1191 | "cell_type": "code",
1192 | "execution_count": 141,
1193 | "metadata": {},
1194 | "outputs": [
1195 | {
1196 | "data": {
1197 | "text/plain": [
1198 | "(62, 6)"
1199 | ]
1200 | },
1201 | "execution_count": 141,
1202 | "metadata": {},
1203 | "output_type": "execute_result"
1204 | }
1205 | ],
1206 | "source": [
1207 | "# Drop all users with popularity scores less than 50k\n",
1208 | "new_df = df[df.popularity >= 50000]\n",
1209 | "new_df.shape"
1210 | ]
1211 | },
1212 | {
1213 | "cell_type": "code",
1214 | "execution_count": 111,
1215 | "metadata": {},
1216 | "outputs": [
1217 | {
1218 | "data": {
1219 | "text/html": [
1220 | "\n",
1221 | "\n",
1234 | "
\n",
1235 | " \n",
1236 | " \n",
1237 | " | \n",
1238 | " screen_name | \n",
1239 | " followers | \n",
1240 | " friends | \n",
1241 | " description | \n",
1242 | " tweets | \n",
1243 | " popularity | \n",
1244 | "
\n",
1245 | " \n",
1246 | " \n",
1247 | " \n",
1248 | " 1639 | \n",
1249 | " citizentvkenya | \n",
1250 | " 2715770 | \n",
1251 | " 219 | \n",
1252 | " Kenya's Premier TV Station. Breaking News / En... | \n",
1253 | " 535035 | \n",
1254 | " 2715551 | \n",
1255 | "
\n",
1256 | " \n",
1257 | " 887 | \n",
1258 | " ntvkenya | \n",
1259 | " 2551044 | \n",
1260 | " 262 | \n",
1261 | " LIVE STREAM: https://t.co/BrEeOBOxu6 _________... | \n",
1262 | " 491465 | \n",
1263 | " 2550782 | \n",
1264 | "
\n",
1265 | " \n",
1266 | " 1684 | \n",
1267 | " dailynation | \n",
1268 | " 2142307 | \n",
1269 | " 262 | \n",
1270 | " Welcome to Daily Nation on Twitter, for instan... | \n",
1271 | " 553869 | \n",
1272 | " 2142045 | \n",
1273 | "
\n",
1274 | " \n",
1275 | " 1269 | \n",
1276 | " RobertAlai | \n",
1277 | " 1498640 | \n",
1278 | " 3353 | \n",
1279 | " The Equalizer | There’s no comfort in the trut... | \n",
1280 | " 244149 | \n",
1281 | " 1495287 | \n",
1282 | "
\n",
1283 | " \n",
1284 | " 1569 | \n",
1285 | " CKirubi | \n",
1286 | " 1432476 | \n",
1287 | " 1091 | \n",
1288 | " Media Owner | Entrepreneur | Mentor. A trailbl... | \n",
1289 | " 27725 | \n",
1290 | " 1431385 | \n",
1291 | "
\n",
1292 | " \n",
1293 | " ... | \n",
1294 | " ... | \n",
1295 | " ... | \n",
1296 | " ... | \n",
1297 | " ... | \n",
1298 | " ... | \n",
1299 | " ... | \n",
1300 | "
\n",
1301 | " \n",
1302 | " 247 | \n",
1303 | " DorahManya | \n",
1304 | " 32044 | \n",
1305 | " 367 | \n",
1306 | " Award winning Radio Presenter |God fearing |Yo... | \n",
1307 | " 189944 | \n",
1308 | " 31677 | \n",
1309 | "
\n",
1310 | " \n",
1311 | " 230 | \n",
1312 | " holydave | \n",
1313 | " 34236 | \n",
1314 | " 2798 | \n",
1315 | " PhD Student | Financial Modelling Expert | Art... | \n",
1316 | " 38324 | \n",
1317 | " 31438 | \n",
1318 | "
\n",
1319 | " \n",
1320 | " 1742 | \n",
1321 | " Lasha254 | \n",
1322 | " 54219 | \n",
1323 | " 23019 | \n",
1324 | " I'm not your ordinary B 🙄 Girls who know sport... | \n",
1325 | " 11965 | \n",
1326 | " 31200 | \n",
1327 | "
\n",
1328 | " \n",
1329 | " 1029 | \n",
1330 | " SKachima | \n",
1331 | " 34059 | \n",
1332 | " 3419 | \n",
1333 | " Simba SC ❤️ManUtd ❤️ Libra ⭐ Good 🎶🇹🇿 Psalm 23✌️ | \n",
1334 | " 33308 | \n",
1335 | " 30640 | \n",
1336 | "
\n",
1337 | " \n",
1338 | " 1329 | \n",
1339 | " discjockeyvin | \n",
1340 | " 30528 | \n",
1341 | " 356 | \n",
1342 | " PROFESSIONAL DJ (CLUB, MOBILE, TV & RADIO, CON... | \n",
1343 | " 3133 | \n",
1344 | " 30172 | \n",
1345 | "
\n",
1346 | " \n",
1347 | "
\n",
1348 | "
87 rows × 6 columns
\n",
1349 | "
"
1350 | ],
1351 | "text/plain": [
1352 | " screen_name followers friends \\\n",
1353 | "1639 citizentvkenya 2715770 219 \n",
1354 | "887 ntvkenya 2551044 262 \n",
1355 | "1684 dailynation 2142307 262 \n",
1356 | "1269 RobertAlai 1498640 3353 \n",
1357 | "1569 CKirubi 1432476 1091 \n",
1358 | "... ... ... ... \n",
1359 | "247 DorahManya 32044 367 \n",
1360 | "230 holydave 34236 2798 \n",
1361 | "1742 Lasha254 54219 23019 \n",
1362 | "1029 SKachima 34059 3419 \n",
1363 | "1329 discjockeyvin 30528 356 \n",
1364 | "\n",
1365 | " description tweets popularity \n",
1366 | "1639 Kenya's Premier TV Station. Breaking News / En... 535035 2715551 \n",
1367 | "887 LIVE STREAM: https://t.co/BrEeOBOxu6 _________... 491465 2550782 \n",
1368 | "1684 Welcome to Daily Nation on Twitter, for instan... 553869 2142045 \n",
1369 | "1269 The Equalizer | There’s no comfort in the trut... 244149 1495287 \n",
1370 | "1569 Media Owner | Entrepreneur | Mentor. A trailbl... 27725 1431385 \n",
1371 | "... ... ... ... \n",
1372 | "247 Award winning Radio Presenter |God fearing |Yo... 189944 31677 \n",
1373 | "230 PhD Student | Financial Modelling Expert | Art... 38324 31438 \n",
1374 | "1742 I'm not your ordinary B 🙄 Girls who know sport... 11965 31200 \n",
1375 | "1029 Simba SC ❤️ManUtd ❤️ Libra ⭐ Good 🎶🇹🇿 Psalm 23✌️ 33308 30640 \n",
1376 | "1329 PROFESSIONAL DJ (CLUB, MOBILE, TV & RADIO, CON... 3133 30172 \n",
1377 | "\n",
1378 | "[87 rows x 6 columns]"
1379 | ]
1380 | },
1381 | "execution_count": 111,
1382 | "metadata": {},
1383 | "output_type": "execute_result"
1384 | }
1385 | ],
1386 | "source": [
1387 | "new_df"
1388 | ]
1389 | },
1390 | {
1391 | "cell_type": "markdown",
1392 | "metadata": {},
1393 | "source": [
1394 | "### Keyword Scraper"
1395 | ]
1396 | },
1397 | {
1398 | "cell_type": "code",
1399 | "execution_count": 7,
1400 | "metadata": {},
1401 | "outputs": [
1402 | {
1403 | "data": {
1404 | "text/html": [
1405 | "\n",
1406 | "\n",
1419 | "
\n",
1420 | " \n",
1421 | " \n",
1422 | " | \n",
1423 | " screen_name | \n",
1424 | " followers | \n",
1425 | " friends | \n",
1426 | " description | \n",
1427 | " tweets | \n",
1428 | "
\n",
1429 | " \n",
1430 | " \n",
1431 | " \n",
1432 | " 0 | \n",
1433 | " valleysOfGod | \n",
1434 | " 1972 | \n",
1435 | " 1980 | \n",
1436 | " political scientist,,upcoming entrepreneur,,so... | \n",
1437 | " 1189 | \n",
1438 | "
\n",
1439 | " \n",
1440 | " 1 | \n",
1441 | " spinkingske | \n",
1442 | " 696 | \n",
1443 | " 117 | \n",
1444 | " The Bicycle is a simple Solution to some of th... | \n",
1445 | " 1944 | \n",
1446 | "
\n",
1447 | " \n",
1448 | " 2 | \n",
1449 | " DNyarere | \n",
1450 | " 7 | \n",
1451 | " 0 | \n",
1452 | " | \n",
1453 | " 601 | \n",
1454 | "
\n",
1455 | " \n",
1456 | " 3 | \n",
1457 | " luckydun | \n",
1458 | " 127 | \n",
1459 | " 270 | \n",
1460 | " | \n",
1461 | " 741 | \n",
1462 | "
\n",
1463 | " \n",
1464 | " 4 | \n",
1465 | " _matheka_a | \n",
1466 | " 587 | \n",
1467 | " 384 | \n",
1468 | " | \n",
1469 | " 8915 | \n",
1470 | "
\n",
1471 | " \n",
1472 | "
\n",
1473 | "
"
1474 | ],
1475 | "text/plain": [
1476 | " screen_name followers friends \\\n",
1477 | "0 valleysOfGod 1972 1980 \n",
1478 | "1 spinkingske 696 117 \n",
1479 | "2 DNyarere 7 0 \n",
1480 | "3 luckydun 127 270 \n",
1481 | "4 _matheka_a 587 384 \n",
1482 | "\n",
1483 | " description tweets \n",
1484 | "0 political scientist,,upcoming entrepreneur,,so... 1189 \n",
1485 | "1 The Bicycle is a simple Solution to some of th... 1944 \n",
1486 | "2 601 \n",
1487 | "3 741 \n",
1488 | "4 8915 "
1489 | ]
1490 | },
1491 | "execution_count": 7,
1492 | "metadata": {},
1493 | "output_type": "execute_result"
1494 | }
1495 | ],
1496 | "source": [
1497 | "tweets = tweepy.Cursor(auth_api.search,q = 'Kenya', since_id = '2020-07-01').items(2000)\n",
1498 | "users = []\n",
1499 | "followers = []\n",
1500 | "friends = []\n",
1501 | "description = []\n",
1502 | "tweet_count = []\n",
1503 | "for status in tweets:\n",
1504 | " name = status.user.screen_name\n",
1505 | " desc = status.user.description\n",
1506 | " number_of_tweets = status.user.statuses_count\n",
1507 | " following = status.user.friends_count\n",
1508 | " follower = status.user.followers_count\n",
1509 | " users.append(name)\n",
1510 | " followers.append(follower)\n",
1511 | " friends.append(following)\n",
1512 | " description.append(desc)\n",
1513 | " tweet_count.append(number_of_tweets) \n",
1514 | "\n",
1515 | "data = {'screen_name':users, 'followers':followers, 'friends':friends, 'description':description, 'tweets':tweet_count}\n",
1516 | "df_4 = pd.DataFrame(data)\n",
1517 | " \n",
1518 | "# df['screen_name'] = users\n",
1519 | "# df['location'] = location\n",
1520 | " \n",
1521 | "df_4.to_csv('df_4 users.csv')\n",
1522 | "df_4.head()"
1523 | ]
1524 | },
1525 | {
1526 | "cell_type": "code",
1527 | "execution_count": 114,
1528 | "metadata": {},
1529 | "outputs": [
1530 | {
1531 | "data": {
1532 | "text/html": [
1533 | "\n",
1534 | "\n",
1547 | "
\n",
1548 | " \n",
1549 | " \n",
1550 | " | \n",
1551 | " screen_name | \n",
1552 | " followers | \n",
1553 | " friends | \n",
1554 | " description | \n",
1555 | " tweets | \n",
1556 | "
\n",
1557 | " \n",
1558 | " \n",
1559 | " \n",
1560 | " 0 | \n",
1561 | " DStv_Kenya | \n",
1562 | " 341008 | \n",
1563 | " 12213 | \n",
1564 | " DStv is Kenya's top entertainment destination.... | \n",
1565 | " 317107 | \n",
1566 | "
\n",
1567 | " \n",
1568 | " 1 | \n",
1569 | " iam_warloque | \n",
1570 | " 412 | \n",
1571 | " 398 | \n",
1572 | " #LFC #30BG | \n",
1573 | " 366 | \n",
1574 | "
\n",
1575 | " \n",
1576 | " 2 | \n",
1577 | " its_vus | \n",
1578 | " 1692 | \n",
1579 | " 1446 | \n",
1580 | " | \n",
1581 | " 9253 | \n",
1582 | "
\n",
1583 | " \n",
1584 | " 3 | \n",
1585 | " 80daysbot | \n",
1586 | " 9 | \n",
1587 | " 0 | \n",
1588 | " 80 days of botting around the world :) | \n",
1589 | " 30836 | \n",
1590 | "
\n",
1591 | " \n",
1592 | " 4 | \n",
1593 | " TheKimutai | \n",
1594 | " 10702 | \n",
1595 | " 3692 | \n",
1596 | " Let go your earthly tether. Enter the void. Em... | \n",
1597 | " 206293 | \n",
1598 | "
\n",
1599 | " \n",
1600 | "
\n",
1601 | "
"
1602 | ],
1603 | "text/plain": [
1604 | " screen_name followers friends \\\n",
1605 | "0 DStv_Kenya 341008 12213 \n",
1606 | "1 iam_warloque 412 398 \n",
1607 | "2 its_vus 1692 1446 \n",
1608 | "3 80daysbot 9 0 \n",
1609 | "4 TheKimutai 10702 3692 \n",
1610 | "\n",
1611 | " description tweets \n",
1612 | "0 DStv is Kenya's top entertainment destination.... 317107 \n",
1613 | "1 #LFC #30BG 366 \n",
1614 | "2 9253 \n",
1615 | "3 80 days of botting around the world :) 30836 \n",
1616 | "4 Let go your earthly tether. Enter the void. Em... 206293 "
1617 | ]
1618 | },
1619 | "execution_count": 114,
1620 | "metadata": {},
1621 | "output_type": "execute_result"
1622 | }
1623 | ],
1624 | "source": [
1625 | "tweets = tweepy.Cursor(auth_api.search,q = 'Kenya', since_id = '2020-06-01').items(2000)\n",
1626 | "users = []\n",
1627 | "followers = []\n",
1628 | "friends = []\n",
1629 | "description = []\n",
1630 | "tweet_count = []\n",
1631 | "for status in tweets:\n",
1632 | " name = status.user.screen_name\n",
1633 | " desc = status.user.description\n",
1634 | " number_of_tweets = status.user.statuses_count\n",
1635 | " following = status.user.friends_count\n",
1636 | " follower = status.user.followers_count\n",
1637 | " users.append(name)\n",
1638 | " followers.append(follower)\n",
1639 | " friends.append(following)\n",
1640 | " description.append(desc)\n",
1641 | " tweet_count.append(number_of_tweets) \n",
1642 | "\n",
1643 | "data = {'screen_name':users, 'followers':followers, 'friends':friends, 'description':description, 'tweets':tweet_count}\n",
1644 | "df_5 = pd.DataFrame(data)\n",
1645 | " \n",
1646 | "# df['screen_name'] = users\n",
1647 | "# df['location'] = location\n",
1648 | " \n",
1649 | "df_5.to_csv('df_5 users.csv')\n",
1650 | "df_5.head()"
1651 | ]
1652 | },
1653 | {
1654 | "cell_type": "code",
1655 | "execution_count": 115,
1656 | "metadata": {},
1657 | "outputs": [
1658 | {
1659 | "data": {
1660 | "text/html": [
1661 | "\n",
1662 | "\n",
1675 | "
\n",
1676 | " \n",
1677 | " \n",
1678 | " | \n",
1679 | " screen_name | \n",
1680 | " followers | \n",
1681 | " friends | \n",
1682 | " description | \n",
1683 | " tweets | \n",
1684 | "
\n",
1685 | " \n",
1686 | " \n",
1687 | " \n",
1688 | " 0 | \n",
1689 | " KJeremiahN | \n",
1690 | " 2109 | \n",
1691 | " 660 | \n",
1692 | " Outgoing and easy to associate with...Any view... | \n",
1693 | " 109414 | \n",
1694 | "
\n",
1695 | " \n",
1696 | " 1 | \n",
1697 | " annitii | \n",
1698 | " 2402 | \n",
1699 | " 2510 | \n",
1700 | " FOLLOWING 🇰🇪 KENYANS ONLY | \n",
1701 | " 6483 | \n",
1702 | "
\n",
1703 | " \n",
1704 | " 2 | \n",
1705 | " DaimaGreenSpace | \n",
1706 | " 39 | \n",
1707 | " 47 | \n",
1708 | " Advocating for inclusive GreenSpaces for all | \n",
1709 | " 37 | \n",
1710 | "
\n",
1711 | " \n",
1712 | " 3 | \n",
1713 | " geonal | \n",
1714 | " 1335 | \n",
1715 | " 872 | \n",
1716 | " Eccl. 9:11. Husband. Dreamer. Father.\\nStudent... | \n",
1717 | " 49244 | \n",
1718 | "
\n",
1719 | " \n",
1720 | " 4 | \n",
1721 | " mohd_Hassan_Ali | \n",
1722 | " 388 | \n",
1723 | " 180 | \n",
1724 | " So humble for human beings & Slave of ALLAH | \n",
1725 | " 1344 | \n",
1726 | "
\n",
1727 | " \n",
1728 | "
\n",
1729 | "
"
1730 | ],
1731 | "text/plain": [
1732 | " screen_name followers friends \\\n",
1733 | "0 KJeremiahN 2109 660 \n",
1734 | "1 annitii 2402 2510 \n",
1735 | "2 DaimaGreenSpace 39 47 \n",
1736 | "3 geonal 1335 872 \n",
1737 | "4 mohd_Hassan_Ali 388 180 \n",
1738 | "\n",
1739 | " description tweets \n",
1740 | "0 Outgoing and easy to associate with...Any view... 109414 \n",
1741 | "1 FOLLOWING 🇰🇪 KENYANS ONLY 6483 \n",
1742 | "2 Advocating for inclusive GreenSpaces for all 37 \n",
1743 | "3 Eccl. 9:11. Husband. Dreamer. Father.\\nStudent... 49244 \n",
1744 | "4 So humble for human beings & Slave of ALLAH 1344 "
1745 | ]
1746 | },
1747 | "execution_count": 115,
1748 | "metadata": {},
1749 | "output_type": "execute_result"
1750 | }
1751 | ],
1752 | "source": [
1753 | "tweets = tweepy.Cursor(auth_api.search,q = 'Kenya', since_id = '2020-05-01').items(2000)\n",
1754 | "users = []\n",
1755 | "followers = []\n",
1756 | "friends = []\n",
1757 | "description = []\n",
1758 | "tweet_count = []\n",
1759 | "for status in tweets:\n",
1760 | " name = status.user.screen_name\n",
1761 | " desc = status.user.description\n",
1762 | " number_of_tweets = status.user.statuses_count\n",
1763 | " following = status.user.friends_count\n",
1764 | " follower = status.user.followers_count\n",
1765 | " users.append(name)\n",
1766 | " followers.append(follower)\n",
1767 | " friends.append(following)\n",
1768 | " description.append(desc)\n",
1769 | " tweet_count.append(number_of_tweets) \n",
1770 | "\n",
1771 | "data = {'screen_name':users, 'followers':followers, 'friends':friends, 'description':description, 'tweets':tweet_count}\n",
1772 | "df_6 = pd.DataFrame(data)\n",
1773 | " \n",
1774 | "# df['screen_name'] = users\n",
1775 | "# df['location'] = location\n",
1776 | " \n",
1777 | "df_6.to_csv('df_6 users.csv')\n",
1778 | "df_6.head()"
1779 | ]
1780 | },
1781 | {
1782 | "cell_type": "code",
1783 | "execution_count": 117,
1784 | "metadata": {},
1785 | "outputs": [
1786 | {
1787 | "data": {
1788 | "text/html": [
1789 | "\n",
1790 | "\n",
1803 | "
\n",
1804 | " \n",
1805 | " \n",
1806 | " | \n",
1807 | " screen_name | \n",
1808 | " followers | \n",
1809 | " friends | \n",
1810 | " description | \n",
1811 | " tweets | \n",
1812 | "
\n",
1813 | " \n",
1814 | " \n",
1815 | " \n",
1816 | " 0 | \n",
1817 | " _dazils_ | \n",
1818 | " 467 | \n",
1819 | " 165 | \n",
1820 | " Fine young man with an old man mind. | \n",
1821 | " 28108 | \n",
1822 | "
\n",
1823 | " \n",
1824 | " 1 | \n",
1825 | " iamjoseh_ | \n",
1826 | " 46577 | \n",
1827 | " 741 | \n",
1828 | " A Human. Being. my back up account 👉🏾 @iam_joseh | \n",
1829 | " 249788 | \n",
1830 | "
\n",
1831 | " \n",
1832 | " 2 | \n",
1833 | " Waambui | \n",
1834 | " 35726 | \n",
1835 | " 68 | \n",
1836 | " “The capacity to learn is a gift; The ability ... | \n",
1837 | " 5992 | \n",
1838 | "
\n",
1839 | " \n",
1840 | " 3 | \n",
1841 | " drewgash | \n",
1842 | " 59 | \n",
1843 | " 48 | \n",
1844 | " African socialist admirer. | \n",
1845 | " 4038 | \n",
1846 | "
\n",
1847 | " \n",
1848 | " 4 | \n",
1849 | " lewys_chris | \n",
1850 | " 94 | \n",
1851 | " 421 | \n",
1852 | " Director Ndege Tausi https://t.co/fzuLaCzFfj A... | \n",
1853 | " 777 | \n",
1854 | "
\n",
1855 | " \n",
1856 | "
\n",
1857 | "
"
1858 | ],
1859 | "text/plain": [
1860 | " screen_name followers friends \\\n",
1861 | "0 _dazils_ 467 165 \n",
1862 | "1 iamjoseh_ 46577 741 \n",
1863 | "2 Waambui 35726 68 \n",
1864 | "3 drewgash 59 48 \n",
1865 | "4 lewys_chris 94 421 \n",
1866 | "\n",
1867 | " description tweets \n",
1868 | "0 Fine young man with an old man mind. 28108 \n",
1869 | "1 A Human. Being. my back up account 👉🏾 @iam_joseh 249788 \n",
1870 | "2 “The capacity to learn is a gift; The ability ... 5992 \n",
1871 | "3 African socialist admirer. 4038 \n",
1872 | "4 Director Ndege Tausi https://t.co/fzuLaCzFfj A... 777 "
1873 | ]
1874 | },
1875 | "execution_count": 117,
1876 | "metadata": {},
1877 | "output_type": "execute_result"
1878 | }
1879 | ],
1880 | "source": [
1881 | "tweets = tweepy.Cursor(auth_api.search,q = 'Kenya', since_id = '2020-04-01').items(2000)\n",
1882 | "users = []\n",
1883 | "followers = []\n",
1884 | "friends = []\n",
1885 | "description = []\n",
1886 | "tweet_count = []\n",
1887 | "for status in tweets:\n",
1888 | " name = status.user.screen_name\n",
1889 | " desc = status.user.description\n",
1890 | " number_of_tweets = status.user.statuses_count\n",
1891 | " following = status.user.friends_count\n",
1892 | " follower = status.user.followers_count\n",
1893 | " users.append(name)\n",
1894 | " followers.append(follower)\n",
1895 | " friends.append(following)\n",
1896 | " description.append(desc)\n",
1897 | " tweet_count.append(number_of_tweets) \n",
1898 | "\n",
1899 | "data = {'screen_name':users, 'followers':followers, 'friends':friends, 'description':description, 'tweets':tweet_count}\n",
1900 | "df_7 = pd.DataFrame(data)\n",
1901 | " \n",
1902 | "# df['screen_name'] = users\n",
1903 | "# df['location'] = location\n",
1904 | " \n",
1905 | "df_7.to_csv('df_7 users.csv')\n",
1906 | "df_7.head()"
1907 | ]
1908 | },
1909 | {
1910 | "cell_type": "code",
1911 | "execution_count": 129,
1912 | "metadata": {},
1913 | "outputs": [
1914 | {
1915 | "data": {
1916 | "text/html": [
1917 | "\n",
1918 | "\n",
1931 | "
\n",
1932 | " \n",
1933 | " \n",
1934 | " | \n",
1935 | " screen_name | \n",
1936 | " followers | \n",
1937 | " friends | \n",
1938 | " description | \n",
1939 | " tweets | \n",
1940 | "
\n",
1941 | " \n",
1942 | " \n",
1943 | " \n",
1944 | " 0 | \n",
1945 | " AherezaJanice | \n",
1946 | " 1037 | \n",
1947 | " 1199 | \n",
1948 | " Environmentalist🍃🌿🍀\\nDancer,less have fun 😅\\nB... | \n",
1949 | " 303 | \n",
1950 | "
\n",
1951 | " \n",
1952 | " 1 | \n",
1953 | " _nt__tn_ | \n",
1954 | " 319 | \n",
1955 | " 299 | \n",
1956 | " tu81 ee26, music addiction, namneung × noey × ... | \n",
1957 | " 137550 | \n",
1958 | "
\n",
1959 | " \n",
1960 | " 2 | \n",
1961 | " HonOuko | \n",
1962 | " 303 | \n",
1963 | " 331 | \n",
1964 | " change is inevitable | \n",
1965 | " 522 | \n",
1966 | "
\n",
1967 | " \n",
1968 | " 3 | \n",
1969 | " split2tee | \n",
1970 | " 523 | \n",
1971 | " 521 | \n",
1972 | " Humanitarian at heart, educator and an ardent ... | \n",
1973 | " 4217 | \n",
1974 | "
\n",
1975 | " \n",
1976 | " 4 | \n",
1977 | " Wellington_ILO | \n",
1978 | " 819 | \n",
1979 | " 294 | \n",
1980 | " This is the Official Twitter Account for the I... | \n",
1981 | " 2308 | \n",
1982 | "
\n",
1983 | " \n",
1984 | "
\n",
1985 | "
"
1986 | ],
1987 | "text/plain": [
1988 | " screen_name followers friends \\\n",
1989 | "0 AherezaJanice 1037 1199 \n",
1990 | "1 _nt__tn_ 319 299 \n",
1991 | "2 HonOuko 303 331 \n",
1992 | "3 split2tee 523 521 \n",
1993 | "4 Wellington_ILO 819 294 \n",
1994 | "\n",
1995 | " description tweets \n",
1996 | "0 Environmentalist🍃🌿🍀\\nDancer,less have fun 😅\\nB... 303 \n",
1997 | "1 tu81 ee26, music addiction, namneung × noey × ... 137550 \n",
1998 | "2 change is inevitable 522 \n",
1999 | "3 Humanitarian at heart, educator and an ardent ... 4217 \n",
2000 | "4 This is the Official Twitter Account for the I... 2308 "
2001 | ]
2002 | },
2003 | "execution_count": 129,
2004 | "metadata": {},
2005 | "output_type": "execute_result"
2006 | }
2007 | ],
2008 | "source": [
2009 | "tweets = tweepy.Cursor(auth_api.search,q = 'Kenya', since_id = '2020-03-01').items(2000)\n",
2010 | "users = []\n",
2011 | "followers = []\n",
2012 | "friends = []\n",
2013 | "description = []\n",
2014 | "tweet_count = []\n",
2015 | "for status in tweets:\n",
2016 | " name = status.user.screen_name\n",
2017 | " desc = status.user.description\n",
2018 | " number_of_tweets = status.user.statuses_count\n",
2019 | " following = status.user.friends_count\n",
2020 | " follower = status.user.followers_count\n",
2021 | " users.append(name)\n",
2022 | " followers.append(follower)\n",
2023 | " friends.append(following)\n",
2024 | " description.append(desc)\n",
2025 | " tweet_count.append(number_of_tweets) \n",
2026 | "\n",
2027 | "data = {'screen_name':users, 'followers':followers, 'friends':friends, 'description':description, 'tweets':tweet_count}\n",
2028 | "df_8 = pd.DataFrame(data)\n",
2029 | " \n",
2030 | "# df['screen_name'] = users\n",
2031 | "# df['location'] = location\n",
2032 | " \n",
2033 | "df_8.to_csv('df_8 users.csv')\n",
2034 | "df_8.head()"
2035 | ]
2036 | },
2037 | {
2038 | "cell_type": "code",
2039 | "execution_count": 134,
2040 | "metadata": {},
2041 | "outputs": [
2042 | {
2043 | "data": {
2044 | "text/html": [
2045 | "\n",
2046 | "\n",
2059 | "
\n",
2060 | " \n",
2061 | " \n",
2062 | " | \n",
2063 | " screen_name | \n",
2064 | " followers | \n",
2065 | " friends | \n",
2066 | " description | \n",
2067 | " tweets | \n",
2068 | "
\n",
2069 | " \n",
2070 | " \n",
2071 | " \n",
2072 | " 0 | \n",
2073 | " Am0n_0ff | \n",
2074 | " 163 | \n",
2075 | " 149 | \n",
2076 | " Africa4Africa//\\n\\nAfrica should be equal trad... | \n",
2077 | " 6104 | \n",
2078 | "
\n",
2079 | " \n",
2080 | " 1 | \n",
2081 | " ItsArnoldOduor | \n",
2082 | " 3604 | \n",
2083 | " 3834 | \n",
2084 | " Start small & stay consistent..\\n\\n+254🇰🇪 | \n",
2085 | " 5480 | \n",
2086 | "
\n",
2087 | " \n",
2088 | " 2 | \n",
2089 | " voyegram | \n",
2090 | " 78 | \n",
2091 | " 243 | \n",
2092 | " Travels and Tours Company\\n\\nVOVEGRAM TOURS & ... | \n",
2093 | " 27 | \n",
2094 | "
\n",
2095 | " \n",
2096 | " 3 | \n",
2097 | " Scanoma | \n",
2098 | " 595 | \n",
2099 | " 392 | \n",
2100 | " 日々是ぜ〜ぜ〜 | \n",
2101 | " 41943 | \n",
2102 | "
\n",
2103 | " \n",
2104 | " 4 | \n",
2105 | " RoselyneNyabon4 | \n",
2106 | " 35 | \n",
2107 | " 58 | \n",
2108 | " | \n",
2109 | " 876 | \n",
2110 | "
\n",
2111 | " \n",
2112 | "
\n",
2113 | "
"
2114 | ],
2115 | "text/plain": [
2116 | " screen_name followers friends \\\n",
2117 | "0 Am0n_0ff 163 149 \n",
2118 | "1 ItsArnoldOduor 3604 3834 \n",
2119 | "2 voyegram 78 243 \n",
2120 | "3 Scanoma 595 392 \n",
2121 | "4 RoselyneNyabon4 35 58 \n",
2122 | "\n",
2123 | " description tweets \n",
2124 | "0 Africa4Africa//\\n\\nAfrica should be equal trad... 6104 \n",
2125 | "1 Start small & stay consistent..\\n\\n+254🇰🇪 5480 \n",
2126 | "2 Travels and Tours Company\\n\\nVOVEGRAM TOURS & ... 27 \n",
2127 | "3 日々是ぜ〜ぜ〜 41943 \n",
2128 | "4 876 "
2129 | ]
2130 | },
2131 | "execution_count": 134,
2132 | "metadata": {},
2133 | "output_type": "execute_result"
2134 | }
2135 | ],
2136 | "source": [
2137 | "tweets = tweepy.Cursor(auth_api.search,q = 'Kenya', since_id = '2020-02-01').items(2000)\n",
2138 | "users = []\n",
2139 | "followers = []\n",
2140 | "friends = []\n",
2141 | "description = []\n",
2142 | "tweet_count = []\n",
2143 | "for status in tweets:\n",
2144 | " name = status.user.screen_name\n",
2145 | " desc = status.user.description\n",
2146 | " number_of_tweets = status.user.statuses_count\n",
2147 | " following = status.user.friends_count\n",
2148 | " follower = status.user.followers_count\n",
2149 | " users.append(name)\n",
2150 | " followers.append(follower)\n",
2151 | " friends.append(following)\n",
2152 | " description.append(desc)\n",
2153 | " tweet_count.append(number_of_tweets) \n",
2154 | "\n",
2155 | "data = {'screen_name':users, 'followers':followers, 'friends':friends, 'description':description, 'tweets':tweet_count}\n",
2156 | "df_9 = pd.DataFrame(data)\n",
2157 | " \n",
2158 | "# df['screen_name'] = users\n",
2159 | "# df['location'] = location\n",
2160 | " \n",
2161 | "df_9.to_csv('df_9 users.csv')\n",
2162 | "df_9.head()"
2163 | ]
2164 | },
2165 | {
2166 | "cell_type": "code",
2167 | "execution_count": 132,
2168 | "metadata": {},
2169 | "outputs": [
2170 | {
2171 | "data": {
2172 | "text/html": [
2173 | "\n",
2174 | "\n",
2187 | "
\n",
2188 | " \n",
2189 | " \n",
2190 | " | \n",
2191 | " screen_name | \n",
2192 | " followers | \n",
2193 | " friends | \n",
2194 | " description | \n",
2195 | " tweets | \n",
2196 | "
\n",
2197 | " \n",
2198 | " \n",
2199 | " \n",
2200 | " 0 | \n",
2201 | " kenya_pu | \n",
2202 | " 371 | \n",
2203 | " 232 | \n",
2204 | " 現存する最古のシンメの深海に溺れる青担トニ担箱推し。妄想絵を描く酔払い。成人式は遥か昔 妄想... | \n",
2205 | " 58113 | \n",
2206 | "
\n",
2207 | " \n",
2208 | " 1 | \n",
2209 | " nanga_tp | \n",
2210 | " 18993 | \n",
2211 | " 2040 | \n",
2212 | " Luyotilo Holdings || Stay patient and trust yo... | \n",
2213 | " 37994 | \n",
2214 | "
\n",
2215 | " \n",
2216 | " 2 | \n",
2217 | " KenInvest | \n",
2218 | " 20513 | \n",
2219 | " 384 | \n",
2220 | " Kenya Investment Authority (KenInvest) is a Go... | \n",
2221 | " 7503 | \n",
2222 | "
\n",
2223 | " \n",
2224 | " 3 | \n",
2225 | " Njeriwaridi | \n",
2226 | " 334 | \n",
2227 | " 143 | \n",
2228 | " Investment. Impact. Travel.Environment\\n\\nWorr... | \n",
2229 | " 14381 | \n",
2230 | "
\n",
2231 | " \n",
2232 | " 4 | \n",
2233 | " ilegacy15 | \n",
2234 | " 54 | \n",
2235 | " 239 | \n",
2236 | " My truth might be a lie. | \n",
2237 | " 1199 | \n",
2238 | "
\n",
2239 | " \n",
2240 | "
\n",
2241 | "
"
2242 | ],
2243 | "text/plain": [
2244 | " screen_name followers friends \\\n",
2245 | "0 kenya_pu 371 232 \n",
2246 | "1 nanga_tp 18993 2040 \n",
2247 | "2 KenInvest 20513 384 \n",
2248 | "3 Njeriwaridi 334 143 \n",
2249 | "4 ilegacy15 54 239 \n",
2250 | "\n",
2251 | " description tweets \n",
2252 | "0 現存する最古のシンメの深海に溺れる青担トニ担箱推し。妄想絵を描く酔払い。成人式は遥か昔 妄想... 58113 \n",
2253 | "1 Luyotilo Holdings || Stay patient and trust yo... 37994 \n",
2254 | "2 Kenya Investment Authority (KenInvest) is a Go... 7503 \n",
2255 | "3 Investment. Impact. Travel.Environment\\n\\nWorr... 14381 \n",
2256 | "4 My truth might be a lie. 1199 "
2257 | ]
2258 | },
2259 | "execution_count": 132,
2260 | "metadata": {},
2261 | "output_type": "execute_result"
2262 | }
2263 | ],
2264 | "source": [
2265 | "tweets = tweepy.Cursor(auth_api.search,q = 'Kenya', since_id = '2020-01-01').items(2000)\n",
2266 | "users = []\n",
2267 | "followers = []\n",
2268 | "friends = []\n",
2269 | "description = []\n",
2270 | "tweet_count = []\n",
2271 | "for status in tweets:\n",
2272 | " name = status.user.screen_name\n",
2273 | " desc = status.user.description\n",
2274 | " number_of_tweets = status.user.statuses_count\n",
2275 | " following = status.user.friends_count\n",
2276 | " follower = status.user.followers_count\n",
2277 | " users.append(name)\n",
2278 | " followers.append(follower)\n",
2279 | " friends.append(following)\n",
2280 | " description.append(desc)\n",
2281 | " tweet_count.append(number_of_tweets) \n",
2282 | "\n",
2283 | "data = {'screen_name':users, 'followers':followers, 'friends':friends, 'description':description, 'tweets':tweet_count}\n",
2284 | "df_0 = pd.DataFrame(data)\n",
2285 | " \n",
2286 | "# df['screen_name'] = users\n",
2287 | "# df['location'] = location\n",
2288 | " \n",
2289 | "df_0.to_csv('df_0 users.csv')\n",
2290 | "df_0.head()"
2291 | ]
2292 | },
2293 | {
2294 | "cell_type": "markdown",
2295 | "metadata": {},
2296 | "source": [
2297 | "#### Remaining dataframes"
2298 | ]
2299 | },
2300 | {
2301 | "cell_type": "code",
2302 | "execution_count": 135,
2303 | "metadata": {},
2304 | "outputs": [
2305 | {
2306 | "data": {
2307 | "text/plain": [
2308 | "(14000, 5)"
2309 | ]
2310 | },
2311 | "execution_count": 135,
2312 | "metadata": {},
2313 | "output_type": "execute_result"
2314 | }
2315 | ],
2316 | "source": [
2317 | "df_k = pd.concat([df_4, df_5, df_6, df_7, df_8, df_9, df_0])\n",
2318 | "df_k.shape"
2319 | ]
2320 | },
2321 | {
2322 | "cell_type": "code",
2323 | "execution_count": 137,
2324 | "metadata": {},
2325 | "outputs": [
2326 | {
2327 | "data": {
2328 | "text/plain": [
2329 | "(116, 6)"
2330 | ]
2331 | },
2332 | "execution_count": 137,
2333 | "metadata": {},
2334 | "output_type": "execute_result"
2335 | }
2336 | ],
2337 | "source": [
2338 | "df_k['popularity'] = df_k['followers'] - df_k['friends']\n",
2339 | "new_df_2 = df_k[df_k.popularity >= 50000]\n",
2340 | "new_df_2 = new_df_2.drop_duplicates(subset =\"screen_name\") \n",
2341 | "new_df_2.shape"
2342 | ]
2343 | },
2344 | {
2345 | "cell_type": "code",
2346 | "execution_count": 138,
2347 | "metadata": {},
2348 | "outputs": [
2349 | {
2350 | "data": {
2351 | "text/html": [
2352 | "\n",
2353 | "\n",
2366 | "
\n",
2367 | " \n",
2368 | " \n",
2369 | " | \n",
2370 | " screen_name | \n",
2371 | " followers | \n",
2372 | " friends | \n",
2373 | " description | \n",
2374 | " tweets | \n",
2375 | " popularity | \n",
2376 | "
\n",
2377 | " \n",
2378 | " \n",
2379 | " \n",
2380 | " 43 | \n",
2381 | " RoadAlertsKE | \n",
2382 | " 162406 | \n",
2383 | " 490 | \n",
2384 | " Official Twitter Account for Road Alerts KE. S... | \n",
2385 | " 123136 | \n",
2386 | " 161916 | \n",
2387 | "
\n",
2388 | " \n",
2389 | " 79 | \n",
2390 | " coldtusker | \n",
2391 | " 147140 | \n",
2392 | " 3716 | \n",
2393 | " Simply me... | \n",
2394 | " 361943 | \n",
2395 | " 143424 | \n",
2396 | "
\n",
2397 | " \n",
2398 | " 91 | \n",
2399 | " pressecitron | \n",
2400 | " 649614 | \n",
2401 | " 2160 | \n",
2402 | " Connectez-vous à l'innovation ! #Pressecitron1... | \n",
2403 | " 64786 | \n",
2404 | " 647454 | \n",
2405 | "
\n",
2406 | " \n",
2407 | " 105 | \n",
2408 | " ntvkenya | \n",
2409 | " 2551233 | \n",
2410 | " 262 | \n",
2411 | " LIVE STREAM: https://t.co/BrEeOBOxu6 _________... | \n",
2412 | " 491479 | \n",
2413 | " 2550971 | \n",
2414 | "
\n",
2415 | " \n",
2416 | " 110 | \n",
2417 | " CofekRebranded | \n",
2418 | " 139997 | \n",
2419 | " 19373 | \n",
2420 | " COFEK is Kenya and Africa's Premier Consumer P... | \n",
2421 | " 44135 | \n",
2422 | " 120624 | \n",
2423 | "
\n",
2424 | " \n",
2425 | " ... | \n",
2426 | " ... | \n",
2427 | " ... | \n",
2428 | " ... | \n",
2429 | " ... | \n",
2430 | " ... | \n",
2431 | " ... | \n",
2432 | "
\n",
2433 | " \n",
2434 | " 957 | \n",
2435 | " TheKenyanPost | \n",
2436 | " 51940 | \n",
2437 | " 418 | \n",
2438 | " News, Politics, Entertainment, Gossip, Feature... | \n",
2439 | " 326473 | \n",
2440 | " 51522 | \n",
2441 | "
\n",
2442 | " \n",
2443 | " 1146 | \n",
2444 | " KBCChannel1 | \n",
2445 | " 310081 | \n",
2446 | " 1750 | \n",
2447 | " Kenya's National Broadcaster, bringing you sto... | \n",
2448 | " 203064 | \n",
2449 | " 308331 | \n",
2450 | "
\n",
2451 | " \n",
2452 | " 1151 | \n",
2453 | " swahilitimes | \n",
2454 | " 581701 | \n",
2455 | " 0 | \n",
2456 | " Habari | Elimu | Burudani | swahilitimes@gmail... | \n",
2457 | " 53538 | \n",
2458 | " 581701 | \n",
2459 | "
\n",
2460 | " \n",
2461 | " 1189 | \n",
2462 | " WMutunga | \n",
2463 | " 1007081 | \n",
2464 | " 847 | \n",
2465 | " INSPIRING THE YOUTH TO CREATE BETTER TOMORROWS. | \n",
2466 | " 34382 | \n",
2467 | " 1006234 | \n",
2468 | "
\n",
2469 | " \n",
2470 | " 1217 | \n",
2471 | " ReutersAfrica | \n",
2472 | " 738848 | \n",
2473 | " 341 | \n",
2474 | " Reuters Africa provides a feed of our latest n... | \n",
2475 | " 39437 | \n",
2476 | " 738507 | \n",
2477 | "
\n",
2478 | " \n",
2479 | "
\n",
2480 | "
116 rows × 6 columns
\n",
2481 | "
"
2482 | ],
2483 | "text/plain": [
2484 | " screen_name followers friends \\\n",
2485 | "43 RoadAlertsKE 162406 490 \n",
2486 | "79 coldtusker 147140 3716 \n",
2487 | "91 pressecitron 649614 2160 \n",
2488 | "105 ntvkenya 2551233 262 \n",
2489 | "110 CofekRebranded 139997 19373 \n",
2490 | "... ... ... ... \n",
2491 | "957 TheKenyanPost 51940 418 \n",
2492 | "1146 KBCChannel1 310081 1750 \n",
2493 | "1151 swahilitimes 581701 0 \n",
2494 | "1189 WMutunga 1007081 847 \n",
2495 | "1217 ReutersAfrica 738848 341 \n",
2496 | "\n",
2497 | " description tweets popularity \n",
2498 | "43 Official Twitter Account for Road Alerts KE. S... 123136 161916 \n",
2499 | "79 Simply me... 361943 143424 \n",
2500 | "91 Connectez-vous à l'innovation ! #Pressecitron1... 64786 647454 \n",
2501 | "105 LIVE STREAM: https://t.co/BrEeOBOxu6 _________... 491479 2550971 \n",
2502 | "110 COFEK is Kenya and Africa's Premier Consumer P... 44135 120624 \n",
2503 | "... ... ... ... \n",
2504 | "957 News, Politics, Entertainment, Gossip, Feature... 326473 51522 \n",
2505 | "1146 Kenya's National Broadcaster, bringing you sto... 203064 308331 \n",
2506 | "1151 Habari | Elimu | Burudani | swahilitimes@gmail... 53538 581701 \n",
2507 | "1189 INSPIRING THE YOUTH TO CREATE BETTER TOMORROWS. 34382 1006234 \n",
2508 | "1217 Reuters Africa provides a feed of our latest n... 39437 738507 \n",
2509 | "\n",
2510 | "[116 rows x 6 columns]"
2511 | ]
2512 | },
2513 | "execution_count": 138,
2514 | "metadata": {},
2515 | "output_type": "execute_result"
2516 | }
2517 | ],
2518 | "source": [
2519 | "new_df_2"
2520 | ]
2521 | },
2522 | {
2523 | "cell_type": "markdown",
2524 | "metadata": {},
2525 | "source": [
2526 | "### Mega Df"
2527 | ]
2528 | },
2529 | {
2530 | "cell_type": "code",
2531 | "execution_count": 2,
2532 | "metadata": {},
2533 | "outputs": [],
2534 | "source": [
2535 | "full_df = pd.concat([new_df, new_df_2])\n",
2536 | "full_df = full_df.drop_duplicates(subset =\"screen_name\") \n",
2537 | "full_df.to_csv('mega1.csv')"
2538 | ]
2539 | },
2540 | {
2541 | "cell_type": "markdown",
2542 | "metadata": {},
2543 | "source": [
2544 | "#### [Back to top](#Tweeps-scraping)"
2545 | ]
2546 | },
2547 | {
2548 | "cell_type": "code",
2549 | "execution_count": null,
2550 | "metadata": {},
2551 | "outputs": [],
2552 | "source": []
2553 | }
2554 | ],
2555 | "metadata": {
2556 | "kernelspec": {
2557 | "display_name": "Python 3",
2558 | "language": "python",
2559 | "name": "python3"
2560 | },
2561 | "language_info": {
2562 | "codemirror_mode": {
2563 | "name": "ipython",
2564 | "version": 3
2565 | },
2566 | "file_extension": ".py",
2567 | "mimetype": "text/x-python",
2568 | "name": "python",
2569 | "nbconvert_exporter": "python",
2570 | "pygments_lexer": "ipython3",
2571 | "version": "3.7.4"
2572 | }
2573 | },
2574 | "nbformat": 4,
2575 | "nbformat_minor": 4
2576 | }
2577 |
--------------------------------------------------------------------------------
/tweetsminig.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Tweets Collection.\n",
8 | "Downloading tweets for the users from the [superbowl df.](super_bowl.ipynb)\n"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 33,
14 | "metadata": {},
15 | "outputs": [],
16 | "source": [
17 | "import tweepy\n",
18 | "import csv\n",
19 | "import os\n",
20 | "import pandas as pd\n",
21 | "\n",
22 | "consumer_key = os.environ.get('consumer_key')\n",
23 | "consumer_secret = os.environ.get('consumer_secret')\n",
24 | "access_token = os.environ.get('access_token')\n",
25 | "access_token_secret = os.environ.get('access_token_secret')\n",
26 | "\n",
27 | "auth = tweepy.OAuthHandler(consumer_key, consumer_secret)\n",
28 | "auth.set_access_token(access_token, access_token_secret)\n",
29 | "api = tweepy.API(auth)\n",
30 | "\n",
31 | "def get_tweets(screen_name):\n",
32 | "\n",
33 | " alltweets = [] \n",
34 | " \n",
35 | " new_tweets = tweepy.Cursor(api.user_timeline, screen_name = screen_name).items(500)\n",
36 | " alltweets.extend(new_tweets)\n",
37 | "\n",
38 | " outtweets = [[tweet.id_str, tweet.created_at, tweet.text, tweet.favorite_count, tweet.retweet_count, tweet.entities, tweet.lang] for tweet in alltweets]\n",
39 | " \n",
40 | " #write the csv \n",
41 | " with open(f'{screen_name}_tweets.csv', 'w') as f:\n",
42 | " writer = csv.writer(f)\n",
43 | " writer.writerow([\"id\",\"created_at\",\"text\", \"favorites\", \"retweets\", \"entities\", \"language\"])\n",
44 | " writer.writerows(outtweets)\n",
45 | " \n",
46 | " pass\n",
47 | "\n",
48 | "\n"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 46,
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "# fetch the unique handles from the superbowl\n",
58 | "# convert to list.\n",
59 | "\n",
60 | "handles = pd.read_csv('/home/ada/10academy/training /week2/monday/kernels/csvs/superbowl.csv')\n",
61 | "names = handles['handles'].to_list()\n",
62 | "\n",
63 | "#divide into groups\n",
64 | "n = 100\n",
65 | "x = [names[i:i + n] for i in range(0, len(names), n)] \n",
66 | "batch_1 = x[0]\n",
67 | "batch_2 = x[1]\n",
68 | "batch_3 = x[2]\n",
69 | "batch_4 = x[3]\n",
70 | "batch_5 = x[4]\n",
71 | "batch_6 = x[5]\n",
72 | "batch_7 = x[6]\n",
73 | "batch_8 = x[7]\n",
74 | "batch_9 = x[8]\n",
75 | "batch_10 = x[9]\n",
76 | "batch_11 = x[10]"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": 47,
82 | "metadata": {},
83 | "outputs": [],
84 | "source": [
85 | "#1\n",
86 | "if __name__ == '__main__':\n",
87 | " #loop through the handles in the list\n",
88 | " for i,name in enumerate(batch_1):\n",
89 | " try:\n",
90 | " get_tweets(name)\n",
91 | " except Exception:\n",
92 | " pass\n"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": 48,
98 | "metadata": {},
99 | "outputs": [],
100 | "source": [
101 | "#2\n",
102 | "if __name__ == '__main__':\n",
103 | " #loop through the handles in the list\n",
104 | " for i,name in enumerate(batch_2):\n",
105 | " try:\n",
106 | " get_tweets(name)\n",
107 | " except Exception:\n",
108 | " pass\n"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": 49,
114 | "metadata": {},
115 | "outputs": [],
116 | "source": [
117 | "#3\n",
118 | "if __name__ == '__main__':\n",
119 | " #loop through the handles in the list\n",
120 | " for i,name in enumerate(batch_3):\n",
121 | " try:\n",
122 | " get_tweets(name)\n",
123 | " except Exception:\n",
124 | " pass\n"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": 50,
130 | "metadata": {},
131 | "outputs": [],
132 | "source": [
133 | "#4\n",
134 | "if __name__ == '__main__':\n",
135 | " #loop through the handles in the list\n",
136 | " for i,name in enumerate(batch_4):\n",
137 | " try:\n",
138 | " get_tweets(name)\n",
139 | " except Exception:\n",
140 | " pass"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": 51,
146 | "metadata": {},
147 | "outputs": [],
148 | "source": [
149 | "#5\n",
150 | "if __name__ == '__main__':\n",
151 | " #loop through the handles in the list\n",
152 | " for i,name in enumerate(batch_5):\n",
153 | " try:\n",
154 | " get_tweets(name)\n",
155 | " except Exception:\n",
156 | " pass"
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": 53,
162 | "metadata": {},
163 | "outputs": [],
164 | "source": [
165 | "#6\n",
166 | "if __name__ == '__main__':\n",
167 | " #loop through the handles in the list\n",
168 | " for i,name in enumerate(batch_6):\n",
169 | " try:\n",
170 | " get_tweets(name)\n",
171 | " except Exception:\n",
172 | " pass"
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": null,
178 | "metadata": {},
179 | "outputs": [],
180 | "source": [
181 | "#7\n",
182 | "if __name__ == '__main__':\n",
183 | " #loop through the handles in the list\n",
184 | " for i,name in enumerate(batch_7):\n",
185 | " try:\n",
186 | " get_tweets(name)\n",
187 | " except Exception:\n",
188 | " pass"
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": null,
194 | "metadata": {},
195 | "outputs": [],
196 | "source": [
197 | "#8\n",
198 | "if __name__ == '__main__':\n",
199 | " #loop through the handles in the list\n",
200 | " for i,name in enumerate(batch_8):\n",
201 | " try:\n",
202 | " get_tweets(name)\n",
203 | " except Exception:\n",
204 | " pass"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "metadata": {},
211 | "outputs": [],
212 | "source": [
213 | "#9\n",
214 | "if __name__ == '__main__':\n",
215 | " #loop through the handles in the list\n",
216 | " for i,name in enumerate(batch_9):\n",
217 | " try:\n",
218 | " get_tweets(name)\n",
219 | " except Exception:\n",
220 | " pass"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": null,
226 | "metadata": {},
227 | "outputs": [],
228 | "source": [
229 | "#10\n",
230 | "if __name__ == '__main__':\n",
231 | " #loop through the handles in the list\n",
232 | " for i,name in enumerate(batch_10):\n",
233 | " try:\n",
234 | " get_tweets(name)\n",
235 | " except Exception:\n",
236 | " pass"
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": null,
242 | "metadata": {},
243 | "outputs": [],
244 | "source": [
245 | "#11\n",
246 | "if __name__ == '__main__':\n",
247 | " #loop through the handles in the list\n",
248 | " for i,name in enumerate(batch_11):\n",
249 | " try:\n",
250 | " get_tweets(name)\n",
251 | " except Exception:\n",
252 | " pass"
253 | ]
254 | }
255 | ],
256 | "metadata": {
257 | "kernelspec": {
258 | "display_name": "Python 3",
259 | "language": "python",
260 | "name": "python3"
261 | },
262 | "language_info": {
263 | "codemirror_mode": {
264 | "name": "ipython",
265 | "version": 3
266 | },
267 | "file_extension": ".py",
268 | "mimetype": "text/x-python",
269 | "name": "python",
270 | "nbconvert_exporter": "python",
271 | "pygments_lexer": "ipython3",
272 | "version": "3.7.4"
273 | }
274 | },
275 | "nbformat": 4,
276 | "nbformat_minor": 4
277 | }
278 |
--------------------------------------------------------------------------------