├── README.md ├── Trump_NYTimes_Analysis.ipynb ├── Trump_NYTimes_Presentation.pdf └── Trump_NYtimes_Collect_Data.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # New York Times' Coverage of Trump 2 | 3 | My project looks at the New York Times' coverage of Trump from when he announced his candidacy to August 2016 (the date of the project). I gathered the approximately 2,200 news articles from the Times that focused on Donald Trump and used topic modeling to map how the topics of the articles have evolved over the past year. I also wrote a [blog post](https://hookedondata.org/topic-modeling-the-new-york-times-and-trump/) documenting my analysis process, topic modeling, and findings. 4 | -------------------------------------------------------------------------------- /Trump_NYTimes_Presentation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/robinsones/NYTimes-and-Trump/3dd0581292b78e064f050f3cc5d256dd56576e4e/Trump_NYTimes_Presentation.pdf -------------------------------------------------------------------------------- /Trump_NYtimes_Collect_Data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Functions for getting article information" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "We want all articles starting in the beginning of June 2015 until now, and we will gather them in two week periods" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "all_dates = []\n", 26 | "for month in [\"06\", \"07\", \"08\", \"09\", \"10\", \"11\", \"12\"]:\n", 27 | " for date_pair in [(\"01\", \"15\"), (\"16\", \"31\")]:\n", 28 | " date_range = (\"2015\" + month + date_pair[0], \"2015\" + month + date_pair[1])\n", 29 | " all_dates.append(date_range)" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": { 36 | "collapsed": true 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "for month in [\"01\", \"02\", \"03\", \"04\", \"05\", \"06\", \"07\", \"08\"]:\n", 41 | " for date_pair in [(\"01\", \"15\"), (\"16\", \"31\")]:\n", 42 | " date_range = (\"2016\" + month + date_pair[0], \"2016\" + month + date_pair[1])\n", 43 | " all_dates.append(date_range)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 3, 49 | "metadata": { 50 | "collapsed": false 51 | }, 52 | "outputs": [ 53 | { 54 | "data": { 55 | "text/plain": [ 56 | "[('20150601', '20150615'),\n", 57 | " ('20150616', '20150631'),\n", 58 | " ('20150701', '20150715'),\n", 59 | " ('20150716', '20150731'),\n", 60 | " ('20150801', '20150815'),\n", 61 | " ('20150816', '20150831'),\n", 62 | " ('20150901', '20150915'),\n", 63 | " ('20150916', '20150931'),\n", 64 | " ('20151001', '20151015'),\n", 65 | " ('20151016', '20151031'),\n", 66 | " ('20151101', '20151115'),\n", 67 | " ('20151116', '20151131'),\n", 68 | " ('20151201', '20151215'),\n", 69 | " ('20151216', '20151231'),\n", 70 | " ('20160101', '20160115'),\n", 71 | " ('20160116', '20160131'),\n", 72 | " ('20160201', '20160215'),\n", 73 | " ('20160216', '20160231'),\n", 74 | " ('20160301', '20160315'),\n", 75 | " ('20160316', '20160331'),\n", 76 | " ('20160401', '20160415'),\n", 77 | " ('20160416', '20160431'),\n", 78 | " ('20160501', '20160515'),\n", 79 | " ('20160516', '20160531'),\n", 80 | " ('20160601', '20160615'),\n", 81 | " ('20160616', '20160631'),\n", 82 | " ('20160701', '20160715'),\n", 83 | " ('20160716', '20160731'),\n", 84 | " ('20160801', '20160815'),\n", 85 | " ('20160816', '20160831')]" 86 | ] 87 | }, 88 | "execution_count": 3, 89 | "metadata": {}, 90 | "output_type": "execute_result" 91 | } 92 | ], 93 | "source": [ 94 | "all_dates" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 4, 100 | "metadata": { 101 | "collapsed": true 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "def get_articles(search_term, dates):\n", 106 | " url = \"https://api.nytimes.com/svc/search/v2/articlesearch.json\"\n", 107 | " api_infos = []\n", 108 | " # get all urls\n", 109 | " for start_date, end_date in dates: \n", 110 | " for page in range(101):\n", 111 | " queries = {\n", 112 | " 'api-key': XXX,\n", 113 | " 'q': search_term,\n", 114 | " 'begin_date': start_date, \n", 115 | " 'end_date': end_date, \n", 116 | " 'page' : page\n", 117 | " }\n", 118 | " req_t = requests.get(url, params = queries)\n", 119 | " adict = json.loads(req_t.text)\n", 120 | " doclist = adict['response']['docs']\n", 121 | " for j in range(len(doclist)):\n", 122 | " api_info = ([doclist[j][\"web_url\"], doclist[j][\"headline\"],\n", 123 | " doclist[j][\"section_name\"],\n", 124 | " doclist[j][\"word_count\"], doclist[j][\"type_of_material\"], \n", 125 | " doclist[j][\"pub_date\"]])\n", 126 | " api_infos.append(api_info)\n", 127 | " # transform data into dataset\n", 128 | " api_dataset = pd.DataFrame(api_infos, columns=['Url','Headline', \n", 129 | " 'Section_name', 'Word_Count', \n", 130 | " 'Type', 'Pub_Date'])\n", 131 | " return api_dataset" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 5, 137 | "metadata": { 138 | "collapsed": true 139 | }, 140 | "outputs": [], 141 | "source": [ 142 | "def get_text(url_list):\n", 143 | " articles = []\n", 144 | " for address in url_list: \n", 145 | " try:\n", 146 | " article = Article(address)\n", 147 | " article.download()\n", 148 | " article.parse()\n", 149 | " article.nlp()\n", 150 | " article_info = ([article.publish_date, article.authors,\n", 151 | " article.summary, article.keywords, article.text,\n", 152 | " article.url])\n", 153 | " except:\n", 154 | " pass\n", 155 | " \n", 156 | " articles.append(article_info)\n", 157 | " \n", 158 | " newspaper_dataset = pd.DataFrame(articles, columns=['Date','Authors', 'Summary', 'Keywords', 'Text', \"Url\"])\n", 159 | " newspaper_dataset[\"Text_Decode\"] = newspaper_dataset[\"Text\"].apply(lambda x: x.encode('utf-8'))\n", 160 | " newspaper_dataset[\"Text_Clean\"] = newspaper_dataset[\"Text_Decode\"].apply(lambda x: x.replace(\"\\n\", \" \"))\n", 161 | " newspaper_dataset = newspaper_dataset.drop('Text_Decode', 1)\n", 162 | " return newspaper_dataset" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "## Gather Hillary and Trump Articles" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": { 176 | "collapsed": true 177 | }, 178 | "outputs": [], 179 | "source": [ 180 | "HC_articles = get_articles(\"Hillary Clinton\", all_dates)" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": { 187 | "collapsed": true 188 | }, 189 | "outputs": [], 190 | "source": [ 191 | "Hillary_text = get_text(HC_articles[\"Url\"])" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": { 198 | "collapsed": true 199 | }, 200 | "outputs": [], 201 | "source": [ 202 | "Trump_articles = get_articles(\"Donald Trump\", all_dates)" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": { 209 | "collapsed": true 210 | }, 211 | "outputs": [], 212 | "source": [ 213 | "Total_Trump = get_text(Trump_articles[\"Url\"])" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": {}, 219 | "source": [ 220 | "## Clean, Merge, and Pickle Datasets" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": {}, 226 | "source": [ 227 | "#### Combine text and api datasets, get rid of duplicates and bad links, and add a column of Text without the \"\\n\"" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": { 234 | "collapsed": true 235 | }, 236 | "outputs": [], 237 | "source": [ 238 | "Trump_merged = Trump_articles.merge(Total_Trump, on = \"Url\", how = \"inner\")\n", 239 | "good_links = Trump_merged[Trump_merged[\"Text\"] != \"NYTimes.com no longer supports Internet Explorer 9 or earlier. Please upgrade your browser.\"]\n", 240 | "Trump_no_duplicates = good_links.drop_duplicates(\"Text\")\n", 241 | "Trump_no_duplicates[\"Text_Decode\"] = Trump_no_duplicates[\"Text\"].apply(lambda x: x.encode('utf-8'))\n", 242 | "Trump_no_duplicates[\"Text_Clean\"] = Trump_no_duplicates[\"Text_Decode\"].apply(lambda x: x.replace(\"\\n\", \" \"))" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": { 249 | "collapsed": true 250 | }, 251 | "outputs": [], 252 | "source": [ 253 | "Hillary_merged = all_newspaper_articles.merge(HC_articles, on = \"Url\", how = \"inner\")\n", 254 | "Hillary_no_duplicates = Hillary_merged.drop_duplicates(\"Text\")\n", 255 | "Hillary_no_duplicates = Hillary_no_duplicates[Hillary_no_duplicates[\"Text\"] != \"NYTimes.com no longer supports Internet Explorer 9 or earlier. Please upgrade your browser.\"]\n", 256 | "final_Hillary = Hillary_no_duplicates\n", 257 | "final_Hillary[\"Text_Decode\"] = final_Hillary[\"Text\"].apply(lambda x: x.encode('utf-8'))\n", 258 | "final_Hillary[\"Text_Clean\"] = final_Hillary[\"Text_Decode\"].apply(lambda x: x.replace(\"\\n\", \" \"))" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": { 265 | "collapsed": true 266 | }, 267 | "outputs": [], 268 | "source": [ 269 | "final_Hillary[\"Trump_subject\"] = 0\n", 270 | "Trump_no_duplicates[\"Trump_subject\"] = 1\n", 271 | "Trump_Clinton_dataset = final_Hillary.append(Trump_no_duplicates)" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "metadata": { 278 | "collapsed": true 279 | }, 280 | "outputs": [], 281 | "source": [ 282 | "with open('Trump_Clinton_dataset.pkl', 'w') as picklefile:\n", 283 | " pickle.dump(Trump_Clinton_dataset, picklefile)" 284 | ] 285 | } 286 | ], 287 | "metadata": { 288 | "kernelspec": { 289 | "display_name": "Python 2", 290 | "language": "python", 291 | "name": "python2" 292 | }, 293 | "language_info": { 294 | "codemirror_mode": { 295 | "name": "ipython", 296 | "version": 2 297 | }, 298 | "file_extension": ".py", 299 | "mimetype": "text/x-python", 300 | "name": "python", 301 | "nbconvert_exporter": "python", 302 | "pygments_lexer": "ipython2", 303 | "version": "2.7.11" 304 | } 305 | }, 306 | "nbformat": 4, 307 | "nbformat_minor": 0 308 | } 309 | --------------------------------------------------------------------------------