├── README.md
├── Trump_NYTimes_Analysis.ipynb
├── Trump_NYTimes_Presentation.pdf
└── Trump_NYtimes_Collect_Data.ipynb


/README.md:
--------------------------------------------------------------------------------
1 | # New York Times' Coverage of Trump
2 | 
3 | My project looks at the New York Times' coverage of Trump from when he announced his candidacy to August 2016 (the date of the project). I gathered the approximately 2,200 news articles from the Times that focused on Donald Trump and used topic modeling to map how the topics of the articles have evolved over the past year. I also wrote a [blog post](https://hookedondata.org/topic-modeling-the-new-york-times-and-trump/) documenting my analysis process, topic modeling, and findings. 
4 | 


--------------------------------------------------------------------------------
/Trump_NYTimes_Presentation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/robinsones/NYTimes-and-Trump/3dd0581292b78e064f050f3cc5d256dd56576e4e/Trump_NYTimes_Presentation.pdf


--------------------------------------------------------------------------------
/Trump_NYtimes_Collect_Data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Functions for getting article information"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "We want all articles starting in the beginning of June 2015 until now, and we will gather them in two week periods"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "all_dates = []\n",
 26 |     "for month in [\"06\", \"07\", \"08\", \"09\", \"10\", \"11\", \"12\"]:\n",
 27 |     "    for date_pair in [(\"01\", \"15\"), (\"16\", \"31\")]:\n",
 28 |     "        date_range = (\"2015\" + month + date_pair[0], \"2015\" + month + date_pair[1])\n",
 29 |     "        all_dates.append(date_range)"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 2,
 35 |    "metadata": {
 36 |     "collapsed": true
 37 |    },
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "for month in [\"01\", \"02\", \"03\", \"04\", \"05\", \"06\", \"07\", \"08\"]:\n",
 41 |     "    for date_pair in [(\"01\", \"15\"), (\"16\", \"31\")]:\n",
 42 |     "        date_range = (\"2016\" + month + date_pair[0], \"2016\" + month + date_pair[1])\n",
 43 |     "        all_dates.append(date_range)"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 3,
 49 |    "metadata": {
 50 |     "collapsed": false
 51 |    },
 52 |    "outputs": [
 53 |     {
 54 |      "data": {
 55 |       "text/plain": [
 56 |        "[('20150601', '20150615'),\n",
 57 |        " ('20150616', '20150631'),\n",
 58 |        " ('20150701', '20150715'),\n",
 59 |        " ('20150716', '20150731'),\n",
 60 |        " ('20150801', '20150815'),\n",
 61 |        " ('20150816', '20150831'),\n",
 62 |        " ('20150901', '20150915'),\n",
 63 |        " ('20150916', '20150931'),\n",
 64 |        " ('20151001', '20151015'),\n",
 65 |        " ('20151016', '20151031'),\n",
 66 |        " ('20151101', '20151115'),\n",
 67 |        " ('20151116', '20151131'),\n",
 68 |        " ('20151201', '20151215'),\n",
 69 |        " ('20151216', '20151231'),\n",
 70 |        " ('20160101', '20160115'),\n",
 71 |        " ('20160116', '20160131'),\n",
 72 |        " ('20160201', '20160215'),\n",
 73 |        " ('20160216', '20160231'),\n",
 74 |        " ('20160301', '20160315'),\n",
 75 |        " ('20160316', '20160331'),\n",
 76 |        " ('20160401', '20160415'),\n",
 77 |        " ('20160416', '20160431'),\n",
 78 |        " ('20160501', '20160515'),\n",
 79 |        " ('20160516', '20160531'),\n",
 80 |        " ('20160601', '20160615'),\n",
 81 |        " ('20160616', '20160631'),\n",
 82 |        " ('20160701', '20160715'),\n",
 83 |        " ('20160716', '20160731'),\n",
 84 |        " ('20160801', '20160815'),\n",
 85 |        " ('20160816', '20160831')]"
 86 |       ]
 87 |      },
 88 |      "execution_count": 3,
 89 |      "metadata": {},
 90 |      "output_type": "execute_result"
 91 |     }
 92 |    ],
 93 |    "source": [
 94 |     "all_dates"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 4,
100 |    "metadata": {
101 |     "collapsed": true
102 |    },
103 |    "outputs": [],
104 |    "source": [
105 |     "def get_articles(search_term, dates):\n",
106 |     "    url = \"https://api.nytimes.com/svc/search/v2/articlesearch.json\"\n",
107 |     "    api_infos = []\n",
108 |     "    # get all urls\n",
109 |     "    for start_date, end_date in dates: \n",
110 |     "        for page in range(101):\n",
111 |     "            queries = {\n",
112 |     "          'api-key': XXX,\n",
113 |     "          'q': search_term,\n",
114 |     "          'begin_date': start_date, \n",
115 |     "          'end_date': end_date, \n",
116 |     "          'page' : page\n",
117 |     "            }\n",
118 |     "            req_t = requests.get(url, params = queries)\n",
119 |     "            adict = json.loads(req_t.text)\n",
120 |     "            doclist = adict['response']['docs']\n",
121 |     "            for j in range(len(doclist)):\n",
122 |     "                api_info = ([doclist[j][\"web_url\"], doclist[j][\"headline\"],\n",
123 |     "                      doclist[j][\"section_name\"],\n",
124 |     "                      doclist[j][\"word_count\"], doclist[j][\"type_of_material\"], \n",
125 |     "                      doclist[j][\"pub_date\"]])\n",
126 |     "                api_infos.append(api_info)\n",
127 |     "    # transform data into dataset\n",
128 |     "    api_dataset = pd.DataFrame(api_infos, columns=['Url','Headline', \n",
129 |     "                                           'Section_name', 'Word_Count', \n",
130 |     "                                           'Type', 'Pub_Date'])\n",
131 |     "    return api_dataset"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 5,
137 |    "metadata": {
138 |     "collapsed": true
139 |    },
140 |    "outputs": [],
141 |    "source": [
142 |     "def get_text(url_list):\n",
143 |     "    articles = []\n",
144 |     "    for address in url_list: \n",
145 |     "        try:\n",
146 |     "            article = Article(address)\n",
147 |     "            article.download()\n",
148 |     "            article.parse()\n",
149 |     "            article.nlp()\n",
150 |     "            article_info = ([article.publish_date, article.authors,\n",
151 |     "                      article.summary, article.keywords, article.text,\n",
152 |     "                     article.url])\n",
153 |     "        except:\n",
154 |     "            pass\n",
155 |     "        \n",
156 |     "        articles.append(article_info)\n",
157 |     "        \n",
158 |     "    newspaper_dataset = pd.DataFrame(articles, columns=['Date','Authors', 'Summary', 'Keywords', 'Text', \"Url\"])\n",
159 |     "    newspaper_dataset[\"Text_Decode\"] = newspaper_dataset[\"Text\"].apply(lambda x: x.encode('utf-8'))\n",
160 |     "    newspaper_dataset[\"Text_Clean\"] = newspaper_dataset[\"Text_Decode\"].apply(lambda x: x.replace(\"\\n\", \" \"))\n",
161 |     "    newspaper_dataset = newspaper_dataset.drop('Text_Decode', 1)\n",
162 |     "    return newspaper_dataset"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "metadata": {},
168 |    "source": [
169 |     "## Gather Hillary and Trump Articles"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": null,
175 |    "metadata": {
176 |     "collapsed": true
177 |    },
178 |    "outputs": [],
179 |    "source": [
180 |     "HC_articles = get_articles(\"Hillary Clinton\", all_dates)"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "metadata": {
187 |     "collapsed": true
188 |    },
189 |    "outputs": [],
190 |    "source": [
191 |     "Hillary_text = get_text(HC_articles[\"Url\"])"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {
198 |     "collapsed": true
199 |    },
200 |    "outputs": [],
201 |    "source": [
202 |     "Trump_articles = get_articles(\"Donald Trump\", all_dates)"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": null,
208 |    "metadata": {
209 |     "collapsed": true
210 |    },
211 |    "outputs": [],
212 |    "source": [
213 |     "Total_Trump = get_text(Trump_articles[\"Url\"])"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "markdown",
218 |    "metadata": {},
219 |    "source": [
220 |     "## Clean, Merge, and Pickle Datasets"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "markdown",
225 |    "metadata": {},
226 |    "source": [
227 |     "#### Combine text and api datasets, get rid of duplicates and bad links, and add a column of Text without the \"\\n\""
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": null,
233 |    "metadata": {
234 |     "collapsed": true
235 |    },
236 |    "outputs": [],
237 |    "source": [
238 |     "Trump_merged = Trump_articles.merge(Total_Trump, on = \"Url\", how = \"inner\")\n",
239 |     "good_links = Trump_merged[Trump_merged[\"Text\"] != \"NYTimes.com no longer supports Internet Explorer 9 or earlier. Please upgrade your browser.\"]\n",
240 |     "Trump_no_duplicates = good_links.drop_duplicates(\"Text\")\n",
241 |     "Trump_no_duplicates[\"Text_Decode\"] = Trump_no_duplicates[\"Text\"].apply(lambda x: x.encode('utf-8'))\n",
242 |     "Trump_no_duplicates[\"Text_Clean\"] = Trump_no_duplicates[\"Text_Decode\"].apply(lambda x: x.replace(\"\\n\", \" \"))"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": null,
248 |    "metadata": {
249 |     "collapsed": true
250 |    },
251 |    "outputs": [],
252 |    "source": [
253 |     "Hillary_merged = all_newspaper_articles.merge(HC_articles, on = \"Url\", how = \"inner\")\n",
254 |     "Hillary_no_duplicates = Hillary_merged.drop_duplicates(\"Text\")\n",
255 |     "Hillary_no_duplicates = Hillary_no_duplicates[Hillary_no_duplicates[\"Text\"] != \"NYTimes.com no longer supports Internet Explorer 9 or earlier. Please upgrade your browser.\"]\n",
256 |     "final_Hillary = Hillary_no_duplicates\n",
257 |     "final_Hillary[\"Text_Decode\"] = final_Hillary[\"Text\"].apply(lambda x: x.encode('utf-8'))\n",
258 |     "final_Hillary[\"Text_Clean\"] = final_Hillary[\"Text_Decode\"].apply(lambda x: x.replace(\"\\n\", \" \"))"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": null,
264 |    "metadata": {
265 |     "collapsed": true
266 |    },
267 |    "outputs": [],
268 |    "source": [
269 |     "final_Hillary[\"Trump_subject\"] = 0\n",
270 |     "Trump_no_duplicates[\"Trump_subject\"] = 1\n",
271 |     "Trump_Clinton_dataset = final_Hillary.append(Trump_no_duplicates)"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": null,
277 |    "metadata": {
278 |     "collapsed": true
279 |    },
280 |    "outputs": [],
281 |    "source": [
282 |     "with open('Trump_Clinton_dataset.pkl', 'w') as picklefile:\n",
283 |     "    pickle.dump(Trump_Clinton_dataset, picklefile)"
284 |    ]
285 |   }
286 |  ],
287 |  "metadata": {
288 |   "kernelspec": {
289 |    "display_name": "Python 2",
290 |    "language": "python",
291 |    "name": "python2"
292 |   },
293 |   "language_info": {
294 |    "codemirror_mode": {
295 |     "name": "ipython",
296 |     "version": 2
297 |    },
298 |    "file_extension": ".py",
299 |    "mimetype": "text/x-python",
300 |    "name": "python",
301 |    "nbconvert_exporter": "python",
302 |    "pygments_lexer": "ipython2",
303 |    "version": "2.7.11"
304 |   }
305 |  },
306 |  "nbformat": 4,
307 |  "nbformat_minor": 0
308 | }
309 | 


--------------------------------------------------------------------------------