├── .DS_Store
├── Articles_Dataframe_Formatting.py
├── Creating_Visualizations.py
├── Hyperparameter_Testing
    ├── hyperparameter_functions.py
    └── hyperparameter_testing.py
├── PreProcessing
    ├── CustomTFIDF.py
    ├── NERTokenizer.py
    ├── __pycache__
    │   ├── CustomTFIDF.cpython-37.pyc
    │   └── NERTokenizer.cpython-37.pyc
    └── julian_matrix.py
├── README.md
├── SuccessMetrics.py
├── Success_Rates.md
├── Visualizations
    ├── .DS_Store
    ├── 500_dendogram_hac.png
    ├── Article_Centers.png
    ├── NewsfeedArticleClustering.pdf
    ├── cumulative_score_hyperparameters.png
    ├── date_distributions.png
    ├── distance_heatmap.png
    ├── f_score_hyperparameters.png
    ├── s_score_hyperparameters.png
    ├── svd_cluster_centers_example.png
    └── svd_colored_clusters.png
├── clustering.py
├── data
    ├── .DS_Store
    └── article_classifications.csv
└── exploring_entities.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parkervg/news-article-clustering/bca5f12fde915a3bc1a103af7937eec89e3a7a2d/.DS_Store


--------------------------------------------------------------------------------
/Articles_Dataframe_Formatting.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | import glob
 4 | import csv
 5 | """
 6 | Takes as input all original Kaggle CSVs, and filters the concatenated_df to only where the date and url are present.
 7 | """
 8 | 
 9 | path = "/Users/parkerglenn/Desktop/DataScienceSets"
10 | 
11 | 
12 | list_ = []
13 | all_files = glob.glob(path + "/*.csv")
14 | frame = pd.DataFrame()
15 | 
16 | 
17 | df_from_each_file = (pd.read_csv(f) for f in all_files)
18 | concatenated_df = pd.concat(df_from_each_file, ignore_index = True)
19 | 
20 | dates = concatenated_df['date']
21 | 
22 | url = concatenated_df['url']
23 | 
24 | df1 = concatenated_df[~concatenated_df['date'].isna()]
25 | df = df1[~df1['url'].isna()]
26 | df = df.reset_index()
27 | df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True)
28 | 
29 | df = df.sort_values(by=['date'])
30 | 
31 | df.to_csv("all_good_articles.csv")
32 | 


--------------------------------------------------------------------------------
/Creating_Visualizations.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Tue Mar 19 10:13:38 2019
  5 | 
  6 | @author: parkerglenn
  7 | """
  8 | from sklearn.decomposition import PCA
  9 | import matplotlib.pylab as plt
 10 | import seaborn as sns
 11 | from sklearn.decomposition import TruncatedSVD
 12 | from matplotlib import pyplot
 13 | import mpld3
 14 | from mpld3 import display
 15 | from sklearn.metrics.pairwise import cosine_similarity
 16 | from sklearn.cluster import KMeans
 17 | import pylab
 18 | 
 19 | 
 20 | ###############################################################################
 21 | ##################VISUALIZATION################################################
 22 | ###############################################################################
 23 | 
 24 | #Date Distribution
 25 | out = pd.cut(df["joined"], bins = [20000000,20020000,20040000,20060000,20080000,20100000,20120000,20140000,20160000,20180000], include_lowest = True)
 26 | ax = out.value_counts(sort = False).plot.bar(rot = 0, color = "b", figsize = (20,10))
 27 | ax.set_ylim(bottom = 0, top = 450)
 28 | ax.set_xticklabels(x for x in ["2000 to 2002","2002 to 2004","2004 to 2006","2006 to 2008","2008 to 2010","2010 to 2012","2012 to 2014","2014 to 2016","2016 to 2018"])
 29 | plt.xlabel("Date Range", fontsize = 18)
 30 | plt.ylabel("Frequency", fontsize = 18)
 31 | plt.title("Date Distribution", fontsize = 25)
 32 | for i in ax.patches:
 33 |     ax.text(i.get_x() + .10, 5, str(i.get_height()), fontsize = 20, color = "black")
 34 | 
 35 | 
 36 | #Distance Heatmap
 37 | dist = 1 - cosine_similarity(matrix)
 38 | 
 39 | cmap = pyplot.cm.cubehelix
 40 | dimensions = (20,20)
 41 | fig, ax = pyplot.subplots(figsize=dimensions)
 42 | sns.heatmap(dist, vmin = 0, vmax = 1, cmap = cmap).set_title("Tfidf Distances Between Articles", fontsize = 15)
 43 | 
 44 | """
 45 | Notice hot spot around the 625:630 line.
 46 | Those article titles:
 47 | ['Dem Debate Blogging #1',
 48 |  'Dem Debate Blogging #2',
 49 |  'Dem Debate Blogging #3',
 50 |  'Dem Debate Blogging #4',
 51 |  'Dem Debate Blogging #5',
 52 |  'Dem Debate Blogging #6']
 53 | 
 54 | 
 55 | Around 90:160: large circle of relatively similar articles.
 56 | Reason: Iowa Caucuses.
 57 | """
 58 | 
 59 | 
 60 | #Tfidf Matrix, in 2D SVD scatterplot
 61 | svd = TruncatedSVD(n_components=2).fit(matrix)
 62 | data2D = svd.transform(matrix)
 63 | plt.title("Truncated SVD, 2 Components")
 64 | colors = rng.rand(1000)
 65 | plt.scatter(data2D[:,0], data2D[:,1], marker = "o", c = colors, cmap = "YlGnBu", s = 10)
 66 | 
 67 | 
 68 | ######With clusters assigned as colors########
 69 | data2D = svd.transform(matrix)
 70 | kmeans = KMeans(n_clusters = 520)
 71 | kmeans.fit(data2D)
 72 | y_kmeans = kmeans.predict(data2D)
 73 | y_pred = kmeans.labels_.tolist()
 74 | 
 75 | success(kmeans,y_pred,matrix)
 76 | 
 77 | articles = {"title": titles, "date": new_df["date"], "cluster": y_pred, "content": new_df["content"], "event": events[:1000]}
 78 | frame = pd.DataFrame(articles, index = [y_pred] , columns = ['title', 'date', 'cluster', 'content', "event"])
 79 | frame['cluster'].value_counts()
 80 | 
 81 | 
 82 | """Creates scalable points for cluster centers found within y_true.
 83 | The size of the center is dependent on how many events occur withing that specific cluster. """
 84 | 
 85 | centers = kmeans.cluster_centers_
 86 | fig, ax = plt.subplots(figsize = (14,8))
 87 | np.random.seed(0)
 88 | threshold = 4
 89 | for cluster, center in enumerate(centers):
 90 |     cluster+=1
 91 |     # Only maps cluster if it has more than "threshold" events in it
 92 |     if cluster in y_true and len(frame.loc[cluster]["event"].values.tolist()) > threshold:
 93 |         #Gets event name that the cluster center represents
 94 |         event = events[y_true.index(cluster)]
 95 |         #s scaled based on number of events in cluster
 96 |         ax.plot(center[0], center[1], markersize = float(len(frame.loc[cluster]["event"].values.tolist())),  marker = "o");
 97 |         plt.annotate(event, (center[0],center[1]))
 98 | ax.set_title('Cluster Centers with Predominant Event', size=14)
 99 | plt.show()
100 | 
101 | mpld3.show(fig)
102 | 
103 | # mpld3.save_html(fig, "Cluster_Centers.html")
104 | 
105 | 
106 | 
107 | """Zoomed out plot with colors"""
108 | plt.title("Truncated SVD with Colored Clusters")
109 | plt.scatter(data2D[:, 0], data2D[:, 1], c=y_kmeans, cmap = "tab20", s=30)
110 | """Example plot, zoomed in to visualize cluster centers"""
111 | 
112 | 
113 | 
114 | 
115 | fig, ax = plt.subplots(figsize = (14,8))
116 | np.random.seed(0)
117 | ax.plot(data2D[:, 0], data2D[:, 1],
118 |         'or', ms=10, alpha=0.2)
119 | ax.set_title('Truncated SVD with Cluster Assignments', size=14)
120 | ax.grid(color='lightgray', alpha=0.7)
121 | for i, txt in enumerate(events):
122 |     print(i)
123 |     plt.annotate(txt + ", " + str(y_pred[i]), (data2D[:, 0][i], data2D[:, 1][i]))
124 | mpld3.show(fig)
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 
131 | 
132 | centers = kmeans.cluster_centers_
133 | plt.scatter(centers[:, 0], centers[:, 1], c='black', s=30, alpha=0.5, marker = 'x');
134 | pyplot.axis(ymax = 0, ymin =-.25 , xmax = .2, xmin = .14)
135 | plt.title("Truncated SVD with Cluster Centers")
136 | pyplot.axis(ymax = 0, ymin =-.25 , xmax = .2, xmin = .14)
137 | plt.scatter(data2D[:, 0], data2D[:, 1], c=y_kmeans, cmap = "tab20", s=30)
138 | 
139 | 
140 | 
141 | 
142 | ######Interactive ScatterPlot with SVD######
143 | svd = TruncatedSVD(n_components=2).fit(matrix)
144 | data2D = svd.transform(matrix)
145 | 
146 | fig, ax = plt.subplots(figsize = (14,8))
147 | np.random.seed(0)
148 | ax.plot(data2D[:, 0], data2D[:, 1],
149 |         'or', ms=10, alpha=0.2)
150 | ax.set_title('Truncated SVD with Cluster Assignments', size=14)
151 | ax.grid(color='lightgray', alpha=0.7)
152 | for i, txt in enumerate(events):
153 |     print(i)
154 |     plt.annotate(txt + ", " + str(y_pred[i]), (data2D[:, 0][i], data2D[:, 1][i]))
155 | mpld3.show(fig)
156 | #mpld3.save_html(fig, "Truncated_SVD_D3.html")
157 | 
158 | 
159 | 
160 | 
161 | #####Interactive ScatterPlot with Dense Matrix and PCA######
162 | """Probably not the method to use. SVD seems better since it takes the sparse matrix "tfidf_matrix" directly."""
163 | x = tfidf_matrix.todense()
164 | coords = PCA(n_components=2).fit_transform(x)
165 | fig, ax = plt.subplots(figsize = (14,8))
166 | 
167 | np.random.seed(0)
168 | ax.plot(coords[:, 0], coords[:, 1],
169 |         'or', ms=10, alpha=0.2)
170 | ax.set_title('PCA with Cluster Assignments', size=14)
171 | ax.grid(color='lightgray', alpha=0.7)
172 | for i, txt in enumerate(events):
173 |     plt.annotate(txt, (coords[:, 0][i], coords[:, 1][i]))
174 | mpld3.show(fig)
175 | 
176 | 
177 | 
178 | """Dendogram Making"""
179 | fig = pylab.figure(figsize=(100,70))
180 | children = hac.children_
181 | distance = np.arange(children.shape[0])
182 | no_of_observations = np.arange(2, children.shape[0]+2)
183 | linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float)
184 | dendrogram(linkage_matrix, labels = (events), truncate_mode = "level", leaf_font_size = 8)
185 | fig.show
186 | 
187 | 
188 | 
189 | 
190 | """ScatterPlot"""
191 | coords = PCA(n_components=2).fit_transform(dense_matrix)
192 | fig, ax = plt.subplots(figsize = (14,8))
193 | np.random.seed(0)
194 | ax.plot(coords[:, 0], coords[:, 1],
195 |         'or', ms=10, alpha=0.2)
196 | ax.set_title('Truncated SVD with Cluster Assignments', size=14)
197 | ax.grid(color='lightgray', alpha=0.7)
198 | for i, txt in enumerate(events):
199 |     print(i)
200 |     plt.annotate(txt + ", " + str(y_pred[i]), (coords[:, 0][i], coords[:, 1][i]))
201 | mpld3.show(fig)
202 | 
203 | 
204 | 
205 | #############################################################
206 | ################Hyperparameter Testing#######################
207 | #############################################################
208 | 
209 | 
210 | hyper_params = pd.read_csv("/Users/parkerglenn/Desktop/DataScience/Article_Clustering/csv/Hyperparam_testing/Hyper_Params.csv")
211 | hyper_params = hyper_params.reset_index()
212 | 
213 | f_scores = [x for x in hyper_params["f_score"]]
214 | s_scores = [x for x in hyper_params["s_score"]]
215 | person_rate = [x for x in hyper_params["person_rate"]]
216 | ents_rate = [x for x in hyper_params["ents_rate"]]
217 | 
218 | 
219 | 
220 | """F1 Score distribution across hyperparameters
221 | Best F1 Score: ents_rate = 1.63, person_rate = 2.57, f_score = .927 on 412 articles. BUT s_score is a measly .0788.
222 | """
223 | f_data = pd.DataFrame({"Person Rate": [round(x,2) for x in person_rate], "Entity Rate":[round(x,2) for x in ents_rate], "Z": hyper_params.f_score})
224 | f_data_pivoted = f_data.pivot("Person Rate","Entity Rate","Z")
225 | ax1 = sns.heatmap(f_data_pivoted, cmap = plt.cm.hot)
226 | cbar = ax1.collections[0].colorbar
227 | cbar.set_label('F1 Score', labelpad=15)
228 | ax1.invert_yaxis()
229 | plt.show()
230 | 
231 | """Silhouette Score distribution across hyperparameters
232 | More direct correlation here than in F1 Score. Most notably, as the rates increase, so does S Score. This is due to
233 | the fact that the rating of words in articles becomes more radical; articles that are different from each other, i.e. share no
234 | entities, are now much more distant than before. Even more specifically, when the Entity Rate and Person Rate are dissimilar, the
235 | S Score is the highest. Again, the same radical rating phenomena: By limiting the amount of weighted features, those that are weighted
236 | make the article more of an outlier than before."""
237 | 
238 | s_data = pd.DataFrame({"Person Weighting": [round(x,2) for x in person_rate], "Non-Person Weighting":[round(x,2) for x in ents_rate], "Z": hyper_params.s_score})
239 | s_data_pivoted = s_data.pivot("Person Weighting","Non-Person Weighting","Z")
240 | ax2 = sns.heatmap(s_data_pivoted, cmap = plt.cm.hot)
241 | cbar = ax2.collections[0].colorbar
242 | cbar.set_label('Silhouette Score', labelpad=15)
243 | ax2.invert_yaxis()
244 | plt.show()
245 | 
246 | 
247 | 
248 | """Cumulative Total
249 | Highest: ents_rate = 6.368, person_rate = 2.263, f_score = 0.922, s_score = 0.093
250 | 
251 | """
252 | sf_data = pd.DataFrame({"Person Weighting": [round(x,2) for x in person_rate], "Entity Weighting":[round(x,2) for x in ents_rate], "Z": hyper_params.s_score + hyper_params.f_score})
253 | sf_data_pivoted = sf_data.pivot("Person Weighting","Entity Weighting","Z")
254 | ax3 = sns.heatmap(sf_data_pivoted, cmap = plt.cm.hot)
255 | cbar = ax3.collections[0].colorbar
256 | cbar.set_label('Composite Score', labelpad=15)
257 | ax3.invert_yaxis()
258 | plt.show()
259 | 
260 | """Little thing to find best cumulative score and rates that achieved it."""
261 | hightot = 0
262 | for tup in enumerate(hyper_params["f_score"]):
263 |     tot = tup[1] + hyper_params.loc[tup[0]]["s_score"]
264 |     if hightot < tot:
265 |         hightot = tot
266 |         best_scores = (hyper_params.loc[tup[0]]["ents_rate"],hyper_params.loc[tup[0]]["person_rate"])
267 | best_scores
268 | 


--------------------------------------------------------------------------------
/Hyperparameter_Testing/hyperparameter_functions.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Fri Feb 22 15:31:02 2019
  5 | 
  6 | @author: parkerglenn
  7 | """
  8 | 
  9 | def tokenize_and_stem_NER(corpus):
 10 |     global tokenized_corpus
 11 |     tokenized_corpus = []
 12 |     good_ents = ["PERSON", "GPE","ORG", "LOC", "EVENT", "FAC"]
 13 |     continue_tags = ["B-","I-"]
 14 |     end_tags = ["L-","U-"]
 15 |     for text in corpus:
 16 |         toks = []
 17 |         iobs = [i.ent_iob_ for i in nlp(text)]
 18 |         biluos = list(iob_to_biluo(iobs))
 19 |         index = -1
 20 |         #Named entities variable
 21 |         ne = ""
 22 |         for tok in nlp(text):
 23 |             index += 1
 24 |             if biluos[index] in continue_tags and str(tok.ent_type_) in good_ents:
 25 |                 #Checks if empty token
 26 |                 #For some reason tok.whitespace_ doesn't include double token entities
 27 |                 #like "JENNIFER LAWRENCE"
 28 |                 if str(tok).split() != [] and str(tok.ent_type_) != "PERSON":
 29 |                     ne += " " + str(tok).upper()
 30 |                 elif str(tok).split() != [] and str(tok.ent_type_) == "PERSON":
 31 |                     ne += " " +str(tok).title()
 32 |             elif biluos[index] in end_tags and str(tok.ent_type_) in good_ents:
 33 |                 if str(tok).split() != [] and str(tok.ent_type_) != "PERSON":
 34 |                     ne += " " + str(tok).upper()
 35 |                     toks.append(ne.lstrip())
 36 |                     ne = " "
 37 |                 elif str(tok).split() != [] and str(tok.ent_type_) == "PERSON":
 38 |                     ne += " " + str(tok).title()
 39 |                     ne = ne.replace("’S", "")
 40 |                     toks.append(ne.lstrip())
 41 |                     ne = " "
 42 |                 ne = " "
 43 |             #If token is just a boring old word
 44 |             else:
 45 |                 if tok.is_punct == False and str(tok).lower() not in list(stop_words.ENGLISH_STOP_WORDS):
 46 |                     toks.append(stemmer.stem(str(tok)))
 47 |         tokenized_corpus.append(toks)
 48 | 
 49 | 
 50 | 
 51 | def do_tfidf(ents_rate, person_rate):
 52 |     
 53 |     def TF_dict(article):
 54 |         article_tf = {}
 55 |         for word in article:
 56 |             if word in article_tf:
 57 |                 article_tf[word] += 1
 58 |             else:
 59 |                 article_tf[word] = 1
 60 |         for word in article_tf:
 61 |             """Manipulate word.isupper() to account for entity weighting."""
 62 |             if word.isupper():
 63 |                 occurences = article_tf[word]
 64 |                 article_tf[word] = (occurences / len(article)) * ents_rate
 65 |             #word.istitle() applies to PERSON tags
 66 |             elif word.istitle():
 67 |                 occurences = article_tf[word]
 68 |                 article_tf[word] = (occurences / len(article)) * person_rate
 69 |             else:
 70 |                 occurences = article_tf[word]
 71 |                 article_tf[word] = (occurences / len(article))            
 72 |         return article_tf
 73 |     
 74 |     TF = [TF_dict(article) for article in tokenized_corpus]     
 75 |     
 76 |     
 77 |     
 78 |     def Count_dict():
 79 |         countDict = {}
 80 |         for article in TF:
 81 |             found_words = []
 82 |             for word in article:
 83 |                 if word in countDict and word not in found_words:
 84 |                     countDict[word] += 1
 85 |                     found_words.append(word)
 86 |                 elif word not in found_words:
 87 |                     countDict[word] = 1
 88 |                     found_words.append(word)
 89 |         return countDict
 90 |         
 91 |     countDict = Count_dict()
 92 |     
 93 |     import operator
 94 |     sortCount = sorted(countDict.items(), key=operator.itemgetter(1), reverse = True)
 95 |     
 96 |     
 97 |     def IDF_dict():
 98 |         import math
 99 |         idfDict = {}
100 |         for word in countDict:
101 |             #len(corpus) is 1000, the total number of documents
102 |             #countDict[word] is the number of articles the word appears in
103 |             """
104 |             From Sci-Kit code: https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py
105 |                 'smooth_idf: If ``smooth_idf=True`` (the default), the constant "1" is added to the
106 |                 numerator and denominator of the idf as if an extra document was seen
107 |                 containing every term in the collection exactly once, which prevents
108 |                 zero divisions: idf(d, t) = log [ (1 + n) / (1 + df(d, t)) ] + 1.'
109 |                 
110 |                 'The effect of adding "1" to
111 |                 the idf in the equation above is that terms with zero idf, i.e., terms
112 |                 that occur in all documents in a training set, will not be entirely
113 |                 ignored.'
114 |                 
115 |                 min_df: 'When building the vocabulary ignore terms that have a document
116 |                 frequency strictly lower than the given threshold. This value is also
117 |                 called cut-off in the literature.'
118 |                 
119 |                 max_df: 'When building the vocabulary ignore terms that have a document
120 |                 frequency strictly higher than the given threshold (corpus-specific
121 |                 stop words).'
122 |                 
123 |                 norm: (default='l2') Each output row will have unit norm, either:
124 |                     * 'l2': Sum of squares of vector elements is 1. The cosine
125 |                     similarity between two vectors is their dot product when l2 norm has
126 |                     been applied.
127 |             """
128 |             #Implements min_df and max_df
129 |             min_df = 0
130 |             max_df = 1.0
131 |             if countDict[word] > min_df and (countDict[word] / len(corpus)) < max_df:
132 |                 idfDict[word] = math.log((1 + len(corpus)) / (1 + countDict[word])) + 1
133 |             else:
134 |                 idfDict[word] = 0
135 |         return idfDict
136 |     
137 |     idfDict = IDF_dict()
138 |     
139 |     
140 |     
141 |     def TFIDF_list(article):
142 |         article_tfidf = {}
143 |         for word in article:
144 |             #article[word] is the TF score for that word in the given article
145 |             article_tfidf[word] = article[word] * idfDict[word]
146 |         return article_tfidf
147 |     
148 |     
149 |     
150 |     tfidf = [TFIDF_list(article) for article in TF]
151 |     
152 |     
153 |     
154 |     
155 |     from sklearn import preprocessing
156 |     terms = sorted(countDict.keys())
157 |     def compute_TFIDF_matrix(article):
158 |         article_matrix = [0.0] * len(terms)
159 |         for i, word in enumerate(terms):
160 |             #Stores tfidf value of unique word in terms
161 |             #if the word is in the article
162 |             if word in article:
163 |                 #article[word] is the word's tfidf score
164 |                 article_matrix[i] = article[word]
165 |         return article_matrix
166 |     
167 |     
168 |     
169 |     tfidf_matrix = [compute_TFIDF_matrix(article) for article in tfidf]
170 |     
171 |     #Normalized with the default l2 setting
172 |     tfidf_matrix = preprocessing.normalize(tfidf_matrix, norm = 'l2')
173 | 
174 |     return tfidf_matrix
175 | 
176 | ################################################################
177 | ####################HAC#########################################
178 | ################################################################
179 | 
180 | def HAC(matrix):
181 |     from sklearn.cluster import AgglomerativeClustering
182 |     from scipy.cluster.hierarchy import dendrogram
183 |     hac = AgglomerativeClustering(n_clusters=500, affinity = "euclidean")
184 |     hac.fit_predict(tfidf_matrix)
185 |     y_pred = list(hac.labels_)
186 |     success(hac, y_pred, tfidf_matrix)
187 | 
188 | def success(model, clusters, matrix):
189 |     
190 |     import os
191 |     import pandas as pd
192 |     import codecs
193 |     
194 |     os.chdir("/Users/parkerglenn/Desktop/DataScience/Article_Clustering")
195 | 
196 |     data = codecs.open('/Users/parkerglenn/Desktop/DataScience/Article_Clustering/csv/all_GOOD_articles.csv', encoding = 'utf-8')
197 |     data_with_labels = codecs.open("/Users/parkerglenn/Desktop/DataScience/Article_Clustering/Google_Drive/Article_Classification26.csv")
198 |     
199 |     df = pd.read_csv(data)
200 |     labels_df= pd.read_csv(data_with_labels)
201 |     
202 |     #Deletes unnecessary columns
203 |     df = df.drop(df.columns[:12], axis = 1)
204 |     #Sets manageable range for working data set
205 |     new_df = df[5000:6000]
206 |     #Gets info in list form to be later called in kmeans part
207 |     
208 |     corpus = []
209 |     for text in new_df['content']:
210 |         corpus.append(text)
211 |     
212 |     titles = []
213 |     for title in new_df["title"]:
214 |         titles.append(str(title))
215 |     #labels_df starts at df[5000] so we're good on the matching of labels to content
216 |     events = []
217 |     for event in labels_df["Event"][:1000]:
218 |         events.append(str(event))
219 | 
220 | 
221 |     articles = {"title": titles, "date": new_df["date"], "cluster": clusters, "content": new_df["content"], "event": events[:1000]}
222 |     frame = pd.DataFrame(articles, index = [clusters] , columns = ['title', 'date', 'cluster', 'content', "event"])
223 |     frame['cluster'].value_counts()
224 | 
225 |     """
226 |     BELOW THIS CREATES DICT OF CLUSTERS AND PREDOMINANT EVENT
227 |     
228 |     If multiple events occur the same amount of times in a single cluster,
229 |     the ratio function is invoked to choose the event holding the most relative
230 |     significance. If one ratio is not greater than the others (ex. a cluster 
231 |     composed of 5 one-off events) then the cluster is disregared (labelled "nan").
232 |     
233 |     If the cluster only contains one event, it is assumed at this stage that it is 
234 |     the main cluster for the event. 
235 |     
236 |     BUGS:
237 |         If the cluster contains only "nan" events, it will not show up in y_trueDict
238 |         (ex. Cluster 113 is not shown, consisting of {'nan': 2} )
239 |     """ 
240 |     from collections import Counter
241 |     all_events = []
242 |     #This fixes quirk where the same cluster was iterated over multiple times
243 |     clusters_we_saw = []
244 |     for cluster in clusters: 
245 |         if cluster not in clusters_we_saw:
246 |             clusters_we_saw.append(cluster)
247 |             for event in frame.loc[cluster]["event"].values.tolist():
248 |                 if event != "nan" and event != "useless":
249 |                     all_events.append(event)
250 |     event_occurences = dict(Counter(all_events))
251 |     
252 |     all_clusters = []
253 |     for cluster in clusters:
254 |         if cluster not in all_clusters:
255 |             all_clusters.append(cluster)
256 |             
257 |     y_trueDict = {}
258 |     #This range needs to be changed depending on the cluster model
259 |     for i in range(0,len(all_clusters)):
260 |         ratios = []
261 |         counts = []
262 |         cluster_event = []
263 |         ratio = 0
264 |         
265 |         #Counts occurence per cluster of event
266 |         for event in frame.loc[i]["event"].values.tolist():
267 |             if event != "nan" and event != "useless":
268 |                 counts.append(event)
269 |         counts = Counter(counts) 
270 |         
271 |         
272 |         if len(counts) > 1:
273 |             score_1 = list(counts.most_common()[0])[1]
274 |             score_2 = list(counts.most_common()[1])[1]
275 |             #Check to see if there are multiple events with same frequency
276 |             if score_1 == score_2:
277 |                 #Gets all events with same frequency
278 |                 tied_events = [k for k,v in dict(counts).items() if v == score_1]
279 |                 for event in tied_events:
280 |                     #Gets the ratio of an occurence for an event in a cluster
281 |                     #For example, if an event happens only once, it's ratio will be 1
282 |                     #But if "iowa_caucuses" is used 100 times and only 20 times in a specific cluster,
283 |                     #its ratio is .2
284 |                     new_ratio = score_1 / int(event_occurences[event])
285 |                     ratios.append(new_ratio)
286 |                     if new_ratio > ratio:
287 |                         cluster_event = event
288 |                         ratio = new_ratio
289 |                     #If result is an empty list, all ratios are unique. If not, there
290 |                     #are repititions and the data point is thrown out.
291 |                     if list(set([x for x in ratios if ratios.count(x) > 1])) != []:
292 |                          y_trueDict[i] = "nan"
293 |                          break
294 |                     
295 |                     #Dumb try and except sees if ytrueDict[i] is already set to something ("nan")
296 |                     try: 
297 |                         y_trueDict[i] 
298 |                     except:
299 |                         counts = dict(counts)
300 |                         #Makes sure there's still the occurence in cluster attached to the cluster_event
301 |                         y_trueDict[i] = [cluster_event, counts[cluster_event]]
302 |         
303 |     
304 |         #If there is one obviously right event, i.e. score_1 != score_2
305 |             else:
306 |                 y_trueDict[i] = list(counts.most_common()[0])
307 |         
308 |         #Catches the instance of only one item per cluster, i.e. len(counts) !> 1
309 |         elif len(counts) == 1:
310 |             y_trueDict[i] =list(counts.most_common()[0]) 
311 |     
312 |     
313 |     #Re-analyzes y_trueDict, applying ratio again so there's one objectively "right" cluster per event
314 |     a = []
315 |     for k in y_trueDict:
316 |         a.append(y_trueDict[k][0])
317 |     a = dict(Counter(a))
318 |     
319 |     #Sees where the same event label is applied to multiple clusters
320 |     duplicates = []
321 |     for g in a:
322 |         if a[g] > 1 and g != "n" and g != "unknown":
323 |             duplicates.append(g)
324 |     
325 |     
326 |     #Creates dup_eventsPLUSratio, where the duplicate events are stored by cluster number
327 |     #with their ratio
328 |     dup_eventsPLUSratio = {}
329 |     for key in y_trueDict:
330 |         if y_trueDict[key][0] in duplicates:
331 |             event = y_trueDict[key][0]
332 |             ratio = int(y_trueDict[key][1]) / int(event_occurences[event])
333 |             eventPLUSratio = []
334 |             eventPLUSratio.append(event)
335 |             eventPLUSratio.append(ratio)
336 |             dup_eventsPLUSratio[key] = eventPLUSratio
337 |     dup_eventsPLUSratio
338 |     
339 |     #Dives into dup_eventsPLUSratio to see what cluster is more approrpiate for event
340 |     for duplicate in duplicates:
341 |         ratios = []
342 |         for key in dup_eventsPLUSratio:
343 |             if dup_eventsPLUSratio[key][0] == duplicate:
344 |                 ratios.append(dup_eventsPLUSratio[key][1])
345 |         sort=sorted(ratios,reverse=True)
346 |         highest = sort[0]
347 |         theGood_one = [duplicate, highest]
348 |         for key in dup_eventsPLUSratio:
349 |             if dup_eventsPLUSratio[key][0] == duplicate:    
350 |                 if dup_eventsPLUSratio[key] != theGood_one or highest == sort[1]:
351 |                     y_trueDict[key] = "nan"
352 |             #If after all that there's still a tie between the top two ratios,
353 |             #(like in hail_caesar_movie where its split 2 and 2 between clusters)
354 |             #its given a "nan" label
355 |             #COULD BE CHANGED TO FIT A WHILE LOOP THAT THEN FINDS score_2 AND
356 |             #RELABELS CLUSTER TO SECOND MOST POPULAR EVENT IF THAT EVENT IS NOT
357 |             #ALREADY ASSIGNED A CLUSTER
358 |     
359 |     #Gets y_true, the correct cluster assignments for each event
360 |     bad_labels = ["useless","nan","unkown"]
361 |     y_true = []          
362 |     for event in events[:1000]:
363 |         find = False
364 |         for key in y_trueDict:
365 |             #Used to see if there is a distinct cluster for that event
366 |             #FIXED BUG: probably still some duplicates in y_trueDict somehow, bc output len is 10005
367 |             #maybe the "unknown" or "useless" stuff?
368 |             if y_trueDict[key][0] == event and y_trueDict[key][0] not in bad_labels:
369 |                 y_true.append(key)
370 |                 find = True
371 |         if find == False:
372 |             #Arbitrary value that's not going to return a match in t score
373 |             y_true.append("nan")
374 |     
375 |     
376 |     #Gets y_pred, the cluster where each individual event was actually clustered
377 |     y_pred = []
378 |     for cluster_assignment in frame["cluster"]:
379 |         y_pred.append(cluster_assignment)
380 |     
381 |     #checks how events actually match up with definitively defined cluster
382 |     num = 0
383 |     for i in y_true:
384 |         if i != "nan":
385 |             num += 1
386 |     print("Working with " + str(num) + " samples based on a corpus of " + str(len(corpus)) + " documents: ")
387 |     print()
388 |     
389 |     #Re-Aligns two lists to only include good values (those not equalling "nan")
390 |     filtered_y_true = []
391 |     filtered_y_pred = []   
392 |     
393 |     for place in range(len(y_true)):
394 |         if y_true[place] != "nan":
395 |             filtered_y_true.append(y_true[place])
396 |             filtered_y_pred.append(y_pred[place])
397 |     
398 |     
399 |     
400 |     
401 |     
402 |     """F1 score is the harmonic average of precision and recall. """
403 |     
404 |     from sklearn.metrics import f1_score
405 |     print("The F1 score for the model is " + str(f1_score(y_true = filtered_y_true, y_pred = filtered_y_pred, average = "micro")))
406 |     print()
407 |     #500_no_ngrams F1 score micro: 0.8785046728971962 (also works off the most samples)
408 |     #350_3_ngrams F1 score micro: 0.8718861209964412 (but goes off 281 samples rather than 303 in no ngrams)
409 |     #700_no_ngrams F1 score micro: 0.8638392857142858
410 |     #350_no_ngrams F1 score micro: 0.8576158940397351
411 |     #300_3_ngrams F1 score micro: 0.8294573643410853
412 |     
413 |     """ Silhouette values lies in the range of [-1, 1]. A value of +1 indicates that the sample is far away 
414 |     from its neighboring cluster and very close to the cluster its assigned. Similarly, value of -1 
415 |     indicates that the point is close to its neighboring cluster than to the cluster its assigned. 
416 |     And, a value of 0 means its at the boundary of the distance between the two cluster. Value of +1 
417 |     is ideal and -1 is least preferred. Hence, higher the value better is the cluster configuration. """
418 |     
419 |     from sklearn.metrics import silhouette_score
420 |     print("The sillhouette score for the model is " + str(silhouette_score(matrix, y_pred)))
421 |     print()
422 |     print()
423 |     print()
424 |     #500_no_ngrams: 0.07096239881264323
425 |     #350_no_ngrams: 0.06777628195061947
426 |     #700_no_ngrams: 0.06251251395097632
427 |     #350_3_ngrams: 0.04969413068018369
428 |     #300_3_ngrams: 0.04857286650243616
429 |     
430 |     global s_score 
431 |     global f_score
432 |     global num2
433 |     
434 |     s_score = silhouette_score(matrix, y_pred)
435 |     f_score = f1_score(y_true = filtered_y_true, y_pred = filtered_y_pred, average = "micro")
436 |     num2 = 0
437 |     for i in y_true:
438 |         if i != "nan":
439 |             num2 += 1


--------------------------------------------------------------------------------
/Hyperparameter_Testing/hyperparameter_testing.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Sat Feb 16 12:00:18 2019
  5 | 
  6 | @author: parkerglenn
  7 | """
  8 | import os
  9 | import pandas as pd
 10 | import numpy as np
 11 | import sklearn.manifold
 12 | import matplotlib.pyplot as plt
 13 | import nltk
 14 | import regex as re
 15 | import re
 16 | import codecs
 17 | import csv
 18 | import glob
 19 | import multiprocessing
 20 | from nltk.corpus import stopwords
 21 | from nltk.stem.snowball import SnowballStemmer
 22 | from nltk.tokenize import word_tokenize, sent_tokenize
 23 | stemmer = SnowballStemmer("english")
 24 | import math
 25 | from sklearn.feature_extraction.text import TfidfVectorizer
 26 | from sklearn.cluster import KMeans
 27 | import seaborn as sns
 28 | import sklearn
 29 | 
 30 | os.chdir("/Users/parkerglenn/Desktop/DataScience/Article_Clustering")
 31 | df = pd.read_csv("/Users/parkerglenn/Desktop/DataScience/Article_Clustering/csv/all_GOOD_articles.csv")
 32 | labels_df= pd.read_csv("/Users/parkerglenn/Desktop/DataScience/Article_Clustering/Google_Drive/Article_Classification122.csv")
 33 | #Deletes unnecessary columns
 34 | df = df.drop(df.columns[:12], axis = 1)
 35 | #Sets manageable range for working data set
 36 | new_df = df[5000:6000]
 37 | #Gets info in list form to be later called in kmeans part
 38 | 
 39 | corpus = []
 40 | for text in new_df['content']:
 41 |     corpus.append(text)
 42 | 
 43 | titles = []
 44 | for title in new_df["title"]:
 45 |     titles.append(str(title))
 46 | #labels_df starts at df[5000] so we're good on the matching of labels to content
 47 | events = []
 48 | for event in labels_df["Event"][:1000]:
 49 |     events.append(str(event))
 50 |     
 51 | import spacy
 52 | from spacy import gold   
 53 | from spacy.gold import iob_to_biluo
 54 | nlp = spacy.load('en_core_web_md', disable=['parser','tagger','textcat'])
 55 | nlp.vocab.add_flag(lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP)
 56 | english_stopwords = stopwords.words('english')
 57 | from sklearn.feature_extraction import stop_words
 58 | 
 59 | ##############################################################################
 60 | ###################HYPER-PARAMETER TESTING####################################
 61 | ##############################################################################
 62 | from hyper_parameter_functions import tokenize_and_stem_NER, do_tfidf, HAC, success
 63 | 
 64 | hyper_params = pd.DataFrame(columns = ["ents_rate", "person_rate","f_score","s_score","samples used"])
 65 | 
 66 | for person_rate in np.linspace(1,7,20):
 67 |     for ents_rate in np.linspace(1,7,20):
 68 |         tfidf_matrix = do_tfidf(ents_rate, person_rate)
 69 |         HAC(tfidf_matrix)
 70 |         cols = [pd.Series([ents_rate,person_rate,f_score,s_score,num2],index=hyper_params.columns)]
 71 |         hyper_params = hyper_params.append(cols)
 72 | 
 73 | hyper_params.plot.scatter(x="f_score", y = "s_score")
 74 | 
 75 | 
 76 | f_scores = [x for x in hyper_params["f_score"]]
 77 | s_scores = [x for x in hyper_params["s_score"]]
 78 | person_rate = [x for x in hyper_params["person_rate"]]
 79 | ents_rate = [x for x in hyper_params["ents_rate"]]
 80 | 
 81 | cmap = pyplot.cm.cubehelix
 82 | dimensions = (20,20)
 83 | fig, ax = pyplot.subplots(figsize=dimensions)
 84 | sns.heatmap(dist, vmin = 0, vmax = 1, cmap = cmap).set_title("Tfidf Distances Between Articles", fontsize = 15)
 85 | 
 86 | 
 87 | 
 88 | import seaborn as sns
 89 | 
 90 | """F1 Score distribution across hyperparameters
 91 | Best F1 Score: ents_rate = 1.63, person_rate = 2.57, f_score = .927 on 412 articles. BUT s_score is a measly .0788.
 92 | """
 93 | f_data = pd.DataFrame({"Person Rate": [round(x,2) for x in person_rate], "Entity Rate":[round(x,2) for x in ents_rate], "Z": hyper_params.f_score})
 94 | f_data_pivoted = f_data.pivot("Person Rate","Entity Rate","Z")
 95 | ax1 = sns.heatmap(f_data_pivoted, cmap = plt.cm.hot)
 96 | cbar = ax1.collections[0].colorbar
 97 | cbar.set_label('F1 Score', labelpad=15)
 98 | ax1.invert_yaxis()
 99 | plt.show()
100 | 
101 | """Silhouette Score distribution across hyperparameters
102 | 
103 | More direct correlation here than in F1 Score. Most notably, as the rates increase, so does S Score. This is due to 
104 | the fact that the rating of words in articles becomes more radical; articles that are different from each other, i.e. share no 
105 | entities, are now much more distant than before. Even more specifically, when the Entity Rate and Person Rate are dissimilar, the
106 | S Score is the highest. Again, the same radical rating phenomena: By limiting the amount of weighted features, those that are weighted
107 | make the article more of an outlier than before."""
108 | s_data = pd.DataFrame({"Person Rate": [round(x,2) for x in person_rate], "Entity Rate":[round(x,2) for x in ents_rate], "Z": hyper_params.s_score})
109 | s_data_pivoted = s_data.pivot("Person Rate","Entity Rate","Z")
110 | ax2 = sns.heatmap(s_data_pivoted, cmap = plt.cm.hot)
111 | cbar = ax2.collections[0].colorbar
112 | cbar.set_label('Silhouette Score', labelpad=15)
113 | ax2.invert_yaxis()
114 | plt.show()
115 | 
116 | 
117 | 
118 | """Cumulative Total
119 | Highest: ents_rate = 6.368, person_rate = 2.263, f_score = 0.922, s_score = 0.093
120 | 
121 | """
122 | sf_data = pd.DataFrame({"Person Rate": [round(x,2) for x in person_rate], "Entity Rate":[round(x,2) for x in ents_rate], "Z": hyper_params.s_score + hyper_params.f_score})
123 | sf_data_pivoted = sf_data.pivot("Person Rate","Entity Rate","Z")
124 | ax3 = sns.heatmap(sf_data_pivoted, cmap = plt.cm.hot)
125 | cbar = ax3.collections[0].colorbar
126 | cbar.set_label('Composite Score', labelpad=15)
127 | ax3.invert_yaxis()
128 | plt.show()
129 | 
130 | """Little thing to find best cumulative score and rates that achieved it."""
131 | hightot = 0
132 | for tup in enumerate(hyper_params["f_score"]):
133 |     tot = tup[1] + hyper_params.loc[tup[0]]["s_score"]
134 |     if hightot < tot:
135 |         hightot = tot
136 |         best_scores = (hyper_params.loc[tup[0]]["ents_rate"],hyper_params.loc[tup[0]]["person_rate"])
137 | best_scores
138 | 
139 | 
140 | 
141 |  


--------------------------------------------------------------------------------
/PreProcessing/CustomTFIDF.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Mon Mar  4 16:12:21 2019
  5 | 
  6 | @author: parkerglenn
  7 | """
  8 | import sklearn
  9 | class CustomTFIDF(sklearn.base.TransformerMixin):
 10 | 
 11 |     def __init__(self, person_rate = 1,
 12 |                   ents_rate = 1, max_df = 1.0, min_df = 0, date_weight = .1, julian = False, df = False):
 13 |         self._person_rate = person_rate
 14 |         self._ents_rate = ents_rate
 15 |         self._min_df = min_df
 16 |         self._max_df = max_df
 17 |         self._date_weight = date_weight
 18 |         self._julian = julian
 19 |         self._df = df
 20 | 
 21 | 
 22 |     def fit(self, X, *_):
 23 |         return self
 24 | 
 25 | 
 26 |     def TF_dict(self, article):
 27 |             article_tf = {}
 28 |             for word in article:
 29 |                 if word in article_tf:
 30 |                     article_tf[word] += 1
 31 |                 else:
 32 |                     article_tf[word] = 1
 33 |             for word in article_tf:
 34 |                 """Manipulate word.startswith() to account for entity weighting."""
 35 |                 #word.startswith("*") applies to PERSON tags
 36 |                 if word.startswith("*"):
 37 |                     occurences = article_tf[word]
 38 |                     article_tf[word] = (occurences / len(article)) * self._person_rate
 39 |                 #word.startswith("&") applies to NON-PERSON tags
 40 |                 elif word.startswith("&"):
 41 |                     occurences = article_tf[word]
 42 |                     article_tf[word] = (occurences / len(article)) * self._ents_rate
 43 |                 else:
 44 |                     occurences = article_tf[word]
 45 |                     article_tf[word] = (occurences / len(article))
 46 |             return article_tf
 47 | 
 48 | 
 49 |     def Count_dict(self):
 50 |         countDict = {}
 51 |         for article in self._TF:
 52 |             found_words = []
 53 |             for word in article:
 54 |                 if word in countDict and word not in found_words:
 55 |                     countDict[word] += 1
 56 |                     found_words.append(word)
 57 |                 elif word not in found_words:
 58 |                     countDict[word] = 1
 59 |                     found_words.append(word)
 60 |         return countDict
 61 | 
 62 | 
 63 | 
 64 | 
 65 |     def IDF_dict(self, X):
 66 |         import math
 67 |         idfDict = {}
 68 |         for word in self._countDict:
 69 |         #len(corpus) is 1000, the total number of documents for this project
 70 |         #countDict[word] is the number of articles the word appears in
 71 |             """
 72 |             From Sci-Kit code: https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py
 73 |                 'smooth_idf: If ``smooth_idf=True`` (the default), the constant "1" is added to the
 74 |                 numerator and denominator of the idf as if an extra document was seen
 75 |                 containing every term in the collection exactly once, which prevents
 76 |                 zero divisions: idf(d, t) = log [ (1 + n) / (1 + df(d, t)) ] + 1.'
 77 | 
 78 |                 'The effect of adding "1" to
 79 |                 the idf in the equation above is that terms with zero idf, i.e., terms
 80 |                 that occur in all documents in a training set, will not be entirely
 81 |                 ignored.'
 82 | 
 83 |                 min_df: 'When building the vocabulary ignore terms that have a document
 84 |                 frequency strictly lower than the given threshold. This value is also
 85 |                 called cut-off in the literature.'
 86 | 
 87 |                 max_df: 'When building the vocabulary ignore terms that have a document
 88 |                 frequency strictly higher than the given threshold (corpus-specific
 89 |                 stop words).'
 90 | 
 91 |                 norm: (default='l2') Each output row will have unit norm, either:
 92 |                     * 'l2': Sum of squares of vector elements is 1. The cosine
 93 |                     similarity between two vectors is their dot product when l2 norm has
 94 |                     been applied.
 95 |             """
 96 |             #Implements min_df and max_df
 97 |             if self._countDict[word] > self._min_df and (self._countDict[word] / self._amount) < self._max_df:
 98 |                 idfDict[word] = math.log((1 + self._amount) / (1 + self._countDict[word])) + 1
 99 |             else:
100 |                 idfDict[word] = 0
101 |         return idfDict
102 | 
103 | 
104 |     def TFIDF_list(self, article):
105 |         article_tfidf = {}
106 |         for word in article:
107 |             #article[word] is the TF score for that word in the given article
108 |             article_tfidf[word] = article[word] * self._idfDict[word]
109 |         return article_tfidf
110 | 
111 | 
112 | 
113 |     def compute_TFIDF_matrix(self, article):
114 |         terms = sorted(self._countDict.keys())
115 |         article_matrix = [0.0] * len(terms)
116 |         for i, word in enumerate(terms):
117 |             #Stores tfidf value of unique word in terms
118 |             #if the word is in the article
119 |             if word in article:
120 |                 #article[word] is the word's tfidf score
121 |                 article_matrix[i] = article[word]
122 |         return article_matrix
123 | 
124 | 
125 |     def makeJulian(self,X):
126 |         """X must be a df with a "date" column in '%Y-%m-%d' format.
127 | 
128 |         This takes a while to run.
129 |         """
130 |         import datetime
131 |         fmt = '%Y-%m-%d'
132 | 
133 | 
134 |         import julian
135 | 
136 |         julian_lst = []
137 |         for date in X["date"]:
138 |             dt = datetime.datetime.strptime(date,fmt)
139 |             julian_lst.append(julian.to_jd(dt + datetime.timedelta(hours=12), fmt = "jd"))
140 | 
141 |         #Find amount of unique dates
142 |         #Set arbitrary value (maybe 1, do hyperparameting testing again) to index of that date
143 |         unique = list(set(julian_lst))
144 | 
145 |         #Just to have easy access to indexes, ultimately to decide which place in the feature
146 |         #matrix corresponds to which date
147 |         unique_dict = {}
148 |         for place, date in enumerate(unique):
149 |             unique_dict[date] = place
150 | 
151 |         jul_matrix = []
152 |         for place, date in enumerate(julian_lst):
153 |             #mini_matrix is the matrix for the individual article
154 |             mini_matrix = [0.0] * len(julian_lst)
155 |             for num in range(-4,4):
156 |                 if num == 0:
157 |                     mini_matrix[unique_dict[date]] = self._date_weight
158 |                 else:
159 |                     if (unique_dict[date] + num) > -1:
160 |                         #Deterioation function as dates get further away from target
161 |                         #Since dates within a proximity of about 4 seem to indicate some significance in similarity
162 |                         #Can change, right now it's date_weight divided by absolute value of num
163 |                         mini_matrix[unique_dict[date] + num] = (self._date_weight / (abs(num)+.5))
164 |             jul_matrix.append(mini_matrix)
165 |         return jul_matrix
166 | 
167 | 
168 | 
169 | 
170 |     def transform(self, X, *_):
171 |         self._amount = len(X)
172 |         from sklearn import preprocessing
173 |         self._TF = [self.TF_dict(article) for article in X]
174 |         self._countDict = self.Count_dict()
175 |         self._idfDict = self.IDF_dict(X)
176 |         self._tfidf = [self.TFIDF_list(article) for article in self._TF]
177 |         self._tfidf_matrix = [self.compute_TFIDF_matrix(article) for article in self._tfidf]
178 |         self._tfidf_matrix = preprocessing.normalize(self._tfidf_matrix, norm = 'l2')
179 |         #Decides whether or not to add date component
180 |         if self._julian == True:
181 |             import scipy
182 |             from scipy.sparse import  hstack
183 |             self._jul_list = self.makeJulian(self._df)
184 |             self._jul_matrix = scipy.sparse.csr_matrix(self._jul_list)
185 |             self._combo_matrix = hstack([self._jul_matrix, self._tfidf_matrix]).toarray()
186 |             return self._combo_matrix
187 |         else:
188 |             return self._tfidf_matrix
189 | 


--------------------------------------------------------------------------------
/PreProcessing/NERTokenizer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Mon Mar  4 14:59:22 2019
  5 | 
  6 | @author: parkerglenn
  7 | """
  8 | 
  9 | 
 10 | """
 11 | Known errors:
 12 |     Trump is sometimes tagged as an ORG
 13 |     U.S. is sometimes tagged as a PERSON
 14 | """
 15 | import sklearn.base
 16 | class NERTokenizer(sklearn.base.TransformerMixin):
 17 |     """If 'tag' is True, Person entities .startswith("*") and other entities deemed "good" .startswith("&")"""
 18 |     def __init__(self, tag = False):
 19 |         self._tag = tag
 20 | 
 21 |     def fit(self, X, *_):
 22 |         return self
 23 | 
 24 |     def transform(self, X, *_):
 25 |         from nltk.corpus import stopwords
 26 |         from nltk.stem.snowball import SnowballStemmer
 27 |         stemmer = SnowballStemmer("english")
 28 | 
 29 |         import spacy
 30 |         from spacy.gold import iob_to_biluo
 31 |         nlp = spacy.load('en_core_web_md', disable=['parser','tagger','textcat'])
 32 |         from spacy.attrs import ORTH
 33 |         nlp.tokenizer.add_special_case("I'm", [{ORTH: "I'm"}])
 34 |         nlp.vocab.add_flag(lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP)
 35 | 
 36 |         english_stopwords = stopwords.words('english')
 37 |         english_stopwords.append("i'm")
 38 | 
 39 |         tokenized_corpus = []
 40 |         good_ents = ["PERSON","GPE","ORG", "LOC", "EVENT", "FAC"]
 41 |         continue_tags = ["B-","I-"]
 42 |         end_tags = ["L-","U-"]
 43 | 
 44 | 
 45 | 
 46 |         for text in X:
 47 |             toks = []
 48 |             iobs = [i.ent_iob_ for i in nlp(text)]
 49 |             biluos = list(iob_to_biluo(iobs))
 50 |             #Named entities variable
 51 |             ne = ""
 52 |             for index, tok in enumerate(nlp(text)):
 53 |                 if biluos[index] in continue_tags and str(tok.ent_type_) in good_ents:
 54 |                     #str(tok).split() != [] Checks if empty token
 55 |                     #For some reason tok.whitespace_ doesn't include double token entities
 56 |                     #like "JENNIFER LAWRENCE"
 57 |                     if not self._tag:
 58 |                         ne += " " + str(tok).lower()
 59 |                     elif self._tag and str(tok).split() != []:
 60 |                         #Entity is the beginning of an entity set
 61 |                         if biluos[index] == "B-":
 62 |                             if str(tok.ent_type_) != "PERSON":
 63 |                                 ne += " &" + str(tok).lower()
 64 |                             elif str(tok.ent_type_) == "PERSON":
 65 |                                 ne += " *" + str(tok).lower()
 66 |                         else:
 67 |                             if str(tok.ent_type_) != "PERSON":
 68 |                                 ne += " " + str(tok).lower()
 69 |                             elif str(tok.ent_type_) == "PERSON":
 70 |                                 ne += " " + str(tok).lower()
 71 |                 elif biluos[index] in end_tags and str(tok.ent_type_) in good_ents:
 72 |                     if not self._tag:
 73 |                         ne += " " + str(tok).lower()
 74 |                         toks.append(ne.lstrip())
 75 |                         ne = " "
 76 |                     elif self._tag and str(tok).split() != []:
 77 |                         #Entity is just a single unit
 78 |                         if biluos[index] == "U-":
 79 |                             if str(tok.ent_type_) != "PERSON":
 80 |                                 ne += " &" + str(tok).lower()
 81 |                                 toks.append(ne.lstrip())
 82 |                                 ne = " "
 83 |                             elif str(tok.ent_type_) == "PERSON":
 84 |                                 ne += " *" + str(tok).lower()
 85 |                                 ne.replace("*’m", "")
 86 |                                 toks.append(ne.lstrip())
 87 |                                 ne = " "
 88 |                         else:
 89 |                             ne += " " + str(tok).lower()
 90 |                             # so that possesive tags are not stored with the '’s'
 91 |                             ne = ne.replace("’s", "")
 92 |                             toks.append(ne.lstrip())
 93 |                             ne = " "
 94 |                 #If token is just a boring old word
 95 |                 else:
 96 |                     if not tok.is_punct and not tok.is_space and str(tok).lower() not in english_stopwords:
 97 |                         toks.append(stemmer.stem(str(tok)))
 98 |             tokenized_corpus.append(toks)
 99 |         return tokenized_corpus
100 | 


--------------------------------------------------------------------------------
/PreProcessing/__pycache__/CustomTFIDF.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parkervg/news-article-clustering/bca5f12fde915a3bc1a103af7937eec89e3a7a2d/PreProcessing/__pycache__/CustomTFIDF.cpython-37.pyc


--------------------------------------------------------------------------------
/PreProcessing/__pycache__/NERTokenizer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parkervg/news-article-clustering/bca5f12fde915a3bc1a103af7937eec89e3a7a2d/PreProcessing/__pycache__/NERTokenizer.cpython-37.pyc


--------------------------------------------------------------------------------
/PreProcessing/julian_matrix.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | # In[60]:
 5 | import os
 6 | import pandas as pd
 7 | os.chdir("/Users/parkerglenn/Desktop/DataScience/Article_Clustering")
 8 | df = pd.read_csv("/Users/parkerglenn/Desktop/DataScience/Article_Clustering/csv/all_GOOD_articles.csv")
 9 | labels_df= pd.read_csv("/Users/parkerglenn/Desktop/DataScience/Article_Clustering/Google_Drive/Article_Classification26.csv")
10 | #Deletes unnecessary columns
11 | df = df.drop(df.columns[:12], axis = 1)
12 | #Sets manageable range for working data set
13 | new_df = df[5000:6000]
14 | 
15 | 
16 | # In[63]:
17 | 
18 | 
19 | import datetime
20 | fmt = '%Y-%m-%d'
21 | s = "2016-02-01"
22 | s = datetime.datetime.strptime(s,fmt)
23 | 
24 | 
25 | # In[64]:
26 | 
27 | 
28 | new_df["dt"]=False
29 | for i in enumerate(new_df["date"]):
30 |     new_df["dt"].iloc[i[0]] = datetime.datetime.strptime(i[1],fmt)
31 | 
32 | 
33 | # In[65]:
34 | 
35 | 
36 | import julian
37 | new_df["julian"] = False
38 | for i in enumerate(new_df["dt"]):
39 |     jd = julian.to_jd(i[1] + datetime.timedelta(hours=12), fmt = "jd")
40 |     new_df["julian"].iloc[i[0]] = jd
41 | 
42 | 
43 | # In[75]:
44 | 
45 | 
46 | #Find amount of unique dates
47 | #Set arbitrary value (maybe 1, do hyperparameting testing again) to index of that date
48 | unique = []
49 | for i in new_df["julian"]:
50 |     unique.append(i)
51 | unique = set(unique)
52 | print(len(unique))
53 | 
54 | 
55 | # In[80]:
56 | 
57 | 
58 | unique_dict = {}
59 | for place, date in enumerate(unique):
60 |     unique_dict[date] = place
61 | 
62 | 
63 | 
64 | # In[96]:
65 | 
66 | 
67 | import scipy
68 | jul_matrix = []
69 | for place, date in enumerate(new_df["julian"]):
70 |     mini_matrix = [0.0] * len(new_df)
71 |     #Change the .1 value to something appropriate
72 |     #Use hyperparemeter testing
73 |     mini_matrix[unique_dict[date]] = 0.1
74 |     jul_matrix.append(mini_matrix)
75 | 
76 | 
77 | jul_matrix
78 | 
79 | 
80 | 
81 | 
82 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # newsfeed-nlp
 2 | 
 3 | [Link to project poster](Visualizations/NewsfeedArticleClustering.pdf)
 4 | 
 5 | ## Abstract
 6 | This unsupervised learning project allows the average news consumer to experience a stream-lined information acquisition process, free of repetition. Working in Python and using a Kaggle dataset (https://www.kaggle.com/snapcrack/all-the-news) of 85,000 news articles, we extract significance from the texts by utilizing a modified TFIDF-Vectorizer to pre-process the data. We experiment with various clustering techniques (Kmeans, HAC, and Birch), paired with various success metrics to gauge effectiveness of the news consolidation. Visualizations are created using Seaborn and Matplotlib, along with D3 for ease of exploration. Spacy is used as the primary NLP.
 7 | 
 8 | ## Motivation
 9 | In a political climate of intensely polarizing takes on the latest scandals, international relations, and national issues, it can be difficult to make sense of all the data. With the rise of social media, the ability to publish has been democratized and repetition in the newsfeed runs rampant. By grouping together news stories by topic, not only is the newsfeed browsing process streamlined, but clusters are formed that provide differing perspective on the same story.
10 | 
11 | # Methodology - Data Cleaning
12 | Below is a snippet of the original data (https://www.kaggle.com/snapcrack/all-the-news), with the 'content' column omitted.
13 | 
14 | |        |                                                                                   |             |                   |          |      |       |                                                                                                                         |
15 | |--------|-----------------------------------------------------------------------------------|-------------|-------------------|----------|------|-------|-------------------------------------------------------------------------------------------------------------------------|
16 | | id     | title                                                                             | publication | author           | date     | year | month | url       | content                                                                                                              |
17 | | 151908 | Alton Sterling‚Äôs son: ‚ÄôEveryone needs to protest the right way, with peace‚Äô | Guardian    | Jessica Glenza    | 7/13/16  | 2016 | 7     | https://www.theguardian.com/us-news/2016/jul/13/alton-sterling-son-cameron-protesters-baton-rouge                       |
18 | | 151909 | Shakespeare‚Äôs first four folios sell at auction for almost ¬£2.5m               | Guardian    |                   | 5/25/16  | 2016 | 5     | https://www.theguardian.com/culture/2016/may/25/shakespeares-first-four-folios-sell-at-auction-for-almost-25m           |
19 | | 151910 | My grandmother‚Äôs death saved me from a life of debt                             | Guardian    | Robert Pendry     | 10/31/16 | 2016 | 10    | https://www.theguardian.com/commentisfree/2016/oct/31/grandmothers-death-saved-me-life-of-debt                          |
20 | | 151911 | I feared my life lacked meaning. Cancer pushed me to find some                    | Guardian    | Bradford Frost    | 11/26/16 | 2016 | 11    | https://www.theguardian.com/commentisfree/2016/nov/26/cancer-diagnosis-existential-life-accomplishments-meaning         |
21 | | 151912 | Texas man serving life sentence innocent of double murder, judge says             | Guardian    |                   | 8/20/16  | 2016 | 8     | https://www.theguardian.com/us-news/2016/aug/20/texas-life-sentence-innocence-dna-richard-bryan-kussmaul                |
22 | |
23 | 
24 | The orginal dataset was cleansed to only keep rows which contained both a url and a date. This data cleansing resulted in a working dataframe of 82,920 articles.
25 | 
26 | # Methodology - Analysis
27 | 
28 | ## Data Exploration
29 | We began by exploring the distribution of certain notable factors of the data, such as date and publisher. There are 10 unique pulishers in our subset of the data, with an interesting distribution in publishing date:
30 | 
31 | <img src="https://github.com/parkervg/news-article-clustering/blob/master/Visualizations/date_distributions.png" width="1000">
32 | 
33 | 
34 | For the purpose of our project, a document similarity-based task, it seemed most appropriate to run our clustering tests on a section of the data found to the right. Considering that the texts we are working with are all news stories, the frequency of having multiple distinct news stories published about the same event is increased when they are more chronologically compact, i.e. the chance of having an article about the 2016 Iowa Caucus 7 months after it took place is unlikely.
35 | 
36 | We began by [labelling each datapoint](data/article_classifications.csv) with a succinct event label to accurately describe the event it portrayed. One such example is found below (with abbreviated content), which yielded the label "daallo_airlines_explosion."
37 | 
38 | |  event                         |  |   content                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             | date      | id      | month  |  publication                   |           month                                                  |                  url                                                                                                               | year     |
39 | |---------------------------|--|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------|-------|---|---------------------|-------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------|------|
40 | | daallo_airlines_explosion |  | MOGADISHU, Somalia (AP) ‚Äî An explosion that blew a hole in a jetliner shortly after takeoff and left one man missing was believed to have been caused by a bomb, the pilot said Wednesday, describing how the crew calmed frightened passengers as smoke enveloped the cabin before he brought the plane back to Mogadishu‚Äôs airport for an emergency landing. Daallo Airlines said all passengers except one got off the plane safely. It previously... | 2/3/16 | 95647 | 2 | Talking Points Memo | Flight Lands Safely After Suspected Bomb Blew Hole In Plane | https://web.archive.org/web/20160204014156/http://talkingpointsmemo.com/world-news/daallo-airlines-explosion-plane-lands-safely | 2016 |
41 | 
42 | 
43 | This labelling was completed for 1,000 of the articles, to be used for our clustering and success metrics.
44 | 
45 | 
46 | ## Pre-Processing
47 | TFIDF is used as the primary pre-processing method, with some adjustments to account for entity weighting. First, a special tokenizer was created which takes into consideration the entity type of the token. If the token is not an entity, the stem of the token is taken using nltk's [snowball stemmer](http://www.nltk.org/howto/stem.html). Stemming refers to the process of reducing an inflected (or sometimes derived) word to their base form, even if is not identical to its morphological root. If the token is a location, organization or event, an ampersand is inserted in front of the token. If the token is a person, an astericks is inserted at the start of the token. This was achieved using [Spacy's entity recognition](https://spacy.io/usage/linguistic-features).
48 | 
49 | ```python
50 | sent = ["The United Nations ban pineapple on pizza, but Bill Gates intends to fights back."]
51 | tokens = tokenize_and_stem_NER(sent)
52 | print(tokens)
53 | >>>> ['&the united nations', 'ban', 'pineappl', 'pizza', '*bill gates', 'intend', 'fight']
54 | ```
55 | 
56 | Then, to give extra significance to these entities, their TFIDF score was manipulated; specifically, in creating the TF_dict from the file [CustomTFIDF.py](PreProcessing/CustomTFIDF.py), the non-person entities were multipled by a factor of 4, and the person entities a factor of 1.3.
57 | 
58 | ## Clustering
59 | As noted before, the three clustering methods that we utilized in our project were Kmeans, HAC, and KNN. Ultimately, after many iterations and logging of [success rates](Success_Rates.md), [HAC (Hierarchical Agglomerative Clustering)](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html) proved to be the most successful method.
60 | 
61 | Below is a graphic, 2D representation of the multidimensional TFIDF matrix and resulting clusters (as clustered by Kmeans). The annotation by each data point is the "predominant event" as explained below in the Defining Success section, and the size of each point is directly related to the number of articles in that specific cluster.
62 | 
63 | ![Article Centers](/Visualizations/Article_Centers.png)
64 | 
65 | 
66 | ## Defining Success
67 | ### [F1 Score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html)
68 | The F1 score is defined as the harmonic mean between precision and recall. Being as this is an unsupervised learning project, the definition of "y_true" was not inherently obvious. When looking at a cluster of, say, {'zika_std': 1, 'robert_finicum_shooting': 3}, it is difficult from the mere labels which event is a false positive. In response to this ambiguity, a success algorithm was made to assign as many clusters as possible to one unique, predominant event.
69 | 
70 | This predominant event was found by invoking not only the frequency of events, but the individual events' "ratio" when compared to the total occurences of that event. First, the predominant event was defined merely as that event which occured most often within a specific cluster. For example, in the instance of *Cluster A* containing {'zika_std': 1, 'robert_finicum_shooting': 3}, the predominant event would be 'robert_finicum_shooting.'
71 | 
72 | Second, the ratio of an event is invoked. The process of defining predominance merely by quantity of events results in doubling-up of events; multiple clusters are assigned the same event. If *Cluster B* yields {'santorum_drops_out': 1, 'robert_finicum_shooting': 2}, it would also recieve 'robert_finicum_shooting' as the predominant label. But perhaps the label "santorum_drops_out" is used only 4 times across the whole dataset, while the label "robert_finicum_shooting" was used a total of 10 times. To compare the ratio of santorum_drops_out (1/4 = .25) and robert_finicum_shooting (2/10 = .2), the label santorum_drops_out would be more significant in this particular cluster.
73 | 
74 | If, between two clusters there is a tie in ratios of their predominant event, the clusters are dismissed when calculating the F1 Score. For our purposes, the F1 Score is not intended to be a perfect measure, but merely a gauge by which some improvement may be noticed as we pass through different clustering models.
75 | 
76 | ### [Silhouette Score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html)
77 | While the F1 Score defines success based on the values being assigned, the Silhouette Score uses the intrinsic properties of the clusters themselves, with no weight given to the meaning of labels.
78 | 
79 | A value of +1 indicates that the sample is far away from its neighboring cluster and very close to the cluster its assigned. Similarly, value of -1 indicates that the point is close to its neighboring cluster than to the cluster its assigned. And, a value of 0 means its at the boundary of the distance between the two cluster. Value of +1 is ideal and -1 is least preferred. Hence, higher the value better is the cluster configuration.
80 | 
81 | ## Summary
82 | Our 500 cluster HAC model can cluster together the 1,000 news articles we had pre-processed with an *F1-Score of 0.922* and a *Sillhouette Score of 0.093*.
83 | 
84 | A difficult idea that came up quite frequently was the definition of "success" within the project. Specifically, how broad we intended to be with our definition of an "event". Were we content with an article about Russian politics being clustered with the Russian Olympic ban? Ultimately, I decided to take a more granular approach to events, and label those two example events as "russian_politics" and "russian_olympic_ban" accordingly.
85 | 
86 | Another difficulty in the scope of the project was navigating intersections in broad topics. Is the article "Donald Trump speaks about Hurricane Matthew" about Donald Trump or Hurricane Matthew?
87 | 
88 | Overall, I am extremely pleased with how the project turned out and all that I learned about topic clustering and NLP.
89 | 


--------------------------------------------------------------------------------
/SuccessMetrics.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Wed Feb  6 11:16:12 2019
  5 | 
  6 | @author: parkerglenn
  7 | """
  8 | 
  9 | ##############################################################################
 10 | ##################SUCESS RATES################################################
 11 | ##############################################################################
 12 | """
 13 | Takes as input the clustering model, the clusters assignments (in list form) that were predicted, and the 
 14 | tfidf_matrix that was clustered.
 15 | """
 16 | def success(model, clusters, matrix):
 17 |     import os
 18 |     import pandas as pd
 19 |     
 20 |     os.chdir("/Users/parkerglenn/Desktop/DataScience/Article_Clustering")
 21 | 
 22 |     df = pd.read_csv("/Users/parkerglenn/Desktop/DataScience/Article_Clustering/csv/all_GOOD_articles.csv")
 23 |     labels_df= pd.read_csv("/Users/parkerglenn/Desktop/DataScience/Article_Clustering/Google_Drive/Article_ClassificationFINAL.csv")
 24 |     
 25 |     #Deletes unnecessary columns
 26 |     df = df.drop(df.columns[:12], axis = 1)
 27 |     #Sets manageable range for working data set
 28 |     new_df = df[5000:6000]
 29 |     #Gets info in list form to be later called in kmeans part
 30 |     
 31 |     corpus = []
 32 |     for text in new_df['content']:
 33 |         corpus.append(text)
 34 |     
 35 |     titles = []
 36 |     for title in new_df["title"]:
 37 |         titles.append(str(title))
 38 |     #labels_df starts at df[5000] so we're good on the matching of labels to content
 39 |     events = []
 40 |     for event in labels_df["Event"][:1000]:
 41 |         events.append(str(event))
 42 | 
 43 | 
 44 |     articles = {"title": titles, "date": new_df["date"], "cluster": clusters, "content": new_df["content"], "event": events[:len(clusters)]}
 45 |     frame = pd.DataFrame(articles, index = [clusters] , columns = ['title', 'date', 'cluster', 'content', "event"])
 46 | 
 47 |     """
 48 |     BELOW THIS CREATES DICT OF CLUSTERS AND PREDOMINANT EVENT
 49 |     
 50 |     If multiple events occur the same amount of times in a single cluster,
 51 |     the ratio function is invoked to choose the event holding the most relative
 52 |     significance. If one ratio is not greater than the others (ex. a cluster 
 53 |     composed of 5 one-off events) then the cluster is disregared (labelled "nan").
 54 |     
 55 |     If the cluster only contains one event, it is assumed at this stage that it is 
 56 |     the main cluster for the event. 
 57 |     
 58 |     BUGS:
 59 |         If the cluster contains only "nan" events, it will not show up in y_trueDict
 60 |         (ex. Cluster 113 is not shown, consisting of {'nan': 2} )
 61 |     """ 
 62 |     
 63 |     from collections import Counter
 64 |     all_events = []
 65 |     #This fixes quirk where the same cluster was iterated over multiple times
 66 |     clusters_we_saw = []
 67 |     for cluster in clusters: 
 68 |         if cluster not in clusters_we_saw:
 69 |             clusters_we_saw.append(cluster)
 70 |             for event in frame.loc[cluster]["event"].values.tolist():
 71 |                 if event != "nan" and event != "useless":
 72 |                     all_events.append(event)
 73 |     event_occurences = dict(Counter(all_events))
 74 |     
 75 |     #Gets number of unique clusters
 76 |     all_clusters = set(clusters)
 77 |     
 78 |     
 79 |     y_trueDict = {}
 80 |     for i in range(0,len(all_clusters)):
 81 |         ratios = []
 82 |         counts = []
 83 |         ratio = 0
 84 |         
 85 |         #Counts occurence per cluster of event
 86 |         for event in frame.loc[i]["event"].values.tolist():
 87 |             if event != "nan" and event != "useless":
 88 |                 counts.append(event)
 89 |         counts = Counter(counts) 
 90 |         
 91 |         
 92 |         if len(counts) > 1:
 93 |             score_1 = list(counts.most_common()[0])[1]
 94 |             score_2 = list(counts.most_common()[1])[1]
 95 |             #Check to see if there are multiple events with same frequency
 96 |             if score_1 == score_2:
 97 |                 #Gets all events with same frequency
 98 |                 tied_events = [k for k,v in dict(counts).items() if v == score_1]
 99 |                 for event in tied_events:
100 |                     #Gets the ratio of an occurence for an event in a cluster
101 |                     #For example, if an event happens only once, it's ratio will be 1
102 |                     #But if "iowa_caucuses" is used 100 times and only 20 times in a specific cluster,
103 |                     #its ratio is .2
104 |                     new_ratio = score_1 / int(event_occurences[event])
105 |                     ratios.append(new_ratio)
106 |                     if new_ratio > ratio:
107 |                         cluster_event = event
108 |                         ratio = new_ratio
109 |                 #If result is an empty list, all ratios are unique. If not, there
110 |                 #are repititions and the data point is thrown out.
111 |                 if list(set([x for x in ratios if ratios.count(x) > 1])) != []:
112 |                      y_trueDict[i] = "nan"           
113 |                 #Dumb try and except sees if ytrueDict[i] is already set to something ("nan")
114 |                 try: 
115 |                     y_trueDict[i] 
116 |                 except:
117 |                     counts = dict(counts)
118 |                     #Makes sure there's still the occurence in cluster attached to the cluster_event
119 |                     y_trueDict[i] = [cluster_event, counts[cluster_event]]
120 |         #If there is one obviously right event, i.e. score_1 != score_2
121 |             else:
122 |                 y_trueDict[i] = list(counts.most_common()[0])
123 |         
124 |         #Catches the instance of only one item per cluster, i.e. len(counts) !> 1
125 |         elif len(counts) == 1:
126 |             y_trueDict[i] =list(counts.most_common()[0]) 
127 |     
128 |     
129 |     #Re-analyzes y_trueDict, applying ratio again so there's one objectively "right" cluster per event
130 |     a = []
131 |     for k ,v in y_trueDict.items():
132 |         a.append(v[0])
133 |     a = dict(Counter(a))
134 |     
135 |     #Sees where the same event label is applied to multiple clusters
136 |     duplicates = []
137 |     for g in a:
138 |         if a[g] > 1 and g != "n" and g != "unknown":
139 |             duplicates.append(g)
140 |     
141 |     
142 |     #Creates dup_eventsPLUSratio, where the duplicate events are stored by cluster number
143 |     #with their ratio
144 |     dup_eventsPLUSratio = {}
145 |     for key, value in y_trueDict.items():
146 |         if value[0] in duplicates:
147 |             event = value[0]
148 |             ratio = int(y_trueDict[key][1]) / int(event_occurences[event])
149 |             eventPLUSratio = []
150 |             eventPLUSratio.append(event)
151 |             eventPLUSratio.append(ratio)
152 |             dup_eventsPLUSratio[key] = eventPLUSratio
153 |     
154 |     #Dives into dup_eventsPLUSratio to see what cluster is more approrpiate for event
155 |     for duplicate in duplicates:
156 |         ratios = []
157 |         for key,value in dup_eventsPLUSratio.items():
158 |             event = value[0]
159 |             event_ratio = value[1]
160 |             if event == duplicate:
161 |                 ratios.append(event_ratio)
162 |         sort=sorted(ratios,reverse=True)
163 |         highest = sort[0]
164 |         theGood_one = [duplicate, highest]
165 |         for key in dup_eventsPLUSratio:
166 |             if event == duplicate:    
167 |                 if value != theGood_one or highest == sort[1]:
168 |                     y_trueDict[key] = "nan"
169 |             #If after all that there's still a tie between the top two ratios,
170 |             #(like in hail_caesar_movie where its split 2 and 2 between clusters)
171 |             #its given a "nan" label
172 |             #COULD BE CHANGED TO FIT A WHILE LOOP THAT THEN FINDS score_2 AND
173 |             #RELABELS CLUSTER TO SECOND MOST POPULAR EVENT IF THAT EVENT IS NOT
174 |             #ALREADY ASSIGNED A CLUSTER
175 |     
176 |     #Gets y_true, the correct cluster assignments for each event
177 |     bad_labels = ["useless","nan","unkown"]
178 |     y_true = []          
179 |     for event in events[:1000]:
180 |         find = False
181 |         for key, value in y_trueDict.items():
182 |             #Used to see if there is a distinct cluster for that event
183 |             #FIXED BUG: probably still some duplicates in y_trueDict somehow, bc output len is 10005
184 |             #maybe the "unknown" or "useless" stuff?
185 |             if value[0] == event and value[0] not in bad_labels:
186 |                 y_true.append(key)
187 |                 find = True
188 |         if find == False:
189 |             #Arbitrary value that's not going to return a match in t score
190 |             y_true.append("nan")
191 |     
192 |     
193 |     #Gets y_pred, the cluster where each individual event was actually clustered
194 |     y_pred = [cluster_assignment for cluster_assignment in frame["cluster"] ]
195 |         
196 |     #checks how events actually match up with definitively defined cluster
197 |     num = 0
198 |     for i in y_true:
199 |         if i != "nan":
200 |             num += 1
201 |     print("Working with " + str(num) + " samples based on a spread of " + str(len(all_clusters)) + " clusters: ")
202 |     print()
203 |     
204 |     #Re-Aligns two lists to only include good values (those not equalling "nan")
205 |     filtered_y_true = []
206 |     filtered_y_pred = []   
207 |     
208 |     for place in range(len(y_true)):
209 |         if y_true[place] != "nan":
210 |             filtered_y_true.append(y_true[place])
211 |             filtered_y_pred.append(y_pred[place])
212 |     
213 |     
214 |     
215 |     """F1 score is the harmonic average of precision and recall. """
216 |     
217 |     from sklearn.metrics import f1_score
218 |     print("The F1 score for the model is " + str(f1_score(y_true = filtered_y_true, y_pred = filtered_y_pred, average = "micro")))
219 |     print()
220 |     #500_no_ngrams F1 score micro: 0.8785046728971962 (also works off the most samples)
221 |     #350_3_ngrams F1 score micro: 0.8718861209964412 (but goes off 281 samples rather than 303 in no ngrams)
222 |     #700_no_ngrams F1 score micro: 0.8638392857142858
223 |     #350_no_ngrams F1 score micro: 0.8576158940397351
224 |     #300_3_ngrams F1 score micro: 0.8294573643410853
225 |     
226 |     """ Silhouette values lies in the range of [-1, 1]. A value of +1 indicates that the sample is far away 
227 |     from its neighboring cluster and very close to the cluster its assigned. Similarly, value of -1 
228 |     indicates that the point is close to its neighboring cluster than to the cluster its assigned. 
229 |     And, a value of 0 means its at the boundary of the distance between the two cluster. Value of +1 
230 |     is ideal and -1 is least preferred. Hence, higher the value better is the cluster configuration. """
231 |     
232 |     from sklearn.metrics import silhouette_score
233 |     print("The sillhouette score for the model is " + str(silhouette_score(matrix, y_pred)))
234 |     print()
235 |     print()
236 |     print()
237 |     #500_no_ngrams: 0.07096239881264323
238 |     #350_no_ngrams: 0.06777628195061947
239 |     #700_no_ngrams: 0.06251251395097632
240 |     #350_3_ngrams: 0.04969413068018369
241 |     #300_3_ngrams: 0.04857286650243616
242 | 


--------------------------------------------------------------------------------
/Success_Rates.md:
--------------------------------------------------------------------------------
 1 | ## Kmeans
 2 | #### 500_no_ngrams
 3 | * F1 score: 0.8785046728971962
 4 | * Sillhouette Score: 0.07096239881264323
 5 | 
 6 | #### 350_no_ngrams
 7 | * F1 score: 0.8576158940397351
 8 | * Sillhouette Score: 0.06777628195061947
 9 | 
10 | #### 350_no_ngrams_ENTS
11 | * F1 score: 0.850609756097561
12 | * Sillhouette Score: 0.06327439811090264
13 | 
14 | #### 350_3_ngrams
15 | * F1 score: 0.8718861209964412
16 | * Sillhouette Score: 0.04969413068018369
17 | 
18 | #### 300_3_ngrams
19 | * F1 score: 0.8294573643410853
20 | * Sillhouette Score: 0.04857286650243616
21 | 
22 | #### 700_no_ngrams
23 | * F1 score: 0.8638392857142858
24 | * Sillhouette Score: 0.06251251395097632
25 | 
26 | 
27 | 
28 | ## HAC
29 | 
30 | #### 350_euclidean_HAC
31 | * F1 score: 0.8359621451104101
32 | * Sillhouette Score: 0.08998077808781355
33 | 
34 | #### 500_euclidean_HAC
35 | * F1 score: 0.8997613365155133
36 | * Sillhouette Score: 0.08817834578438288
37 | 
38 | 
39 | good_ents = ["PERSON","GPE","ORG", "LOC", "EVENT", "FAC"]
40 | General trend: As the entity weighting increases, the sillhouette score also increases, usually at the expense of F1 score. The intense weighting of entities produces more spread out, tight-knit clusters.
41 | 
42 | 
43 | #### 500_euclidean_HAC_ENTS_*6_PERSON_*1.3
44 | * F1 score: 0.8974358974358975
45 | * Sillhouette Score: 0.09440994024020795
46 | 
47 | #### 500_euclidean_HAC_ENTS_*4_PERSON_*1.3
48 | * F1 score: 0.9067357512953368
49 | * Sillhouette Score: 0.09050069997225527
50 | 
51 | #### 500_euclidean_HAC_ENTS_*5
52 | * F1 score: 0.8691588785046729
53 | * Sillhouette Score: 0.09316827956149414
54 | 
55 | #### 500_euclidean_HAC_ENTS_*4
56 | * F1 score: 0.8721461187214612
57 | * Sillhouette Score: 0.09069735742843224
58 | 
59 | #### 500_euclidean_HAC_ENTS_*3
60 | * F1 score: 0.8899297423887589
61 | * Sillhouette Score: 0.08761547351995444
62 | 
63 | #### 500_euclidean_HAC_ENTS_*2
64 | * F1 score: 0.8792710706150342
65 | * Sillhouette Score: 0.08451414378144817
66 | 
67 | #### 350_euclidean_HAC_ENTS_*2
68 | * F1 score: 0.8178807947019867
69 | * Sillhouette Score: 0.08456507985134794
70 | 


--------------------------------------------------------------------------------
/Visualizations/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parkervg/news-article-clustering/bca5f12fde915a3bc1a103af7937eec89e3a7a2d/Visualizations/.DS_Store


--------------------------------------------------------------------------------
/Visualizations/500_dendogram_hac.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parkervg/news-article-clustering/bca5f12fde915a3bc1a103af7937eec89e3a7a2d/Visualizations/500_dendogram_hac.png


--------------------------------------------------------------------------------
/Visualizations/Article_Centers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parkervg/news-article-clustering/bca5f12fde915a3bc1a103af7937eec89e3a7a2d/Visualizations/Article_Centers.png


--------------------------------------------------------------------------------
/Visualizations/NewsfeedArticleClustering.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parkervg/news-article-clustering/bca5f12fde915a3bc1a103af7937eec89e3a7a2d/Visualizations/NewsfeedArticleClustering.pdf


--------------------------------------------------------------------------------
/Visualizations/cumulative_score_hyperparameters.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parkervg/news-article-clustering/bca5f12fde915a3bc1a103af7937eec89e3a7a2d/Visualizations/cumulative_score_hyperparameters.png


--------------------------------------------------------------------------------
/Visualizations/date_distributions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parkervg/news-article-clustering/bca5f12fde915a3bc1a103af7937eec89e3a7a2d/Visualizations/date_distributions.png


--------------------------------------------------------------------------------
/Visualizations/distance_heatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parkervg/news-article-clustering/bca5f12fde915a3bc1a103af7937eec89e3a7a2d/Visualizations/distance_heatmap.png


--------------------------------------------------------------------------------
/Visualizations/f_score_hyperparameters.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parkervg/news-article-clustering/bca5f12fde915a3bc1a103af7937eec89e3a7a2d/Visualizations/f_score_hyperparameters.png


--------------------------------------------------------------------------------
/Visualizations/s_score_hyperparameters.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parkervg/news-article-clustering/bca5f12fde915a3bc1a103af7937eec89e3a7a2d/Visualizations/s_score_hyperparameters.png


--------------------------------------------------------------------------------
/Visualizations/svd_cluster_centers_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parkervg/news-article-clustering/bca5f12fde915a3bc1a103af7937eec89e3a7a2d/Visualizations/svd_cluster_centers_example.png


--------------------------------------------------------------------------------
/Visualizations/svd_colored_clusters.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parkervg/news-article-clustering/bca5f12fde915a3bc1a103af7937eec89e3a7a2d/Visualizations/svd_colored_clusters.png


--------------------------------------------------------------------------------
/clustering.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Sat Apr 13 15:50:28 2019
  5 | 
  6 | @author: parkerglenn
  7 | """
  8 | import os
  9 | import pandas as pd
 10 | from nltk.stem.snowball import SnowballStemmer
 11 | from PreProcessing.NERTokenizer import NERTokenizer
 12 | from PreProcessing.CustomTFIDF import CustomTFIDF
 13 | from SuccessMetrics import success
 14 | 
 15 | """
 16 | Creating relevant classes
 17 | """
 18 | NerTok = NERTokenizer(tag=True)
 19 | Vectorizer = CustomTFIDF(ents_rate = 6.368, person_rate = 2.263, julian = False)
 20 | stemmer = SnowballStemmer("english")
 21 | 
 22 | """
 23 | Cleaning DF
 24 | """
 25 | os.chdir("/Users/parkerglenn/Desktop/DataScience/Article_Clustering")
 26 | df = pd.read_csv("/Users/parkerglenn/Desktop/DataScience/Article_Clustering/csv/all_GOOD_articles.csv")
 27 | labels_df= pd.read_csv("/Users/parkerglenn/Desktop/DataScience/Article_Clustering/Google_Drive/Article_ClassificationFINAL.csv")
 28 | #Deletes unnecessary columns
 29 | df = df.drop(df.columns[:12], axis = 1)
 30 | #Sets manageable range for working data set
 31 | new_df = df[5000:6000]
 32 | #Gets info in list form to be later called in kmeans part
 33 | corpus = new_df['content'].tolist()
 34 | titles = new_df["title"].tolist()
 35 | #labels_df starts at df[5000] so we're good on the matching of labels to content
 36 | events = labels_df["events"].tolist()[:1000]
 37 | links = new_df["url"].tolist()
 38 | 
 39 | """
 40 | Creating matrix
 41 | """
 42 | toks = NerTok.transform(corpus)
 43 | matrix= Vectorizer.transform(toks)
 44 | 
 45 | """
 46 | Clustering and measuring success.
 47 | """
 48 | #########################################################
 49 | ####################BIRCH################################
 50 | #########################################################
 51 | from sklearn.cluster import Birch
 52 | brc = Birch(n_clusters = 520)
 53 | brc.fit(matrix)
 54 | 
 55 | y_pred = brc.labels_.tolist()
 56 | success(brc, y_pred, matrix)
 57 | 
 58 | 
 59 | #########################################################
 60 | ####################HAC##################################
 61 | #########################################################
 62 | from sklearn.cluster import AgglomerativeClustering
 63 | hac = AgglomerativeClustering(n_clusters=520, affinity = "euclidean")
 64 | hac.fit(matrix)
 65 | #dense_matrix = tfidf_matrix.todense()
 66 | 
 67 | #from sklearn.externals import joblib
 68 | #Saves the model you just made
 69 | #joblib.dump(hac, '350_euc_HAC_ENTS.pkl')
 70 | #hac = joblib.load("/Users/parkerglenn/Desktop/DataScience/Article_Clustering/HAC_Cluster_Models/350_euc_HAC.pkl")
 71 | 
 72 | y_pred = hac.labels_.tolist()
 73 | success(hac, y_pred, matrix)
 74 | 
 75 | 
 76 | #########################################################
 77 | ####################KEMANS###############################
 78 | #########################################################
 79 | from sklearn.cluster import KMeans
 80 | num_clusters = 520
 81 | km = KMeans(n_clusters = num_clusters)
 82 | km.fit(matrix)
 83 | 
 84 | y_pred = km.labels_.tolist()
 85 | success(km, y_pred, matrix)
 86 | 
 87 | 
 88 | 
 89 | 
 90 | 
 91 | #########################################################
 92 | ###############KMEANS CLUSTER EXPLORING##################
 93 | #########################################################
 94 | def tokenize_and_stem(text):
 95 |     tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
 96 |     filtered_tokens = []
 97 |     for token in tokens:
 98 |         if re.search('[a-zA-Z]', token):
 99 |             filtered_tokens.append(token)
100 |     stems = [stemmer.stem(t) for t in filtered_tokens]
101 |     return stems
102 | 
103 | def tokenize_only(text):
104 |     tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
105 |     filtered_tokens = []
106 |     for token in tokens:
107 |         if re.search('[a-zA-Z]', token):
108 |             filtered_tokens.append(token)
109 |     return filtered_tokens
110 | 
111 | 
112 | totalvocab_stemmed = []
113 | totalvocab_tokenized = []
114 | for i in corpus:
115 |     allwords_stemmed = tokenize_and_stem(i)
116 |     totalvocab_stemmed.extend(allwords_stemmed)
117 |     allwords_tokenized = tokenize_only(i)
118 |     totalvocab_tokenized.extend(allwords_tokenized)
119 | 
120 | #Let's you search with stemmed word to see original format of word
121 | vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
122 | print ('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')
123 | 
124 | 
125 | articles = {"title": titles, "date": new_df["date"], "cluster": clusters, "content": new_df["content"], "event": events[:1000]}
126 | frame = pd.DataFrame(articles, index = [clusters] , columns = ['title', 'date', 'cluster', 'content', "event"])
127 | frame['cluster'].value_counts()
128 | 
129 | order_centroids = km.cluster_centers_.argsort()[:, ::-1]
130 | 
131 | from collections import Counter
132 | #Creates a count dict (success) to see how many instances of the same event are clustered together
133 | for i in clusters[:100]:
134 |     print("Cluster %d words:" % i, end='')
135 |     for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
136 |         print(' %s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
137 |     print()
138 |     print()
139 |     print()
140 |     counts = []
141 |     for event in frame.loc[i]["event"].values.tolist():
142 |         counts.append(event)
143 |     counts = dict(Counter(counts))
144 |     print(counts)
145 |     print()
146 |     print()
147 | 
148 | 
149 | #Allows you to zoom in on a specific cluster, see what words make that cluster unique
150 | for i in clusters:
151 |     if i == 244: #Change 2 to the cluster
152 |         print("Cluster %d words:" % i, end='')
153 |         for ind in order_centroids[i, :5]: #replace 20 with n words per cluster
154 |             print(' %s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
155 |         counts = []
156 |         for event in frame.ix[i]["event"].values.tolist():
157 |             counts.append(event)
158 |         counts = dict(Counter(counts))
159 |         print(counts)
160 |         print()
161 |         print()
162 | 


--------------------------------------------------------------------------------
/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parkervg/news-article-clustering/bca5f12fde915a3bc1a103af7937eec89e3a7a2d/data/.DS_Store


--------------------------------------------------------------------------------
/exploring_entities.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Mon Apr  8 13:27:43 2019
  5 | 
  6 | @author: parkerglenn
  7 | """
  8 | 
  9 | 
 10 | 
 11 | """
 12 | WITH NO ENTITY MANIPULATION:
 13 | 97.9% of non-PERSON ents occur in at least one other cluster. 
 14 | 51.5% of PERSON ents occur in at least one other cluster.
 15 | 
 16 | 
 17 | 
 18 | WITH ENTITY MANIPULATION:
 19 | 96.2% of non-PERSON ents occur in at least one other cluster. 
 20 | 52.1% of PERSON ents occur in at least one other cluster.
 21 | 
 22 | 
 23 | Average of 51.832 PERSONs per cluster
 24 | Average of 64.94 non-PERSONs per cluster
 25 | 
 26 | 1795 PERSON tags only appear once.
 27 | 6.9% of all PERSON tags are not helpful in determining cluster boundaries.
 28 | 1683 NON_PERSON tags only appear once.
 29 | 5.0% of all NON_PERSON tags are not helpful in determining cluster boundaries.
 30 | 
 31 | Non-People entities occured 6,554 more times than people entities.
 32 | 
 33 | 
 34 | 
 35 | "GPE": Countries, cities, states
 36 | "ORG": Organizations
 37 | "LOC": Non-GPE locations
 38 | "EVENT": Named hurricanes, sports events, etc.
 39 | "FAC": Buildings, airports, highways, bridges
 40 | "PERSON": All people, including fictional
 41 | """
 42 | 
 43 | from collections import Counter
 44 | ent_corpus = []
 45 | unique_ents = []
 46 | for article in toks:
 47 |     article_ents = ([tok for tok in article if tok.startswith("*") == True or tok.startswith("&") == True and tok != "*’m"])
 48 |     for tok in article:
 49 |         if tok not in unique_ents:
 50 |             unique_ents.append(tok)
 51 |     ent_corpus.append(article_ents)
 52 | 
 53 | # Creates cluster_ents, a dict where the occurences of ents per cluster are counted.
 54 | cluster_ents = {}
 55 | for place, cluster in enumerate(y_pred):
 56 |     cluster_ents[cluster] = cluster_ents.get(cluster,ent_corpus[place]) + ent_corpus[place]
 57 |     
 58 |     
 59 | cluster_ents_count = {}
 60 | for cluster in cluster_ents:
 61 |     cluster_ents_count[cluster] = dict(Counter(cluster_ents[cluster]))
 62 | 
 63 | # Creates dict for the amount of times an entity is used across distinct clusters
 64 | dup_clusters1 = {}
 65 | for base_ent in unique_ents:
 66 |     for cluster, ents in cluster_ents.items():
 67 |         if base_ent in ents:
 68 |             dup_clusters1[base_ent] = dup_clusters1.get(base_ent, -1) + 1
 69 | dup_clusters = {k:v for k,v in dup_clusters1.items() if v != 0}
 70 | 
 71 | 
 72 | """
 73 | How many one-off person ents/non person ents are there?
 74 | 
 75 | If more one-off person ents, this explains s score.
 76 | """
 77 | 
 78 | ent_occurences = {}
 79 | for cluster in cluster_ents_count:
 80 |     for ent, value in cluster_ents_count[cluster].items():
 81 |         try:
 82 |             ent_occurences[ent] += value
 83 |         except:
 84 |              ent_occurences[ent] = value
 85 | 
 86 | 
 87 | one_off_persons = 0
 88 | one_off_notpersons = 0
 89 | delp = []
 90 | delnp = []
 91 | for k, v in ent_occurences.items():
 92 |     if k.startswith("*") and v == 1:
 93 |         one_off_persons += 1
 94 |         delp.append(k)
 95 |     elif k.startswith("&") and v == 1:
 96 |         one_off_notpersons += 1
 97 |         delnp.append(k)
 98 | one_off_persons   
 99 | one_off_notpersons 
100 | 
101 |        
102 | event_ents = {}
103 | for article, toks in enumerate(ent_corpus):
104 |     event_ents[events[article]] = event_ents.get(events[article], toks) + toks   
105 | for event,toks in event_ents.items():
106 |     event_ents[event] = dict(Counter(toks))
107 | 
108 | 
109 | 
110 | distribution = pd.DataFrame(columns = ["event","ent","type","ratio"])
111 | place = -1
112 | for event, counts in event_ents.items():
113 |     for ent, value in counts.items():
114 |         place += 1
115 |         if ent.startswith("*") and ent not in delp:
116 |             distribution.loc[place] = [event, ent, "PERSON", (value / ent_occurences[ent])]
117 |         if ent.startswith("&") and ent not in delnp:
118 |             distribution.loc[place] = [event, ent, "NON_PERSON", (value / ent_occurences[ent])]
119 | 
120 | 
121 | people_dist = distribution.loc[distribution["type"] == "PERSON"]
122 | not_people_dist = distribution.loc[distribution["type"] == "NON_PERSON"]
123 | 
124 | 
125 | 
126 | pval = 0
127 | for value in people_dist["ratio"]:
128 |     if value > .7:
129 |         pval +=1
130 | 
131 | npval = 0
132 | for value in not_people_dist["ratio"]:
133 |     if value > .7:
134 |         npval +=1
135 |            
136 | 
137 | 
138 | 
139 |         
140 | import matplotlib.pyplot as plt
141 | import seaborn as sns
142 | fig, (ax1,ax2) = plt.subplots(ncols = 2)
143 | fig.subplots_adjust(wspace = 0.01)
144 | sns.heatmap(distribution, cmap = "rocket", ax = ax, cbar = False)
145 | 
146 | sns.heatmap(distribution.loc[distribution["type"] == "PERSON"])
147 | 
148 | 
149 | 
150 | 
151 | 
152 | 
153 | """
154 | Entity Usage Across Distinct Clusters
155 | 
156 | """
157 | ents = pd.DataFrame(columns = ["ent","occurence"])
158 | people = pd.DataFrame(columns = ["ent","occurence"])
159 | p = []
160 | n = []
161 | p1 = []
162 | n1 = []
163 | for k in dup_clusters:
164 |     if k.startswith("&"):
165 |         n.append(k)
166 |         n1.append(dup_clusters[k])
167 |     elif k.startswith("*"):
168 |         p.append(k)
169 |         p1.append(dup_clusters[k])
170 | 
171 | 
172 | ents["ent"] = n
173 | ents["occurence"] = n1
174 | ents["type"] = "ent"
175 | people["ent"] = p
176 | people["occurence"] = p1
177 | people["type"] = "person"
178 | ents = ents.sort_values(by = "occurence", ascending = False)
179 | people = people.sort_values(by = "occurence", ascending = False)
180 | 
181 | 
182 | people_2plt = people[1:6]
183 | ents_2plt = ents[:5]
184 | fig, ax = plt.subplots(1,2, sharey = True, figsize = (15,8))
185 | fig.suptitle("Entity Usage Across Distinct Clusters", fontsize=14)
186 | sns.set(style = "darkgrid")
187 | sns.barplot(x = "ent", y = "occurence", hue = "type", data = people_2plt, ax = ax[0], palette=["C0"])
188 | sns.barplot(x = "ent", y = "occurence", hue = "type", data = ents_2plt,ax = ax[1], palette=["C1"])
189 | for a in ax:
190 |     a.set_xlabel('Entity')
191 |     a.set_ylabel('Occurences')
192 | fig.show()
193 | 
194 | 
195 | ents = pd.DataFrame(columns = ["ent","occurence"])
196 | people = pd.DataFrame(columns = ["ent","occurence"])
197 | p = []
198 | n = []
199 | p1 = []
200 | n1 = []
201 | for k, v in ent_occurences.items():
202 |     if k.startswith("&"):
203 |         n.append(k)
204 |         n1.append(v)
205 |     elif k.startswith("*"):
206 |         p.append(k)
207 |         p1.append(v)
208 | ents["ent"] = n
209 | ents["occurence"] = n1
210 | ents["type"] = "Non-Person"
211 | people["ent"] = p
212 | people["occurence"] = p1
213 | people["type"] = "Person"
214 | ents = ents.sort_values(by = "occurence", ascending = False)
215 | people = people.sort_values(by = "occurence", ascending = False)
216 | 
217 | 
218 | from matplotlib.colors import ListedColormap
219 | 
220 | color1 = ["#13b23b"]
221 | color2 = ["#ffa100"]  
222 | people_2plt = people[:5]
223 | ents_2plt = ents[:5]
224 | fig, ax = plt.subplots(1,2, sharey = True, figsize = (15,8))
225 | #fig.suptitle("Entity Usage Across Articles", fontsize=25)
226 | sns.set(style = "darkgrid", font_scale = 1)
227 | one = sns.barplot(x = "ent", y = "occurence", hue = "type", data = people_2plt, ax = ax[0], palette=color1)
228 | two = sns.barplot(x = "ent", y = "occurence", hue = "type", data = ents_2plt,ax = ax[1], palette=color2)
229 | for item in one.get_xticklabels():
230 |     item.set_rotation(60)
231 | for item in two.get_xticklabels():
232 |     item.set_rotation(60)
233 | for a in ax:
234 |     a.set_xlabel('Entity')
235 |     a.set_ylabel('Occurences')
236 | fig.show()
237 | 
238 | 
239 | 
240 | 
241 | person = 0
242 | not_person = 0
243 | for ent, value in dup_clusters.items():
244 |     if ent.startswith("&"):
245 |         not_person += value
246 |     elif ent.startswith("*"):
247 |         person += value
248 | print("Overall, non-person entities occured across distinct clusters {} more times than person entities did.".format(not_person - person))
249 | 
250 | person
251 | not_person
252 | 
253 | 
254 | 
255 | 
256 | all_people = 0
257 | all_not_people = 0
258 | for ent, value in ent_occurences.items():
259 |     if ent.startswith("&"):
260 |         all_not_people += value
261 |     if ent.startswith("*"):
262 |         all_people += value
263 | print("Non-People entities occured {} more times than people entities.".format(all_not_people - all_people))
264 | 
265 | 
266 | all_people
267 | all_not_people
268 | 
269 | x = 0
270 | y = 0
271 | for ent in unique_ents:
272 |     if ent.startswith("&"):
273 |         x += 1
274 |     if ent.startswith("*"):
275 |         y += 1
276 | 
277 | person / y
278 | not_person / x
279 | 
280 | all_people / 500
281 | all_not_people / 500
282 | 
283 | 1710 / all_people
284 | 1772 / all_not_people
285 | 
286 | all_people
287 | all_not_people - all_people
288 | 
289 | 
290 | 


--------------------------------------------------------------------------------