├── .DS_Store ├── Articles_Dataframe_Formatting.py ├── Creating_Visualizations.py ├── Hyperparameter_Testing ├── hyperparameter_functions.py └── hyperparameter_testing.py ├── PreProcessing ├── CustomTFIDF.py ├── NERTokenizer.py ├── __pycache__ │ ├── CustomTFIDF.cpython-37.pyc │ └── NERTokenizer.cpython-37.pyc └── julian_matrix.py ├── README.md ├── SuccessMetrics.py ├── Success_Rates.md ├── Visualizations ├── .DS_Store ├── 500_dendogram_hac.png ├── Article_Centers.png ├── NewsfeedArticleClustering.pdf ├── cumulative_score_hyperparameters.png ├── date_distributions.png ├── distance_heatmap.png ├── f_score_hyperparameters.png ├── s_score_hyperparameters.png ├── svd_cluster_centers_example.png └── svd_colored_clusters.png ├── clustering.py ├── data ├── .DS_Store └── article_classifications.csv └── exploring_entities.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parkervg/news-article-clustering/bca5f12fde915a3bc1a103af7937eec89e3a7a2d/.DS_Store -------------------------------------------------------------------------------- /Articles_Dataframe_Formatting.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import glob 4 | import csv 5 | """ 6 | Takes as input all original Kaggle CSVs, and filters the concatenated_df to only where the date and url are present. 7 | """ 8 | 9 | path = "/Users/parkerglenn/Desktop/DataScienceSets" 10 | 11 | 12 | list_ = [] 13 | all_files = glob.glob(path + "/*.csv") 14 | frame = pd.DataFrame() 15 | 16 | 17 | df_from_each_file = (pd.read_csv(f) for f in all_files) 18 | concatenated_df = pd.concat(df_from_each_file, ignore_index = True) 19 | 20 | dates = concatenated_df['date'] 21 | 22 | url = concatenated_df['url'] 23 | 24 | df1 = concatenated_df[~concatenated_df['date'].isna()] 25 | df = df1[~df1['url'].isna()] 26 | df = df.reset_index() 27 | df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True) 28 | 29 | df = df.sort_values(by=['date']) 30 | 31 | df.to_csv("all_good_articles.csv") 32 | -------------------------------------------------------------------------------- /Creating_Visualizations.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Mar 19 10:13:38 2019 5 | 6 | @author: parkerglenn 7 | """ 8 | from sklearn.decomposition import PCA 9 | import matplotlib.pylab as plt 10 | import seaborn as sns 11 | from sklearn.decomposition import TruncatedSVD 12 | from matplotlib import pyplot 13 | import mpld3 14 | from mpld3 import display 15 | from sklearn.metrics.pairwise import cosine_similarity 16 | from sklearn.cluster import KMeans 17 | import pylab 18 | 19 | 20 | ############################################################################### 21 | ##################VISUALIZATION################################################ 22 | ############################################################################### 23 | 24 | #Date Distribution 25 | out = pd.cut(df["joined"], bins = [20000000,20020000,20040000,20060000,20080000,20100000,20120000,20140000,20160000,20180000], include_lowest = True) 26 | ax = out.value_counts(sort = False).plot.bar(rot = 0, color = "b", figsize = (20,10)) 27 | ax.set_ylim(bottom = 0, top = 450) 28 | ax.set_xticklabels(x for x in ["2000 to 2002","2002 to 2004","2004 to 2006","2006 to 2008","2008 to 2010","2010 to 2012","2012 to 2014","2014 to 2016","2016 to 2018"]) 29 | plt.xlabel("Date Range", fontsize = 18) 30 | plt.ylabel("Frequency", fontsize = 18) 31 | plt.title("Date Distribution", fontsize = 25) 32 | for i in ax.patches: 33 | ax.text(i.get_x() + .10, 5, str(i.get_height()), fontsize = 20, color = "black") 34 | 35 | 36 | #Distance Heatmap 37 | dist = 1 - cosine_similarity(matrix) 38 | 39 | cmap = pyplot.cm.cubehelix 40 | dimensions = (20,20) 41 | fig, ax = pyplot.subplots(figsize=dimensions) 42 | sns.heatmap(dist, vmin = 0, vmax = 1, cmap = cmap).set_title("Tfidf Distances Between Articles", fontsize = 15) 43 | 44 | """ 45 | Notice hot spot around the 625:630 line. 46 | Those article titles: 47 | ['Dem Debate Blogging #1', 48 | 'Dem Debate Blogging #2', 49 | 'Dem Debate Blogging #3', 50 | 'Dem Debate Blogging #4', 51 | 'Dem Debate Blogging #5', 52 | 'Dem Debate Blogging #6'] 53 | 54 | 55 | Around 90:160: large circle of relatively similar articles. 56 | Reason: Iowa Caucuses. 57 | """ 58 | 59 | 60 | #Tfidf Matrix, in 2D SVD scatterplot 61 | svd = TruncatedSVD(n_components=2).fit(matrix) 62 | data2D = svd.transform(matrix) 63 | plt.title("Truncated SVD, 2 Components") 64 | colors = rng.rand(1000) 65 | plt.scatter(data2D[:,0], data2D[:,1], marker = "o", c = colors, cmap = "YlGnBu", s = 10) 66 | 67 | 68 | ######With clusters assigned as colors######## 69 | data2D = svd.transform(matrix) 70 | kmeans = KMeans(n_clusters = 520) 71 | kmeans.fit(data2D) 72 | y_kmeans = kmeans.predict(data2D) 73 | y_pred = kmeans.labels_.tolist() 74 | 75 | success(kmeans,y_pred,matrix) 76 | 77 | articles = {"title": titles, "date": new_df["date"], "cluster": y_pred, "content": new_df["content"], "event": events[:1000]} 78 | frame = pd.DataFrame(articles, index = [y_pred] , columns = ['title', 'date', 'cluster', 'content', "event"]) 79 | frame['cluster'].value_counts() 80 | 81 | 82 | """Creates scalable points for cluster centers found within y_true. 83 | The size of the center is dependent on how many events occur withing that specific cluster. """ 84 | 85 | centers = kmeans.cluster_centers_ 86 | fig, ax = plt.subplots(figsize = (14,8)) 87 | np.random.seed(0) 88 | threshold = 4 89 | for cluster, center in enumerate(centers): 90 | cluster+=1 91 | # Only maps cluster if it has more than "threshold" events in it 92 | if cluster in y_true and len(frame.loc[cluster]["event"].values.tolist()) > threshold: 93 | #Gets event name that the cluster center represents 94 | event = events[y_true.index(cluster)] 95 | #s scaled based on number of events in cluster 96 | ax.plot(center[0], center[1], markersize = float(len(frame.loc[cluster]["event"].values.tolist())), marker = "o"); 97 | plt.annotate(event, (center[0],center[1])) 98 | ax.set_title('Cluster Centers with Predominant Event', size=14) 99 | plt.show() 100 | 101 | mpld3.show(fig) 102 | 103 | # mpld3.save_html(fig, "Cluster_Centers.html") 104 | 105 | 106 | 107 | """Zoomed out plot with colors""" 108 | plt.title("Truncated SVD with Colored Clusters") 109 | plt.scatter(data2D[:, 0], data2D[:, 1], c=y_kmeans, cmap = "tab20", s=30) 110 | """Example plot, zoomed in to visualize cluster centers""" 111 | 112 | 113 | 114 | 115 | fig, ax = plt.subplots(figsize = (14,8)) 116 | np.random.seed(0) 117 | ax.plot(data2D[:, 0], data2D[:, 1], 118 | 'or', ms=10, alpha=0.2) 119 | ax.set_title('Truncated SVD with Cluster Assignments', size=14) 120 | ax.grid(color='lightgray', alpha=0.7) 121 | for i, txt in enumerate(events): 122 | print(i) 123 | plt.annotate(txt + ", " + str(y_pred[i]), (data2D[:, 0][i], data2D[:, 1][i])) 124 | mpld3.show(fig) 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | centers = kmeans.cluster_centers_ 133 | plt.scatter(centers[:, 0], centers[:, 1], c='black', s=30, alpha=0.5, marker = 'x'); 134 | pyplot.axis(ymax = 0, ymin =-.25 , xmax = .2, xmin = .14) 135 | plt.title("Truncated SVD with Cluster Centers") 136 | pyplot.axis(ymax = 0, ymin =-.25 , xmax = .2, xmin = .14) 137 | plt.scatter(data2D[:, 0], data2D[:, 1], c=y_kmeans, cmap = "tab20", s=30) 138 | 139 | 140 | 141 | 142 | ######Interactive ScatterPlot with SVD###### 143 | svd = TruncatedSVD(n_components=2).fit(matrix) 144 | data2D = svd.transform(matrix) 145 | 146 | fig, ax = plt.subplots(figsize = (14,8)) 147 | np.random.seed(0) 148 | ax.plot(data2D[:, 0], data2D[:, 1], 149 | 'or', ms=10, alpha=0.2) 150 | ax.set_title('Truncated SVD with Cluster Assignments', size=14) 151 | ax.grid(color='lightgray', alpha=0.7) 152 | for i, txt in enumerate(events): 153 | print(i) 154 | plt.annotate(txt + ", " + str(y_pred[i]), (data2D[:, 0][i], data2D[:, 1][i])) 155 | mpld3.show(fig) 156 | #mpld3.save_html(fig, "Truncated_SVD_D3.html") 157 | 158 | 159 | 160 | 161 | #####Interactive ScatterPlot with Dense Matrix and PCA###### 162 | """Probably not the method to use. SVD seems better since it takes the sparse matrix "tfidf_matrix" directly.""" 163 | x = tfidf_matrix.todense() 164 | coords = PCA(n_components=2).fit_transform(x) 165 | fig, ax = plt.subplots(figsize = (14,8)) 166 | 167 | np.random.seed(0) 168 | ax.plot(coords[:, 0], coords[:, 1], 169 | 'or', ms=10, alpha=0.2) 170 | ax.set_title('PCA with Cluster Assignments', size=14) 171 | ax.grid(color='lightgray', alpha=0.7) 172 | for i, txt in enumerate(events): 173 | plt.annotate(txt, (coords[:, 0][i], coords[:, 1][i])) 174 | mpld3.show(fig) 175 | 176 | 177 | 178 | """Dendogram Making""" 179 | fig = pylab.figure(figsize=(100,70)) 180 | children = hac.children_ 181 | distance = np.arange(children.shape[0]) 182 | no_of_observations = np.arange(2, children.shape[0]+2) 183 | linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float) 184 | dendrogram(linkage_matrix, labels = (events), truncate_mode = "level", leaf_font_size = 8) 185 | fig.show 186 | 187 | 188 | 189 | 190 | """ScatterPlot""" 191 | coords = PCA(n_components=2).fit_transform(dense_matrix) 192 | fig, ax = plt.subplots(figsize = (14,8)) 193 | np.random.seed(0) 194 | ax.plot(coords[:, 0], coords[:, 1], 195 | 'or', ms=10, alpha=0.2) 196 | ax.set_title('Truncated SVD with Cluster Assignments', size=14) 197 | ax.grid(color='lightgray', alpha=0.7) 198 | for i, txt in enumerate(events): 199 | print(i) 200 | plt.annotate(txt + ", " + str(y_pred[i]), (coords[:, 0][i], coords[:, 1][i])) 201 | mpld3.show(fig) 202 | 203 | 204 | 205 | ############################################################# 206 | ################Hyperparameter Testing####################### 207 | ############################################################# 208 | 209 | 210 | hyper_params = pd.read_csv("/Users/parkerglenn/Desktop/DataScience/Article_Clustering/csv/Hyperparam_testing/Hyper_Params.csv") 211 | hyper_params = hyper_params.reset_index() 212 | 213 | f_scores = [x for x in hyper_params["f_score"]] 214 | s_scores = [x for x in hyper_params["s_score"]] 215 | person_rate = [x for x in hyper_params["person_rate"]] 216 | ents_rate = [x for x in hyper_params["ents_rate"]] 217 | 218 | 219 | 220 | """F1 Score distribution across hyperparameters 221 | Best F1 Score: ents_rate = 1.63, person_rate = 2.57, f_score = .927 on 412 articles. BUT s_score is a measly .0788. 222 | """ 223 | f_data = pd.DataFrame({"Person Rate": [round(x,2) for x in person_rate], "Entity Rate":[round(x,2) for x in ents_rate], "Z": hyper_params.f_score}) 224 | f_data_pivoted = f_data.pivot("Person Rate","Entity Rate","Z") 225 | ax1 = sns.heatmap(f_data_pivoted, cmap = plt.cm.hot) 226 | cbar = ax1.collections[0].colorbar 227 | cbar.set_label('F1 Score', labelpad=15) 228 | ax1.invert_yaxis() 229 | plt.show() 230 | 231 | """Silhouette Score distribution across hyperparameters 232 | More direct correlation here than in F1 Score. Most notably, as the rates increase, so does S Score. This is due to 233 | the fact that the rating of words in articles becomes more radical; articles that are different from each other, i.e. share no 234 | entities, are now much more distant than before. Even more specifically, when the Entity Rate and Person Rate are dissimilar, the 235 | S Score is the highest. Again, the same radical rating phenomena: By limiting the amount of weighted features, those that are weighted 236 | make the article more of an outlier than before.""" 237 | 238 | s_data = pd.DataFrame({"Person Weighting": [round(x,2) for x in person_rate], "Non-Person Weighting":[round(x,2) for x in ents_rate], "Z": hyper_params.s_score}) 239 | s_data_pivoted = s_data.pivot("Person Weighting","Non-Person Weighting","Z") 240 | ax2 = sns.heatmap(s_data_pivoted, cmap = plt.cm.hot) 241 | cbar = ax2.collections[0].colorbar 242 | cbar.set_label('Silhouette Score', labelpad=15) 243 | ax2.invert_yaxis() 244 | plt.show() 245 | 246 | 247 | 248 | """Cumulative Total 249 | Highest: ents_rate = 6.368, person_rate = 2.263, f_score = 0.922, s_score = 0.093 250 | 251 | """ 252 | sf_data = pd.DataFrame({"Person Weighting": [round(x,2) for x in person_rate], "Entity Weighting":[round(x,2) for x in ents_rate], "Z": hyper_params.s_score + hyper_params.f_score}) 253 | sf_data_pivoted = sf_data.pivot("Person Weighting","Entity Weighting","Z") 254 | ax3 = sns.heatmap(sf_data_pivoted, cmap = plt.cm.hot) 255 | cbar = ax3.collections[0].colorbar 256 | cbar.set_label('Composite Score', labelpad=15) 257 | ax3.invert_yaxis() 258 | plt.show() 259 | 260 | """Little thing to find best cumulative score and rates that achieved it.""" 261 | hightot = 0 262 | for tup in enumerate(hyper_params["f_score"]): 263 | tot = tup[1] + hyper_params.loc[tup[0]]["s_score"] 264 | if hightot < tot: 265 | hightot = tot 266 | best_scores = (hyper_params.loc[tup[0]]["ents_rate"],hyper_params.loc[tup[0]]["person_rate"]) 267 | best_scores 268 | -------------------------------------------------------------------------------- /Hyperparameter_Testing/hyperparameter_functions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Fri Feb 22 15:31:02 2019 5 | 6 | @author: parkerglenn 7 | """ 8 | 9 | def tokenize_and_stem_NER(corpus): 10 | global tokenized_corpus 11 | tokenized_corpus = [] 12 | good_ents = ["PERSON", "GPE","ORG", "LOC", "EVENT", "FAC"] 13 | continue_tags = ["B-","I-"] 14 | end_tags = ["L-","U-"] 15 | for text in corpus: 16 | toks = [] 17 | iobs = [i.ent_iob_ for i in nlp(text)] 18 | biluos = list(iob_to_biluo(iobs)) 19 | index = -1 20 | #Named entities variable 21 | ne = "" 22 | for tok in nlp(text): 23 | index += 1 24 | if biluos[index] in continue_tags and str(tok.ent_type_) in good_ents: 25 | #Checks if empty token 26 | #For some reason tok.whitespace_ doesn't include double token entities 27 | #like "JENNIFER LAWRENCE" 28 | if str(tok).split() != [] and str(tok.ent_type_) != "PERSON": 29 | ne += " " + str(tok).upper() 30 | elif str(tok).split() != [] and str(tok.ent_type_) == "PERSON": 31 | ne += " " +str(tok).title() 32 | elif biluos[index] in end_tags and str(tok.ent_type_) in good_ents: 33 | if str(tok).split() != [] and str(tok.ent_type_) != "PERSON": 34 | ne += " " + str(tok).upper() 35 | toks.append(ne.lstrip()) 36 | ne = " " 37 | elif str(tok).split() != [] and str(tok.ent_type_) == "PERSON": 38 | ne += " " + str(tok).title() 39 | ne = ne.replace("’S", "") 40 | toks.append(ne.lstrip()) 41 | ne = " " 42 | ne = " " 43 | #If token is just a boring old word 44 | else: 45 | if tok.is_punct == False and str(tok).lower() not in list(stop_words.ENGLISH_STOP_WORDS): 46 | toks.append(stemmer.stem(str(tok))) 47 | tokenized_corpus.append(toks) 48 | 49 | 50 | 51 | def do_tfidf(ents_rate, person_rate): 52 | 53 | def TF_dict(article): 54 | article_tf = {} 55 | for word in article: 56 | if word in article_tf: 57 | article_tf[word] += 1 58 | else: 59 | article_tf[word] = 1 60 | for word in article_tf: 61 | """Manipulate word.isupper() to account for entity weighting.""" 62 | if word.isupper(): 63 | occurences = article_tf[word] 64 | article_tf[word] = (occurences / len(article)) * ents_rate 65 | #word.istitle() applies to PERSON tags 66 | elif word.istitle(): 67 | occurences = article_tf[word] 68 | article_tf[word] = (occurences / len(article)) * person_rate 69 | else: 70 | occurences = article_tf[word] 71 | article_tf[word] = (occurences / len(article)) 72 | return article_tf 73 | 74 | TF = [TF_dict(article) for article in tokenized_corpus] 75 | 76 | 77 | 78 | def Count_dict(): 79 | countDict = {} 80 | for article in TF: 81 | found_words = [] 82 | for word in article: 83 | if word in countDict and word not in found_words: 84 | countDict[word] += 1 85 | found_words.append(word) 86 | elif word not in found_words: 87 | countDict[word] = 1 88 | found_words.append(word) 89 | return countDict 90 | 91 | countDict = Count_dict() 92 | 93 | import operator 94 | sortCount = sorted(countDict.items(), key=operator.itemgetter(1), reverse = True) 95 | 96 | 97 | def IDF_dict(): 98 | import math 99 | idfDict = {} 100 | for word in countDict: 101 | #len(corpus) is 1000, the total number of documents 102 | #countDict[word] is the number of articles the word appears in 103 | """ 104 | From Sci-Kit code: https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py 105 | 'smooth_idf: If ``smooth_idf=True`` (the default), the constant "1" is added to the 106 | numerator and denominator of the idf as if an extra document was seen 107 | containing every term in the collection exactly once, which prevents 108 | zero divisions: idf(d, t) = log [ (1 + n) / (1 + df(d, t)) ] + 1.' 109 | 110 | 'The effect of adding "1" to 111 | the idf in the equation above is that terms with zero idf, i.e., terms 112 | that occur in all documents in a training set, will not be entirely 113 | ignored.' 114 | 115 | min_df: 'When building the vocabulary ignore terms that have a document 116 | frequency strictly lower than the given threshold. This value is also 117 | called cut-off in the literature.' 118 | 119 | max_df: 'When building the vocabulary ignore terms that have a document 120 | frequency strictly higher than the given threshold (corpus-specific 121 | stop words).' 122 | 123 | norm: (default='l2') Each output row will have unit norm, either: 124 | * 'l2': Sum of squares of vector elements is 1. The cosine 125 | similarity between two vectors is their dot product when l2 norm has 126 | been applied. 127 | """ 128 | #Implements min_df and max_df 129 | min_df = 0 130 | max_df = 1.0 131 | if countDict[word] > min_df and (countDict[word] / len(corpus)) < max_df: 132 | idfDict[word] = math.log((1 + len(corpus)) / (1 + countDict[word])) + 1 133 | else: 134 | idfDict[word] = 0 135 | return idfDict 136 | 137 | idfDict = IDF_dict() 138 | 139 | 140 | 141 | def TFIDF_list(article): 142 | article_tfidf = {} 143 | for word in article: 144 | #article[word] is the TF score for that word in the given article 145 | article_tfidf[word] = article[word] * idfDict[word] 146 | return article_tfidf 147 | 148 | 149 | 150 | tfidf = [TFIDF_list(article) for article in TF] 151 | 152 | 153 | 154 | 155 | from sklearn import preprocessing 156 | terms = sorted(countDict.keys()) 157 | def compute_TFIDF_matrix(article): 158 | article_matrix = [0.0] * len(terms) 159 | for i, word in enumerate(terms): 160 | #Stores tfidf value of unique word in terms 161 | #if the word is in the article 162 | if word in article: 163 | #article[word] is the word's tfidf score 164 | article_matrix[i] = article[word] 165 | return article_matrix 166 | 167 | 168 | 169 | tfidf_matrix = [compute_TFIDF_matrix(article) for article in tfidf] 170 | 171 | #Normalized with the default l2 setting 172 | tfidf_matrix = preprocessing.normalize(tfidf_matrix, norm = 'l2') 173 | 174 | return tfidf_matrix 175 | 176 | ################################################################ 177 | ####################HAC######################################### 178 | ################################################################ 179 | 180 | def HAC(matrix): 181 | from sklearn.cluster import AgglomerativeClustering 182 | from scipy.cluster.hierarchy import dendrogram 183 | hac = AgglomerativeClustering(n_clusters=500, affinity = "euclidean") 184 | hac.fit_predict(tfidf_matrix) 185 | y_pred = list(hac.labels_) 186 | success(hac, y_pred, tfidf_matrix) 187 | 188 | def success(model, clusters, matrix): 189 | 190 | import os 191 | import pandas as pd 192 | import codecs 193 | 194 | os.chdir("/Users/parkerglenn/Desktop/DataScience/Article_Clustering") 195 | 196 | data = codecs.open('/Users/parkerglenn/Desktop/DataScience/Article_Clustering/csv/all_GOOD_articles.csv', encoding = 'utf-8') 197 | data_with_labels = codecs.open("/Users/parkerglenn/Desktop/DataScience/Article_Clustering/Google_Drive/Article_Classification26.csv") 198 | 199 | df = pd.read_csv(data) 200 | labels_df= pd.read_csv(data_with_labels) 201 | 202 | #Deletes unnecessary columns 203 | df = df.drop(df.columns[:12], axis = 1) 204 | #Sets manageable range for working data set 205 | new_df = df[5000:6000] 206 | #Gets info in list form to be later called in kmeans part 207 | 208 | corpus = [] 209 | for text in new_df['content']: 210 | corpus.append(text) 211 | 212 | titles = [] 213 | for title in new_df["title"]: 214 | titles.append(str(title)) 215 | #labels_df starts at df[5000] so we're good on the matching of labels to content 216 | events = [] 217 | for event in labels_df["Event"][:1000]: 218 | events.append(str(event)) 219 | 220 | 221 | articles = {"title": titles, "date": new_df["date"], "cluster": clusters, "content": new_df["content"], "event": events[:1000]} 222 | frame = pd.DataFrame(articles, index = [clusters] , columns = ['title', 'date', 'cluster', 'content', "event"]) 223 | frame['cluster'].value_counts() 224 | 225 | """ 226 | BELOW THIS CREATES DICT OF CLUSTERS AND PREDOMINANT EVENT 227 | 228 | If multiple events occur the same amount of times in a single cluster, 229 | the ratio function is invoked to choose the event holding the most relative 230 | significance. If one ratio is not greater than the others (ex. a cluster 231 | composed of 5 one-off events) then the cluster is disregared (labelled "nan"). 232 | 233 | If the cluster only contains one event, it is assumed at this stage that it is 234 | the main cluster for the event. 235 | 236 | BUGS: 237 | If the cluster contains only "nan" events, it will not show up in y_trueDict 238 | (ex. Cluster 113 is not shown, consisting of {'nan': 2} ) 239 | """ 240 | from collections import Counter 241 | all_events = [] 242 | #This fixes quirk where the same cluster was iterated over multiple times 243 | clusters_we_saw = [] 244 | for cluster in clusters: 245 | if cluster not in clusters_we_saw: 246 | clusters_we_saw.append(cluster) 247 | for event in frame.loc[cluster]["event"].values.tolist(): 248 | if event != "nan" and event != "useless": 249 | all_events.append(event) 250 | event_occurences = dict(Counter(all_events)) 251 | 252 | all_clusters = [] 253 | for cluster in clusters: 254 | if cluster not in all_clusters: 255 | all_clusters.append(cluster) 256 | 257 | y_trueDict = {} 258 | #This range needs to be changed depending on the cluster model 259 | for i in range(0,len(all_clusters)): 260 | ratios = [] 261 | counts = [] 262 | cluster_event = [] 263 | ratio = 0 264 | 265 | #Counts occurence per cluster of event 266 | for event in frame.loc[i]["event"].values.tolist(): 267 | if event != "nan" and event != "useless": 268 | counts.append(event) 269 | counts = Counter(counts) 270 | 271 | 272 | if len(counts) > 1: 273 | score_1 = list(counts.most_common()[0])[1] 274 | score_2 = list(counts.most_common()[1])[1] 275 | #Check to see if there are multiple events with same frequency 276 | if score_1 == score_2: 277 | #Gets all events with same frequency 278 | tied_events = [k for k,v in dict(counts).items() if v == score_1] 279 | for event in tied_events: 280 | #Gets the ratio of an occurence for an event in a cluster 281 | #For example, if an event happens only once, it's ratio will be 1 282 | #But if "iowa_caucuses" is used 100 times and only 20 times in a specific cluster, 283 | #its ratio is .2 284 | new_ratio = score_1 / int(event_occurences[event]) 285 | ratios.append(new_ratio) 286 | if new_ratio > ratio: 287 | cluster_event = event 288 | ratio = new_ratio 289 | #If result is an empty list, all ratios are unique. If not, there 290 | #are repititions and the data point is thrown out. 291 | if list(set([x for x in ratios if ratios.count(x) > 1])) != []: 292 | y_trueDict[i] = "nan" 293 | break 294 | 295 | #Dumb try and except sees if ytrueDict[i] is already set to something ("nan") 296 | try: 297 | y_trueDict[i] 298 | except: 299 | counts = dict(counts) 300 | #Makes sure there's still the occurence in cluster attached to the cluster_event 301 | y_trueDict[i] = [cluster_event, counts[cluster_event]] 302 | 303 | 304 | #If there is one obviously right event, i.e. score_1 != score_2 305 | else: 306 | y_trueDict[i] = list(counts.most_common()[0]) 307 | 308 | #Catches the instance of only one item per cluster, i.e. len(counts) !> 1 309 | elif len(counts) == 1: 310 | y_trueDict[i] =list(counts.most_common()[0]) 311 | 312 | 313 | #Re-analyzes y_trueDict, applying ratio again so there's one objectively "right" cluster per event 314 | a = [] 315 | for k in y_trueDict: 316 | a.append(y_trueDict[k][0]) 317 | a = dict(Counter(a)) 318 | 319 | #Sees where the same event label is applied to multiple clusters 320 | duplicates = [] 321 | for g in a: 322 | if a[g] > 1 and g != "n" and g != "unknown": 323 | duplicates.append(g) 324 | 325 | 326 | #Creates dup_eventsPLUSratio, where the duplicate events are stored by cluster number 327 | #with their ratio 328 | dup_eventsPLUSratio = {} 329 | for key in y_trueDict: 330 | if y_trueDict[key][0] in duplicates: 331 | event = y_trueDict[key][0] 332 | ratio = int(y_trueDict[key][1]) / int(event_occurences[event]) 333 | eventPLUSratio = [] 334 | eventPLUSratio.append(event) 335 | eventPLUSratio.append(ratio) 336 | dup_eventsPLUSratio[key] = eventPLUSratio 337 | dup_eventsPLUSratio 338 | 339 | #Dives into dup_eventsPLUSratio to see what cluster is more approrpiate for event 340 | for duplicate in duplicates: 341 | ratios = [] 342 | for key in dup_eventsPLUSratio: 343 | if dup_eventsPLUSratio[key][0] == duplicate: 344 | ratios.append(dup_eventsPLUSratio[key][1]) 345 | sort=sorted(ratios,reverse=True) 346 | highest = sort[0] 347 | theGood_one = [duplicate, highest] 348 | for key in dup_eventsPLUSratio: 349 | if dup_eventsPLUSratio[key][0] == duplicate: 350 | if dup_eventsPLUSratio[key] != theGood_one or highest == sort[1]: 351 | y_trueDict[key] = "nan" 352 | #If after all that there's still a tie between the top two ratios, 353 | #(like in hail_caesar_movie where its split 2 and 2 between clusters) 354 | #its given a "nan" label 355 | #COULD BE CHANGED TO FIT A WHILE LOOP THAT THEN FINDS score_2 AND 356 | #RELABELS CLUSTER TO SECOND MOST POPULAR EVENT IF THAT EVENT IS NOT 357 | #ALREADY ASSIGNED A CLUSTER 358 | 359 | #Gets y_true, the correct cluster assignments for each event 360 | bad_labels = ["useless","nan","unkown"] 361 | y_true = [] 362 | for event in events[:1000]: 363 | find = False 364 | for key in y_trueDict: 365 | #Used to see if there is a distinct cluster for that event 366 | #FIXED BUG: probably still some duplicates in y_trueDict somehow, bc output len is 10005 367 | #maybe the "unknown" or "useless" stuff? 368 | if y_trueDict[key][0] == event and y_trueDict[key][0] not in bad_labels: 369 | y_true.append(key) 370 | find = True 371 | if find == False: 372 | #Arbitrary value that's not going to return a match in t score 373 | y_true.append("nan") 374 | 375 | 376 | #Gets y_pred, the cluster where each individual event was actually clustered 377 | y_pred = [] 378 | for cluster_assignment in frame["cluster"]: 379 | y_pred.append(cluster_assignment) 380 | 381 | #checks how events actually match up with definitively defined cluster 382 | num = 0 383 | for i in y_true: 384 | if i != "nan": 385 | num += 1 386 | print("Working with " + str(num) + " samples based on a corpus of " + str(len(corpus)) + " documents: ") 387 | print() 388 | 389 | #Re-Aligns two lists to only include good values (those not equalling "nan") 390 | filtered_y_true = [] 391 | filtered_y_pred = [] 392 | 393 | for place in range(len(y_true)): 394 | if y_true[place] != "nan": 395 | filtered_y_true.append(y_true[place]) 396 | filtered_y_pred.append(y_pred[place]) 397 | 398 | 399 | 400 | 401 | 402 | """F1 score is the harmonic average of precision and recall. """ 403 | 404 | from sklearn.metrics import f1_score 405 | print("The F1 score for the model is " + str(f1_score(y_true = filtered_y_true, y_pred = filtered_y_pred, average = "micro"))) 406 | print() 407 | #500_no_ngrams F1 score micro: 0.8785046728971962 (also works off the most samples) 408 | #350_3_ngrams F1 score micro: 0.8718861209964412 (but goes off 281 samples rather than 303 in no ngrams) 409 | #700_no_ngrams F1 score micro: 0.8638392857142858 410 | #350_no_ngrams F1 score micro: 0.8576158940397351 411 | #300_3_ngrams F1 score micro: 0.8294573643410853 412 | 413 | """ Silhouette values lies in the range of [-1, 1]. A value of +1 indicates that the sample is far away 414 | from its neighboring cluster and very close to the cluster its assigned. Similarly, value of -1 415 | indicates that the point is close to its neighboring cluster than to the cluster its assigned. 416 | And, a value of 0 means its at the boundary of the distance between the two cluster. Value of +1 417 | is ideal and -1 is least preferred. Hence, higher the value better is the cluster configuration. """ 418 | 419 | from sklearn.metrics import silhouette_score 420 | print("The sillhouette score for the model is " + str(silhouette_score(matrix, y_pred))) 421 | print() 422 | print() 423 | print() 424 | #500_no_ngrams: 0.07096239881264323 425 | #350_no_ngrams: 0.06777628195061947 426 | #700_no_ngrams: 0.06251251395097632 427 | #350_3_ngrams: 0.04969413068018369 428 | #300_3_ngrams: 0.04857286650243616 429 | 430 | global s_score 431 | global f_score 432 | global num2 433 | 434 | s_score = silhouette_score(matrix, y_pred) 435 | f_score = f1_score(y_true = filtered_y_true, y_pred = filtered_y_pred, average = "micro") 436 | num2 = 0 437 | for i in y_true: 438 | if i != "nan": 439 | num2 += 1 -------------------------------------------------------------------------------- /Hyperparameter_Testing/hyperparameter_testing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sat Feb 16 12:00:18 2019 5 | 6 | @author: parkerglenn 7 | """ 8 | import os 9 | import pandas as pd 10 | import numpy as np 11 | import sklearn.manifold 12 | import matplotlib.pyplot as plt 13 | import nltk 14 | import regex as re 15 | import re 16 | import codecs 17 | import csv 18 | import glob 19 | import multiprocessing 20 | from nltk.corpus import stopwords 21 | from nltk.stem.snowball import SnowballStemmer 22 | from nltk.tokenize import word_tokenize, sent_tokenize 23 | stemmer = SnowballStemmer("english") 24 | import math 25 | from sklearn.feature_extraction.text import TfidfVectorizer 26 | from sklearn.cluster import KMeans 27 | import seaborn as sns 28 | import sklearn 29 | 30 | os.chdir("/Users/parkerglenn/Desktop/DataScience/Article_Clustering") 31 | df = pd.read_csv("/Users/parkerglenn/Desktop/DataScience/Article_Clustering/csv/all_GOOD_articles.csv") 32 | labels_df= pd.read_csv("/Users/parkerglenn/Desktop/DataScience/Article_Clustering/Google_Drive/Article_Classification122.csv") 33 | #Deletes unnecessary columns 34 | df = df.drop(df.columns[:12], axis = 1) 35 | #Sets manageable range for working data set 36 | new_df = df[5000:6000] 37 | #Gets info in list form to be later called in kmeans part 38 | 39 | corpus = [] 40 | for text in new_df['content']: 41 | corpus.append(text) 42 | 43 | titles = [] 44 | for title in new_df["title"]: 45 | titles.append(str(title)) 46 | #labels_df starts at df[5000] so we're good on the matching of labels to content 47 | events = [] 48 | for event in labels_df["Event"][:1000]: 49 | events.append(str(event)) 50 | 51 | import spacy 52 | from spacy import gold 53 | from spacy.gold import iob_to_biluo 54 | nlp = spacy.load('en_core_web_md', disable=['parser','tagger','textcat']) 55 | nlp.vocab.add_flag(lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP) 56 | english_stopwords = stopwords.words('english') 57 | from sklearn.feature_extraction import stop_words 58 | 59 | ############################################################################## 60 | ###################HYPER-PARAMETER TESTING#################################### 61 | ############################################################################## 62 | from hyper_parameter_functions import tokenize_and_stem_NER, do_tfidf, HAC, success 63 | 64 | hyper_params = pd.DataFrame(columns = ["ents_rate", "person_rate","f_score","s_score","samples used"]) 65 | 66 | for person_rate in np.linspace(1,7,20): 67 | for ents_rate in np.linspace(1,7,20): 68 | tfidf_matrix = do_tfidf(ents_rate, person_rate) 69 | HAC(tfidf_matrix) 70 | cols = [pd.Series([ents_rate,person_rate,f_score,s_score,num2],index=hyper_params.columns)] 71 | hyper_params = hyper_params.append(cols) 72 | 73 | hyper_params.plot.scatter(x="f_score", y = "s_score") 74 | 75 | 76 | f_scores = [x for x in hyper_params["f_score"]] 77 | s_scores = [x for x in hyper_params["s_score"]] 78 | person_rate = [x for x in hyper_params["person_rate"]] 79 | ents_rate = [x for x in hyper_params["ents_rate"]] 80 | 81 | cmap = pyplot.cm.cubehelix 82 | dimensions = (20,20) 83 | fig, ax = pyplot.subplots(figsize=dimensions) 84 | sns.heatmap(dist, vmin = 0, vmax = 1, cmap = cmap).set_title("Tfidf Distances Between Articles", fontsize = 15) 85 | 86 | 87 | 88 | import seaborn as sns 89 | 90 | """F1 Score distribution across hyperparameters 91 | Best F1 Score: ents_rate = 1.63, person_rate = 2.57, f_score = .927 on 412 articles. BUT s_score is a measly .0788. 92 | """ 93 | f_data = pd.DataFrame({"Person Rate": [round(x,2) for x in person_rate], "Entity Rate":[round(x,2) for x in ents_rate], "Z": hyper_params.f_score}) 94 | f_data_pivoted = f_data.pivot("Person Rate","Entity Rate","Z") 95 | ax1 = sns.heatmap(f_data_pivoted, cmap = plt.cm.hot) 96 | cbar = ax1.collections[0].colorbar 97 | cbar.set_label('F1 Score', labelpad=15) 98 | ax1.invert_yaxis() 99 | plt.show() 100 | 101 | """Silhouette Score distribution across hyperparameters 102 | 103 | More direct correlation here than in F1 Score. Most notably, as the rates increase, so does S Score. This is due to 104 | the fact that the rating of words in articles becomes more radical; articles that are different from each other, i.e. share no 105 | entities, are now much more distant than before. Even more specifically, when the Entity Rate and Person Rate are dissimilar, the 106 | S Score is the highest. Again, the same radical rating phenomena: By limiting the amount of weighted features, those that are weighted 107 | make the article more of an outlier than before.""" 108 | s_data = pd.DataFrame({"Person Rate": [round(x,2) for x in person_rate], "Entity Rate":[round(x,2) for x in ents_rate], "Z": hyper_params.s_score}) 109 | s_data_pivoted = s_data.pivot("Person Rate","Entity Rate","Z") 110 | ax2 = sns.heatmap(s_data_pivoted, cmap = plt.cm.hot) 111 | cbar = ax2.collections[0].colorbar 112 | cbar.set_label('Silhouette Score', labelpad=15) 113 | ax2.invert_yaxis() 114 | plt.show() 115 | 116 | 117 | 118 | """Cumulative Total 119 | Highest: ents_rate = 6.368, person_rate = 2.263, f_score = 0.922, s_score = 0.093 120 | 121 | """ 122 | sf_data = pd.DataFrame({"Person Rate": [round(x,2) for x in person_rate], "Entity Rate":[round(x,2) for x in ents_rate], "Z": hyper_params.s_score + hyper_params.f_score}) 123 | sf_data_pivoted = sf_data.pivot("Person Rate","Entity Rate","Z") 124 | ax3 = sns.heatmap(sf_data_pivoted, cmap = plt.cm.hot) 125 | cbar = ax3.collections[0].colorbar 126 | cbar.set_label('Composite Score', labelpad=15) 127 | ax3.invert_yaxis() 128 | plt.show() 129 | 130 | """Little thing to find best cumulative score and rates that achieved it.""" 131 | hightot = 0 132 | for tup in enumerate(hyper_params["f_score"]): 133 | tot = tup[1] + hyper_params.loc[tup[0]]["s_score"] 134 | if hightot < tot: 135 | hightot = tot 136 | best_scores = (hyper_params.loc[tup[0]]["ents_rate"],hyper_params.loc[tup[0]]["person_rate"]) 137 | best_scores 138 | 139 | 140 | 141 | -------------------------------------------------------------------------------- /PreProcessing/CustomTFIDF.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Mar 4 16:12:21 2019 5 | 6 | @author: parkerglenn 7 | """ 8 | import sklearn 9 | class CustomTFIDF(sklearn.base.TransformerMixin): 10 | 11 | def __init__(self, person_rate = 1, 12 | ents_rate = 1, max_df = 1.0, min_df = 0, date_weight = .1, julian = False, df = False): 13 | self._person_rate = person_rate 14 | self._ents_rate = ents_rate 15 | self._min_df = min_df 16 | self._max_df = max_df 17 | self._date_weight = date_weight 18 | self._julian = julian 19 | self._df = df 20 | 21 | 22 | def fit(self, X, *_): 23 | return self 24 | 25 | 26 | def TF_dict(self, article): 27 | article_tf = {} 28 | for word in article: 29 | if word in article_tf: 30 | article_tf[word] += 1 31 | else: 32 | article_tf[word] = 1 33 | for word in article_tf: 34 | """Manipulate word.startswith() to account for entity weighting.""" 35 | #word.startswith("*") applies to PERSON tags 36 | if word.startswith("*"): 37 | occurences = article_tf[word] 38 | article_tf[word] = (occurences / len(article)) * self._person_rate 39 | #word.startswith("&") applies to NON-PERSON tags 40 | elif word.startswith("&"): 41 | occurences = article_tf[word] 42 | article_tf[word] = (occurences / len(article)) * self._ents_rate 43 | else: 44 | occurences = article_tf[word] 45 | article_tf[word] = (occurences / len(article)) 46 | return article_tf 47 | 48 | 49 | def Count_dict(self): 50 | countDict = {} 51 | for article in self._TF: 52 | found_words = [] 53 | for word in article: 54 | if word in countDict and word not in found_words: 55 | countDict[word] += 1 56 | found_words.append(word) 57 | elif word not in found_words: 58 | countDict[word] = 1 59 | found_words.append(word) 60 | return countDict 61 | 62 | 63 | 64 | 65 | def IDF_dict(self, X): 66 | import math 67 | idfDict = {} 68 | for word in self._countDict: 69 | #len(corpus) is 1000, the total number of documents for this project 70 | #countDict[word] is the number of articles the word appears in 71 | """ 72 | From Sci-Kit code: https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py 73 | 'smooth_idf: If ``smooth_idf=True`` (the default), the constant "1" is added to the 74 | numerator and denominator of the idf as if an extra document was seen 75 | containing every term in the collection exactly once, which prevents 76 | zero divisions: idf(d, t) = log [ (1 + n) / (1 + df(d, t)) ] + 1.' 77 | 78 | 'The effect of adding "1" to 79 | the idf in the equation above is that terms with zero idf, i.e., terms 80 | that occur in all documents in a training set, will not be entirely 81 | ignored.' 82 | 83 | min_df: 'When building the vocabulary ignore terms that have a document 84 | frequency strictly lower than the given threshold. This value is also 85 | called cut-off in the literature.' 86 | 87 | max_df: 'When building the vocabulary ignore terms that have a document 88 | frequency strictly higher than the given threshold (corpus-specific 89 | stop words).' 90 | 91 | norm: (default='l2') Each output row will have unit norm, either: 92 | * 'l2': Sum of squares of vector elements is 1. The cosine 93 | similarity between two vectors is their dot product when l2 norm has 94 | been applied. 95 | """ 96 | #Implements min_df and max_df 97 | if self._countDict[word] > self._min_df and (self._countDict[word] / self._amount) < self._max_df: 98 | idfDict[word] = math.log((1 + self._amount) / (1 + self._countDict[word])) + 1 99 | else: 100 | idfDict[word] = 0 101 | return idfDict 102 | 103 | 104 | def TFIDF_list(self, article): 105 | article_tfidf = {} 106 | for word in article: 107 | #article[word] is the TF score for that word in the given article 108 | article_tfidf[word] = article[word] * self._idfDict[word] 109 | return article_tfidf 110 | 111 | 112 | 113 | def compute_TFIDF_matrix(self, article): 114 | terms = sorted(self._countDict.keys()) 115 | article_matrix = [0.0] * len(terms) 116 | for i, word in enumerate(terms): 117 | #Stores tfidf value of unique word in terms 118 | #if the word is in the article 119 | if word in article: 120 | #article[word] is the word's tfidf score 121 | article_matrix[i] = article[word] 122 | return article_matrix 123 | 124 | 125 | def makeJulian(self,X): 126 | """X must be a df with a "date" column in '%Y-%m-%d' format. 127 | 128 | This takes a while to run. 129 | """ 130 | import datetime 131 | fmt = '%Y-%m-%d' 132 | 133 | 134 | import julian 135 | 136 | julian_lst = [] 137 | for date in X["date"]: 138 | dt = datetime.datetime.strptime(date,fmt) 139 | julian_lst.append(julian.to_jd(dt + datetime.timedelta(hours=12), fmt = "jd")) 140 | 141 | #Find amount of unique dates 142 | #Set arbitrary value (maybe 1, do hyperparameting testing again) to index of that date 143 | unique = list(set(julian_lst)) 144 | 145 | #Just to have easy access to indexes, ultimately to decide which place in the feature 146 | #matrix corresponds to which date 147 | unique_dict = {} 148 | for place, date in enumerate(unique): 149 | unique_dict[date] = place 150 | 151 | jul_matrix = [] 152 | for place, date in enumerate(julian_lst): 153 | #mini_matrix is the matrix for the individual article 154 | mini_matrix = [0.0] * len(julian_lst) 155 | for num in range(-4,4): 156 | if num == 0: 157 | mini_matrix[unique_dict[date]] = self._date_weight 158 | else: 159 | if (unique_dict[date] + num) > -1: 160 | #Deterioation function as dates get further away from target 161 | #Since dates within a proximity of about 4 seem to indicate some significance in similarity 162 | #Can change, right now it's date_weight divided by absolute value of num 163 | mini_matrix[unique_dict[date] + num] = (self._date_weight / (abs(num)+.5)) 164 | jul_matrix.append(mini_matrix) 165 | return jul_matrix 166 | 167 | 168 | 169 | 170 | def transform(self, X, *_): 171 | self._amount = len(X) 172 | from sklearn import preprocessing 173 | self._TF = [self.TF_dict(article) for article in X] 174 | self._countDict = self.Count_dict() 175 | self._idfDict = self.IDF_dict(X) 176 | self._tfidf = [self.TFIDF_list(article) for article in self._TF] 177 | self._tfidf_matrix = [self.compute_TFIDF_matrix(article) for article in self._tfidf] 178 | self._tfidf_matrix = preprocessing.normalize(self._tfidf_matrix, norm = 'l2') 179 | #Decides whether or not to add date component 180 | if self._julian == True: 181 | import scipy 182 | from scipy.sparse import hstack 183 | self._jul_list = self.makeJulian(self._df) 184 | self._jul_matrix = scipy.sparse.csr_matrix(self._jul_list) 185 | self._combo_matrix = hstack([self._jul_matrix, self._tfidf_matrix]).toarray() 186 | return self._combo_matrix 187 | else: 188 | return self._tfidf_matrix 189 | -------------------------------------------------------------------------------- /PreProcessing/NERTokenizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Mar 4 14:59:22 2019 5 | 6 | @author: parkerglenn 7 | """ 8 | 9 | 10 | """ 11 | Known errors: 12 | Trump is sometimes tagged as an ORG 13 | U.S. is sometimes tagged as a PERSON 14 | """ 15 | import sklearn.base 16 | class NERTokenizer(sklearn.base.TransformerMixin): 17 | """If 'tag' is True, Person entities .startswith("*") and other entities deemed "good" .startswith("&")""" 18 | def __init__(self, tag = False): 19 | self._tag = tag 20 | 21 | def fit(self, X, *_): 22 | return self 23 | 24 | def transform(self, X, *_): 25 | from nltk.corpus import stopwords 26 | from nltk.stem.snowball import SnowballStemmer 27 | stemmer = SnowballStemmer("english") 28 | 29 | import spacy 30 | from spacy.gold import iob_to_biluo 31 | nlp = spacy.load('en_core_web_md', disable=['parser','tagger','textcat']) 32 | from spacy.attrs import ORTH 33 | nlp.tokenizer.add_special_case("I'm", [{ORTH: "I'm"}]) 34 | nlp.vocab.add_flag(lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP) 35 | 36 | english_stopwords = stopwords.words('english') 37 | english_stopwords.append("i'm") 38 | 39 | tokenized_corpus = [] 40 | good_ents = ["PERSON","GPE","ORG", "LOC", "EVENT", "FAC"] 41 | continue_tags = ["B-","I-"] 42 | end_tags = ["L-","U-"] 43 | 44 | 45 | 46 | for text in X: 47 | toks = [] 48 | iobs = [i.ent_iob_ for i in nlp(text)] 49 | biluos = list(iob_to_biluo(iobs)) 50 | #Named entities variable 51 | ne = "" 52 | for index, tok in enumerate(nlp(text)): 53 | if biluos[index] in continue_tags and str(tok.ent_type_) in good_ents: 54 | #str(tok).split() != [] Checks if empty token 55 | #For some reason tok.whitespace_ doesn't include double token entities 56 | #like "JENNIFER LAWRENCE" 57 | if not self._tag: 58 | ne += " " + str(tok).lower() 59 | elif self._tag and str(tok).split() != []: 60 | #Entity is the beginning of an entity set 61 | if biluos[index] == "B-": 62 | if str(tok.ent_type_) != "PERSON": 63 | ne += " &" + str(tok).lower() 64 | elif str(tok.ent_type_) == "PERSON": 65 | ne += " *" + str(tok).lower() 66 | else: 67 | if str(tok.ent_type_) != "PERSON": 68 | ne += " " + str(tok).lower() 69 | elif str(tok.ent_type_) == "PERSON": 70 | ne += " " + str(tok).lower() 71 | elif biluos[index] in end_tags and str(tok.ent_type_) in good_ents: 72 | if not self._tag: 73 | ne += " " + str(tok).lower() 74 | toks.append(ne.lstrip()) 75 | ne = " " 76 | elif self._tag and str(tok).split() != []: 77 | #Entity is just a single unit 78 | if biluos[index] == "U-": 79 | if str(tok.ent_type_) != "PERSON": 80 | ne += " &" + str(tok).lower() 81 | toks.append(ne.lstrip()) 82 | ne = " " 83 | elif str(tok.ent_type_) == "PERSON": 84 | ne += " *" + str(tok).lower() 85 | ne.replace("*’m", "") 86 | toks.append(ne.lstrip()) 87 | ne = " " 88 | else: 89 | ne += " " + str(tok).lower() 90 | # so that possesive tags are not stored with the '’s' 91 | ne = ne.replace("’s", "") 92 | toks.append(ne.lstrip()) 93 | ne = " " 94 | #If token is just a boring old word 95 | else: 96 | if not tok.is_punct and not tok.is_space and str(tok).lower() not in english_stopwords: 97 | toks.append(stemmer.stem(str(tok))) 98 | tokenized_corpus.append(toks) 99 | return tokenized_corpus 100 | -------------------------------------------------------------------------------- /PreProcessing/__pycache__/CustomTFIDF.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parkervg/news-article-clustering/bca5f12fde915a3bc1a103af7937eec89e3a7a2d/PreProcessing/__pycache__/CustomTFIDF.cpython-37.pyc -------------------------------------------------------------------------------- /PreProcessing/__pycache__/NERTokenizer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parkervg/news-article-clustering/bca5f12fde915a3bc1a103af7937eec89e3a7a2d/PreProcessing/__pycache__/NERTokenizer.cpython-37.pyc -------------------------------------------------------------------------------- /PreProcessing/julian_matrix.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[60]: 5 | import os 6 | import pandas as pd 7 | os.chdir("/Users/parkerglenn/Desktop/DataScience/Article_Clustering") 8 | df = pd.read_csv("/Users/parkerglenn/Desktop/DataScience/Article_Clustering/csv/all_GOOD_articles.csv") 9 | labels_df= pd.read_csv("/Users/parkerglenn/Desktop/DataScience/Article_Clustering/Google_Drive/Article_Classification26.csv") 10 | #Deletes unnecessary columns 11 | df = df.drop(df.columns[:12], axis = 1) 12 | #Sets manageable range for working data set 13 | new_df = df[5000:6000] 14 | 15 | 16 | # In[63]: 17 | 18 | 19 | import datetime 20 | fmt = '%Y-%m-%d' 21 | s = "2016-02-01" 22 | s = datetime.datetime.strptime(s,fmt) 23 | 24 | 25 | # In[64]: 26 | 27 | 28 | new_df["dt"]=False 29 | for i in enumerate(new_df["date"]): 30 | new_df["dt"].iloc[i[0]] = datetime.datetime.strptime(i[1],fmt) 31 | 32 | 33 | # In[65]: 34 | 35 | 36 | import julian 37 | new_df["julian"] = False 38 | for i in enumerate(new_df["dt"]): 39 | jd = julian.to_jd(i[1] + datetime.timedelta(hours=12), fmt = "jd") 40 | new_df["julian"].iloc[i[0]] = jd 41 | 42 | 43 | # In[75]: 44 | 45 | 46 | #Find amount of unique dates 47 | #Set arbitrary value (maybe 1, do hyperparameting testing again) to index of that date 48 | unique = [] 49 | for i in new_df["julian"]: 50 | unique.append(i) 51 | unique = set(unique) 52 | print(len(unique)) 53 | 54 | 55 | # In[80]: 56 | 57 | 58 | unique_dict = {} 59 | for place, date in enumerate(unique): 60 | unique_dict[date] = place 61 | 62 | 63 | 64 | # In[96]: 65 | 66 | 67 | import scipy 68 | jul_matrix = [] 69 | for place, date in enumerate(new_df["julian"]): 70 | mini_matrix = [0.0] * len(new_df) 71 | #Change the .1 value to something appropriate 72 | #Use hyperparemeter testing 73 | mini_matrix[unique_dict[date]] = 0.1 74 | jul_matrix.append(mini_matrix) 75 | 76 | 77 | jul_matrix 78 | 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # newsfeed-nlp 2 | 3 | [Link to project poster](Visualizations/NewsfeedArticleClustering.pdf) 4 | 5 | ## Abstract 6 | This unsupervised learning project allows the average news consumer to experience a stream-lined information acquisition process, free of repetition. Working in Python and using a Kaggle dataset (https://www.kaggle.com/snapcrack/all-the-news) of 85,000 news articles, we extract significance from the texts by utilizing a modified TFIDF-Vectorizer to pre-process the data. We experiment with various clustering techniques (Kmeans, HAC, and Birch), paired with various success metrics to gauge effectiveness of the news consolidation. Visualizations are created using Seaborn and Matplotlib, along with D3 for ease of exploration. Spacy is used as the primary NLP. 7 | 8 | ## Motivation 9 | In a political climate of intensely polarizing takes on the latest scandals, international relations, and national issues, it can be difficult to make sense of all the data. With the rise of social media, the ability to publish has been democratized and repetition in the newsfeed runs rampant. By grouping together news stories by topic, not only is the newsfeed browsing process streamlined, but clusters are formed that provide differing perspective on the same story. 10 | 11 | # Methodology - Data Cleaning 12 | Below is a snippet of the original data (https://www.kaggle.com/snapcrack/all-the-news), with the 'content' column omitted. 13 | 14 | | | | | | | | | | 15 | |--------|-----------------------------------------------------------------------------------|-------------|-------------------|----------|------|-------|-------------------------------------------------------------------------------------------------------------------------| 16 | | id | title | publication | author | date | year | month | url | content | 17 | | 151908 | Alton Sterling‚Äôs son: ‚ÄôEveryone needs to protest the right way, with peace‚Äô | Guardian | Jessica Glenza | 7/13/16 | 2016 | 7 | https://www.theguardian.com/us-news/2016/jul/13/alton-sterling-son-cameron-protesters-baton-rouge | 18 | | 151909 | Shakespeare‚Äôs first four folios sell at auction for almost ¬£2.5m | Guardian | | 5/25/16 | 2016 | 5 | https://www.theguardian.com/culture/2016/may/25/shakespeares-first-four-folios-sell-at-auction-for-almost-25m | 19 | | 151910 | My grandmother‚Äôs death saved me from a life of debt | Guardian | Robert Pendry | 10/31/16 | 2016 | 10 | https://www.theguardian.com/commentisfree/2016/oct/31/grandmothers-death-saved-me-life-of-debt | 20 | | 151911 | I feared my life lacked meaning. Cancer pushed me to find some | Guardian | Bradford Frost | 11/26/16 | 2016 | 11 | https://www.theguardian.com/commentisfree/2016/nov/26/cancer-diagnosis-existential-life-accomplishments-meaning | 21 | | 151912 | Texas man serving life sentence innocent of double murder, judge says | Guardian | | 8/20/16 | 2016 | 8 | https://www.theguardian.com/us-news/2016/aug/20/texas-life-sentence-innocence-dna-richard-bryan-kussmaul | 22 | | 23 | 24 | The orginal dataset was cleansed to only keep rows which contained both a url and a date. This data cleansing resulted in a working dataframe of 82,920 articles. 25 | 26 | # Methodology - Analysis 27 | 28 | ## Data Exploration 29 | We began by exploring the distribution of certain notable factors of the data, such as date and publisher. There are 10 unique pulishers in our subset of the data, with an interesting distribution in publishing date: 30 | 31 | 32 | 33 | 34 | For the purpose of our project, a document similarity-based task, it seemed most appropriate to run our clustering tests on a section of the data found to the right. Considering that the texts we are working with are all news stories, the frequency of having multiple distinct news stories published about the same event is increased when they are more chronologically compact, i.e. the chance of having an article about the 2016 Iowa Caucus 7 months after it took place is unlikely. 35 | 36 | We began by [labelling each datapoint](data/article_classifications.csv) with a succinct event label to accurately describe the event it portrayed. One such example is found below (with abbreviated content), which yielded the label "daallo_airlines_explosion." 37 | 38 | | event | | content | date | id | month | publication | month | url | year | 39 | |---------------------------|--|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------|-------|---|---------------------|-------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------|------| 40 | | daallo_airlines_explosion | | MOGADISHU, Somalia (AP) ‚Äî An explosion that blew a hole in a jetliner shortly after takeoff and left one man missing was believed to have been caused by a bomb, the pilot said Wednesday, describing how the crew calmed frightened passengers as smoke enveloped the cabin before he brought the plane back to Mogadishu‚Äôs airport for an emergency landing. Daallo Airlines said all passengers except one got off the plane safely. It previously... | 2/3/16 | 95647 | 2 | Talking Points Memo | Flight Lands Safely After Suspected Bomb Blew Hole In Plane | https://web.archive.org/web/20160204014156/http://talkingpointsmemo.com/world-news/daallo-airlines-explosion-plane-lands-safely | 2016 | 41 | 42 | 43 | This labelling was completed for 1,000 of the articles, to be used for our clustering and success metrics. 44 | 45 | 46 | ## Pre-Processing 47 | TFIDF is used as the primary pre-processing method, with some adjustments to account for entity weighting. First, a special tokenizer was created which takes into consideration the entity type of the token. If the token is not an entity, the stem of the token is taken using nltk's [snowball stemmer](http://www.nltk.org/howto/stem.html). Stemming refers to the process of reducing an inflected (or sometimes derived) word to their base form, even if is not identical to its morphological root. If the token is a location, organization or event, an ampersand is inserted in front of the token. If the token is a person, an astericks is inserted at the start of the token. This was achieved using [Spacy's entity recognition](https://spacy.io/usage/linguistic-features). 48 | 49 | ```python 50 | sent = ["The United Nations ban pineapple on pizza, but Bill Gates intends to fights back."] 51 | tokens = tokenize_and_stem_NER(sent) 52 | print(tokens) 53 | >>>> ['&the united nations', 'ban', 'pineappl', 'pizza', '*bill gates', 'intend', 'fight'] 54 | ``` 55 | 56 | Then, to give extra significance to these entities, their TFIDF score was manipulated; specifically, in creating the TF_dict from the file [CustomTFIDF.py](PreProcessing/CustomTFIDF.py), the non-person entities were multipled by a factor of 4, and the person entities a factor of 1.3. 57 | 58 | ## Clustering 59 | As noted before, the three clustering methods that we utilized in our project were Kmeans, HAC, and KNN. Ultimately, after many iterations and logging of [success rates](Success_Rates.md), [HAC (Hierarchical Agglomerative Clustering)](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html) proved to be the most successful method. 60 | 61 | Below is a graphic, 2D representation of the multidimensional TFIDF matrix and resulting clusters (as clustered by Kmeans). The annotation by each data point is the "predominant event" as explained below in the Defining Success section, and the size of each point is directly related to the number of articles in that specific cluster. 62 | 63 | ![Article Centers](/Visualizations/Article_Centers.png) 64 | 65 | 66 | ## Defining Success 67 | ### [F1 Score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html) 68 | The F1 score is defined as the harmonic mean between precision and recall. Being as this is an unsupervised learning project, the definition of "y_true" was not inherently obvious. When looking at a cluster of, say, {'zika_std': 1, 'robert_finicum_shooting': 3}, it is difficult from the mere labels which event is a false positive. In response to this ambiguity, a success algorithm was made to assign as many clusters as possible to one unique, predominant event. 69 | 70 | This predominant event was found by invoking not only the frequency of events, but the individual events' "ratio" when compared to the total occurences of that event. First, the predominant event was defined merely as that event which occured most often within a specific cluster. For example, in the instance of *Cluster A* containing {'zika_std': 1, 'robert_finicum_shooting': 3}, the predominant event would be 'robert_finicum_shooting.' 71 | 72 | Second, the ratio of an event is invoked. The process of defining predominance merely by quantity of events results in doubling-up of events; multiple clusters are assigned the same event. If *Cluster B* yields {'santorum_drops_out': 1, 'robert_finicum_shooting': 2}, it would also recieve 'robert_finicum_shooting' as the predominant label. But perhaps the label "santorum_drops_out" is used only 4 times across the whole dataset, while the label "robert_finicum_shooting" was used a total of 10 times. To compare the ratio of santorum_drops_out (1/4 = .25) and robert_finicum_shooting (2/10 = .2), the label santorum_drops_out would be more significant in this particular cluster. 73 | 74 | If, between two clusters there is a tie in ratios of their predominant event, the clusters are dismissed when calculating the F1 Score. For our purposes, the F1 Score is not intended to be a perfect measure, but merely a gauge by which some improvement may be noticed as we pass through different clustering models. 75 | 76 | ### [Silhouette Score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html) 77 | While the F1 Score defines success based on the values being assigned, the Silhouette Score uses the intrinsic properties of the clusters themselves, with no weight given to the meaning of labels. 78 | 79 | A value of +1 indicates that the sample is far away from its neighboring cluster and very close to the cluster its assigned. Similarly, value of -1 indicates that the point is close to its neighboring cluster than to the cluster its assigned. And, a value of 0 means its at the boundary of the distance between the two cluster. Value of +1 is ideal and -1 is least preferred. Hence, higher the value better is the cluster configuration. 80 | 81 | ## Summary 82 | Our 500 cluster HAC model can cluster together the 1,000 news articles we had pre-processed with an *F1-Score of 0.922* and a *Sillhouette Score of 0.093*. 83 | 84 | A difficult idea that came up quite frequently was the definition of "success" within the project. Specifically, how broad we intended to be with our definition of an "event". Were we content with an article about Russian politics being clustered with the Russian Olympic ban? Ultimately, I decided to take a more granular approach to events, and label those two example events as "russian_politics" and "russian_olympic_ban" accordingly. 85 | 86 | Another difficulty in the scope of the project was navigating intersections in broad topics. Is the article "Donald Trump speaks about Hurricane Matthew" about Donald Trump or Hurricane Matthew? 87 | 88 | Overall, I am extremely pleased with how the project turned out and all that I learned about topic clustering and NLP. 89 | -------------------------------------------------------------------------------- /SuccessMetrics.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Feb 6 11:16:12 2019 5 | 6 | @author: parkerglenn 7 | """ 8 | 9 | ############################################################################## 10 | ##################SUCESS RATES################################################ 11 | ############################################################################## 12 | """ 13 | Takes as input the clustering model, the clusters assignments (in list form) that were predicted, and the 14 | tfidf_matrix that was clustered. 15 | """ 16 | def success(model, clusters, matrix): 17 | import os 18 | import pandas as pd 19 | 20 | os.chdir("/Users/parkerglenn/Desktop/DataScience/Article_Clustering") 21 | 22 | df = pd.read_csv("/Users/parkerglenn/Desktop/DataScience/Article_Clustering/csv/all_GOOD_articles.csv") 23 | labels_df= pd.read_csv("/Users/parkerglenn/Desktop/DataScience/Article_Clustering/Google_Drive/Article_ClassificationFINAL.csv") 24 | 25 | #Deletes unnecessary columns 26 | df = df.drop(df.columns[:12], axis = 1) 27 | #Sets manageable range for working data set 28 | new_df = df[5000:6000] 29 | #Gets info in list form to be later called in kmeans part 30 | 31 | corpus = [] 32 | for text in new_df['content']: 33 | corpus.append(text) 34 | 35 | titles = [] 36 | for title in new_df["title"]: 37 | titles.append(str(title)) 38 | #labels_df starts at df[5000] so we're good on the matching of labels to content 39 | events = [] 40 | for event in labels_df["Event"][:1000]: 41 | events.append(str(event)) 42 | 43 | 44 | articles = {"title": titles, "date": new_df["date"], "cluster": clusters, "content": new_df["content"], "event": events[:len(clusters)]} 45 | frame = pd.DataFrame(articles, index = [clusters] , columns = ['title', 'date', 'cluster', 'content', "event"]) 46 | 47 | """ 48 | BELOW THIS CREATES DICT OF CLUSTERS AND PREDOMINANT EVENT 49 | 50 | If multiple events occur the same amount of times in a single cluster, 51 | the ratio function is invoked to choose the event holding the most relative 52 | significance. If one ratio is not greater than the others (ex. a cluster 53 | composed of 5 one-off events) then the cluster is disregared (labelled "nan"). 54 | 55 | If the cluster only contains one event, it is assumed at this stage that it is 56 | the main cluster for the event. 57 | 58 | BUGS: 59 | If the cluster contains only "nan" events, it will not show up in y_trueDict 60 | (ex. Cluster 113 is not shown, consisting of {'nan': 2} ) 61 | """ 62 | 63 | from collections import Counter 64 | all_events = [] 65 | #This fixes quirk where the same cluster was iterated over multiple times 66 | clusters_we_saw = [] 67 | for cluster in clusters: 68 | if cluster not in clusters_we_saw: 69 | clusters_we_saw.append(cluster) 70 | for event in frame.loc[cluster]["event"].values.tolist(): 71 | if event != "nan" and event != "useless": 72 | all_events.append(event) 73 | event_occurences = dict(Counter(all_events)) 74 | 75 | #Gets number of unique clusters 76 | all_clusters = set(clusters) 77 | 78 | 79 | y_trueDict = {} 80 | for i in range(0,len(all_clusters)): 81 | ratios = [] 82 | counts = [] 83 | ratio = 0 84 | 85 | #Counts occurence per cluster of event 86 | for event in frame.loc[i]["event"].values.tolist(): 87 | if event != "nan" and event != "useless": 88 | counts.append(event) 89 | counts = Counter(counts) 90 | 91 | 92 | if len(counts) > 1: 93 | score_1 = list(counts.most_common()[0])[1] 94 | score_2 = list(counts.most_common()[1])[1] 95 | #Check to see if there are multiple events with same frequency 96 | if score_1 == score_2: 97 | #Gets all events with same frequency 98 | tied_events = [k for k,v in dict(counts).items() if v == score_1] 99 | for event in tied_events: 100 | #Gets the ratio of an occurence for an event in a cluster 101 | #For example, if an event happens only once, it's ratio will be 1 102 | #But if "iowa_caucuses" is used 100 times and only 20 times in a specific cluster, 103 | #its ratio is .2 104 | new_ratio = score_1 / int(event_occurences[event]) 105 | ratios.append(new_ratio) 106 | if new_ratio > ratio: 107 | cluster_event = event 108 | ratio = new_ratio 109 | #If result is an empty list, all ratios are unique. If not, there 110 | #are repititions and the data point is thrown out. 111 | if list(set([x for x in ratios if ratios.count(x) > 1])) != []: 112 | y_trueDict[i] = "nan" 113 | #Dumb try and except sees if ytrueDict[i] is already set to something ("nan") 114 | try: 115 | y_trueDict[i] 116 | except: 117 | counts = dict(counts) 118 | #Makes sure there's still the occurence in cluster attached to the cluster_event 119 | y_trueDict[i] = [cluster_event, counts[cluster_event]] 120 | #If there is one obviously right event, i.e. score_1 != score_2 121 | else: 122 | y_trueDict[i] = list(counts.most_common()[0]) 123 | 124 | #Catches the instance of only one item per cluster, i.e. len(counts) !> 1 125 | elif len(counts) == 1: 126 | y_trueDict[i] =list(counts.most_common()[0]) 127 | 128 | 129 | #Re-analyzes y_trueDict, applying ratio again so there's one objectively "right" cluster per event 130 | a = [] 131 | for k ,v in y_trueDict.items(): 132 | a.append(v[0]) 133 | a = dict(Counter(a)) 134 | 135 | #Sees where the same event label is applied to multiple clusters 136 | duplicates = [] 137 | for g in a: 138 | if a[g] > 1 and g != "n" and g != "unknown": 139 | duplicates.append(g) 140 | 141 | 142 | #Creates dup_eventsPLUSratio, where the duplicate events are stored by cluster number 143 | #with their ratio 144 | dup_eventsPLUSratio = {} 145 | for key, value in y_trueDict.items(): 146 | if value[0] in duplicates: 147 | event = value[0] 148 | ratio = int(y_trueDict[key][1]) / int(event_occurences[event]) 149 | eventPLUSratio = [] 150 | eventPLUSratio.append(event) 151 | eventPLUSratio.append(ratio) 152 | dup_eventsPLUSratio[key] = eventPLUSratio 153 | 154 | #Dives into dup_eventsPLUSratio to see what cluster is more approrpiate for event 155 | for duplicate in duplicates: 156 | ratios = [] 157 | for key,value in dup_eventsPLUSratio.items(): 158 | event = value[0] 159 | event_ratio = value[1] 160 | if event == duplicate: 161 | ratios.append(event_ratio) 162 | sort=sorted(ratios,reverse=True) 163 | highest = sort[0] 164 | theGood_one = [duplicate, highest] 165 | for key in dup_eventsPLUSratio: 166 | if event == duplicate: 167 | if value != theGood_one or highest == sort[1]: 168 | y_trueDict[key] = "nan" 169 | #If after all that there's still a tie between the top two ratios, 170 | #(like in hail_caesar_movie where its split 2 and 2 between clusters) 171 | #its given a "nan" label 172 | #COULD BE CHANGED TO FIT A WHILE LOOP THAT THEN FINDS score_2 AND 173 | #RELABELS CLUSTER TO SECOND MOST POPULAR EVENT IF THAT EVENT IS NOT 174 | #ALREADY ASSIGNED A CLUSTER 175 | 176 | #Gets y_true, the correct cluster assignments for each event 177 | bad_labels = ["useless","nan","unkown"] 178 | y_true = [] 179 | for event in events[:1000]: 180 | find = False 181 | for key, value in y_trueDict.items(): 182 | #Used to see if there is a distinct cluster for that event 183 | #FIXED BUG: probably still some duplicates in y_trueDict somehow, bc output len is 10005 184 | #maybe the "unknown" or "useless" stuff? 185 | if value[0] == event and value[0] not in bad_labels: 186 | y_true.append(key) 187 | find = True 188 | if find == False: 189 | #Arbitrary value that's not going to return a match in t score 190 | y_true.append("nan") 191 | 192 | 193 | #Gets y_pred, the cluster where each individual event was actually clustered 194 | y_pred = [cluster_assignment for cluster_assignment in frame["cluster"] ] 195 | 196 | #checks how events actually match up with definitively defined cluster 197 | num = 0 198 | for i in y_true: 199 | if i != "nan": 200 | num += 1 201 | print("Working with " + str(num) + " samples based on a spread of " + str(len(all_clusters)) + " clusters: ") 202 | print() 203 | 204 | #Re-Aligns two lists to only include good values (those not equalling "nan") 205 | filtered_y_true = [] 206 | filtered_y_pred = [] 207 | 208 | for place in range(len(y_true)): 209 | if y_true[place] != "nan": 210 | filtered_y_true.append(y_true[place]) 211 | filtered_y_pred.append(y_pred[place]) 212 | 213 | 214 | 215 | """F1 score is the harmonic average of precision and recall. """ 216 | 217 | from sklearn.metrics import f1_score 218 | print("The F1 score for the model is " + str(f1_score(y_true = filtered_y_true, y_pred = filtered_y_pred, average = "micro"))) 219 | print() 220 | #500_no_ngrams F1 score micro: 0.8785046728971962 (also works off the most samples) 221 | #350_3_ngrams F1 score micro: 0.8718861209964412 (but goes off 281 samples rather than 303 in no ngrams) 222 | #700_no_ngrams F1 score micro: 0.8638392857142858 223 | #350_no_ngrams F1 score micro: 0.8576158940397351 224 | #300_3_ngrams F1 score micro: 0.8294573643410853 225 | 226 | """ Silhouette values lies in the range of [-1, 1]. A value of +1 indicates that the sample is far away 227 | from its neighboring cluster and very close to the cluster its assigned. Similarly, value of -1 228 | indicates that the point is close to its neighboring cluster than to the cluster its assigned. 229 | And, a value of 0 means its at the boundary of the distance between the two cluster. Value of +1 230 | is ideal and -1 is least preferred. Hence, higher the value better is the cluster configuration. """ 231 | 232 | from sklearn.metrics import silhouette_score 233 | print("The sillhouette score for the model is " + str(silhouette_score(matrix, y_pred))) 234 | print() 235 | print() 236 | print() 237 | #500_no_ngrams: 0.07096239881264323 238 | #350_no_ngrams: 0.06777628195061947 239 | #700_no_ngrams: 0.06251251395097632 240 | #350_3_ngrams: 0.04969413068018369 241 | #300_3_ngrams: 0.04857286650243616 242 | -------------------------------------------------------------------------------- /Success_Rates.md: -------------------------------------------------------------------------------- 1 | ## Kmeans 2 | #### 500_no_ngrams 3 | * F1 score: 0.8785046728971962 4 | * Sillhouette Score: 0.07096239881264323 5 | 6 | #### 350_no_ngrams 7 | * F1 score: 0.8576158940397351 8 | * Sillhouette Score: 0.06777628195061947 9 | 10 | #### 350_no_ngrams_ENTS 11 | * F1 score: 0.850609756097561 12 | * Sillhouette Score: 0.06327439811090264 13 | 14 | #### 350_3_ngrams 15 | * F1 score: 0.8718861209964412 16 | * Sillhouette Score: 0.04969413068018369 17 | 18 | #### 300_3_ngrams 19 | * F1 score: 0.8294573643410853 20 | * Sillhouette Score: 0.04857286650243616 21 | 22 | #### 700_no_ngrams 23 | * F1 score: 0.8638392857142858 24 | * Sillhouette Score: 0.06251251395097632 25 | 26 | 27 | 28 | ## HAC 29 | 30 | #### 350_euclidean_HAC 31 | * F1 score: 0.8359621451104101 32 | * Sillhouette Score: 0.08998077808781355 33 | 34 | #### 500_euclidean_HAC 35 | * F1 score: 0.8997613365155133 36 | * Sillhouette Score: 0.08817834578438288 37 | 38 | 39 | good_ents = ["PERSON","GPE","ORG", "LOC", "EVENT", "FAC"] 40 | General trend: As the entity weighting increases, the sillhouette score also increases, usually at the expense of F1 score. The intense weighting of entities produces more spread out, tight-knit clusters. 41 | 42 | 43 | #### 500_euclidean_HAC_ENTS_*6_PERSON_*1.3 44 | * F1 score: 0.8974358974358975 45 | * Sillhouette Score: 0.09440994024020795 46 | 47 | #### 500_euclidean_HAC_ENTS_*4_PERSON_*1.3 48 | * F1 score: 0.9067357512953368 49 | * Sillhouette Score: 0.09050069997225527 50 | 51 | #### 500_euclidean_HAC_ENTS_*5 52 | * F1 score: 0.8691588785046729 53 | * Sillhouette Score: 0.09316827956149414 54 | 55 | #### 500_euclidean_HAC_ENTS_*4 56 | * F1 score: 0.8721461187214612 57 | * Sillhouette Score: 0.09069735742843224 58 | 59 | #### 500_euclidean_HAC_ENTS_*3 60 | * F1 score: 0.8899297423887589 61 | * Sillhouette Score: 0.08761547351995444 62 | 63 | #### 500_euclidean_HAC_ENTS_*2 64 | * F1 score: 0.8792710706150342 65 | * Sillhouette Score: 0.08451414378144817 66 | 67 | #### 350_euclidean_HAC_ENTS_*2 68 | * F1 score: 0.8178807947019867 69 | * Sillhouette Score: 0.08456507985134794 70 | -------------------------------------------------------------------------------- /Visualizations/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parkervg/news-article-clustering/bca5f12fde915a3bc1a103af7937eec89e3a7a2d/Visualizations/.DS_Store -------------------------------------------------------------------------------- /Visualizations/500_dendogram_hac.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parkervg/news-article-clustering/bca5f12fde915a3bc1a103af7937eec89e3a7a2d/Visualizations/500_dendogram_hac.png -------------------------------------------------------------------------------- /Visualizations/Article_Centers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parkervg/news-article-clustering/bca5f12fde915a3bc1a103af7937eec89e3a7a2d/Visualizations/Article_Centers.png -------------------------------------------------------------------------------- /Visualizations/NewsfeedArticleClustering.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parkervg/news-article-clustering/bca5f12fde915a3bc1a103af7937eec89e3a7a2d/Visualizations/NewsfeedArticleClustering.pdf -------------------------------------------------------------------------------- /Visualizations/cumulative_score_hyperparameters.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parkervg/news-article-clustering/bca5f12fde915a3bc1a103af7937eec89e3a7a2d/Visualizations/cumulative_score_hyperparameters.png -------------------------------------------------------------------------------- /Visualizations/date_distributions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parkervg/news-article-clustering/bca5f12fde915a3bc1a103af7937eec89e3a7a2d/Visualizations/date_distributions.png -------------------------------------------------------------------------------- /Visualizations/distance_heatmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parkervg/news-article-clustering/bca5f12fde915a3bc1a103af7937eec89e3a7a2d/Visualizations/distance_heatmap.png -------------------------------------------------------------------------------- /Visualizations/f_score_hyperparameters.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parkervg/news-article-clustering/bca5f12fde915a3bc1a103af7937eec89e3a7a2d/Visualizations/f_score_hyperparameters.png -------------------------------------------------------------------------------- /Visualizations/s_score_hyperparameters.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parkervg/news-article-clustering/bca5f12fde915a3bc1a103af7937eec89e3a7a2d/Visualizations/s_score_hyperparameters.png -------------------------------------------------------------------------------- /Visualizations/svd_cluster_centers_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parkervg/news-article-clustering/bca5f12fde915a3bc1a103af7937eec89e3a7a2d/Visualizations/svd_cluster_centers_example.png -------------------------------------------------------------------------------- /Visualizations/svd_colored_clusters.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parkervg/news-article-clustering/bca5f12fde915a3bc1a103af7937eec89e3a7a2d/Visualizations/svd_colored_clusters.png -------------------------------------------------------------------------------- /clustering.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sat Apr 13 15:50:28 2019 5 | 6 | @author: parkerglenn 7 | """ 8 | import os 9 | import pandas as pd 10 | from nltk.stem.snowball import SnowballStemmer 11 | from PreProcessing.NERTokenizer import NERTokenizer 12 | from PreProcessing.CustomTFIDF import CustomTFIDF 13 | from SuccessMetrics import success 14 | 15 | """ 16 | Creating relevant classes 17 | """ 18 | NerTok = NERTokenizer(tag=True) 19 | Vectorizer = CustomTFIDF(ents_rate = 6.368, person_rate = 2.263, julian = False) 20 | stemmer = SnowballStemmer("english") 21 | 22 | """ 23 | Cleaning DF 24 | """ 25 | os.chdir("/Users/parkerglenn/Desktop/DataScience/Article_Clustering") 26 | df = pd.read_csv("/Users/parkerglenn/Desktop/DataScience/Article_Clustering/csv/all_GOOD_articles.csv") 27 | labels_df= pd.read_csv("/Users/parkerglenn/Desktop/DataScience/Article_Clustering/Google_Drive/Article_ClassificationFINAL.csv") 28 | #Deletes unnecessary columns 29 | df = df.drop(df.columns[:12], axis = 1) 30 | #Sets manageable range for working data set 31 | new_df = df[5000:6000] 32 | #Gets info in list form to be later called in kmeans part 33 | corpus = new_df['content'].tolist() 34 | titles = new_df["title"].tolist() 35 | #labels_df starts at df[5000] so we're good on the matching of labels to content 36 | events = labels_df["events"].tolist()[:1000] 37 | links = new_df["url"].tolist() 38 | 39 | """ 40 | Creating matrix 41 | """ 42 | toks = NerTok.transform(corpus) 43 | matrix= Vectorizer.transform(toks) 44 | 45 | """ 46 | Clustering and measuring success. 47 | """ 48 | ######################################################### 49 | ####################BIRCH################################ 50 | ######################################################### 51 | from sklearn.cluster import Birch 52 | brc = Birch(n_clusters = 520) 53 | brc.fit(matrix) 54 | 55 | y_pred = brc.labels_.tolist() 56 | success(brc, y_pred, matrix) 57 | 58 | 59 | ######################################################### 60 | ####################HAC################################## 61 | ######################################################### 62 | from sklearn.cluster import AgglomerativeClustering 63 | hac = AgglomerativeClustering(n_clusters=520, affinity = "euclidean") 64 | hac.fit(matrix) 65 | #dense_matrix = tfidf_matrix.todense() 66 | 67 | #from sklearn.externals import joblib 68 | #Saves the model you just made 69 | #joblib.dump(hac, '350_euc_HAC_ENTS.pkl') 70 | #hac = joblib.load("/Users/parkerglenn/Desktop/DataScience/Article_Clustering/HAC_Cluster_Models/350_euc_HAC.pkl") 71 | 72 | y_pred = hac.labels_.tolist() 73 | success(hac, y_pred, matrix) 74 | 75 | 76 | ######################################################### 77 | ####################KEMANS############################### 78 | ######################################################### 79 | from sklearn.cluster import KMeans 80 | num_clusters = 520 81 | km = KMeans(n_clusters = num_clusters) 82 | km.fit(matrix) 83 | 84 | y_pred = km.labels_.tolist() 85 | success(km, y_pred, matrix) 86 | 87 | 88 | 89 | 90 | 91 | ######################################################### 92 | ###############KMEANS CLUSTER EXPLORING################## 93 | ######################################################### 94 | def tokenize_and_stem(text): 95 | tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] 96 | filtered_tokens = [] 97 | for token in tokens: 98 | if re.search('[a-zA-Z]', token): 99 | filtered_tokens.append(token) 100 | stems = [stemmer.stem(t) for t in filtered_tokens] 101 | return stems 102 | 103 | def tokenize_only(text): 104 | tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] 105 | filtered_tokens = [] 106 | for token in tokens: 107 | if re.search('[a-zA-Z]', token): 108 | filtered_tokens.append(token) 109 | return filtered_tokens 110 | 111 | 112 | totalvocab_stemmed = [] 113 | totalvocab_tokenized = [] 114 | for i in corpus: 115 | allwords_stemmed = tokenize_and_stem(i) 116 | totalvocab_stemmed.extend(allwords_stemmed) 117 | allwords_tokenized = tokenize_only(i) 118 | totalvocab_tokenized.extend(allwords_tokenized) 119 | 120 | #Let's you search with stemmed word to see original format of word 121 | vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed) 122 | print ('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame') 123 | 124 | 125 | articles = {"title": titles, "date": new_df["date"], "cluster": clusters, "content": new_df["content"], "event": events[:1000]} 126 | frame = pd.DataFrame(articles, index = [clusters] , columns = ['title', 'date', 'cluster', 'content', "event"]) 127 | frame['cluster'].value_counts() 128 | 129 | order_centroids = km.cluster_centers_.argsort()[:, ::-1] 130 | 131 | from collections import Counter 132 | #Creates a count dict (success) to see how many instances of the same event are clustered together 133 | for i in clusters[:100]: 134 | print("Cluster %d words:" % i, end='') 135 | for ind in order_centroids[i, :6]: #replace 6 with n words per cluster 136 | print(' %s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',') 137 | print() 138 | print() 139 | print() 140 | counts = [] 141 | for event in frame.loc[i]["event"].values.tolist(): 142 | counts.append(event) 143 | counts = dict(Counter(counts)) 144 | print(counts) 145 | print() 146 | print() 147 | 148 | 149 | #Allows you to zoom in on a specific cluster, see what words make that cluster unique 150 | for i in clusters: 151 | if i == 244: #Change 2 to the cluster 152 | print("Cluster %d words:" % i, end='') 153 | for ind in order_centroids[i, :5]: #replace 20 with n words per cluster 154 | print(' %s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',') 155 | counts = [] 156 | for event in frame.ix[i]["event"].values.tolist(): 157 | counts.append(event) 158 | counts = dict(Counter(counts)) 159 | print(counts) 160 | print() 161 | print() 162 | -------------------------------------------------------------------------------- /data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parkervg/news-article-clustering/bca5f12fde915a3bc1a103af7937eec89e3a7a2d/data/.DS_Store -------------------------------------------------------------------------------- /exploring_entities.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Apr 8 13:27:43 2019 5 | 6 | @author: parkerglenn 7 | """ 8 | 9 | 10 | 11 | """ 12 | WITH NO ENTITY MANIPULATION: 13 | 97.9% of non-PERSON ents occur in at least one other cluster. 14 | 51.5% of PERSON ents occur in at least one other cluster. 15 | 16 | 17 | 18 | WITH ENTITY MANIPULATION: 19 | 96.2% of non-PERSON ents occur in at least one other cluster. 20 | 52.1% of PERSON ents occur in at least one other cluster. 21 | 22 | 23 | Average of 51.832 PERSONs per cluster 24 | Average of 64.94 non-PERSONs per cluster 25 | 26 | 1795 PERSON tags only appear once. 27 | 6.9% of all PERSON tags are not helpful in determining cluster boundaries. 28 | 1683 NON_PERSON tags only appear once. 29 | 5.0% of all NON_PERSON tags are not helpful in determining cluster boundaries. 30 | 31 | Non-People entities occured 6,554 more times than people entities. 32 | 33 | 34 | 35 | "GPE": Countries, cities, states 36 | "ORG": Organizations 37 | "LOC": Non-GPE locations 38 | "EVENT": Named hurricanes, sports events, etc. 39 | "FAC": Buildings, airports, highways, bridges 40 | "PERSON": All people, including fictional 41 | """ 42 | 43 | from collections import Counter 44 | ent_corpus = [] 45 | unique_ents = [] 46 | for article in toks: 47 | article_ents = ([tok for tok in article if tok.startswith("*") == True or tok.startswith("&") == True and tok != "*’m"]) 48 | for tok in article: 49 | if tok not in unique_ents: 50 | unique_ents.append(tok) 51 | ent_corpus.append(article_ents) 52 | 53 | # Creates cluster_ents, a dict where the occurences of ents per cluster are counted. 54 | cluster_ents = {} 55 | for place, cluster in enumerate(y_pred): 56 | cluster_ents[cluster] = cluster_ents.get(cluster,ent_corpus[place]) + ent_corpus[place] 57 | 58 | 59 | cluster_ents_count = {} 60 | for cluster in cluster_ents: 61 | cluster_ents_count[cluster] = dict(Counter(cluster_ents[cluster])) 62 | 63 | # Creates dict for the amount of times an entity is used across distinct clusters 64 | dup_clusters1 = {} 65 | for base_ent in unique_ents: 66 | for cluster, ents in cluster_ents.items(): 67 | if base_ent in ents: 68 | dup_clusters1[base_ent] = dup_clusters1.get(base_ent, -1) + 1 69 | dup_clusters = {k:v for k,v in dup_clusters1.items() if v != 0} 70 | 71 | 72 | """ 73 | How many one-off person ents/non person ents are there? 74 | 75 | If more one-off person ents, this explains s score. 76 | """ 77 | 78 | ent_occurences = {} 79 | for cluster in cluster_ents_count: 80 | for ent, value in cluster_ents_count[cluster].items(): 81 | try: 82 | ent_occurences[ent] += value 83 | except: 84 | ent_occurences[ent] = value 85 | 86 | 87 | one_off_persons = 0 88 | one_off_notpersons = 0 89 | delp = [] 90 | delnp = [] 91 | for k, v in ent_occurences.items(): 92 | if k.startswith("*") and v == 1: 93 | one_off_persons += 1 94 | delp.append(k) 95 | elif k.startswith("&") and v == 1: 96 | one_off_notpersons += 1 97 | delnp.append(k) 98 | one_off_persons 99 | one_off_notpersons 100 | 101 | 102 | event_ents = {} 103 | for article, toks in enumerate(ent_corpus): 104 | event_ents[events[article]] = event_ents.get(events[article], toks) + toks 105 | for event,toks in event_ents.items(): 106 | event_ents[event] = dict(Counter(toks)) 107 | 108 | 109 | 110 | distribution = pd.DataFrame(columns = ["event","ent","type","ratio"]) 111 | place = -1 112 | for event, counts in event_ents.items(): 113 | for ent, value in counts.items(): 114 | place += 1 115 | if ent.startswith("*") and ent not in delp: 116 | distribution.loc[place] = [event, ent, "PERSON", (value / ent_occurences[ent])] 117 | if ent.startswith("&") and ent not in delnp: 118 | distribution.loc[place] = [event, ent, "NON_PERSON", (value / ent_occurences[ent])] 119 | 120 | 121 | people_dist = distribution.loc[distribution["type"] == "PERSON"] 122 | not_people_dist = distribution.loc[distribution["type"] == "NON_PERSON"] 123 | 124 | 125 | 126 | pval = 0 127 | for value in people_dist["ratio"]: 128 | if value > .7: 129 | pval +=1 130 | 131 | npval = 0 132 | for value in not_people_dist["ratio"]: 133 | if value > .7: 134 | npval +=1 135 | 136 | 137 | 138 | 139 | 140 | import matplotlib.pyplot as plt 141 | import seaborn as sns 142 | fig, (ax1,ax2) = plt.subplots(ncols = 2) 143 | fig.subplots_adjust(wspace = 0.01) 144 | sns.heatmap(distribution, cmap = "rocket", ax = ax, cbar = False) 145 | 146 | sns.heatmap(distribution.loc[distribution["type"] == "PERSON"]) 147 | 148 | 149 | 150 | 151 | 152 | 153 | """ 154 | Entity Usage Across Distinct Clusters 155 | 156 | """ 157 | ents = pd.DataFrame(columns = ["ent","occurence"]) 158 | people = pd.DataFrame(columns = ["ent","occurence"]) 159 | p = [] 160 | n = [] 161 | p1 = [] 162 | n1 = [] 163 | for k in dup_clusters: 164 | if k.startswith("&"): 165 | n.append(k) 166 | n1.append(dup_clusters[k]) 167 | elif k.startswith("*"): 168 | p.append(k) 169 | p1.append(dup_clusters[k]) 170 | 171 | 172 | ents["ent"] = n 173 | ents["occurence"] = n1 174 | ents["type"] = "ent" 175 | people["ent"] = p 176 | people["occurence"] = p1 177 | people["type"] = "person" 178 | ents = ents.sort_values(by = "occurence", ascending = False) 179 | people = people.sort_values(by = "occurence", ascending = False) 180 | 181 | 182 | people_2plt = people[1:6] 183 | ents_2plt = ents[:5] 184 | fig, ax = plt.subplots(1,2, sharey = True, figsize = (15,8)) 185 | fig.suptitle("Entity Usage Across Distinct Clusters", fontsize=14) 186 | sns.set(style = "darkgrid") 187 | sns.barplot(x = "ent", y = "occurence", hue = "type", data = people_2plt, ax = ax[0], palette=["C0"]) 188 | sns.barplot(x = "ent", y = "occurence", hue = "type", data = ents_2plt,ax = ax[1], palette=["C1"]) 189 | for a in ax: 190 | a.set_xlabel('Entity') 191 | a.set_ylabel('Occurences') 192 | fig.show() 193 | 194 | 195 | ents = pd.DataFrame(columns = ["ent","occurence"]) 196 | people = pd.DataFrame(columns = ["ent","occurence"]) 197 | p = [] 198 | n = [] 199 | p1 = [] 200 | n1 = [] 201 | for k, v in ent_occurences.items(): 202 | if k.startswith("&"): 203 | n.append(k) 204 | n1.append(v) 205 | elif k.startswith("*"): 206 | p.append(k) 207 | p1.append(v) 208 | ents["ent"] = n 209 | ents["occurence"] = n1 210 | ents["type"] = "Non-Person" 211 | people["ent"] = p 212 | people["occurence"] = p1 213 | people["type"] = "Person" 214 | ents = ents.sort_values(by = "occurence", ascending = False) 215 | people = people.sort_values(by = "occurence", ascending = False) 216 | 217 | 218 | from matplotlib.colors import ListedColormap 219 | 220 | color1 = ["#13b23b"] 221 | color2 = ["#ffa100"] 222 | people_2plt = people[:5] 223 | ents_2plt = ents[:5] 224 | fig, ax = plt.subplots(1,2, sharey = True, figsize = (15,8)) 225 | #fig.suptitle("Entity Usage Across Articles", fontsize=25) 226 | sns.set(style = "darkgrid", font_scale = 1) 227 | one = sns.barplot(x = "ent", y = "occurence", hue = "type", data = people_2plt, ax = ax[0], palette=color1) 228 | two = sns.barplot(x = "ent", y = "occurence", hue = "type", data = ents_2plt,ax = ax[1], palette=color2) 229 | for item in one.get_xticklabels(): 230 | item.set_rotation(60) 231 | for item in two.get_xticklabels(): 232 | item.set_rotation(60) 233 | for a in ax: 234 | a.set_xlabel('Entity') 235 | a.set_ylabel('Occurences') 236 | fig.show() 237 | 238 | 239 | 240 | 241 | person = 0 242 | not_person = 0 243 | for ent, value in dup_clusters.items(): 244 | if ent.startswith("&"): 245 | not_person += value 246 | elif ent.startswith("*"): 247 | person += value 248 | print("Overall, non-person entities occured across distinct clusters {} more times than person entities did.".format(not_person - person)) 249 | 250 | person 251 | not_person 252 | 253 | 254 | 255 | 256 | all_people = 0 257 | all_not_people = 0 258 | for ent, value in ent_occurences.items(): 259 | if ent.startswith("&"): 260 | all_not_people += value 261 | if ent.startswith("*"): 262 | all_people += value 263 | print("Non-People entities occured {} more times than people entities.".format(all_not_people - all_people)) 264 | 265 | 266 | all_people 267 | all_not_people 268 | 269 | x = 0 270 | y = 0 271 | for ent in unique_ents: 272 | if ent.startswith("&"): 273 | x += 1 274 | if ent.startswith("*"): 275 | y += 1 276 | 277 | person / y 278 | not_person / x 279 | 280 | all_people / 500 281 | all_not_people / 500 282 | 283 | 1710 / all_people 284 | 1772 / all_not_people 285 | 286 | all_people 287 | all_not_people - all_people 288 | 289 | 290 | --------------------------------------------------------------------------------