├── tools ├── __init__.py ├── parameters.txt ├── my_parameters.py ├── my_stopwords.py ├── tools.py ├── burst_detection.py └── cleaning.py ├── .gitattributes ├── clusters.xlsx ├── stacked_vectors.p ├── burstvectors_500.p ├── README.md └── Data cleaning pipeline.ipynb /tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.p filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /clusters.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/etattershall/burst-detection/HEAD/clusters.xlsx -------------------------------------------------------------------------------- /stacked_vectors.p: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:8abcdc2d6fd73ccdf926d6f4473c17037968d1ed7ef52b6d163b2fdcc41013fa 3 | size 412380031 4 | -------------------------------------------------------------------------------- /burstvectors_500.p: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:fc6e0e1dc245aee557b66b3759f114e3dee26b23eee280e281df5560fdb854de 3 | size 104058929 4 | -------------------------------------------------------------------------------- /tools/parameters.txt: -------------------------------------------------------------------------------- 1 | def set_parameters(): 2 | parameters = { 3 | "ngram_length": 3, 4 | "min_yearly_df": 5, 5 | "significance_threshold": 0.0015, 6 | "years_above_significance": 3, 7 | "long_ma_length": 8, 8 | "short_ma_length": 4, 9 | "signal_line_ma": 3, 10 | "significance_ma_length": 3 11 | } 12 | return parameters 13 | -------------------------------------------------------------------------------- /tools/my_parameters.py: -------------------------------------------------------------------------------- 1 | def set_parameters(): 2 | parameters = { 3 | "ngram_length": 3, 4 | "min_yearly_df": 5, 5 | "significance_threshold": 0.0015, 6 | "years_above_significance": 3, 7 | "long_ma_length": 8, 8 | "short_ma_length": 4, 9 | "signal_line_ma": 3, 10 | "significance_ma_length": 3 11 | } 12 | return parameters 13 | -------------------------------------------------------------------------------- /tools/my_stopwords.py: -------------------------------------------------------------------------------- 1 | from nltk.corpus import stopwords 2 | 3 | def get_stopwords(): 4 | 5 | stop = set(stopwords.words('english')) 6 | stop = set([s.replace("'", "") for s in stop]) 7 | 8 | # Add years to prevent spikes 9 | for year in range(1900, 2020): 10 | stop.add(str(year)) 11 | 12 | # Add small numbers 13 | for num in range(0, 100): 14 | if len(str(num)) < 2: 15 | stop.add(str(num)) 16 | num = '0' + str(num) 17 | 18 | stop.add(str(num)) 19 | 20 | # Add these extra stopwords to the list 21 | extra = [ 22 | 'use', 'using', 'uses', 'used', 'based', 'including', 'include', 'approach', 23 | 'wa', 'ha', 'doe' 24 | ] 25 | for word in extra: 26 | stop.add(word) 27 | 28 | return(stop) 29 | 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Detecting Bursty Terms in Computer Science 3 | *burst-detection* 4 | 5 | Research topics rise and fall in popularity over time, some more swiftly than others. The fastest rising topics are typically called *bursts*; e.g. "deep learning", "internet of things" and "big data". Being able to detect and track bursty terms in the literature could give insight into how scientific thought evolves over time. 6 | 7 | In this repository, we take a trend detection algorithm from technical stock market analysis and apply it to 31 years of computer science research abstracts, treating the prevalence of each term in the dataset like the price of a stock. Unlike previous work in this domain, we use the free text of abstracts and titles, resulting in a finer-grained analysis. We report a list of bursty terms, then use historical data to build a classifier to predict whether they will rise or fall in popularity in the future, obtaining accuracy in the region of 80%. As a consequence, we now have a pipeline that can be applied to any time-ordered collection of text to yield past and present bursty terms and predict their probable fate. 8 | -------------------------------------------------------------------------------- /tools/tools.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction.text import CountVectorizer 2 | import numpy as np 3 | 4 | def remove_redundant_strings(cluster): 5 | ''' 6 | Takes a list of strings and removes those which are entirely contained within other strings 7 | ''' 8 | not_duplicated = [] 9 | 10 | for i in range(len(cluster)): 11 | duplicate = False 12 | for j in range(len(cluster)): 13 | if i == j: 14 | pass 15 | elif cluster[i]+'s' == cluster[j]: 16 | # Check for failures of lemmatisation 17 | # for instance, dataset, datasets 18 | pass 19 | elif cluster[i] == cluster[j]+'s': 20 | # Check for failures of lemmatisation 21 | duplicate = True 22 | elif cluster[i] in cluster[j]: 23 | duplicate = True 24 | if not duplicate: 25 | not_duplicated.append(cluster[i]) 26 | 27 | return not_duplicated 28 | 29 | 30 | def get_top_n_bursts(burstiness, n): 31 | return list(burstiness.nlargest(n, "max").index) 32 | 33 | def s_curve(x, a, b, c, d): 34 | return a / (1. + np.exp(-c * (x - d))) + b 35 | 36 | def all_subterms(term): 37 | subterms = term.split(' ') 38 | vectorizer = CountVectorizer(strip_accents='ascii', ngram_range=(1,len(subterms)-1)) 39 | vectorizer.fit_transform([term]) 40 | return list(vectorizer.vocabulary_) 41 | 42 | 43 | 44 | 45 | def normalise_time_series(time_series): 46 | # Normalise prevalance such that it is capped at 1 and has a minimum at 0. 47 | return (time_series-time_series.min())/(time_series.max()-(time_series.min())) -------------------------------------------------------------------------------- /Data cleaning pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Data cleaning pipeline\n", 8 | "\n", 9 | "This notebook takes a dataset separated out by year into the format\n", 10 | "\n", 11 | "- 1975.csv\n", 12 | "- 1976.csv\n", 13 | "- 1977.csv\n", 14 | "\n", 15 | "etc, and transforms it into a set of dataframes of clean text that can be stored in a series of p files containing only the document id and cleaned text" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import os\n", 25 | "import pandas as pd\n", 26 | "import pickle\n", 27 | "\n", 28 | "import sys\n", 29 | "sys.path.append(\"../tools\")\n", 30 | "import my_stopwords\n", 31 | "import my_parameters\n", 32 | "import cleaning" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 6, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "stop = my_stopwords.get_stopwords()\n", 42 | "parameters = my_parameters.set_parameters()\n", 43 | "\n", 44 | "dataset_name = 'dblp_cs'\n", 45 | "raw_data_path = 'Raw_Data'" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 30, 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "name": "stdout", 55 | "output_type": "stream", 56 | "text": [ 57 | "1988 6906\n", 58 | "1989 7947\n", 59 | "1990 9328\n", 60 | "1991 10599\n", 61 | "1992 12985\n", 62 | "1993 15213\n", 63 | "1994 18756\n", 64 | "1995 20607\n", 65 | "1996 24408\n", 66 | "1997 27865\n", 67 | "1998 32629\n", 68 | "1999 35897\n", 69 | "2000 42482\n", 70 | "2001 45158\n", 71 | "2002 52898\n", 72 | "2003 64871\n", 73 | "2004 90084\n", 74 | "2005 105201\n", 75 | "2006 120579\n", 76 | "2007 132013\n", 77 | "2008 141659\n", 78 | "2009 152864\n", 79 | "2010 161380\n", 80 | "2011 173486\n", 81 | "2012 183729\n", 82 | "2013 189858\n", 83 | "2014 195136\n", 84 | "2015 195426\n", 85 | "2016 197102\n", 86 | "2017 188640\n" 87 | ] 88 | } 89 | ], 90 | "source": [ 91 | "cleaner = cleaning.Clean(parameters[\"ngram_length\"])\n", 92 | "\n", 93 | "for year in range(1988,2018):\n", 94 | " df = pd.read_csv(raw_data_path+str(year)+'.csv')\n", 95 | " print(year, len(df))\n", 96 | " \n", 97 | " cleaned_text = []\n", 98 | " cleaned_df = pd.DataFrame()\n", 99 | " \n", 100 | " if 'language' in df.keys():\n", 101 | " for index, row in df[df['language']=='en'].iterrows():\n", 102 | " cleaned_text.append(cleaner.cleaning_pipeline(row['title'], row['abstract'], pad=False))\n", 103 | " \n", 104 | " cleaned_df['id'] = list(df[df['language']=='en'].ssid)\n", 105 | " else:\n", 106 | " for index, row in df.iterrows():\n", 107 | " cleaned_text.append(cleaner.cleaning_pipeline(row['title'], row['abstract'], pad=False))\n", 108 | " \n", 109 | " cleaned_df['id'] = list(df.ssid)\n", 110 | " \n", 111 | " cleaned_df['cleaned'] = cleaned_text\n", 112 | " \n", 113 | " pickle.dump(cleaned_df, open(\"dataset_name+\"/\"+str(year)+\".p\", \"wb\"))" 114 | ] 115 | }, 116 | { 117 | "cell_type": "raw", 118 | "metadata": {}, 119 | "source": [] 120 | } 121 | ], 122 | "metadata": { 123 | "kernelspec": { 124 | "display_name": "Python 3", 125 | "language": "python", 126 | "name": "python3" 127 | }, 128 | "language_info": { 129 | "codemirror_mode": { 130 | "name": "ipython", 131 | "version": 3 132 | }, 133 | "file_extension": ".py", 134 | "mimetype": "text/x-python", 135 | "name": "python", 136 | "nbconvert_exporter": "python", 137 | "pygments_lexer": "ipython3", 138 | "version": "3.7.11" 139 | } 140 | }, 141 | "nbformat": 4, 142 | "nbformat_minor": 2 143 | } 144 | -------------------------------------------------------------------------------- /tools/burst_detection.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | def calc_significance(stacked_vectors, significance_threshold, n): 5 | # Must have been above the significance threshold for two consecutive timesteps 6 | a = stacked_vectors>significance_threshold 7 | b = a.rolling(window=n).sum() 8 | return stacked_vectors[stacked_vectors.axes[1][np.where(b.max()>=n)[0]]] 9 | 10 | 11 | class BurstDetection: 12 | def __init__(self, short_ma_length, long_ma_length, significance_ma_length, signal_line_ma): 13 | self.short_ma_length = short_ma_length 14 | self.long_ma_length = long_ma_length 15 | self.significance_ma_length = significance_ma_length 16 | self.signal_line_ma = signal_line_ma 17 | 18 | def calc_macd(self, stacked_vectors, ema=True): 19 | if ema: 20 | long_ma = stacked_vectors.ewm(span=self.long_ma_length).mean() 21 | short_ma = stacked_vectors.ewm(span=self.short_ma_length).mean() 22 | significance_ma = stacked_vectors.ewm(span=self.significance_ma_length).mean() 23 | macd = short_ma - long_ma 24 | signal = macd.ewm(span=self.signal_line_ma).mean() 25 | hist = macd - signal 26 | return long_ma, short_ma, significance_ma, macd, signal, hist 27 | else: 28 | long_ma = stacked_vectors.rolling(self.long_ma_length).mean() 29 | short_ma = stacked_vectors.rolling(self.short_ma_length).mean() 30 | significance_ma = stacked_vectors.rolling(self.significance_ma_length).mean() 31 | macd = short_ma - long_ma 32 | signal = macd.rolling(self.signal_line_ma).mean() 33 | hist = macd - signal 34 | return long_ma, short_ma, significance_ma, macd, signal, hist 35 | 36 | 37 | 38 | def calc_burstiness(self, hist, scaling_factor): 39 | return hist.iloc[self.long_ma_length-1:]/scaling_factor 40 | 41 | def calc_scaling(self, significance_ma, method): 42 | if method == "max": 43 | scaling = significance_ma.iloc[self.significance_ma_length-1:].max() 44 | elif method == "mean": 45 | scaling = significance_ma.iloc[self.significance_ma_length-1:].mean() 46 | elif method == "sqrt": 47 | scaling = np.sqrt(significance_ma.iloc[self.significance_ma_length-1:].max() ) 48 | return scaling 49 | 50 | def max_burstiness(self, burstiness, absolute): 51 | if absolute: 52 | b = pd.concat([np.abs(burstiness).max(), np.abs(burstiness).idxmax()], axis=1) 53 | else: 54 | b = pd.concat([burstiness.max(), burstiness.idxmax()], axis=1) 55 | b.columns = ["max", "location"] 56 | return b 57 | 58 | def my_burstiness(self, stacked_vectors, absolute=True, method="sqrt"): 59 | long_ma, short_ma, significance_ma, macd, signal, hist = self.calc_macd(stacked_vectors) 60 | scaling_factor = self.calc_scaling(significance_ma, method) 61 | burstiness_over_time = self.calc_burstiness(hist, scaling_factor) 62 | burstiness = self.max_burstiness(burstiness_over_time, absolute=absolute) 63 | return(burstiness) 64 | 65 | 66 | class Dataset: 67 | def __init__(self, name, years, stacked_vectors): 68 | self.name = name 69 | self.stacked_vectors = stacked_vectors 70 | self.years = years 71 | 72 | def get_sig_stacked_vectors(self, significance_threshold, years_above_significance): 73 | normalisation = self.stacked_vectors.sum(axis=1) 74 | self.sig_stacked_vectors = calc_significance(self.stacked_vectors.divide(normalisation, axis="index")*100, significance_threshold, years_above_significance) 75 | 76 | def get_burstiness(self, short_ma_length, long_ma_length, significance_ma_length, signal_line_ma_length, ema=True, scaling_type="sqrt", absolute=True): 77 | bd = BurstDetection(short_ma_length, long_ma_length, significance_ma_length, signal_line_ma_length) 78 | long_ma, short_ma, significance_ma, macd, signal, hist = bd.calc_macd(self.sig_stacked_vectors, ema=ema) 79 | self.scaling_factor = bd.calc_scaling(significance_ma, scaling_type) 80 | burstiness_over_time = bd.calc_burstiness(hist, self.scaling_factor) 81 | self.burstiness = bd.max_burstiness(burstiness_over_time, absolute=absolute) 82 | 83 | 84 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /tools/cleaning.py: -------------------------------------------------------------------------------- 1 | # data cleaning 2 | import re 3 | 4 | # lemmatisation 5 | from collections import defaultdict 6 | from nltk.stem import WordNetLemmatizer 7 | from sklearn.feature_extraction.text import CountVectorizer 8 | 9 | class Clean: 10 | def __init__(self, ngram_length): 11 | self.ngram_length = ngram_length 12 | 13 | self.alphabets ="([A-Za-z])" 14 | self.prefixes = "(Mr|St|Mrs|Ms|Dr)[.]" 15 | self.suffixes = "(Inc|Ltd|Jr|Sr|Co)" 16 | self.starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)" 17 | self.acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)" 18 | self.websites = "[.](com|net|org|io|gov)" 19 | self.htmltags = '<[^>]+>' 20 | self.htmlspecial = '&#?[xX]?[a-zA-Z0-9]{2,8};' 21 | self.start_delimiter = 'documentstart' 22 | self.sent_delimiter = 'sentenceboundary' 23 | self.end_delimiter = 'documentend' 24 | 25 | # Download the lemmatisesr 26 | self.wnl = WordNetLemmatizer() 27 | 28 | # Create a tokeniser 29 | count = CountVectorizer(strip_accents='ascii', min_df=1) 30 | self.tokeniser = count.build_analyzer() 31 | 32 | def normalise_acronymns(self, text): 33 | ''' 34 | Remove the periods in acronyms. 35 | Adapted from the method found at https://stackoverflow.com/a/40197005 36 | ''' 37 | 38 | # deal with single letters before sentence boundaries 39 | text = re.sub(r'\s([A-Z, a-z])\.\s', r' \1.. ', text) 40 | return re.sub(r'(?",text) 54 | text = re.sub(self.websites,"\\1",text) 55 | 56 | # my addition 57 | text = re.sub(self.htmltags, " ", text) 58 | text = re.sub(self.htmlspecial, " ", text) 59 | 60 | if "Ph.D" in text: 61 | text = text.replace("Ph.D.","PhD") 62 | 63 | text = re.sub("\s" + self.alphabets + "[.] "," \\1",text) 64 | text = re.sub(self.acronyms+" "+self.starters,"\\1 \\2",text) 65 | text = re.sub(self.alphabets + "[.]" + self.alphabets + "[.]" + self.alphabets + "[.]","\\1\\2\\3",text) 66 | text = re.sub(self.alphabets + "[.]" + self.alphabets + "[.]","\\1\\2",text) 67 | text = re.sub(" "+self.suffixes+"[.] "+self.starters," \\1 \\2",text) 68 | text = re.sub(" "+self.suffixes+"[.]"," \\1",text) 69 | text = re.sub(" " + self.alphabets + "[.]"," \\1",text) 70 | 71 | if "”" in text: 72 | text = text.replace(".”","”.") 73 | if "\"" in text: 74 | text = text.replace(".\"","\".") 75 | if "!" in text: 76 | text = text.replace("!\"","\"!") 77 | if "?" in text: 78 | text = text.replace("?\"","\"?") 79 | 80 | text = text.replace(".","") 81 | text = text.replace("?","") 82 | text = text.replace("!","") 83 | 84 | sentences = text.split("") 85 | sentences = [s.strip() for s in sentences] 86 | 87 | non_empty = [] 88 | for s in sentences: 89 | # we require that there be two alphanumeric characters in a row 90 | if len(re.findall("[A-Za-z0-9][A-Za-z0-9]", s)) > 0: 91 | non_empty.append(s) 92 | return non_empty 93 | 94 | def pad_sentences(self, sentences): 95 | ''' 96 | Takes a list of sentences and returns a string in which: 97 | - The beginning of the abstract is indicated by DOCUMENTSTART 98 | - The end is indicated by DOCUMENTEND 99 | - Sentence boundaries are indicated by SENTENCEBOUNDARY 100 | 101 | The number of delimiters used is dependent on the ngram length 102 | ''' 103 | sent_string = (' '+(self.sent_delimiter+' ')*(self.ngram_length-1)).join(sentences) 104 | 105 | return (self.start_delimiter+' ')*(self.ngram_length-1) + sent_string + (' '+self.end_delimiter)*(self.ngram_length-1) 106 | 107 | def cleaning_pipeline(self, title, abstract, pad=True): 108 | ''' 109 | Takes a binary string and returns a list of cleaned sentences, stripped of punctuation and lemmatised 110 | ''' 111 | 112 | # Check that title and abstract exist 113 | if type(title) is not float: 114 | title = self.normalise_decimals(self.normalise_acronymns(title)) 115 | else: 116 | title = '' 117 | 118 | if type(abstract) is not float: 119 | abstract = self.normalise_decimals(self.normalise_acronymns(abstract)) 120 | else: 121 | abstract = '' 122 | 123 | if pad: 124 | sentences = [title] + self.split_into_sentences(abstract) 125 | 126 | # strip out punctuation and make lowercase 127 | clean_sentences = [] 128 | for s in sentences: 129 | 130 | # Deal with special cases 131 | s = re.sub(r'[-/]', ' ', s) 132 | 133 | # Remove all other punctuation 134 | s = re.sub(r'[^\w\s]','',s) 135 | 136 | clean_sentences.append(s.lower()) 137 | 138 | # pad sentences with delimiters 139 | 140 | text = self.pad_sentences(clean_sentences) 141 | 142 | else: 143 | text = title + '. ' + abstract 144 | text = re.sub(r'[-/]', ' ', text) 145 | text = re.sub(r'[^\w\s]','',text) 146 | text = text.lower() 147 | 148 | # Lemmatise word by word 149 | lemmas = [] 150 | for word in self.tokeniser(text): 151 | lemmas.append(self.wnl.lemmatize(word)) 152 | 153 | return ' '.join(lemmas) 154 | --------------------------------------------------------------------------------