├── tools
    ├── __init__.py
    ├── parameters.txt
    ├── my_parameters.py
    ├── my_stopwords.py
    ├── tools.py
    ├── burst_detection.py
    └── cleaning.py
├── .gitattributes
├── clusters.xlsx
├── stacked_vectors.p
├── burstvectors_500.p
├── README.md
└── Data cleaning pipeline.ipynb


/tools/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.p filter=lfs diff=lfs merge=lfs -text
2 | 


--------------------------------------------------------------------------------
/clusters.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/etattershall/burst-detection/HEAD/clusters.xlsx


--------------------------------------------------------------------------------
/stacked_vectors.p:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8abcdc2d6fd73ccdf926d6f4473c17037968d1ed7ef52b6d163b2fdcc41013fa
3 | size 412380031
4 | 


--------------------------------------------------------------------------------
/burstvectors_500.p:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:fc6e0e1dc245aee557b66b3759f114e3dee26b23eee280e281df5560fdb854de
3 | size 104058929
4 | 


--------------------------------------------------------------------------------
/tools/parameters.txt:
--------------------------------------------------------------------------------
 1 | def set_parameters():
 2 |     parameters = {
 3 |         "ngram_length": 3,
 4 |         "min_yearly_df": 5,
 5 |         "significance_threshold": 0.0015,
 6 |         "years_above_significance": 3,
 7 |         "long_ma_length": 8,
 8 |         "short_ma_length": 4,
 9 |         "signal_line_ma": 3,
10 |         "significance_ma_length": 3        
11 |     }
12 |     return parameters
13 | 


--------------------------------------------------------------------------------
/tools/my_parameters.py:
--------------------------------------------------------------------------------
 1 | def set_parameters():
 2 |     parameters = {
 3 |         "ngram_length": 3,
 4 |         "min_yearly_df": 5,
 5 |         "significance_threshold": 0.0015,
 6 |         "years_above_significance": 3,
 7 |         "long_ma_length": 8,
 8 |         "short_ma_length": 4,
 9 |         "signal_line_ma": 3,
10 |         "significance_ma_length": 3        
11 |     }
12 |     return parameters
13 | 


--------------------------------------------------------------------------------
/tools/my_stopwords.py:
--------------------------------------------------------------------------------
 1 | from nltk.corpus import stopwords
 2 | 
 3 | def get_stopwords():
 4 | 
 5 | 	stop = set(stopwords.words('english'))
 6 | 	stop = set([s.replace("'", "") for s in stop])
 7 | 
 8 | 	# Add years to prevent spikes
 9 | 	for year in range(1900, 2020):
10 | 		stop.add(str(year))
11 | 
12 | 	# Add small numbers
13 | 	for num in range(0, 100):
14 | 		if len(str(num)) < 2:
15 | 			stop.add(str(num))
16 | 			num = '0' + str(num)
17 | 			
18 | 		stop.add(str(num))
19 | 		
20 | 	# Add these extra stopwords to the list
21 | 	extra = [
22 | 		'use', 'using', 'uses', 'used', 'based', 'including', 'include', 'approach',
23 | 		'wa', 'ha', 'doe'
24 | 			]
25 | 	for word in extra:
26 | 		stop.add(word)
27 | 		
28 | 	return(stop)
29 | 	
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 | # Detecting Bursty Terms in Computer Science
3 | *burst-detection*
4 | 
5 | Research topics rise and fall in popularity over time, some more swiftly than others. The fastest rising topics are typically called *bursts*; e.g. "deep learning", "internet of things" and "big data". Being able to detect and track bursty terms in the literature could give insight into how scientific thought evolves over time.
6 | 
7 | In this repository, we take a trend detection algorithm from technical stock market analysis and apply it to 31 years of computer science research abstracts, treating the prevalence of each term in the dataset like the price of a stock. Unlike previous work in this domain, we use the free text of abstracts and titles, resulting in a finer-grained analysis. We report a list of bursty terms, then use historical data to build a classifier to predict whether they will rise or fall in popularity in the future, obtaining accuracy in the region of 80%. As a consequence, we now have a pipeline that can be applied to any time-ordered collection of text to yield past and present bursty terms and predict their probable fate.
8 | 


--------------------------------------------------------------------------------
/tools/tools.py:
--------------------------------------------------------------------------------
 1 | from sklearn.feature_extraction.text import CountVectorizer
 2 | import numpy as np
 3 | 
 4 | def remove_redundant_strings(cluster):
 5 |     '''
 6 |     Takes a list of strings and removes those which are entirely contained within other strings
 7 |     '''
 8 |     not_duplicated = []
 9 |     
10 |     for i in range(len(cluster)):
11 |         duplicate = False
12 |         for j in range(len(cluster)):
13 |             if i == j:
14 |                 pass
15 |             elif cluster[i]+'s' == cluster[j]:
16 |                 # Check for failures of lemmatisation
17 |                 # for instance, dataset, datasets
18 |                 pass
19 |             elif cluster[i] == cluster[j]+'s':
20 |                 # Check for failures of lemmatisation
21 |                 duplicate = True
22 |             elif cluster[i] in cluster[j]:
23 |                 duplicate = True
24 |         if not duplicate:
25 |             not_duplicated.append(cluster[i])
26 |     
27 |     return not_duplicated
28 |     
29 |     
30 | def get_top_n_bursts(burstiness, n):
31 |     return list(burstiness.nlargest(n, "max").index)
32 | 
33 | def s_curve(x, a, b, c, d):
34 |     return a / (1. + np.exp(-c * (x - d))) + b
35 | 	
36 | def all_subterms(term):
37 |     subterms = term.split(' ')
38 |     vectorizer = CountVectorizer(strip_accents='ascii', ngram_range=(1,len(subterms)-1))
39 |     vectorizer.fit_transform([term])
40 |     return list(vectorizer.vocabulary_)
41 | 
42 | 
43 | 	
44 | 	
45 | def normalise_time_series(time_series):
46 |     # Normalise prevalance such that it is capped at 1 and has a minimum at 0.
47 |     return (time_series-time_series.min())/(time_series.max()-(time_series.min()))


--------------------------------------------------------------------------------
/Data cleaning pipeline.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Data cleaning pipeline\n",
  8 |     "\n",
  9 |     "This notebook takes a dataset separated out by year into the format\n",
 10 |     "\n",
 11 |     "- 1975.csv\n",
 12 |     "- 1976.csv\n",
 13 |     "- 1977.csv\n",
 14 |     "\n",
 15 |     "etc, and transforms it into a set of dataframes of clean text that can be stored in a series of p files containing only the document id and cleaned text"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 1,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import os\n",
 25 |     "import pandas as pd\n",
 26 |     "import pickle\n",
 27 |     "\n",
 28 |     "import sys\n",
 29 |     "sys.path.append(\"../tools\")\n",
 30 |     "import my_stopwords\n",
 31 |     "import my_parameters\n",
 32 |     "import cleaning"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 6,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "stop = my_stopwords.get_stopwords()\n",
 42 |     "parameters = my_parameters.set_parameters()\n",
 43 |     "\n",
 44 |     "dataset_name = 'dblp_cs'\n",
 45 |     "raw_data_path = 'Raw_Data'"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 30,
 51 |    "metadata": {},
 52 |    "outputs": [
 53 |     {
 54 |      "name": "stdout",
 55 |      "output_type": "stream",
 56 |      "text": [
 57 |       "1988 6906\n",
 58 |       "1989 7947\n",
 59 |       "1990 9328\n",
 60 |       "1991 10599\n",
 61 |       "1992 12985\n",
 62 |       "1993 15213\n",
 63 |       "1994 18756\n",
 64 |       "1995 20607\n",
 65 |       "1996 24408\n",
 66 |       "1997 27865\n",
 67 |       "1998 32629\n",
 68 |       "1999 35897\n",
 69 |       "2000 42482\n",
 70 |       "2001 45158\n",
 71 |       "2002 52898\n",
 72 |       "2003 64871\n",
 73 |       "2004 90084\n",
 74 |       "2005 105201\n",
 75 |       "2006 120579\n",
 76 |       "2007 132013\n",
 77 |       "2008 141659\n",
 78 |       "2009 152864\n",
 79 |       "2010 161380\n",
 80 |       "2011 173486\n",
 81 |       "2012 183729\n",
 82 |       "2013 189858\n",
 83 |       "2014 195136\n",
 84 |       "2015 195426\n",
 85 |       "2016 197102\n",
 86 |       "2017 188640\n"
 87 |      ]
 88 |     }
 89 |    ],
 90 |    "source": [
 91 |     "cleaner = cleaning.Clean(parameters[\"ngram_length\"])\n",
 92 |     "\n",
 93 |     "for year in range(1988,2018):\n",
 94 |     "    df = pd.read_csv(raw_data_path+str(year)+'.csv')\n",
 95 |     "    print(year, len(df))\n",
 96 |     "    \n",
 97 |     "    cleaned_text = []\n",
 98 |     "    cleaned_df = pd.DataFrame()\n",
 99 |     "    \n",
100 |     "    if 'language' in df.keys():\n",
101 |     "        for index, row in df[df['language']=='en'].iterrows():\n",
102 |     "            cleaned_text.append(cleaner.cleaning_pipeline(row['title'], row['abstract'], pad=False))\n",
103 |     "            \n",
104 |     "        cleaned_df['id'] = list(df[df['language']=='en'].ssid)\n",
105 |     "    else:\n",
106 |     "        for index, row in df.iterrows():\n",
107 |     "            cleaned_text.append(cleaner.cleaning_pipeline(row['title'], row['abstract'], pad=False))\n",
108 |     "            \n",
109 |     "        cleaned_df['id'] = list(df.ssid)\n",
110 |     "        \n",
111 |     "    cleaned_df['cleaned'] = cleaned_text\n",
112 |     "    \n",
113 |     "    pickle.dump(cleaned_df, open(\"dataset_name+\"/\"+str(year)+\".p\", \"wb\"))"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "raw",
118 |    "metadata": {},
119 |    "source": []
120 |   }
121 |  ],
122 |  "metadata": {
123 |   "kernelspec": {
124 |    "display_name": "Python 3",
125 |    "language": "python",
126 |    "name": "python3"
127 |   },
128 |   "language_info": {
129 |    "codemirror_mode": {
130 |     "name": "ipython",
131 |     "version": 3
132 |    },
133 |    "file_extension": ".py",
134 |    "mimetype": "text/x-python",
135 |    "name": "python",
136 |    "nbconvert_exporter": "python",
137 |    "pygments_lexer": "ipython3",
138 |    "version": "3.7.11"
139 |   }
140 |  },
141 |  "nbformat": 4,
142 |  "nbformat_minor": 2
143 | }
144 | 


--------------------------------------------------------------------------------
/tools/burst_detection.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | def calc_significance(stacked_vectors, significance_threshold, n):
 5 |     # Must have been above the significance threshold for two consecutive timesteps
 6 |     a = stacked_vectors>significance_threshold
 7 |     b = a.rolling(window=n).sum()
 8 |     return stacked_vectors[stacked_vectors.axes[1][np.where(b.max()>=n)[0]]]
 9 | 
10 | 
11 | class BurstDetection:
12 |     def __init__(self, short_ma_length, long_ma_length, significance_ma_length, signal_line_ma):
13 |         self.short_ma_length = short_ma_length
14 |         self.long_ma_length = long_ma_length
15 |         self.significance_ma_length = significance_ma_length
16 |         self.signal_line_ma = signal_line_ma
17 |     
18 |     def calc_macd(self, stacked_vectors, ema=True):
19 |         if ema:
20 |             long_ma = stacked_vectors.ewm(span=self.long_ma_length).mean()
21 |             short_ma = stacked_vectors.ewm(span=self.short_ma_length).mean()
22 |             significance_ma = stacked_vectors.ewm(span=self.significance_ma_length).mean()
23 |             macd = short_ma - long_ma
24 |             signal = macd.ewm(span=self.signal_line_ma).mean()
25 |             hist = macd - signal
26 |             return long_ma, short_ma, significance_ma, macd, signal, hist
27 |         else:
28 |             long_ma = stacked_vectors.rolling(self.long_ma_length).mean()
29 |             short_ma = stacked_vectors.rolling(self.short_ma_length).mean()
30 |             significance_ma = stacked_vectors.rolling(self.significance_ma_length).mean()
31 |             macd = short_ma - long_ma
32 |             signal = macd.rolling(self.signal_line_ma).mean()
33 |             hist = macd - signal
34 |             return long_ma, short_ma, significance_ma, macd, signal, hist        
35 | 
36 | 
37 |     
38 |     def calc_burstiness(self, hist, scaling_factor):
39 |         return hist.iloc[self.long_ma_length-1:]/scaling_factor
40 | 
41 |     def calc_scaling(self, significance_ma, method):
42 |         if method == "max":
43 |             scaling = significance_ma.iloc[self.significance_ma_length-1:].max()
44 |         elif method == "mean":
45 |             scaling = significance_ma.iloc[self.significance_ma_length-1:].mean()
46 |         elif method == "sqrt":
47 |             scaling = np.sqrt(significance_ma.iloc[self.significance_ma_length-1:].max()  )      
48 |         return scaling
49 | 
50 |     def max_burstiness(self, burstiness, absolute):
51 |         if absolute:
52 |             b = pd.concat([np.abs(burstiness).max(), np.abs(burstiness).idxmax()], axis=1)
53 |         else:
54 |             b = pd.concat([burstiness.max(), burstiness.idxmax()], axis=1)
55 |         b.columns = ["max", "location"]
56 |         return b
57 |         
58 |     def my_burstiness(self, stacked_vectors, absolute=True, method="sqrt"):
59 |         long_ma, short_ma, significance_ma, macd, signal, hist = self.calc_macd(stacked_vectors)
60 |         scaling_factor = self.calc_scaling(significance_ma, method)
61 |         burstiness_over_time = self.calc_burstiness(hist, scaling_factor)
62 |         burstiness = self.max_burstiness(burstiness_over_time, absolute=absolute)
63 |         return(burstiness)
64 |         
65 |         
66 | class Dataset:
67 |     def __init__(self, name, years, stacked_vectors):
68 |         self.name = name
69 |         self.stacked_vectors = stacked_vectors
70 |         self.years = years
71 |     
72 |     def get_sig_stacked_vectors(self, significance_threshold, years_above_significance):
73 |         normalisation = self.stacked_vectors.sum(axis=1)
74 |         self.sig_stacked_vectors = calc_significance(self.stacked_vectors.divide(normalisation, axis="index")*100, significance_threshold, years_above_significance)
75 |     
76 |     def get_burstiness(self, short_ma_length, long_ma_length, significance_ma_length, signal_line_ma_length, ema=True, scaling_type="sqrt", absolute=True):
77 |         bd = BurstDetection(short_ma_length, long_ma_length, significance_ma_length, signal_line_ma_length)
78 |         long_ma, short_ma, significance_ma, macd, signal, hist = bd.calc_macd(self.sig_stacked_vectors, ema=ema)
79 |         self.scaling_factor = bd.calc_scaling(significance_ma, scaling_type)
80 |         burstiness_over_time = bd.calc_burstiness(hist, self.scaling_factor)
81 |         self.burstiness = bd.max_burstiness(burstiness_over_time, absolute=absolute)
82 |     
83 |     
84 |     
85 |     
86 |     
87 |         


--------------------------------------------------------------------------------
/tools/cleaning.py:
--------------------------------------------------------------------------------
  1 | # data cleaning
  2 | import re
  3 | 
  4 | # lemmatisation
  5 | from collections import defaultdict
  6 | from nltk.stem import WordNetLemmatizer
  7 | from sklearn.feature_extraction.text import CountVectorizer
  8 | 
  9 | class Clean:
 10 |     def __init__(self, ngram_length):
 11 |         self.ngram_length = ngram_length
 12 |         
 13 |         self.alphabets ="([A-Za-z])"
 14 |         self.prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
 15 |         self.suffixes = "(Inc|Ltd|Jr|Sr|Co)"
 16 |         self.starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
 17 |         self.acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
 18 |         self.websites = "[.](com|net|org|io|gov)"
 19 |         self.htmltags = '<[^>]+>'
 20 |         self.htmlspecial = '&#?[xX]?[a-zA-Z0-9]{2,8};'
 21 |         self.start_delimiter = 'documentstart' 
 22 |         self.sent_delimiter = 'sentenceboundary'
 23 |         self.end_delimiter = 'documentend'
 24 | 
 25 |         # Download the lemmatisesr
 26 |         self.wnl = WordNetLemmatizer()
 27 | 
 28 |         # Create a tokeniser
 29 |         count = CountVectorizer(strip_accents='ascii', min_df=1)
 30 |         self.tokeniser = count.build_analyzer()
 31 | 
 32 |     def normalise_acronymns(self, text):
 33 |         '''
 34 |         Remove the periods in acronyms. 
 35 |         Adapted from the method found at https://stackoverflow.com/a/40197005 
 36 |         '''
 37 | 
 38 |         # deal with single letters before sentence boundaries
 39 |         text = re.sub(r'\s([A-Z, a-z])\.\s', r' \1..  ', text)
 40 |         return re.sub(r'(?<!\w)([A-Z, a-z])\.', r'\1', text)
 41 | 
 42 |     def normalise_decimals(self, text):
 43 |         '''
 44 |         Remove the periods in decimal numbers and replace with POINT
 45 |         '''
 46 |         return re.sub(r'([0-9])\.([0-9])', r'\1POINT\2', text)
 47 | 
 48 |     def split_into_sentences(self, text):
 49 |         '''
 50 |         Sentence splitter adapted from https://stackoverflow.com/a/31505798
 51 |         '''
 52 |         text = text.replace("\n"," ")
 53 |         text = re.sub(self.prefixes,"\\1<prd>",text)
 54 |         text = re.sub(self.websites,"<prd>\\1",text)
 55 | 
 56 |         # my addition
 57 |         text = re.sub(self.htmltags, " ", text)
 58 |         text = re.sub(self.htmlspecial, " ", text)
 59 | 
 60 |         if "Ph.D" in text: 
 61 |             text = text.replace("Ph.D.","PhD")
 62 | 
 63 |         text = re.sub("\s" + self.alphabets + "[.] "," \\1",text)
 64 |         text = re.sub(self.acronyms+" "+self.starters,"\\1<stop> \\2",text)
 65 |         text = re.sub(self.alphabets + "[.]" + self.alphabets + "[.]" + self.alphabets + "[.]","\\1\\2\\3",text)
 66 |         text = re.sub(self.alphabets + "[.]" + self.alphabets + "[.]","\\1\\2",text)
 67 |         text = re.sub(" "+self.suffixes+"[.] "+self.starters," \\1 \\2",text)
 68 |         text = re.sub(" "+self.suffixes+"[.]"," \\1",text)
 69 |         text = re.sub(" " + self.alphabets + "[.]"," \\1",text)
 70 | 
 71 |         if "”" in text: 
 72 |             text = text.replace(".”","”.")
 73 |         if "\"" in text: 
 74 |             text = text.replace(".\"","\".")
 75 |         if "!" in text: 
 76 |             text = text.replace("!\"","\"!")
 77 |         if "?" in text: 
 78 |             text = text.replace("?\"","\"?")
 79 | 
 80 |         text = text.replace(".","<stop>")
 81 |         text = text.replace("?","<stop>")
 82 |         text = text.replace("!","<stop>")
 83 | 
 84 |         sentences = text.split("<stop>")
 85 |         sentences = [s.strip() for s in sentences]
 86 | 
 87 |         non_empty = []
 88 |         for s in sentences: 
 89 |             # we require that there be two alphanumeric characters in a row
 90 |             if len(re.findall("[A-Za-z0-9][A-Za-z0-9]", s)) > 0:
 91 |                 non_empty.append(s)
 92 |         return non_empty
 93 | 
 94 |     def pad_sentences(self, sentences):
 95 |         '''
 96 |         Takes a list of sentences and returns a string in which:
 97 |             - The beginning of the abstract is indicated by DOCUMENTSTART
 98 |             - The end is indicated by DOCUMENTEND
 99 |             - Sentence boundaries are indicated by SENTENCEBOUNDARY
100 | 
101 |         The number of delimiters used is dependent on the ngram length
102 |         '''
103 |         sent_string = (' '+(self.sent_delimiter+' ')*(self.ngram_length-1)).join(sentences)
104 | 
105 |         return (self.start_delimiter+' ')*(self.ngram_length-1) + sent_string + (' '+self.end_delimiter)*(self.ngram_length-1)
106 |     
107 |     def cleaning_pipeline(self, title, abstract, pad=True):
108 |         '''
109 |         Takes a binary string and returns a list of cleaned sentences, stripped of punctuation and lemmatised
110 |         '''
111 |         
112 |         # Check that title and abstract exist
113 |         if type(title) is not float:
114 |             title = self.normalise_decimals(self.normalise_acronymns(title))
115 |         else:
116 |             title = ''
117 |             
118 |         if type(abstract) is not float:
119 |             abstract = self.normalise_decimals(self.normalise_acronymns(abstract))
120 |         else:
121 |             abstract = ''
122 |         
123 |         if pad:
124 |             sentences = [title] + self.split_into_sentences(abstract)
125 | 
126 |             # strip out punctuation and make lowercase
127 |             clean_sentences = []
128 |             for s in sentences:
129 | 
130 |                 # Deal with special cases
131 |                 s = re.sub(r'[-/]', ' ', s)
132 | 
133 |                 # Remove all other punctuation
134 |                 s = re.sub(r'[^\w\s]','',s)
135 | 
136 |                 clean_sentences.append(s.lower())
137 | 
138 |             # pad sentences with delimiters
139 |             
140 |             text = self.pad_sentences(clean_sentences)
141 | 
142 |         else:
143 |             text = title + '. ' + abstract
144 |             text = re.sub(r'[-/]', ' ', text)
145 |             text = re.sub(r'[^\w\s]','',text)
146 |             text = text.lower()
147 | 
148 |         # Lemmatise word by word
149 |         lemmas = []
150 |         for word in self.tokeniser(text):
151 |             lemmas.append(self.wnl.lemmatize(word))
152 | 
153 |         return ' '.join(lemmas)
154 | 


--------------------------------------------------------------------------------