├── .gitignore ├── .gitmodules ├── LICENSE ├── Makefile ├── README.rst ├── index_template.html ├── out └── ep16 │ ├── europython_logo.png │ └── gael_simple.png └── topics_extraction.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "github-pages-publish"] 2 | path = github-pages-publish 3 | url = git@github.com:rafaelmartins/github-pages-publish.git 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016, Gael Varoquaux 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | all: html 3 | 4 | html: topics_extraction.py 5 | python topics_extraction.py 6 | 7 | install: html 8 | python github-pages-publish/github-pages-publish . out/ 9 | git push origin gh-pages 10 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | 2 | ==================================================== 3 | Topic modelling from EuroPython's list of abstracts 4 | ==================================================== 5 | 6 | This is the code to produce a list of topics from abstracts downloaded 7 | from the conference website. 8 | 9 | The different steps and corresponding modules are: 10 | 11 | * **Web srapping** to retrieve the abstracts, based on `beautifulsoup4`, 12 | and `urllib2`. 13 | 14 | `joblib` is also useful for caching, to avoid multiple crawls of the 15 | websites and downloads. 16 | 17 | I could have asked access to a dump of the database for the organizers, 18 | but it was more fun to crawl. 19 | 20 | * **Stemming**: trying to convert plural words to singular, using `NLTK`. 21 | 22 | Note that stemming is in general more sophisticated, and will convert 23 | words to their roots, such as 'organization' -> 'organ'. To have 24 | understandable word clouds, we want to keep more differentiation. Hence 25 | we add a custom layer to reduce the power of the stemmer. 26 | 27 | * **Topic modelling** with `scikit-learn`. 28 | 29 | It's a 2 step process: first we convert the text data to a numerical 30 | representation, "vectorizing"; second we use a Non-negative Matrix 31 | Factorization to extract "topics" in these. 32 | 33 | * **Word-cloud figures** with the `wordcloud` module. 34 | 35 | * **Create a webpace** with the `tempita`. 36 | 37 | ___ 38 | 39 | 40 | This application beautifully combines multiple facets of the Python 41 | ecosystem, from web tools to PyData. 42 | 43 | -------------------------------------------------------------------------------- /index_template.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | Topics in EuroPython's talks 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 96 | 97 | 98 | 99 | 100 | 101 | 102 |
103 | 104 |
105 | 106 | 107 |

Topics in EuroPython's talks

108 |

Too many 109 | abstracts to browse?

110 |

111 | Here is a list of topics and related talks, automatically 112 | extracted with data science 113 | . 114 | Code 116 | 117 |

118 | 119 |
120 |
121 | 122 |
123 | 124 | {{for loop, topic in looper(topics)}} 125 |
126 |
127 | 128 |
129 |

{{topic.first_word}} 130 |   {{topic.second_word}}

131 | 141 |
142 |
143 |
144 | {{endfor}} 145 | 146 |
148 |
149 |

150 | Brought to you by     151 |

152 | 153 | 154 | 155 |
156 | 157 | 163 |
164 | 165 |
166 | 167 | 168 |
169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | -------------------------------------------------------------------------------- /out/ep16/europython_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GaelVaroquaux/my_topics/c605ca14cb2f83fb6b8a26588910f7bdd9f7e7d9/out/ep16/europython_logo.png -------------------------------------------------------------------------------- /out/ep16/gael_simple.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GaelVaroquaux/my_topics/c605ca14cb2f83fb6b8a26588910f7bdd9f7e7d9/out/ep16/gael_simple.png -------------------------------------------------------------------------------- /topics_extraction.py: -------------------------------------------------------------------------------- 1 | """ 2 | Download the abstracts from Internet, compute a topic model, plot word 3 | clouds, and create a webpage. 4 | """ 5 | OUTPUT_DIR = "out/ep16" 6 | 7 | 8 | ############################################################################### 9 | # Download the abstracts and URLs of the talks 10 | 11 | # Beautiful soup, for webscraping 12 | import bs4 13 | import urllib2 14 | 15 | # joblib, for caching 16 | import joblib 17 | 18 | mem = joblib.Memory(cachedir='cache') 19 | 20 | def get_list_of_talks(): 21 | all_talks_urls = {} 22 | 23 | main_page = urllib2.urlopen( 24 | 'https://ep2016.europython.eu/p3/schedule/ep2016/list/') 25 | tree = bs4.BeautifulSoup(main_page.read()) 26 | 27 | rows = tree.find_all(name='td', attrs={'class': 'event'}) 28 | 29 | for row in rows: 30 | divs = row.find_all(name='div', attrs={'class': 'name'}) 31 | for div in divs: 32 | link = div.find_next(name='a') 33 | url = 'https://ep2016.europython.eu/' + link.attrs['href'] 34 | title = link.get_text() 35 | if title: 36 | all_talks_urls[title] = url 37 | break 38 | 39 | return all_talks_urls 40 | 41 | 42 | def grab_talk_description(talk_url): 43 | page = urllib2.urlopen(talk_url) 44 | tree = bs4.BeautifulSoup(page.read()) 45 | 46 | # First extract the content 47 | content = tree.find_all(name='div', attrs={'class': 'cms'})[0].get_text() 48 | 49 | # Second grab the tags 50 | tag_div = tree.find_all(name='div', attrs={'class': 'all-tags'}) 51 | if tag_div: 52 | tags = [t.get_text() 53 | for t in tag_div[0].find_all(name='span', 54 | attrs={'class': 'tag'})] 55 | else: 56 | tags = [] 57 | 58 | return content, tags 59 | 60 | 61 | all_talks_urls = mem.cache(get_list_of_talks)() 62 | 63 | all_talks_description = {} 64 | all_talks_document = {} 65 | 66 | for title, url in all_talks_urls.items(): 67 | content, tags = mem.cache(grab_talk_description)(url) 68 | all_talks_description[title] = content 69 | # Add the tags repeated 3 times to the content, to give them more weight 70 | all_talks_document[title] = '%s %s' % (content, ' '.join(3 * tags)) 71 | 72 | 73 | # Make a list of documents (sort for reproducibility) 74 | documents = [d for t, d in sorted(all_talks_document.items())] 75 | 76 | 77 | ############################################################################### 78 | # Stemming: converting words to a canonical form. Here we only worry 79 | # about plural 80 | 81 | from nltk.stem import SnowballStemmer 82 | stemmer = SnowballStemmer("english") 83 | 84 | PROTECTED_WORDS = ['pandas', 'itertools'] 85 | 86 | def no_plural_stemmer(word): 87 | """ A stemmer that tries to apply only on plural. The goal is to keep 88 | the readability of the words. 89 | """ 90 | word = word.lower() 91 | if word.endswith('s') and not (word in PROTECTED_WORDS 92 | or word.endswith('sis')): 93 | stemmed_word = stemmer.stem(word) 94 | if len(stemmed_word) == len(word) - 1: 95 | word = stemmed_word 96 | return word 97 | 98 | 99 | ############################################################################### 100 | # Learn the topic model 101 | n_features = 1000 102 | n_topics = 10 103 | 104 | # First we "vectorize": converting every document to a vector of numbers 105 | from sklearn.feature_extraction.text import TfidfVectorizer 106 | from sklearn.decomposition import NMF 107 | 108 | 109 | class StemmedTfidfVectorizer(TfidfVectorizer): 110 | def build_analyzer(self): 111 | analyzer = super(TfidfVectorizer, self).build_analyzer() 112 | return lambda doc: (no_plural_stemmer(w) for w in analyzer(doc)) 113 | 114 | # We use a few heuristics to filter out useless terms early on: the posts 115 | # are stripped of headers, footers and quoted replies, and common English 116 | # words, words occurring in only one document or in at least 95% of the 117 | # documents are removed. 118 | 119 | # Use tf-idf features for NMF. 120 | tfidf_vectorizer = StemmedTfidfVectorizer(max_df=0.95, min_df=2, 121 | max_features=n_features, 122 | stop_words='english') 123 | tfidf = tfidf_vectorizer.fit_transform(documents) 124 | 125 | # Fit the topic model: a Non-negative Matrix Factorization 126 | nmf = NMF(n_components=n_topics, random_state=1, 127 | alpha=.1, l1_ratio=.5).fit(tfidf) 128 | 129 | feature_names = tfidf_vectorizer.get_feature_names() 130 | 131 | # The loading of each topic on each document 132 | doc_loadings = nmf.transform(tfidf) 133 | 134 | 135 | ############################################################################### 136 | # Plot word-cloud figures for each topic 137 | import os 138 | import itertools 139 | 140 | from wordcloud import WordCloud 141 | 142 | def my_color_func(word=None, font_size=None, position=None, 143 | orientation=None, font_path=None, random_state=None): 144 | """ hue in the 128-255 range given by size, with saturation 90% and 145 | lumination 20%""" 146 | return "hsl(%d, 90%%, 20%%)" % (110 + 3 * font_size) 147 | 148 | 149 | # First create an ellipse mask 150 | import numpy as np 151 | x, y = np.ogrid[-1:1:250j, -1:1:450j] 152 | mask = (255 * ((x ** 2 + y ** 2) > 1)).astype(int) 153 | 154 | if not os.path.exists(OUTPUT_DIR): 155 | os.makedirs(OUTPUT_DIR) 156 | 157 | # Generate a word cloud image using frequencies assign to the terms 158 | for topic_idx, topic in enumerate(nmf.components_): 159 | freq_cloud = WordCloud(max_font_size=40, relative_scaling=0.5, 160 | #background_color=None, mode="RGBA", 161 | background_color='white', mode="RGBA", 162 | mask=mask, color_func=my_color_func, 163 | scale=1.5) 164 | frequencies = [(w, f) 165 | for w, f in itertools.izip(feature_names, topic) 166 | if f != 0] 167 | freq_cloud.generate_from_frequencies(frequencies) 168 | freq_cloud.to_file(os.path.join(OUTPUT_DIR, 'topic_%02i.png' % topic_idx)) 169 | 170 | 171 | ############################################################################### 172 | # Output an HTML file using tempita 173 | 174 | titles_and_urls = sorted(all_talks_urls.items()) 175 | 176 | import tempita 177 | 178 | # First create the information that will go in the file 179 | topics = list() 180 | for topic, loading in itertools.izip(nmf.components_, doc_loadings.T): 181 | frequencies = [(f, w) 182 | for f, w in itertools.izip(topic, feature_names) 183 | if f != 0] 184 | frequencies.sort(reverse=True) 185 | titles = [(l, t) 186 | for l, t in itertools.izip(loading, titles_and_urls) 187 | if l != 0] 188 | titles.sort(reverse=True) 189 | talks = [tempita.bunch(title=t[0], url=t[1], 190 | description=(all_talks_description[t[0]] 191 | if len(all_talks_description[t[0]].strip()) > 1 192 | else "")) 193 | for l, t in titles] 194 | topic_desc = tempita.bunch(first_word=frequencies[0][1], 195 | second_word=frequencies[1][1], 196 | talks=talks[:10]) 197 | topics.append(topic_desc) 198 | 199 | template = tempita.HTMLTemplate.from_filename('index_template.html') 200 | 201 | html = template.substitute(topics=topics) 202 | open(os.path.join(OUTPUT_DIR, 'index.html'), 'w').write(html) 203 | 204 | --------------------------------------------------------------------------------