├── .gitignore ├── LICENSE ├── report ├── img │ ├── cnn_loss_acc.png │ ├── cnn_lstm_loss_acc.png │ ├── lstm_dropout_loss_acc.png │ └── lstm_no_dropout_loss_acc.png ├── main.pdf ├── main.tex └── nips15submit_e.sty ├── sentenceclassification ├── cnn_imdb.py ├── cnn_lstm_imdb.py ├── lstm_imdb.py └── utils.py └── word2vec └── embedding_word2vec.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Mario Ynocente Castro 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /report/img/cnn_loss_acc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/marioyc/Sentence-Classification/2680d4855a4de9dc4a8f9c4df0de7820077f5780/report/img/cnn_loss_acc.png -------------------------------------------------------------------------------- /report/img/cnn_lstm_loss_acc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/marioyc/Sentence-Classification/2680d4855a4de9dc4a8f9c4df0de7820077f5780/report/img/cnn_lstm_loss_acc.png -------------------------------------------------------------------------------- /report/img/lstm_dropout_loss_acc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/marioyc/Sentence-Classification/2680d4855a4de9dc4a8f9c4df0de7820077f5780/report/img/lstm_dropout_loss_acc.png -------------------------------------------------------------------------------- /report/img/lstm_no_dropout_loss_acc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/marioyc/Sentence-Classification/2680d4855a4de9dc4a8f9c4df0de7820077f5780/report/img/lstm_no_dropout_loss_acc.png -------------------------------------------------------------------------------- /report/main.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/marioyc/Sentence-Classification/2680d4855a4de9dc4a8f9c4df0de7820077f5780/report/main.pdf -------------------------------------------------------------------------------- /report/main.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} % For LaTeX2e 2 | \usepackage{nips15submit_e,times} 3 | %\usepackage{hyperref} 4 | %\usepackage{url} 5 | \usepackage{amsmath} 6 | %\newcommand\addtag{\refstepcounter{equation}\tag{\theequation}} 7 | \usepackage{graphicx} 8 | 9 | \title{Word and sentence embeddings\\ 10 | Sentence Classification with LSTMs/ConvNets} 11 | \author{ 12 | Mario Ynocente Castro\\ 13 | Master MVA\\ 14 | ENS Cachan / Ecole Polytechnique\\ 15 | \texttt{mario.ynocente-castro@polytechnique.edu} 16 | } 17 | 18 | % The \author macro works with any number of authors. There are two commands 19 | % used to separate the names and addresses of multiple authors: \And and \AND. 20 | % 21 | % Using \And between authors leaves it to \LaTeX{} to determine where to break 22 | % the lines. Using \AND forces a linebreak at that point. So, if \LaTeX{} 23 | % puts 3 of 4 authors names on the first line, and the last on the second 24 | % line, try using \AND instead of \And before the third author name. 25 | 26 | \newcommand{\fix}{\marginpar{FIX}} 27 | \newcommand{\new}{\marginpar{NEW}} 28 | 29 | \nipsfinalcopy % Uncomment for camera-ready version 30 | 31 | \begin{document} 32 | \maketitle 33 | 34 | \section{Word and sentece embeddings with word2vec} 35 | 36 | \subsection{Loading models} 37 | 38 | \begin{enumerate} 39 | \item 40 | What is the total number of raw words found in the corpus? 41 | 42 | 17,005,207 words. 43 | 44 | \item 45 | What is the number of words retained in the word2vec vocabulary 46 | (with default min\_count = 5)? 47 | 48 | 71290 words. 49 | \end{enumerate} 50 | 51 | \subsection{Exploring the embedding space} 52 | 53 | \begin{enumerate} 54 | \item 55 | What is the similarity between (’apple’ and ’mac’), between (’apple’ and ’peach’), 56 | between (’banana’ and ’peach’)? In your opinion, why are you asked about 57 | the three previous examples? 58 | 59 | \begin{itemize} 60 | \item 61 | Similarity between apple and mac: 0.567861632452 62 | 63 | \item 64 | Similarity between apple and peach: 0.178399832237 65 | 66 | \item 67 | Similarity between banana and peach: 0.688715470006 68 | \end{itemize} 69 | 70 | This illustrates that in this embedding space apple and mac are more similar than 71 | what apple and peach are even though both are fruits, in the case of banana and peach 72 | we do get a high similarity. This may happen because in the corpus the word apple 73 | is used more often as referring to the mark and less referring to the fruit, so it 74 | is more common to find it in the same context that the word mac. 75 | 76 | \item 77 | What is the closest word to the word 'difficult', for 'model' and 'model\_phrase'. 78 | Comment about the difference between model and model\_phrase. Find the three phrases 79 | that are closest to the word 'clinton'. 80 | 81 | \begin{itemize} 82 | \item 83 | Closest word to difficult for model: easy 84 | 85 | \item 86 | Closest word to difficult for model\_phrase: very\_difficult 87 | 88 | \item 89 | Difference between model and model\_phrase: model\_phrase was trained after 90 | preprocessing the tokens of the sentences in the corpus to group the ones 91 | that commonly occur together in phrases, model in contrast was trained 92 | directly on the individual tokens. 93 | 94 | \item 95 | Three phrases that are closest to clinton: bush, reagan and gore are the top 96 | and consist of only one token, the top consisting of two tokens are 97 | bill\_clinton, w\_bush and al\_gore. 98 | \end{itemize} 99 | 100 | \item 101 | Find the closest word to the vector "vect(france) - vect(germany) + vect(berlin)" and 102 | report its similarity measure. 103 | 104 | The closest word is 'paris' with similarity measure of 0.757699728012085 105 | 106 | \newpage 107 | \item 108 | Explore the embedding space using these functions and report some interesting 109 | behaviour (of your choice) 110 | 111 | \begin{itemize} 112 | \item 113 | We can further confirm that the word apple is more frequently used (on the 114 | corpus) as related to the mark by checking which are its most similar words, 115 | which gives us: macintosh, atari, amiga, intel, ibm, pc 116 | \item 117 | We can make analogies like: 118 | \begin{itemize} 119 | \item science - scientist + mathematician $\approx$ mathematics 120 | \item science - scientist + physicist $\approx$ physics 121 | \item science - scientist + philosopher $\approx$ philosophy 122 | \item science - scientist + astronomer $\approx$ astronomy 123 | \item science - scientist + biologist $\approx$ humanities 124 | \end{itemize} 125 | \end{itemize} 126 | 127 | \end{enumerate} 128 | 129 | \subsection{Sentence embeddings} 130 | 131 | \begin{enumerate} 132 | \item 133 | Report the closest sentence to the sentence with idx "777", and their 134 | similarity score. 135 | 136 | "gymnasts get ready for a competition ." with score 0.902949842134 137 | 138 | \item 139 | Report the 5 closest sentences to the sentence with idx "777", and the 140 | associated similarity scores. 141 | 142 | \begin{table}[h] 143 | \label{sample-table} 144 | \begin{center} 145 | \begin{tabular}{ll} 146 | \multicolumn{1}{c}{\bf Sentence} & \multicolumn{1}{c}{\bf Similarity score} 147 | \\ \hline \\ 148 | gymnasts get ready for a competition . & 0.902949842134 \\ 149 | a woman is getting ready to perform a song for the audience . & 0.890097422822 \\ 150 | a runner in a competition want to go to the finish line . & 0.855536495002 \\ 151 | men working to build a fence for customers . & 0.851471676783 \\ 152 | a man prepares to give a speech for a television audience . & 0.849476121272 \\ 153 | \end{tabular} 154 | \end{center} 155 | \end{table} 156 | \end{enumerate} 157 | 158 | \subsection{IDF weighted sentence embeddings} 159 | 160 | \begin{enumerate} 161 | \item 162 | Report the IDF score of the word "the", the word "a", and the word "clinton". 163 | 164 | The word "the" has a score of 0.867762351618 165 | 166 | The word "a" has a score of 0.473266274881 167 | 168 | The word "clinton" doesn't have an IDF score since it's not present in the data. 169 | 170 | \item 171 | Report the closest sentence to sentence with idx 777. 172 | 173 | The closest sentence is "gymnasts get ready for a competition ." with a score of 0.897237646962. 174 | \end{enumerate} 175 | 176 | \section{Simple LSTM for Sequence Classification} 177 | 178 | \begin{enumerate} 179 | \item 180 | What is the (minibatch) shape of: 181 | 182 | \begin{itemize} 183 | \item 184 | the input of the embedding layer: $32 \times 80$ 185 | \item 186 | the input of the LSTM layer: $32 \times 80 \times 32$ 187 | \item 188 | the output of the LSTM layer $32 \times 64$ 189 | \end{itemize} 190 | 191 | \item 192 | Report the number of parameters of the model with the standard set of hyper-parameters. 193 | Report also the number of sentences in the training set. In standard statistics, 194 | a rule of thumb is to have less parameters than samples in your dataset. 195 | How do you think it’s possible to train a model that has so many parameters 196 | compared to this number of samples? 197 | 198 | \begin{itemize} 199 | \item 200 | Number of hyper-parameters: 480000 201 | \item 202 | Number of sentences in the training set: 35000 203 | \item 204 | We are adding some regularization to the model, in this particular case 205 | we use dropout, which can be seen as training multiple similar models 206 | with some constraints of weight sharing. 207 | \end{itemize} 208 | 209 | \item 210 | For a single sentence, the LSTM has states $h_1, \ldots, h_T$ where $T$ is 211 | the number of words in the sentence. The sentence embeddings that is fed to 212 | the classifier is thus computed as $f(h_1, \ldots, h_T)$. What is the exact 213 | form of $f(h_1, \ldots, h_T)$ used in the python script? 214 | 215 | Keras implements LSTM according to the following recurrent formulas: 216 | 217 | \begin{center} 218 | $f_t = \sigma(W_f x_t + U_f h_{t - 1} + b_f), i_t = \sigma(W_i x_t + U_i h_{t - 1} + b_i)$ 219 | 220 | $C_t = f_t C_{t - 1} + i_t \tanh(W_c x_t + U_c h_{t - 1} + b_c)$ 221 | 222 | $o_t = \sigma(W_o x_t + U_o h_{t - 1} + b_o), h_t = o_t \tanh(C_t)$ 223 | \end{center} 224 | 225 | If the flag return\_sequences is set to False (which is the default) then 226 | $f(h_1,\ldots,h_T) = h_T$, if it is True then $f(h_1,\ldots,h_T) = [h_1; \ldots; h_T]$. 227 | 228 | \item 229 | Plot the evolution of the train and valid accuracy per epoch, and write the 230 | test errors that you obtain. 231 | 232 | \begin{figure}[ht] 233 | \includegraphics[width=\textwidth,height=\textheight,keepaspectratio]{img/lstm_no_dropout_loss_acc.png} 234 | \caption{Evolution of the loss and accuracy with no dropout.} 235 | \end{figure} 236 | 237 | \begin{figure}[ht] 238 | \includegraphics[width=\textwidth,height=\textheight,keepaspectratio]{img/lstm_dropout_loss_acc.png} 239 | \caption{Evolution of the loss and accuracy with dropout.} 240 | \end{figure} 241 | 242 | \begin{itemize} 243 | \item 244 | Results without dropout: loss = 0.656337736861, error = 18.0866666698\% 245 | 246 | \item 247 | Results with dropout: loss = 0.377996407843, error = 16.606666666699998\% 248 | \end{itemize} 249 | 250 | \item 251 | Explain what is the difference between SGD and Adam. 252 | 253 | The difference is that Adam uses an exponentially 254 | decaying average of the past gradients and also divides this by another similarly 255 | calculated average, but this time of the sum of squares of the terms in the 256 | gradient, which has the effect of decreasing the learning rate of the parameters 257 | that have been updated the most. 258 | \end{enumerate} 259 | 260 | \section{Simple ConvNet for Sequence Classification} 261 | 262 | \begin{enumerate} 263 | \item 264 | Report the results (test loss and test error) that you obtain. 265 | 266 | Loss = 0.361414234543 267 | 268 | Error = 16.3333333365\% 269 | 270 | \begin{figure}[ht] 271 | \includegraphics[width=\textwidth,height=\textheight,keepaspectratio]{img/cnn_loss_acc.png} 272 | \caption{Evolution of the loss and accuracy with 1D Convolution.} 273 | \end{figure} 274 | 275 | \item 276 | What is the input and output shape of Convolution1D? 277 | 278 | Input's shape: $32 \times 80 \times 16$ 279 | 280 | Output's shape: $32 \times 78 \times 250$ 281 | 282 | \item 283 | Build a model where on top of the convolution, you have an LSTM. It means 284 | that the input of the LSTM will be the output of your ConvNet. Run the model 285 | with the best parameters you find. Report your best results. 286 | 287 | Loss = 0.338560722351 288 | 289 | Error = 14.673333333299998\% 290 | 291 | \begin{figure}[ht] 292 | \includegraphics[width=\textwidth,height=\textheight,keepaspectratio]{img/cnn_lstm_loss_acc.png} 293 | \caption{Evolution of the loss and accuracy with LSTM on top of 1D Convolution.} 294 | \end{figure} 295 | \end{enumerate} 296 | 297 | \end{document} 298 | -------------------------------------------------------------------------------- /report/nips15submit_e.sty: -------------------------------------------------------------------------------- 1 | %%%% NIPS Macros (LaTex) 2 | %%%% Style File 3 | %%%% Dec 12, 1990 Rev Aug 14, 1991; Sept, 1995; April, 1997; April, 1999 4 | 5 | % This file can be used with Latex2e whether running in main mode, or 6 | % 2.09 compatibility mode. 7 | % 8 | % If using main mode, you need to include the commands 9 | % \documentclass{article} 10 | % \usepackage{nips10submit_e,times} 11 | % as the first lines in your document. Or, if you do not have Times 12 | % Roman font available, you can just use 13 | % \documentclass{article} 14 | % \usepackage{nips10submit_e} 15 | % instead. 16 | % 17 | % If using 2.09 compatibility mode, you need to include the command 18 | % \documentstyle[nips10submit_09,times]{article} 19 | % as the first line in your document. Or, if you do not have Times 20 | % Roman font available, you can include the command 21 | % \documentstyle[nips10submit_09]{article} 22 | % instead. 23 | 24 | % Change the overall width of the page. If these parameters are 25 | % changed, they will require corresponding changes in the 26 | % maketitle section. 27 | % 28 | \usepackage{eso-pic} % used by \AddToShipoutPicture 29 | 30 | \renewcommand{\topfraction}{0.95} % let figure take up nearly whole page 31 | \renewcommand{\textfraction}{0.05} % let figure take up nearly whole page 32 | 33 | % Define nipsfinal, set to true if nipsfinalcopy is defined 34 | \newif\ifnipsfinal 35 | \nipsfinalfalse 36 | \def\nipsfinalcopy{\nipsfinaltrue} 37 | \font\nipstenhv = phvb at 8pt % *** IF THIS FAILS, SEE nips10submit_e.sty *** 38 | 39 | % Specify the dimensions of each page 40 | 41 | \setlength{\paperheight}{11in} 42 | \setlength{\paperwidth}{8.5in} 43 | 44 | \oddsidemargin .5in % Note \oddsidemargin = \evensidemargin 45 | \evensidemargin .5in 46 | \marginparwidth 0.07 true in 47 | %\marginparwidth 0.75 true in 48 | %\topmargin 0 true pt % Nominal distance from top of page to top of 49 | %\topmargin 0.125in 50 | \topmargin -0.625in 51 | \addtolength{\headsep}{0.25in} 52 | \textheight 9.0 true in % Height of text (including footnotes & figures) 53 | \textwidth 5.5 true in % Width of text line. 54 | \widowpenalty=10000 55 | \clubpenalty=10000 56 | 57 | % \thispagestyle{empty} \pagestyle{empty} 58 | \flushbottom \sloppy 59 | 60 | % We're never going to need a table of contents, so just flush it to 61 | % save space --- suggested by drstrip@sandia-2 62 | \def\addcontentsline#1#2#3{} 63 | 64 | % Title stuff, taken from deproc. 65 | \def\maketitle{\par 66 | \begingroup 67 | \def\thefootnote{\fnsymbol{footnote}} 68 | \def\@makefnmark{\hbox to 0pt{$^{\@thefnmark}$\hss}} % for perfect author 69 | % name centering 70 | % The footnote-mark was overlapping the footnote-text, 71 | % added the following to fix this problem (MK) 72 | \long\def\@makefntext##1{\parindent 1em\noindent 73 | \hbox to1.8em{\hss $\m@th ^{\@thefnmark}$}##1} 74 | \@maketitle \@thanks 75 | \endgroup 76 | \setcounter{footnote}{0} 77 | \let\maketitle\relax \let\@maketitle\relax 78 | \gdef\@thanks{}\gdef\@author{}\gdef\@title{}\let\thanks\relax} 79 | 80 | % The toptitlebar has been raised to top-justify the first page 81 | 82 | % Title (includes both anonimized and non-anonimized versions) 83 | \def\@maketitle{\vbox{\hsize\textwidth 84 | \linewidth\hsize \vskip 0.1in \toptitlebar \centering 85 | {\LARGE\bf \@title\par} \bottomtitlebar % \vskip 0.1in % minus 86 | \ifnipsfinal 87 | \def\And{\end{tabular}\hfil\linebreak[0]\hfil 88 | \begin{tabular}[t]{c}\bf\rule{\z@}{24pt}\ignorespaces}% 89 | \def\AND{\end{tabular}\hfil\linebreak[4]\hfil 90 | \begin{tabular}[t]{c}\bf\rule{\z@}{24pt}\ignorespaces}% 91 | \begin{tabular}[t]{c}\bf\rule{\z@}{24pt}\@author\end{tabular}% 92 | \else 93 | \begin{tabular}[t]{c}\bf\rule{\z@}{24pt} 94 | Anonymous Author(s) \\ 95 | Affiliation \\ 96 | Address \\ 97 | \texttt{email} \\ 98 | \end{tabular}% 99 | \fi 100 | \vskip 0.3in minus 0.1in}} 101 | 102 | \renewenvironment{abstract}{\vskip.075in\centerline{\large\bf 103 | Abstract}\vspace{0.5ex}\begin{quote}}{\par\end{quote}\vskip 1ex} 104 | 105 | % sections with less space 106 | \def\section{\@startsection {section}{1}{\z@}{-2.0ex plus 107 | -0.5ex minus -.2ex}{1.5ex plus 0.3ex 108 | minus0.2ex}{\large\bf\raggedright}} 109 | 110 | \def\subsection{\@startsection{subsection}{2}{\z@}{-1.8ex plus 111 | -0.5ex minus -.2ex}{0.8ex plus .2ex}{\normalsize\bf\raggedright}} 112 | \def\subsubsection{\@startsection{subsubsection}{3}{\z@}{-1.5ex 113 | plus -0.5ex minus -.2ex}{0.5ex plus 114 | .2ex}{\normalsize\bf\raggedright}} 115 | \def\paragraph{\@startsection{paragraph}{4}{\z@}{1.5ex plus 116 | 0.5ex minus .2ex}{-1em}{\normalsize\bf}} 117 | \def\subparagraph{\@startsection{subparagraph}{5}{\z@}{1.5ex plus 118 | 0.5ex minus .2ex}{-1em}{\normalsize\bf}} 119 | \def\subsubsubsection{\vskip 120 | 5pt{\noindent\normalsize\rm\raggedright}} 121 | 122 | 123 | % Footnotes 124 | \footnotesep 6.65pt % 125 | \skip\footins 9pt plus 4pt minus 2pt 126 | \def\footnoterule{\kern-3pt \hrule width 12pc \kern 2.6pt } 127 | \setcounter{footnote}{0} 128 | 129 | % Lists and paragraphs 130 | \parindent 0pt 131 | \topsep 4pt plus 1pt minus 2pt 132 | \partopsep 1pt plus 0.5pt minus 0.5pt 133 | \itemsep 2pt plus 1pt minus 0.5pt 134 | \parsep 2pt plus 1pt minus 0.5pt 135 | \parskip .5pc 136 | 137 | 138 | %\leftmargin2em 139 | \leftmargin3pc 140 | \leftmargini\leftmargin \leftmarginii 2em 141 | \leftmarginiii 1.5em \leftmarginiv 1.0em \leftmarginv .5em 142 | 143 | %\labelsep \labelsep 5pt 144 | 145 | \def\@listi{\leftmargin\leftmargini} 146 | \def\@listii{\leftmargin\leftmarginii 147 | \labelwidth\leftmarginii\advance\labelwidth-\labelsep 148 | \topsep 2pt plus 1pt minus 0.5pt 149 | \parsep 1pt plus 0.5pt minus 0.5pt 150 | \itemsep \parsep} 151 | \def\@listiii{\leftmargin\leftmarginiii 152 | \labelwidth\leftmarginiii\advance\labelwidth-\labelsep 153 | \topsep 1pt plus 0.5pt minus 0.5pt 154 | \parsep \z@ \partopsep 0.5pt plus 0pt minus 0.5pt 155 | \itemsep \topsep} 156 | \def\@listiv{\leftmargin\leftmarginiv 157 | \labelwidth\leftmarginiv\advance\labelwidth-\labelsep} 158 | \def\@listv{\leftmargin\leftmarginv 159 | \labelwidth\leftmarginv\advance\labelwidth-\labelsep} 160 | \def\@listvi{\leftmargin\leftmarginvi 161 | \labelwidth\leftmarginvi\advance\labelwidth-\labelsep} 162 | 163 | \abovedisplayskip 7pt plus2pt minus5pt% 164 | \belowdisplayskip \abovedisplayskip 165 | \abovedisplayshortskip 0pt plus3pt% 166 | \belowdisplayshortskip 4pt plus3pt minus3pt% 167 | 168 | % Less leading in most fonts (due to the narrow columns) 169 | % The choices were between 1-pt and 1.5-pt leading 170 | %\def\@normalsize{\@setsize\normalsize{11pt}\xpt\@xpt} % got rid of @ (MK) 171 | \def\normalsize{\@setsize\normalsize{11pt}\xpt\@xpt} 172 | \def\small{\@setsize\small{10pt}\ixpt\@ixpt} 173 | \def\footnotesize{\@setsize\footnotesize{10pt}\ixpt\@ixpt} 174 | \def\scriptsize{\@setsize\scriptsize{8pt}\viipt\@viipt} 175 | \def\tiny{\@setsize\tiny{7pt}\vipt\@vipt} 176 | \def\large{\@setsize\large{14pt}\xiipt\@xiipt} 177 | \def\Large{\@setsize\Large{16pt}\xivpt\@xivpt} 178 | \def\LARGE{\@setsize\LARGE{20pt}\xviipt\@xviipt} 179 | \def\huge{\@setsize\huge{23pt}\xxpt\@xxpt} 180 | \def\Huge{\@setsize\Huge{28pt}\xxvpt\@xxvpt} 181 | 182 | \def\toptitlebar{\hrule height4pt\vskip .25in\vskip-\parskip} 183 | 184 | \def\bottomtitlebar{\vskip .29in\vskip-\parskip\hrule height1pt\vskip 185 | .09in} % 186 | %Reduced second vskip to compensate for adding the strut in \@author 187 | 188 | % Vertical Ruler 189 | % This code is, largely, from the CVPR 2010 conference style file 190 | % ----- define vruler 191 | \makeatletter 192 | \newbox\nipsrulerbox 193 | \newcount\nipsrulercount 194 | \newdimen\nipsruleroffset 195 | \newdimen\cv@lineheight 196 | \newdimen\cv@boxheight 197 | \newbox\cv@tmpbox 198 | \newcount\cv@refno 199 | \newcount\cv@tot 200 | % NUMBER with left flushed zeros \fillzeros[] 201 | \newcount\cv@tmpc@ \newcount\cv@tmpc 202 | \def\fillzeros[#1]#2{\cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi 203 | \cv@tmpc=1 % 204 | \loop\ifnum\cv@tmpc@<10 \else \divide\cv@tmpc@ by 10 \advance\cv@tmpc by 1 \fi 205 | \ifnum\cv@tmpc@=10\relax\cv@tmpc@=11\relax\fi \ifnum\cv@tmpc@>10 \repeat 206 | \ifnum#2<0\advance\cv@tmpc1\relax-\fi 207 | \loop\ifnum\cv@tmpc<#1\relax0\advance\cv@tmpc1\relax\fi \ifnum\cv@tmpc<#1 \repeat 208 | \cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi \relax\the\cv@tmpc@}% 209 | % \makevruler[][][][][] 210 | \def\makevruler[#1][#2][#3][#4][#5]{\begingroup\offinterlineskip 211 | \textheight=#5\vbadness=10000\vfuzz=120ex\overfullrule=0pt% 212 | \global\setbox\nipsrulerbox=\vbox to \textheight{% 213 | {\parskip=0pt\hfuzz=150em\cv@boxheight=\textheight 214 | \cv@lineheight=#1\global\nipsrulercount=#2% 215 | \cv@tot\cv@boxheight\divide\cv@tot\cv@lineheight\advance\cv@tot2% 216 | \cv@refno1\vskip-\cv@lineheight\vskip1ex% 217 | \loop\setbox\cv@tmpbox=\hbox to0cm{{\nipstenhv\hfil\fillzeros[#4]\nipsrulercount}}% 218 | \ht\cv@tmpbox\cv@lineheight\dp\cv@tmpbox0pt\box\cv@tmpbox\break 219 | \advance\cv@refno1\global\advance\nipsrulercount#3\relax 220 | \ifnum\cv@refno<\cv@tot\repeat}}\endgroup}% 221 | \makeatother 222 | % ----- end of vruler 223 | 224 | % \makevruler[][][][][] 225 | \def\nipsruler#1{\makevruler[12pt][#1][1][3][0.993\textheight]\usebox{\nipsrulerbox}} 226 | \AddToShipoutPicture{% 227 | \ifnipsfinal\else 228 | \nipsruleroffset=\textheight 229 | \advance\nipsruleroffset by -3.7pt 230 | \color[rgb]{.7,.7,.7} 231 | \AtTextUpperLeft{% 232 | \put(\LenToUnit{-35pt},\LenToUnit{-\nipsruleroffset}){%left ruler 233 | \nipsruler{\nipsrulercount}} 234 | } 235 | \fi 236 | } 237 | -------------------------------------------------------------------------------- /sentenceclassification/cnn_imdb.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | '''This example demonstrates the use of Convolution1D for text classification. 3 | 90s/epoch on Intel i5 2.4Ghz CPU. 4 | 10s/epoch on Tesla K40 GPU. 5 | ''' 6 | 7 | ''' 8 | Simple Convolution1D for Sequence Classification 9 | ''' 10 | 11 | ########################## 12 | ## Importing packages 13 | ########################## 14 | # importing packages/function that will be useful later 15 | import numpy as np 16 | np.random.seed(1234) # for reproducibility (manually setting random seed) 17 | 18 | from keras.preprocessing import sequence 19 | from keras.models import Sequential 20 | from keras.layers import Dense, Dropout, Activation, Embedding, LSTM, SimpleRNN, GRU 21 | from keras.layers import Convolution1D, GlobalMaxPooling1D 22 | from utils import load_imdb 23 | 24 | import matplotlib.pyplot as plt 25 | 26 | ########################## 27 | ## Preparing data 28 | ########################## 29 | # some parameters 30 | vocab_size = 5000 # number of words considered in the vocabulary 31 | train_split = 0.7 # ratio of train sentences 32 | 33 | # Preparing data is usually the most time-consuming part of machine learning. 34 | # Luckily for you, the imdb dataset has already been preprocessed and included in Keras. 35 | (X_train, y_train), (X_test, y_test) = load_imdb(nb_words=vocab_size, train_split=train_split) 36 | 37 | print(len(X_train), 'train sequences') 38 | print(len(X_test), 'test sequences') 39 | 40 | ## Padding input data 41 | # Models in Keras (and elsewhere) usually take as input batches of sentences of the same length. 42 | # Since sentences usually have different sizes, we "pad" sentences (we add a dummy "padding" token at the end of the 43 | # sentences. The input thus has this size : (batchsize, maxseqlen) where maxseqlen is the maximum length of a sentence 44 | # in the batch. 45 | 46 | maxlen = 80 # cut texts after this number of words (among top vocab_size most common words) 47 | X_train = sequence.pad_sequences(X_train, maxlen=maxlen) 48 | X_test = sequence.pad_sequences(X_test, maxlen=maxlen) 49 | print('X_train shape:', X_train.shape) 50 | print('X_test shape:', X_test.shape) 51 | 52 | ########################## 53 | ## Building model 54 | ########################## 55 | 56 | embed_dim = 16 57 | nhid = 128 58 | print('\nBuilding model...') 59 | 60 | nb_filter = 250 61 | filter_length = 3 62 | hidden_dims = 250 63 | 64 | model = Sequential() 65 | # we start off with an efficient embedding layer which maps 66 | # our vocab indices into embedding_dims dimensions 67 | model.add(Embedding(vocab_size, 68 | embed_dim, 69 | input_length=maxlen, 70 | dropout=0.2)) 71 | 72 | # we add a Convolution1D, which will learn nb_filter 73 | # word group filters of size filter_length: 74 | model.add(Convolution1D(nb_filter=nb_filter, 75 | filter_length=filter_length, 76 | border_mode='valid', 77 | activation='relu', 78 | subsample_length=1)) 79 | # we use temporal max pooling: 80 | model.add(GlobalMaxPooling1D()) 81 | 82 | # We add a classifier (MLP with one hidden layer) 83 | model.add(Dense(hidden_dims)) 84 | model.add(Dropout(0.2)) 85 | model.add(Activation('relu')) 86 | model.add(Dense(1)) 87 | model.add(Activation('sigmoid')) 88 | 89 | 90 | ########################## 91 | ## Define (i) loss function 92 | # (ii) optimizer 93 | # (iii) metrics 94 | ########################## 95 | 96 | loss_classif = 'binary_crossentropy' 97 | optimizer = 'adam' # or sgd 98 | metrics_classif = ['accuracy'] 99 | 100 | model.compile(loss=loss_classif, 101 | optimizer=optimizer, 102 | metrics=metrics_classif) 103 | 104 | print(model.summary()) 105 | print('Built model') 106 | ########################## 107 | ## Train Model 108 | ########################## 109 | validation_split = 0.2 # Held-out ("validation") data to test on. 110 | batch_size = 32 # size of the minibach (each batch will contain 32 sentences) 111 | nb_epoch = 6 112 | 113 | print('\n\nStarting training of the model\n') 114 | history = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch, validation_split=0.2) 115 | 116 | plt.figure(1) 117 | plt.subplot(1,2,1) 118 | plt.plot(range(1,nb_epoch + 1), history.history['loss'], 'b', range(1,nb_epoch + 1), history.history['val_loss'], 'r') 119 | plt.ylabel('loss') 120 | plt.xlabel('epoch') 121 | plt.legend(['train', 'validation'], loc='upper left') 122 | 123 | plt.subplot(1,2,2) 124 | plt.plot(range(1,nb_epoch + 1), history.history['acc'], 'b', range(1,nb_epoch + 1), history.history['val_acc'], 'r') 125 | plt.ylabel('accuracy') 126 | plt.xlabel('epoch') 127 | plt.legend(['train', 'validation'], loc='upper left') 128 | plt.show() 129 | 130 | ########################## 131 | ## Evaluate on test set 132 | ########################## 133 | # evaluate model on test set (never seen during training) 134 | score, acc = model.evaluate(X_test, y_test, 135 | batch_size=batch_size) 136 | print('\n\nTest score:', score) 137 | print('Test accuracy:', acc) 138 | -------------------------------------------------------------------------------- /sentenceclassification/cnn_lstm_imdb.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | ########################## 4 | ## Importing packages 5 | ########################## 6 | import numpy as np 7 | np.random.seed(1234) # for reproducibility (manually setting random seed) 8 | 9 | from keras.preprocessing import sequence 10 | from keras.models import Sequential 11 | from keras.layers import Dense, Dropout, Activation, Embedding, LSTM, SimpleRNN, GRU 12 | from keras.layers import Convolution1D 13 | from keras.optimizers import SGD, Adam 14 | from utils import load_imdb 15 | 16 | import matplotlib.pyplot as plt 17 | 18 | ########################## 19 | ## Preparing data 20 | ########################## 21 | vocab_size = 5000 # number of words considered in the vocabulary 22 | train_split = 0.7 # ratio of train sentences 23 | 24 | (X_train, y_train), (X_test, y_test) = load_imdb(nb_words=vocab_size, train_split=train_split) 25 | print(len(X_train), 'train sequences') 26 | print(len(X_test), 'test sequences') 27 | 28 | ## Padding input data 29 | maxlen = 80 # cut texts after this number of words (among top vocab_size most common words) 30 | X_train = sequence.pad_sequences(X_train, maxlen=maxlen) 31 | X_test = sequence.pad_sequences(X_test, maxlen=maxlen) 32 | print('X_train shape:', X_train.shape) 33 | print('X_test shape:', X_test.shape) 34 | 35 | ########################## 36 | ## Building model 37 | ########################## 38 | embed_dim = 25 39 | nhid = 128 40 | nb_filter = 250 41 | filter_length = 3 42 | hidden_dims = 250 43 | print('\nBuilding model...') 44 | 45 | model = Sequential() 46 | model.add(Embedding(vocab_size, 47 | embed_dim, 48 | input_length=maxlen, 49 | dropout=0.3)) 50 | model.add(Convolution1D(nb_filter=nb_filter, 51 | filter_length=filter_length, 52 | border_mode='valid', 53 | activation='relu', 54 | subsample_length=1)) 55 | model.add(LSTM(nhid, dropout_W=0.2, dropout_U=0.2)) 56 | 57 | model.add(Dense(hidden_dims)) 58 | model.add(Dropout(0.5)) 59 | model.add(Activation('relu')) 60 | model.add(Dense(1)) 61 | model.add(Activation('sigmoid')) 62 | 63 | ########################## 64 | ## Define (i) loss function 65 | # (ii) optimizer 66 | # (iii) metrics 67 | ########################## 68 | loss_classif = 'binary_crossentropy' 69 | optimizer = 'adam' # or sgd 70 | metrics_classif = ['accuracy'] 71 | 72 | model.compile(loss=loss_classif, 73 | optimizer=optimizer, 74 | metrics=metrics_classif) 75 | 76 | print(model.summary()) 77 | print('Built model') 78 | 79 | ########################## 80 | ## Train Model 81 | ########################## 82 | validation_split = 0.2 # Held-out ("validation") data to test on. 83 | batch_size = 64 # size of the minibach (each batch will contain 32 sentences) 84 | nb_epoch = 7 85 | 86 | print('\n\nStarting training of the model\n') 87 | history = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch, validation_split=0.2) 88 | 89 | plt.figure(1) 90 | plt.subplot(1,2,1) 91 | plt.plot(range(1,nb_epoch + 1), history.history['loss'], 'b', range(1,nb_epoch + 1), history.history['val_loss'], 'r') 92 | plt.ylabel('loss') 93 | plt.xlabel('epoch') 94 | plt.legend(['train', 'validation'], loc='upper left') 95 | 96 | plt.subplot(1,2,2) 97 | plt.plot(range(1,nb_epoch + 1), history.history['acc'], 'b', range(1,nb_epoch + 1), history.history['val_acc'], 'r') 98 | plt.ylabel('accuracy') 99 | plt.xlabel('epoch') 100 | plt.legend(['train', 'validation'], loc='upper left') 101 | plt.show() 102 | 103 | ########################## 104 | ## Evaluate on test set 105 | ########################## 106 | # evaluate model on test set (never seen during training) 107 | score, acc = model.evaluate(X_test, y_test, 108 | batch_size=batch_size) 109 | print('\n\nTest score:', score) 110 | print('Test accuracy:', acc) 111 | -------------------------------------------------------------------------------- /sentenceclassification/lstm_imdb.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Simple LSTM for Sequence Classification 3 | ''' 4 | 5 | ########################## 6 | ## Importing packages 7 | ########################## 8 | # importing packages/function that will be useful later 9 | from __future__ import print_function 10 | import numpy as np 11 | np.random.seed(1234) # for reproducibility (manually setting random seed) 12 | 13 | from keras.preprocessing import sequence 14 | from keras.models import Sequential 15 | from keras.layers import Dense, Dropout, Activation, Embedding, LSTM, SimpleRNN, GRU 16 | from utils import load_imdb 17 | 18 | import matplotlib.pyplot as plt 19 | 20 | ########################## 21 | ## Preparing data 22 | ########################## 23 | # some parameters 24 | vocab_size = 15000 # number of words considered in the vocabulary 25 | train_split = 0.7 # ratio of train sentences 26 | 27 | # Preparing data is usually the most time-consuming part of machine learning. 28 | # Luckily for you, the imdb dataset has already been preprocessed and included in Keras. 29 | (X_train, y_train), (X_test, y_test) = load_imdb(nb_words=vocab_size, train_split=train_split) 30 | 31 | print(len(X_train), 'train sequences') 32 | print(len(X_test), 'test sequences') 33 | 34 | ## Padding input data 35 | # Models in Keras (and elsewhere) usually take as input batches of sentences of the same length. 36 | # Since sentences usually have different sizes, we "pad" sentences (we add a dummy "padding" token at the end of the 37 | # sentences. The input thus has this size : (batchsize, maxseqlen) where maxseqlen is the maximum length of a sentence 38 | # in the batch. 39 | 40 | maxlen = 80 # cut texts after this number of words (among top vocab_size most common words) 41 | X_train = sequence.pad_sequences(X_train, maxlen=maxlen) 42 | X_test = sequence.pad_sequences(X_test, maxlen=maxlen) 43 | print('X_train shape:', X_train.shape) 44 | print('X_test shape:', X_test.shape) 45 | 46 | ########################## 47 | ## Building model 48 | ########################## 49 | 50 | embed_dim = 32 # word embedding dimension 51 | nhid = 64 # number of hidden units in the LSTM 52 | print('\nBuilding model...') 53 | 54 | model = Sequential() 55 | if False: # Change to "False" to use dropout 56 | model.add(Embedding(vocab_size, embed_dim)) 57 | model.add(LSTM(nhid)) 58 | else: 59 | model.add(Embedding(vocab_size, embed_dim, dropout=0.2)) 60 | model.add(LSTM(nhid, dropout_W=0.2, dropout_U=0.2)) 61 | model.add(Dense(1)) 62 | model.add(Activation('sigmoid')) 63 | 64 | print('Built model') 65 | 66 | # In Keras, Torch and other deep learning framework, we create a "container" which is the Sequential() module. 67 | # Then we add components to this contained : the lookuptable, the LSTM, the classifier etc. 68 | # All of these components are contained in the Sequential() and are trained together. 69 | 70 | ########################## 71 | ## Define (i) loss function 72 | # (ii) optimizer 73 | # (iii) metrics 74 | ########################## 75 | 76 | loss_classif = 'binary_crossentropy' 77 | optimizer = 'adam' # or sgd 78 | metrics_classif = ['accuracy'] 79 | 80 | # note that this is especially easy in Keras : one code line 81 | print('\nCompiling model') 82 | model.compile(loss=loss_classif, 83 | optimizer=optimizer, 84 | metrics=metrics_classif) 85 | print(model.summary()) 86 | print('Compiled model') 87 | 88 | ########################## 89 | ## Train Model 90 | ########################## 91 | validation_split = 0.2 # Held-out ("validation") data to test on. 92 | batch_size = 32 # size of the minibach (each batch will contain 32 sentences) 93 | nb_epoch = 6 94 | 95 | # history is just an object that contains information about training. 96 | # Look at the following line and enjoy how simple it is to train a neural network in Keras. 97 | print('\n\nStarting training of the model\n') 98 | history = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch, validation_split=validation_split) 99 | 100 | plt.figure(1) 101 | plt.subplot(1,2,1) 102 | plt.plot(range(1,nb_epoch + 1), history.history['loss'], 'b', range(1,nb_epoch + 1), history.history['val_loss'], 'r') 103 | plt.ylabel('loss') 104 | plt.xlabel('epoch') 105 | plt.legend(['train', 'validation'], loc='upper left') 106 | 107 | plt.subplot(1,2,2) 108 | plt.plot(range(1,nb_epoch + 1), history.history['acc'], 'b', range(1,nb_epoch + 1), history.history['val_acc'], 'r') 109 | plt.ylabel('accuracy') 110 | plt.xlabel('epoch') 111 | plt.legend(['train', 'validation'], loc='upper left') 112 | plt.show() 113 | 114 | ########################## 115 | ## Evaluate on test set 116 | ########################## 117 | # evaluate model on test set (never seen during training) 118 | score, acc = model.evaluate(X_test, y_test, 119 | batch_size=batch_size) 120 | print('\n\nTest loss:', score) 121 | print('Test accuracy:', acc) 122 | -------------------------------------------------------------------------------- /sentenceclassification/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from keras.datasets import imdb 3 | 4 | 5 | def load_imdb(nb_words, train_split=0.8): 6 | print 'Preparing IMDB-review sentence classification dataset with {0} % training data ...'.format(train_split*100) 7 | (X_1, y_1), (X_2, y_2) = imdb.load_data(nb_words=nb_words) 8 | X = np.array([x for x in X_1] + [x for x in X_2]) 9 | Y = np.array([y for y in y_1] + [y for y in y_2]) 10 | X_train, y_train = X[:int(train_split * len(X))], Y[:int(train_split * len(Y))] 11 | X_test, y_test = X[int(train_split * len(X)):], Y[int(train_split * len(Y)):] 12 | 13 | return (X_train, y_train), (X_test, y_test) -------------------------------------------------------------------------------- /word2vec/embedding_word2vec.py: -------------------------------------------------------------------------------- 1 | from math import sqrt 2 | from operator import itemgetter 3 | from random import randint 4 | 5 | import numpy as np 6 | import logging 7 | reload(logging) 8 | logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%I:%M:%S') 9 | 10 | from gensim.models import word2vec 11 | 12 | 13 | def avg_word2vec(model, dataset='data/snli.test'): 14 | array_sentences = [] 15 | array_embeddings = [] 16 | with open(dataset) as f: 17 | for line in f: 18 | avgword2vec = None 19 | cont = 0 20 | for word in line.split(): 21 | # get embedding (if it exists) of each word in the sentence 22 | if word in model.wv.vocab: 23 | cont += 1 24 | if avgword2vec is None: 25 | avgword2vec = model[word] 26 | else: 27 | avgword2vec = avgword2vec + model[word] 28 | # if at least one word in the sentence has a word embeddings : 29 | if avgword2vec is not None: 30 | avgword2vec = avgword2vec / cont # normalize sum 31 | array_sentences.append(line) 32 | array_embeddings.append(avgword2vec) 33 | print 'avg_word2vec: Generated embeddings for {0} sentences from {1} dataset.'.format(len(array_sentences), dataset) 34 | return array_sentences, array_embeddings 35 | 36 | 37 | def cosine_similarity(a, b): 38 | assert len(a) == len(b), 'vectors need to have the same size' 39 | cos_sim = a.dot(b) / sqrt(a.dot(a)) / sqrt(b.dot(b)) 40 | return cos_sim 41 | 42 | 43 | def most_similar(idx, array_embeddings, array_sentences): 44 | query_sentence = array_sentences[idx] 45 | query_embed = array_embeddings[idx] 46 | list_scores = {} 47 | for i in range(idx) + range(idx + 1, len(array_sentences)): 48 | list_scores[i] = cosine_similarity(query_embed, array_embeddings[i]) 49 | closest_idx = max(list_scores, key=list_scores.get) 50 | 51 | print 'The query :\n' 52 | print query_sentence + '\n' 53 | print 'is most similar to\n' 54 | print array_sentences[closest_idx] 55 | print 'with a score of : {0}\n'.format(list_scores[closest_idx]) 56 | 57 | print '5 most similar sentences:' 58 | closest_5 = sorted(list_scores.iteritems(), key=itemgetter(1), reverse=True)[:5] 59 | for i, score in closest_5: 60 | print array_sentences[i], score 61 | 62 | return closest_idx 63 | 64 | def most_5_similar(idx, array_embeddings, array_sentences): 65 | query_sentence = array_sentences[idx] 66 | query_embed = array_embeddings[idx] 67 | list_scores = {} 68 | for i in range(idx) + range(idx + 1, len(array_sentences)): 69 | list_scores[i] = cosine_similarity(query_embed, array_embeddings[i]) 70 | 71 | closest_5 = sorted(list_scores.iteritems(), key=itemgetter(1), reverse=True)[:5] 72 | closest_5_idx = [i for i, score in closest_5] 73 | 74 | assert len(closest_5_idx) == 5 75 | 76 | return closest_5_idx 77 | 78 | 79 | def IDF(dataset='data/snli.test'): 80 | # Compute IDF (Inverse Document Frequency). Here a "document" is a sentence. 81 | # word2idf['peach'] = IDF(peach) 82 | df = {} 83 | N = 0 84 | with open(dataset) as f: 85 | for line in f: 86 | N += 1 87 | sentence = line.split() 88 | sentence = np.unique(sentence) 89 | for word in sentence: 90 | if word in df: 91 | df[word] += 1 92 | else: 93 | df[word] = 1 94 | 95 | word2idf = {} 96 | for k,v in df.iteritems(): 97 | word2idf[k] = np.log(float(N) / v) 98 | 99 | return word2idf 100 | 101 | def avg_word2vec_idf(model, word2idf, dataset='data/snli.test'): 102 | array_sentences = [] 103 | array_embeddings = [] 104 | with open(dataset) as f: 105 | for line in f: 106 | avgword2vec = None 107 | sumidf = 0 108 | for word in line.split(): 109 | # get embedding (if it exists) of each word in the sentence 110 | if word in model.wv.vocab: 111 | sumidf += word2idf[word] 112 | if avgword2vec is None: 113 | avgword2vec = word2idf[word] * model[word] 114 | else: 115 | avgword2vec = avgword2vec + word2idf[word] * model[word] 116 | # if at least one word in the sentence has a word embeddings : 117 | if avgword2vec is not None: 118 | avgword2vec = avgword2vec / sumidf # normalize sum 119 | array_sentences.append(line) 120 | array_embeddings.append(avgword2vec) 121 | print 'avg_word2vec_idf: Generated embeddings for {0} sentences from {1} dataset.'.format(len(array_sentences), dataset) 122 | return array_sentences, array_embeddings 123 | 124 | if __name__ == "__main__": 125 | 126 | if False: # FIRST PART 127 | sentences = word2vec.Text8Corpus('data/text8') 128 | 129 | # Train a word2vec model 130 | embedding_size = 200 131 | model = word2vec.Word2Vec(sentences, size=embedding_size) 132 | 133 | # Train a word2vec model with phrases 134 | bigram_transformer = gensim.models.Phrases(sentences) 135 | model_phrase = Word2Vec(bigram_transformer[sentences], size=200) 136 | else: 137 | # Loading model trained on words 138 | model = word2vec.Word2Vec.load('models/text8.model') 139 | 140 | # Loading model enhanced with phrases (2-grams) 141 | model_phrase = word2vec.Word2Vec.load('models/text8.phrase.model') 142 | 143 | """ 144 | SECOND PART: Investigating word2vec word embeddings space 145 | """ 146 | 147 | # Words that are similar are close in the sense of the cosine similarity. 148 | sim = model.similarity('woman', 'man') 149 | print 'Printing word similarity between "woman" and "man" : {0}'.format(sim) 150 | 151 | sim = model.similarity('apple', 'mac') 152 | print 'Printing word similarity between "apple" and "mac" : {0}'.format(sim) 153 | 154 | sim = model.similarity('apple', 'peach') 155 | print 'Printing word similarity between "apple" and "peach" : {0}'.format(sim) 156 | 157 | sim = model.similarity('banana', 'peach') 158 | print 'Printing word similarity between "banana" and "peach" : {0}'.format(sim) 159 | 160 | # And words that appear in the same context have similar word embeddings. 161 | print model.most_similar(['paris'])[0] 162 | print model_phrase.most_similar(['paris'])[0] 163 | 164 | words = ['apple', 'peach', 'banana', 'car'] 165 | 166 | for word in words: 167 | print word 168 | print model.most_similar([word]) 169 | 170 | print "science - scientist + mathematician" 171 | print model.most_similar(positive=['science', 'mathematician'], negative=['scientist'])[0] 172 | print "science - scientist + physicist" 173 | print model.most_similar(positive=['science', 'physicist'], negative=['scientist'])[0] 174 | print "science - scientist + philosopher" 175 | print model.most_similar(positive=['science', 'philosopher'], negative=['scientist'])[0] 176 | print "science - scientist + astronomer" 177 | print model.most_similar(positive=['science', 'astronomer'], negative=['scientist'])[0] 178 | print "science - scientist + biologist" 179 | print model.most_similar(positive=['science', 'biologist'], negative=['scientist'])[0] 180 | 181 | print model.most_similar(['difficult']) 182 | print model_phrase.most_similar(['difficult']) 183 | 184 | print model_phrase.most_similar(['clinton']) 185 | 186 | # Compositionality and structure in word2vec space 187 | print model.most_similar(positive=['woman', 'king'], negative=['man'])[0] 188 | 189 | print model.most_similar(positive=['france', 'berlin'], negative=['germany'])[0] 190 | 191 | """ 192 | THIRD PART: Sentence embeddings with average(word2vec) 193 | """ 194 | data_path = 'data/snli.test' 195 | array_sentences, array_embeddings = avg_word2vec(model, dataset=data_path) 196 | 197 | query_idx = 777 # random sentence 198 | assert query_idx < len(array_sentences) # little check 199 | 200 | # array_sentences[closest_idx] will be the closest sentence to array_sentences[query_idx]. 201 | closest_idx = most_similar(query_idx, array_embeddings, array_sentences) 202 | 203 | closest_5_idx = most_5_similar(query_idx, array_embeddings, array_sentences) 204 | 205 | print 'Most 5 similar:\n' 206 | for idx in closest_5_idx: 207 | print array_sentences[idx] 208 | 209 | """ 210 | FOURTH PART: Weighted average of word vectors with IDF. 211 | """ 212 | word2idf = IDF(data_path) 213 | 214 | words = ['the', 'a' , 'clinton', 'woman', 'man', 'apple', 'peach', 'banana', 'mac', 'paris', 'france'] 215 | 216 | for word in words: 217 | if word in word2idf: 218 | print word, word2idf[word] 219 | else: 220 | print word, "not found" 221 | 222 | array_sentences_idf, array_embeddings_idf = avg_word2vec_idf(model, word2idf, dataset=data_path) 223 | closest_idx_idf = most_similar(query_idx, array_embeddings_idf, array_sentences_idf) 224 | --------------------------------------------------------------------------------