├── .gitignore
├── LICENSE
├── report
    ├── img
    │   ├── cnn_loss_acc.png
    │   ├── cnn_lstm_loss_acc.png
    │   ├── lstm_dropout_loss_acc.png
    │   └── lstm_no_dropout_loss_acc.png
    ├── main.pdf
    ├── main.tex
    └── nips15submit_e.sty
├── sentenceclassification
    ├── cnn_imdb.py
    ├── cnn_lstm_imdb.py
    ├── lstm_imdb.py
    └── utils.py
└── word2vec
    └── embedding_word2vec.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Mario Ynocente Castro
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/report/img/cnn_loss_acc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/marioyc/Sentence-Classification/2680d4855a4de9dc4a8f9c4df0de7820077f5780/report/img/cnn_loss_acc.png


--------------------------------------------------------------------------------
/report/img/cnn_lstm_loss_acc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/marioyc/Sentence-Classification/2680d4855a4de9dc4a8f9c4df0de7820077f5780/report/img/cnn_lstm_loss_acc.png


--------------------------------------------------------------------------------
/report/img/lstm_dropout_loss_acc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/marioyc/Sentence-Classification/2680d4855a4de9dc4a8f9c4df0de7820077f5780/report/img/lstm_dropout_loss_acc.png


--------------------------------------------------------------------------------
/report/img/lstm_no_dropout_loss_acc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/marioyc/Sentence-Classification/2680d4855a4de9dc4a8f9c4df0de7820077f5780/report/img/lstm_no_dropout_loss_acc.png


--------------------------------------------------------------------------------
/report/main.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/marioyc/Sentence-Classification/2680d4855a4de9dc4a8f9c4df0de7820077f5780/report/main.pdf


--------------------------------------------------------------------------------
/report/main.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{article} % For LaTeX2e
  2 | \usepackage{nips15submit_e,times}
  3 | %\usepackage{hyperref}
  4 | %\usepackage{url}
  5 | \usepackage{amsmath}
  6 | %\newcommand\addtag{\refstepcounter{equation}\tag{\theequation}}
  7 | \usepackage{graphicx}
  8 | 
  9 | \title{Word and sentence embeddings\\
 10 | Sentence Classification with LSTMs/ConvNets}
 11 | \author{
 12 | Mario Ynocente Castro\\
 13 | Master MVA\\
 14 | ENS Cachan / Ecole Polytechnique\\
 15 | \texttt{mario.ynocente-castro@polytechnique.edu}
 16 | }
 17 | 
 18 | % The \author macro works with any number of authors. There are two commands
 19 | % used to separate the names and addresses of multiple authors: \And and \AND.
 20 | %
 21 | % Using \And between authors leaves it to \LaTeX{} to determine where to break
 22 | % the lines. Using \AND forces a linebreak at that point. So, if \LaTeX{}
 23 | % puts 3 of 4 authors names on the first line, and the last on the second
 24 | % line, try using \AND instead of \And before the third author name.
 25 | 
 26 | \newcommand{\fix}{\marginpar{FIX}}
 27 | \newcommand{\new}{\marginpar{NEW}}
 28 | 
 29 | \nipsfinalcopy % Uncomment for camera-ready version
 30 | 
 31 | \begin{document}
 32 | \maketitle
 33 | 
 34 | \section{Word and sentece embeddings with word2vec}
 35 | 
 36 | \subsection{Loading models}
 37 | 
 38 | \begin{enumerate}
 39 |     \item
 40 |     What is the total number of raw words found in the corpus?
 41 | 
 42 |     17,005,207 words.
 43 | 
 44 |     \item
 45 |     What is the number of words retained in the word2vec vocabulary
 46 |     (with default min\_count = 5)?
 47 | 
 48 |     71290 words.
 49 | \end{enumerate}
 50 | 
 51 | \subsection{Exploring the embedding space}
 52 | 
 53 | \begin{enumerate}
 54 |     \item
 55 |     What is the similarity between (’apple’ and ’mac’), between (’apple’ and ’peach’),
 56 |     between (’banana’ and ’peach’)? In your opinion, why are you asked about
 57 |     the three previous examples?
 58 | 
 59 |     \begin{itemize}
 60 |         \item
 61 |         Similarity between apple and mac: 0.567861632452
 62 | 
 63 |         \item
 64 |         Similarity between apple and peach: 0.178399832237
 65 | 
 66 |         \item
 67 |         Similarity between banana and peach: 0.688715470006
 68 |     \end{itemize}
 69 | 
 70 |     This illustrates that in this embedding space apple and mac are more similar than
 71 |     what apple and peach are even though both are fruits, in the case of banana and peach
 72 |     we do get a high similarity. This may happen because in the corpus the word apple
 73 |     is used more often as referring to the mark and less referring to the fruit, so it
 74 |     is more common to find it in the same context that the word mac.
 75 | 
 76 |     \item
 77 |     What is the closest word to the word 'difficult', for 'model' and 'model\_phrase'.
 78 |     Comment about the difference between model and model\_phrase. Find the three phrases
 79 |     that are closest to the word 'clinton'.
 80 | 
 81 |     \begin{itemize}
 82 |         \item
 83 |         Closest word to difficult for model: easy
 84 | 
 85 |         \item
 86 |         Closest word to difficult for model\_phrase: very\_difficult
 87 | 
 88 |         \item
 89 |         Difference between model and model\_phrase: model\_phrase was trained after
 90 |         preprocessing the tokens of the sentences in the corpus to group the ones
 91 |         that commonly occur together in phrases, model in contrast was trained
 92 |         directly on the individual tokens.
 93 | 
 94 |         \item
 95 |         Three phrases that are closest to clinton: bush, reagan and gore are the top
 96 |         and consist of only one token, the top consisting of two tokens are
 97 |         bill\_clinton, w\_bush and al\_gore.
 98 |     \end{itemize}
 99 | 
100 |     \item
101 |     Find the closest word to the vector "vect(france) - vect(germany) + vect(berlin)" and
102 |     report its similarity measure.
103 | 
104 |     The closest word is 'paris' with similarity measure of 0.757699728012085
105 | 
106 | \newpage
107 |     \item
108 |     Explore the embedding space using these functions and report some interesting
109 |     behaviour (of your choice)
110 | 
111 |     \begin{itemize}
112 |         \item
113 |         We can further confirm that the word apple is more frequently used (on the
114 |         corpus) as related to the mark by checking which are its most similar words,
115 |         which gives us: macintosh, atari, amiga, intel, ibm, pc
116 |         \item
117 |         We can make analogies like:
118 |         \begin{itemize}
119 |             \item science - scientist + mathematician $\approx$ mathematics
120 |             \item science - scientist + physicist $\approx$ physics
121 |             \item science - scientist + philosopher $\approx$ philosophy
122 |             \item science - scientist + astronomer $\approx$ astronomy
123 |             \item science - scientist + biologist $\approx$ humanities
124 |         \end{itemize}
125 |     \end{itemize}
126 | 
127 | \end{enumerate}
128 | 
129 | \subsection{Sentence embeddings}
130 | 
131 | \begin{enumerate}
132 |     \item
133 |     Report the closest sentence to the sentence with idx "777", and their
134 |     similarity score.
135 | 
136 |     "gymnasts get ready for a competition ." with score 0.902949842134
137 | 
138 |     \item
139 |     Report the 5 closest sentences to the sentence with idx "777", and the
140 |     associated similarity scores.
141 | 
142 |     \begin{table}[h]
143 |     \label{sample-table}
144 |     \begin{center}
145 |     \begin{tabular}{ll}
146 |     \multicolumn{1}{c}{\bf Sentence} & \multicolumn{1}{c}{\bf Similarity score}
147 |     \\ \hline \\
148 |     gymnasts get ready for a competition . & 0.902949842134 \\
149 |     a woman is getting ready to perform a song for the audience . & 0.890097422822 \\
150 |     a runner in a competition want to go to the finish line . & 0.855536495002 \\
151 |     men working to build a fence for customers . & 0.851471676783 \\
152 |     a man prepares to give a speech for a television audience . & 0.849476121272 \\
153 |     \end{tabular}
154 |     \end{center}
155 |     \end{table}
156 | \end{enumerate}
157 | 
158 | \subsection{IDF weighted sentence embeddings}
159 | 
160 | \begin{enumerate}
161 |     \item
162 |     Report the IDF score of the word "the", the word "a", and the word "clinton".
163 | 
164 |     The word "the" has a score of 0.867762351618
165 | 
166 |     The word "a" has a score of 0.473266274881
167 | 
168 |     The word "clinton" doesn't have an IDF score since it's not present in the data.
169 | 
170 |     \item
171 |     Report the closest sentence to sentence with idx 777.
172 | 
173 |     The closest sentence is "gymnasts get ready for a competition ." with a score of 0.897237646962.
174 | \end{enumerate}
175 | 
176 | \section{Simple LSTM for Sequence Classification}
177 | 
178 | \begin{enumerate}
179 |     \item
180 |     What is the (minibatch) shape of:
181 | 
182 |     \begin{itemize}
183 |         \item
184 |         the input of the embedding layer: $32 \times 80$
185 |         \item
186 |         the input of the LSTM layer: $32 \times 80 \times 32$
187 |         \item
188 |         the output of the LSTM layer $32 \times 64$
189 |     \end{itemize}
190 | 
191 |     \item
192 |     Report the number of parameters of the model with the standard set of hyper-parameters.
193 |     Report also the number of sentences in the training set. In standard statistics,
194 |     a rule of thumb is to have less parameters than samples in your dataset.
195 |     How do you think it’s possible to train a model that has so many parameters
196 |     compared to this number of samples?
197 | 
198 |     \begin{itemize}
199 |         \item
200 |         Number of hyper-parameters: 480000
201 |         \item
202 |         Number of sentences in the training set: 35000
203 |         \item
204 |         We are adding some regularization to the model, in this particular case
205 |         we use dropout, which can be seen as training multiple similar models
206 |         with some constraints of weight sharing.
207 |     \end{itemize}
208 | 
209 |     \item
210 |     For a single sentence, the LSTM has states $h_1, \ldots, h_T$ where $T$ is
211 |     the number of words in the sentence. The sentence embeddings that is fed to
212 |     the classifier is thus computed as $f(h_1, \ldots, h_T)$. What is the exact
213 |     form of $f(h_1, \ldots, h_T)$ used in the python script?
214 | 
215 |     Keras implements LSTM according to the following recurrent formulas:
216 | 
217 |     \begin{center}
218 |         $f_t = \sigma(W_f x_t + U_f h_{t - 1} + b_f), i_t = \sigma(W_i x_t + U_i h_{t - 1} + b_i)$
219 | 
220 |         $C_t = f_t C_{t - 1} + i_t \tanh(W_c x_t + U_c h_{t - 1} + b_c)$
221 | 
222 |         $o_t = \sigma(W_o x_t + U_o h_{t - 1} + b_o), h_t = o_t \tanh(C_t)$
223 |     \end{center}
224 | 
225 |     If the flag return\_sequences is set to False (which is the default) then
226 |     $f(h_1,\ldots,h_T) = h_T$, if it is True then $f(h_1,\ldots,h_T) = [h_1; \ldots; h_T]$.
227 | 
228 |     \item
229 |     Plot the evolution of the train and valid accuracy per epoch, and write the
230 |     test errors that you obtain.
231 | 
232 |     \begin{figure}[ht]
233 |     \includegraphics[width=\textwidth,height=\textheight,keepaspectratio]{img/lstm_no_dropout_loss_acc.png}
234 |     \caption{Evolution of the loss and accuracy with no dropout.}
235 |     \end{figure}
236 | 
237 |     \begin{figure}[ht]
238 |     \includegraphics[width=\textwidth,height=\textheight,keepaspectratio]{img/lstm_dropout_loss_acc.png}
239 |     \caption{Evolution of the loss and accuracy with dropout.}
240 |     \end{figure}
241 | 
242 |     \begin{itemize}
243 |         \item
244 |         Results without dropout: loss = 0.656337736861, error = 18.0866666698\%
245 | 
246 |         \item
247 |         Results with dropout: loss = 0.377996407843, error = 16.606666666699998\%
248 |     \end{itemize}
249 | 
250 |     \item
251 |     Explain what is the difference between SGD and Adam.
252 | 
253 |     The difference is that Adam uses an exponentially
254 |     decaying average of the past gradients and also divides this by another similarly
255 |     calculated average, but this time of the sum of squares of the terms in the
256 |     gradient, which has the effect of decreasing the learning rate of the parameters
257 |     that have been updated the most.
258 | \end{enumerate}
259 | 
260 | \section{Simple ConvNet for Sequence Classification}
261 | 
262 | \begin{enumerate}
263 |     \item
264 |     Report the results (test loss and test error) that you obtain.
265 | 
266 |     Loss = 0.361414234543
267 | 
268 |     Error = 16.3333333365\%
269 | 
270 |     \begin{figure}[ht]
271 |     \includegraphics[width=\textwidth,height=\textheight,keepaspectratio]{img/cnn_loss_acc.png}
272 |     \caption{Evolution of the loss and accuracy with 1D Convolution.}
273 |     \end{figure}
274 | 
275 |     \item
276 |     What is the input and output shape of Convolution1D?
277 | 
278 |     Input's shape: $32 \times 80 \times 16$
279 | 
280 |     Output's shape: $32 \times 78 \times 250$
281 | 
282 |     \item
283 |     Build a model where on top of the convolution, you have an LSTM. It means
284 |     that the input of the LSTM will be the output of your ConvNet. Run the model
285 |     with the best parameters you find. Report your best results.
286 | 
287 |     Loss = 0.338560722351
288 | 
289 |     Error = 14.673333333299998\%
290 | 
291 |     \begin{figure}[ht]
292 |     \includegraphics[width=\textwidth,height=\textheight,keepaspectratio]{img/cnn_lstm_loss_acc.png}
293 |     \caption{Evolution of the loss and accuracy with LSTM on top of 1D Convolution.}
294 |     \end{figure}
295 | \end{enumerate}
296 | 
297 | \end{document}
298 | 


--------------------------------------------------------------------------------
/report/nips15submit_e.sty:
--------------------------------------------------------------------------------
  1 | %%%% NIPS Macros (LaTex)
  2 | %%%% Style File
  3 | %%%% Dec 12, 1990   Rev Aug 14, 1991; Sept, 1995; April, 1997; April, 1999
  4 | 
  5 | % This file can be used with Latex2e whether running in main mode, or
  6 | % 2.09 compatibility mode.
  7 | %
  8 | % If using main mode, you need to include the commands
  9 | %             \documentclass{article}
 10 | %             \usepackage{nips10submit_e,times}
 11 | % as the first lines in your document.  Or, if you do not have Times
 12 | % Roman font available, you can just use
 13 | %             \documentclass{article}
 14 | %             \usepackage{nips10submit_e}
 15 | % instead.
 16 | %
 17 | % If using 2.09 compatibility mode, you need to include the command
 18 | %             \documentstyle[nips10submit_09,times]{article} 
 19 | % as the first line in your document.  Or, if you do not have Times
 20 | % Roman font available, you can include the command
 21 | %             \documentstyle[nips10submit_09]{article}
 22 | % instead.
 23 | 
 24 | % Change the overall width of the page.  If these parameters are
 25 | %       changed, they will require corresponding changes in the
 26 | %       maketitle section.
 27 | %
 28 | \usepackage{eso-pic} % used by \AddToShipoutPicture 
 29 | 
 30 | \renewcommand{\topfraction}{0.95}   % let figure take up nearly whole page
 31 | \renewcommand{\textfraction}{0.05}  % let figure take up nearly whole page
 32 | 
 33 | % Define nipsfinal, set to true if nipsfinalcopy is defined  
 34 | \newif\ifnipsfinal
 35 | \nipsfinalfalse
 36 | \def\nipsfinalcopy{\nipsfinaltrue}
 37 | \font\nipstenhv  = phvb at 8pt % *** IF THIS FAILS, SEE nips10submit_e.sty ***
 38 | 
 39 | % Specify the dimensions of each page
 40 | 
 41 | \setlength{\paperheight}{11in}
 42 | \setlength{\paperwidth}{8.5in}
 43 | 
 44 | \oddsidemargin .5in    %   Note \oddsidemargin = \evensidemargin
 45 | \evensidemargin .5in
 46 | \marginparwidth 0.07 true in
 47 | %\marginparwidth 0.75 true in
 48 | %\topmargin 0 true pt           % Nominal distance from top of page to top of
 49 | %\topmargin 0.125in
 50 | \topmargin -0.625in
 51 | \addtolength{\headsep}{0.25in}
 52 | \textheight 9.0 true in       % Height of text (including footnotes & figures)
 53 | \textwidth 5.5 true in        % Width of text line.
 54 | \widowpenalty=10000
 55 | \clubpenalty=10000
 56 | 
 57 | % \thispagestyle{empty}        \pagestyle{empty}
 58 | \flushbottom \sloppy
 59 | 
 60 | % We're never going to need a table of contents, so just flush it to 
 61 | % save space --- suggested by drstrip@sandia-2
 62 | \def\addcontentsline#1#2#3{}
 63 | 
 64 | % Title stuff, taken from deproc.
 65 | \def\maketitle{\par 
 66 | \begingroup
 67 |    \def\thefootnote{\fnsymbol{footnote}}
 68 |    \def\@makefnmark{\hbox to 0pt{$^{\@thefnmark}$\hss}} % for perfect author
 69 |                                                         % name centering
 70 | %   The footnote-mark was overlapping the footnote-text,
 71 | %   added the following to fix this problem               (MK)
 72 |    \long\def\@makefntext##1{\parindent 1em\noindent
 73 |                             \hbox to1.8em{\hss $\m@th ^{\@thefnmark}$}##1}
 74 |    \@maketitle \@thanks
 75 | \endgroup
 76 | \setcounter{footnote}{0}
 77 | \let\maketitle\relax \let\@maketitle\relax
 78 | \gdef\@thanks{}\gdef\@author{}\gdef\@title{}\let\thanks\relax}
 79 | 
 80 | % The toptitlebar has been raised to top-justify the first page
 81 | 
 82 | % Title (includes both anonimized and non-anonimized versions)
 83 | \def\@maketitle{\vbox{\hsize\textwidth
 84 | \linewidth\hsize \vskip 0.1in \toptitlebar \centering
 85 | {\LARGE\bf \@title\par}  \bottomtitlebar % \vskip 0.1in %  minus
 86 | \ifnipsfinal
 87 |    \def\And{\end{tabular}\hfil\linebreak[0]\hfil
 88 |             \begin{tabular}[t]{c}\bf\rule{\z@}{24pt}\ignorespaces}% 
 89 |   \def\AND{\end{tabular}\hfil\linebreak[4]\hfil
 90 |             \begin{tabular}[t]{c}\bf\rule{\z@}{24pt}\ignorespaces}% 
 91 |     \begin{tabular}[t]{c}\bf\rule{\z@}{24pt}\@author\end{tabular}% 
 92 | \else 
 93 |      \begin{tabular}[t]{c}\bf\rule{\z@}{24pt}
 94 | Anonymous Author(s) \\
 95 | Affiliation \\
 96 | Address \\
 97 | \texttt{email} \\
 98 | \end{tabular}% 
 99 | \fi
100 | \vskip 0.3in minus 0.1in}}
101 | 
102 | \renewenvironment{abstract}{\vskip.075in\centerline{\large\bf
103 | Abstract}\vspace{0.5ex}\begin{quote}}{\par\end{quote}\vskip 1ex}
104 | 
105 | % sections with less space
106 | \def\section{\@startsection {section}{1}{\z@}{-2.0ex plus
107 |     -0.5ex minus -.2ex}{1.5ex plus 0.3ex
108 | minus0.2ex}{\large\bf\raggedright}}
109 | 
110 | \def\subsection{\@startsection{subsection}{2}{\z@}{-1.8ex plus    
111 | -0.5ex minus -.2ex}{0.8ex plus .2ex}{\normalsize\bf\raggedright}}
112 | \def\subsubsection{\@startsection{subsubsection}{3}{\z@}{-1.5ex
113 | plus      -0.5ex minus -.2ex}{0.5ex plus
114 | .2ex}{\normalsize\bf\raggedright}}
115 | \def\paragraph{\@startsection{paragraph}{4}{\z@}{1.5ex plus   
116 | 0.5ex minus .2ex}{-1em}{\normalsize\bf}}
117 | \def\subparagraph{\@startsection{subparagraph}{5}{\z@}{1.5ex plus 
118 |   0.5ex minus .2ex}{-1em}{\normalsize\bf}}
119 | \def\subsubsubsection{\vskip
120 | 5pt{\noindent\normalsize\rm\raggedright}}
121 | 
122 | 
123 | % Footnotes
124 | \footnotesep 6.65pt %
125 | \skip\footins 9pt plus 4pt minus 2pt
126 | \def\footnoterule{\kern-3pt \hrule width 12pc \kern 2.6pt }
127 | \setcounter{footnote}{0}
128 | 
129 | % Lists and paragraphs
130 | \parindent 0pt
131 | \topsep 4pt plus 1pt minus 2pt
132 | \partopsep 1pt plus 0.5pt minus 0.5pt
133 | \itemsep 2pt plus 1pt minus 0.5pt
134 | \parsep 2pt plus 1pt minus 0.5pt
135 | \parskip .5pc
136 | 
137 | 
138 | %\leftmargin2em 
139 | \leftmargin3pc
140 | \leftmargini\leftmargin \leftmarginii 2em
141 | \leftmarginiii 1.5em \leftmarginiv 1.0em \leftmarginv .5em 
142 | 
143 | %\labelsep \labelsep 5pt
144 | 
145 | \def\@listi{\leftmargin\leftmargini}
146 | \def\@listii{\leftmargin\leftmarginii
147 |    \labelwidth\leftmarginii\advance\labelwidth-\labelsep
148 |    \topsep 2pt plus 1pt minus 0.5pt
149 |    \parsep 1pt plus 0.5pt minus 0.5pt
150 |    \itemsep \parsep}
151 | \def\@listiii{\leftmargin\leftmarginiii
152 |     \labelwidth\leftmarginiii\advance\labelwidth-\labelsep
153 |     \topsep 1pt plus 0.5pt minus 0.5pt 
154 |     \parsep \z@ \partopsep 0.5pt plus 0pt minus 0.5pt
155 |     \itemsep \topsep}
156 | \def\@listiv{\leftmargin\leftmarginiv
157 |      \labelwidth\leftmarginiv\advance\labelwidth-\labelsep}
158 | \def\@listv{\leftmargin\leftmarginv
159 |      \labelwidth\leftmarginv\advance\labelwidth-\labelsep}
160 | \def\@listvi{\leftmargin\leftmarginvi
161 |      \labelwidth\leftmarginvi\advance\labelwidth-\labelsep}
162 | 
163 | \abovedisplayskip 7pt plus2pt minus5pt%
164 | \belowdisplayskip \abovedisplayskip
165 | \abovedisplayshortskip  0pt plus3pt%
166 | \belowdisplayshortskip  4pt plus3pt minus3pt%
167 | 
168 | % Less leading in most fonts (due to the narrow columns)
169 | % The choices were between 1-pt and 1.5-pt leading
170 | %\def\@normalsize{\@setsize\normalsize{11pt}\xpt\@xpt} % got rid of @ (MK)
171 | \def\normalsize{\@setsize\normalsize{11pt}\xpt\@xpt}
172 | \def\small{\@setsize\small{10pt}\ixpt\@ixpt}
173 | \def\footnotesize{\@setsize\footnotesize{10pt}\ixpt\@ixpt}
174 | \def\scriptsize{\@setsize\scriptsize{8pt}\viipt\@viipt}
175 | \def\tiny{\@setsize\tiny{7pt}\vipt\@vipt}
176 | \def\large{\@setsize\large{14pt}\xiipt\@xiipt}
177 | \def\Large{\@setsize\Large{16pt}\xivpt\@xivpt}
178 | \def\LARGE{\@setsize\LARGE{20pt}\xviipt\@xviipt}
179 | \def\huge{\@setsize\huge{23pt}\xxpt\@xxpt}
180 | \def\Huge{\@setsize\Huge{28pt}\xxvpt\@xxvpt}
181 | 
182 | \def\toptitlebar{\hrule height4pt\vskip .25in\vskip-\parskip}
183 | 
184 | \def\bottomtitlebar{\vskip .29in\vskip-\parskip\hrule height1pt\vskip
185 | .09in} %
186 | %Reduced second vskip to compensate for adding the strut in \@author
187 | 
188 | % Vertical Ruler
189 | % This code is, largely, from the CVPR 2010 conference style file
190 | % ----- define vruler
191 | \makeatletter
192 | \newbox\nipsrulerbox
193 | \newcount\nipsrulercount
194 | \newdimen\nipsruleroffset
195 | \newdimen\cv@lineheight
196 | \newdimen\cv@boxheight
197 | \newbox\cv@tmpbox
198 | \newcount\cv@refno
199 | \newcount\cv@tot
200 | % NUMBER with left flushed zeros  \fillzeros[<WIDTH>]<NUMBER>
201 | \newcount\cv@tmpc@ \newcount\cv@tmpc
202 | \def\fillzeros[#1]#2{\cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi
203 | \cv@tmpc=1 %
204 | \loop\ifnum\cv@tmpc@<10 \else \divide\cv@tmpc@ by 10 \advance\cv@tmpc by 1 \fi
205 |    \ifnum\cv@tmpc@=10\relax\cv@tmpc@=11\relax\fi \ifnum\cv@tmpc@>10 \repeat
206 | \ifnum#2<0\advance\cv@tmpc1\relax-\fi
207 | \loop\ifnum\cv@tmpc<#1\relax0\advance\cv@tmpc1\relax\fi \ifnum\cv@tmpc<#1 \repeat
208 | \cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi \relax\the\cv@tmpc@}%
209 | % \makevruler[<SCALE>][<INITIAL_COUNT>][<STEP>][<DIGITS>][<HEIGHT>]
210 | \def\makevruler[#1][#2][#3][#4][#5]{\begingroup\offinterlineskip
211 | \textheight=#5\vbadness=10000\vfuzz=120ex\overfullrule=0pt%
212 | \global\setbox\nipsrulerbox=\vbox to \textheight{%
213 | {\parskip=0pt\hfuzz=150em\cv@boxheight=\textheight
214 | \cv@lineheight=#1\global\nipsrulercount=#2%
215 | \cv@tot\cv@boxheight\divide\cv@tot\cv@lineheight\advance\cv@tot2%
216 | \cv@refno1\vskip-\cv@lineheight\vskip1ex%
217 | \loop\setbox\cv@tmpbox=\hbox to0cm{{\nipstenhv\hfil\fillzeros[#4]\nipsrulercount}}%
218 | \ht\cv@tmpbox\cv@lineheight\dp\cv@tmpbox0pt\box\cv@tmpbox\break
219 | \advance\cv@refno1\global\advance\nipsrulercount#3\relax
220 | \ifnum\cv@refno<\cv@tot\repeat}}\endgroup}%
221 | \makeatother
222 | % ----- end of vruler
223 | 
224 | % \makevruler[<SCALE>][<INITIAL_COUNT>][<STEP>][<DIGITS>][<HEIGHT>]
225 | \def\nipsruler#1{\makevruler[12pt][#1][1][3][0.993\textheight]\usebox{\nipsrulerbox}}
226 | \AddToShipoutPicture{%
227 | \ifnipsfinal\else
228 | \nipsruleroffset=\textheight
229 | \advance\nipsruleroffset by -3.7pt
230 |   \color[rgb]{.7,.7,.7}
231 |   \AtTextUpperLeft{%
232 |     \put(\LenToUnit{-35pt},\LenToUnit{-\nipsruleroffset}){%left ruler
233 |       \nipsruler{\nipsrulercount}}
234 |   }
235 | \fi
236 | }
237 | 


--------------------------------------------------------------------------------
/sentenceclassification/cnn_imdb.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | '''This example demonstrates the use of Convolution1D for text classification.
  3 | 90s/epoch on Intel i5 2.4Ghz CPU.
  4 | 10s/epoch on Tesla K40 GPU.
  5 | '''
  6 | 
  7 | '''
  8 | Simple Convolution1D for Sequence Classification
  9 | '''
 10 | 
 11 | ##########################
 12 | ## Importing packages
 13 | ##########################
 14 | # importing packages/function that will be useful later
 15 | import numpy as np
 16 | np.random.seed(1234)  # for reproducibility (manually setting random seed)
 17 | 
 18 | from keras.preprocessing import sequence
 19 | from keras.models import Sequential
 20 | from keras.layers import Dense, Dropout, Activation, Embedding, LSTM, SimpleRNN, GRU
 21 | from keras.layers import Convolution1D, GlobalMaxPooling1D
 22 | from utils import load_imdb
 23 | 
 24 | import matplotlib.pyplot as plt
 25 | 
 26 | ##########################
 27 | ## Preparing data
 28 | ##########################
 29 | # some parameters
 30 | vocab_size   =  5000 # number of words considered in the vocabulary
 31 | train_split = 0.7     # ratio of train sentences
 32 | 
 33 | # Preparing data is usually the most time-consuming part of machine learning.
 34 | # Luckily for you, the imdb dataset has already been preprocessed and included in Keras.
 35 | (X_train, y_train), (X_test, y_test) = load_imdb(nb_words=vocab_size, train_split=train_split)
 36 | 
 37 | print(len(X_train), 'train sequences')
 38 | print(len(X_test), 'test sequences')
 39 | 
 40 | ## Padding input data
 41 | # Models in Keras (and elsewhere) usually take as input batches of sentences of the same length.
 42 | # Since sentences usually have different sizes, we "pad" sentences (we add a dummy "padding" token at the end of the
 43 | # sentences. The input thus has this size : (batchsize, maxseqlen) where maxseqlen is the maximum length of a sentence
 44 | # in the batch.
 45 | 
 46 | maxlen  = 80  # cut texts after this number of words (among top vocab_size most common words)
 47 | X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
 48 | X_test  = sequence.pad_sequences(X_test, maxlen=maxlen)
 49 | print('X_train shape:', X_train.shape)
 50 | print('X_test shape:', X_test.shape)
 51 | 
 52 | ##########################
 53 | ## Building model
 54 | ##########################
 55 | 
 56 | embed_dim = 16
 57 | nhid      = 128
 58 | print('\nBuilding model...')
 59 | 
 60 | nb_filter = 250
 61 | filter_length = 3
 62 | hidden_dims = 250
 63 | 
 64 | model = Sequential()
 65 | # we start off with an efficient embedding layer which maps
 66 | # our vocab indices into embedding_dims dimensions
 67 | model.add(Embedding(vocab_size,
 68 |                     embed_dim,
 69 |                     input_length=maxlen,
 70 |                     dropout=0.2))
 71 | 
 72 | # we add a Convolution1D, which will learn nb_filter
 73 | # word group filters of size filter_length:
 74 | model.add(Convolution1D(nb_filter=nb_filter,
 75 |                         filter_length=filter_length,
 76 |                         border_mode='valid',
 77 |                         activation='relu',
 78 |                         subsample_length=1))
 79 | # we use temporal max pooling:
 80 | model.add(GlobalMaxPooling1D())
 81 | 
 82 | # We add a classifier (MLP with one hidden layer)
 83 | model.add(Dense(hidden_dims))
 84 | model.add(Dropout(0.2))
 85 | model.add(Activation('relu'))
 86 | model.add(Dense(1))
 87 | model.add(Activation('sigmoid'))
 88 | 
 89 | 
 90 | ##########################
 91 | ## Define (i)   loss function
 92 | #         (ii)  optimizer
 93 | #         (iii) metrics
 94 | ##########################
 95 | 
 96 | loss_classif     =  'binary_crossentropy'
 97 | optimizer        =  'adam' # or sgd
 98 | metrics_classif  =  ['accuracy']
 99 | 
100 | model.compile(loss=loss_classif,
101 |               optimizer=optimizer,
102 |               metrics=metrics_classif)
103 | 
104 | print(model.summary())
105 | print('Built model')
106 | ##########################
107 | ## Train Model
108 | ##########################
109 | validation_split =  0.2 # Held-out ("validation") data to test on.
110 | batch_size       =  32  # size of the minibach (each batch will contain 32 sentences)
111 | nb_epoch         =  6
112 | 
113 | print('\n\nStarting training of the model\n')
114 | history = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch, validation_split=0.2)
115 | 
116 | plt.figure(1)
117 | plt.subplot(1,2,1)
118 | plt.plot(range(1,nb_epoch + 1), history.history['loss'], 'b', range(1,nb_epoch + 1), history.history['val_loss'], 'r')
119 | plt.ylabel('loss')
120 | plt.xlabel('epoch')
121 | plt.legend(['train', 'validation'], loc='upper left')
122 | 
123 | plt.subplot(1,2,2)
124 | plt.plot(range(1,nb_epoch + 1), history.history['acc'], 'b', range(1,nb_epoch + 1), history.history['val_acc'], 'r')
125 | plt.ylabel('accuracy')
126 | plt.xlabel('epoch')
127 | plt.legend(['train', 'validation'], loc='upper left')
128 | plt.show()
129 | 
130 | ##########################
131 | ## Evaluate on test set
132 | ##########################
133 | # evaluate model on test set (never seen during training)
134 | score, acc = model.evaluate(X_test, y_test,
135 |                             batch_size=batch_size)
136 | print('\n\nTest score:', score)
137 | print('Test accuracy:', acc)
138 | 


--------------------------------------------------------------------------------
/sentenceclassification/cnn_lstm_imdb.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | ##########################
  4 | ## Importing packages
  5 | ##########################
  6 | import numpy as np
  7 | np.random.seed(1234)  # for reproducibility (manually setting random seed)
  8 | 
  9 | from keras.preprocessing import sequence
 10 | from keras.models import Sequential
 11 | from keras.layers import Dense, Dropout, Activation, Embedding, LSTM, SimpleRNN, GRU
 12 | from keras.layers import Convolution1D
 13 | from keras.optimizers import SGD, Adam
 14 | from utils import load_imdb
 15 | 
 16 | import matplotlib.pyplot as plt
 17 | 
 18 | ##########################
 19 | ## Preparing data
 20 | ##########################
 21 | vocab_size   =  5000 # number of words considered in the vocabulary
 22 | train_split = 0.7     # ratio of train sentences
 23 | 
 24 | (X_train, y_train), (X_test, y_test) = load_imdb(nb_words=vocab_size, train_split=train_split)
 25 | print(len(X_train), 'train sequences')
 26 | print(len(X_test), 'test sequences')
 27 | 
 28 | ## Padding input data
 29 | maxlen  = 80  # cut texts after this number of words (among top vocab_size most common words)
 30 | X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
 31 | X_test  = sequence.pad_sequences(X_test, maxlen=maxlen)
 32 | print('X_train shape:', X_train.shape)
 33 | print('X_test shape:', X_test.shape)
 34 | 
 35 | ##########################
 36 | ## Building model
 37 | ##########################
 38 | embed_dim = 25
 39 | nhid      = 128
 40 | nb_filter = 250
 41 | filter_length = 3
 42 | hidden_dims = 250
 43 | print('\nBuilding model...')
 44 | 
 45 | model = Sequential()
 46 | model.add(Embedding(vocab_size,
 47 |                     embed_dim,
 48 |                     input_length=maxlen,
 49 |                     dropout=0.3))
 50 | model.add(Convolution1D(nb_filter=nb_filter,
 51 |                         filter_length=filter_length,
 52 |                         border_mode='valid',
 53 |                         activation='relu',
 54 |                         subsample_length=1))
 55 | model.add(LSTM(nhid, dropout_W=0.2, dropout_U=0.2))
 56 | 
 57 | model.add(Dense(hidden_dims))
 58 | model.add(Dropout(0.5))
 59 | model.add(Activation('relu'))
 60 | model.add(Dense(1))
 61 | model.add(Activation('sigmoid'))
 62 | 
 63 | ##########################
 64 | ## Define (i)   loss function
 65 | #         (ii)  optimizer
 66 | #         (iii) metrics
 67 | ##########################
 68 | loss_classif     =  'binary_crossentropy'
 69 | optimizer        =  'adam' # or sgd
 70 | metrics_classif  =  ['accuracy']
 71 | 
 72 | model.compile(loss=loss_classif,
 73 |               optimizer=optimizer,
 74 |               metrics=metrics_classif)
 75 | 
 76 | print(model.summary())
 77 | print('Built model')
 78 | 
 79 | ##########################
 80 | ## Train Model
 81 | ##########################
 82 | validation_split =  0.2 # Held-out ("validation") data to test on.
 83 | batch_size       =  64  # size of the minibach (each batch will contain 32 sentences)
 84 | nb_epoch         =  7
 85 | 
 86 | print('\n\nStarting training of the model\n')
 87 | history = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch, validation_split=0.2)
 88 | 
 89 | plt.figure(1)
 90 | plt.subplot(1,2,1)
 91 | plt.plot(range(1,nb_epoch + 1), history.history['loss'], 'b', range(1,nb_epoch + 1), history.history['val_loss'], 'r')
 92 | plt.ylabel('loss')
 93 | plt.xlabel('epoch')
 94 | plt.legend(['train', 'validation'], loc='upper left')
 95 | 
 96 | plt.subplot(1,2,2)
 97 | plt.plot(range(1,nb_epoch + 1), history.history['acc'], 'b', range(1,nb_epoch + 1), history.history['val_acc'], 'r')
 98 | plt.ylabel('accuracy')
 99 | plt.xlabel('epoch')
100 | plt.legend(['train', 'validation'], loc='upper left')
101 | plt.show()
102 | 
103 | ##########################
104 | ## Evaluate on test set
105 | ##########################
106 | # evaluate model on test set (never seen during training)
107 | score, acc = model.evaluate(X_test, y_test,
108 |                             batch_size=batch_size)
109 | print('\n\nTest score:', score)
110 | print('Test accuracy:', acc)
111 | 


--------------------------------------------------------------------------------
/sentenceclassification/lstm_imdb.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Simple LSTM for Sequence Classification
  3 | '''
  4 | 
  5 | ##########################
  6 | ## Importing packages
  7 | ##########################
  8 | # importing packages/function that will be useful later
  9 | from __future__ import print_function
 10 | import numpy as np
 11 | np.random.seed(1234)  # for reproducibility (manually setting random seed)
 12 | 
 13 | from keras.preprocessing import sequence
 14 | from keras.models import Sequential
 15 | from keras.layers import Dense, Dropout, Activation, Embedding, LSTM, SimpleRNN, GRU
 16 | from utils import load_imdb
 17 | 
 18 | import matplotlib.pyplot as plt
 19 | 
 20 | ##########################
 21 | ## Preparing data
 22 | ##########################
 23 | # some parameters
 24 | vocab_size   =  15000 # number of words considered in the vocabulary
 25 | train_split = 0.7     # ratio of train sentences
 26 | 
 27 | # Preparing data is usually the most time-consuming part of machine learning.
 28 | # Luckily for you, the imdb dataset has already been preprocessed and included in Keras.
 29 | (X_train, y_train), (X_test, y_test) = load_imdb(nb_words=vocab_size, train_split=train_split)
 30 | 
 31 | print(len(X_train), 'train sequences')
 32 | print(len(X_test), 'test sequences')
 33 | 
 34 | ## Padding input data
 35 | # Models in Keras (and elsewhere) usually take as input batches of sentences of the same length.
 36 | # Since sentences usually have different sizes, we "pad" sentences (we add a dummy "padding" token at the end of the
 37 | # sentences. The input thus has this size : (batchsize, maxseqlen) where maxseqlen is the maximum length of a sentence
 38 | # in the batch.
 39 | 
 40 | maxlen  = 80  # cut texts after this number of words (among top vocab_size most common words)
 41 | X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
 42 | X_test  = sequence.pad_sequences(X_test, maxlen=maxlen)
 43 | print('X_train shape:', X_train.shape)
 44 | print('X_test shape:', X_test.shape)
 45 | 
 46 | ##########################
 47 | ## Building model
 48 | ##########################
 49 | 
 50 | embed_dim = 32  # word embedding dimension
 51 | nhid      = 64 # number of hidden units in the LSTM
 52 | print('\nBuilding model...')
 53 | 
 54 | model = Sequential()
 55 | if False: # Change to "False" to use dropout
 56 |     model.add(Embedding(vocab_size, embed_dim))
 57 |     model.add(LSTM(nhid))
 58 | else:
 59 |     model.add(Embedding(vocab_size, embed_dim, dropout=0.2))
 60 |     model.add(LSTM(nhid, dropout_W=0.2, dropout_U=0.2))
 61 | model.add(Dense(1))
 62 | model.add(Activation('sigmoid'))
 63 | 
 64 | print('Built model')
 65 | 
 66 | # In Keras, Torch and other deep learning framework, we create a "container" which is the Sequential() module.
 67 | # Then we add components to this contained : the lookuptable, the LSTM, the classifier etc.
 68 | # All of these components are contained in the Sequential() and are trained together.
 69 | 
 70 | ##########################
 71 | ## Define (i)   loss function
 72 | #         (ii)  optimizer
 73 | #         (iii) metrics
 74 | ##########################
 75 | 
 76 | loss_classif     =  'binary_crossentropy'
 77 | optimizer        =  'adam' # or sgd
 78 | metrics_classif  =  ['accuracy']
 79 | 
 80 | # note that this is especially easy in Keras : one code line
 81 | print('\nCompiling model')
 82 | model.compile(loss=loss_classif,
 83 |               optimizer=optimizer,
 84 |               metrics=metrics_classif)
 85 | print(model.summary())
 86 | print('Compiled model')
 87 | 
 88 | ##########################
 89 | ## Train Model
 90 | ##########################
 91 | validation_split =  0.2 # Held-out ("validation") data to test on.
 92 | batch_size       =  32  # size of the minibach (each batch will contain 32 sentences)
 93 | nb_epoch         =  6
 94 | 
 95 | # history is just an object that contains information about training.
 96 | # Look at the following line and enjoy how simple it is to train a neural network in Keras.
 97 | print('\n\nStarting training of the model\n')
 98 | history = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch, validation_split=validation_split)
 99 | 
100 | plt.figure(1)
101 | plt.subplot(1,2,1)
102 | plt.plot(range(1,nb_epoch + 1), history.history['loss'], 'b', range(1,nb_epoch + 1), history.history['val_loss'], 'r')
103 | plt.ylabel('loss')
104 | plt.xlabel('epoch')
105 | plt.legend(['train', 'validation'], loc='upper left')
106 | 
107 | plt.subplot(1,2,2)
108 | plt.plot(range(1,nb_epoch + 1), history.history['acc'], 'b', range(1,nb_epoch + 1), history.history['val_acc'], 'r')
109 | plt.ylabel('accuracy')
110 | plt.xlabel('epoch')
111 | plt.legend(['train', 'validation'], loc='upper left')
112 | plt.show()
113 | 
114 | ##########################
115 | ## Evaluate on test set
116 | ##########################
117 | # evaluate model on test set (never seen during training)
118 | score, acc = model.evaluate(X_test, y_test,
119 |                             batch_size=batch_size)
120 | print('\n\nTest loss:', score)
121 | print('Test accuracy:', acc)
122 | 


--------------------------------------------------------------------------------
/sentenceclassification/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from keras.datasets import imdb
 3 | 
 4 | 
 5 | def load_imdb(nb_words, train_split=0.8):
 6 |     print 'Preparing IMDB-review sentence classification dataset with {0} % training data ...'.format(train_split*100)
 7 |     (X_1, y_1), (X_2, y_2) = imdb.load_data(nb_words=nb_words)
 8 |     X = np.array([x for x in X_1] + [x for x in X_2])
 9 |     Y = np.array([y for y in y_1] + [y for y in y_2])
10 |     X_train, y_train = X[:int(train_split * len(X))], Y[:int(train_split * len(Y))]
11 |     X_test, y_test = X[int(train_split * len(X)):], Y[int(train_split * len(Y)):]
12 | 
13 |     return (X_train, y_train), (X_test, y_test)


--------------------------------------------------------------------------------
/word2vec/embedding_word2vec.py:
--------------------------------------------------------------------------------
  1 | from math import sqrt
  2 | from operator import itemgetter
  3 | from random import randint
  4 | 
  5 | import numpy as np
  6 | import logging
  7 | reload(logging)
  8 | logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%I:%M:%S')
  9 | 
 10 | from gensim.models import word2vec
 11 | 
 12 | 
 13 | def avg_word2vec(model, dataset='data/snli.test'):
 14 |     array_sentences = []
 15 |     array_embeddings = []
 16 |     with open(dataset) as f:
 17 |         for line in f:
 18 |             avgword2vec = None
 19 |             cont = 0
 20 |             for word in line.split():
 21 |                 # get embedding (if it exists) of each word in the sentence
 22 |                 if word in model.wv.vocab:
 23 |                     cont += 1
 24 |                     if avgword2vec is None:
 25 |                         avgword2vec = model[word]
 26 |                     else:
 27 |                         avgword2vec = avgword2vec + model[word]
 28 |             # if at least one word in the sentence has a word embeddings :
 29 |             if avgword2vec is not None:
 30 |                 avgword2vec = avgword2vec / cont  # normalize sum
 31 |                 array_sentences.append(line)
 32 |                 array_embeddings.append(avgword2vec)
 33 |     print 'avg_word2vec: Generated embeddings for {0} sentences from {1} dataset.'.format(len(array_sentences), dataset)
 34 |     return array_sentences, array_embeddings
 35 | 
 36 | 
 37 | def cosine_similarity(a, b):
 38 |     assert len(a) == len(b), 'vectors need to have the same size'
 39 |     cos_sim = a.dot(b) / sqrt(a.dot(a)) / sqrt(b.dot(b))
 40 |     return cos_sim
 41 | 
 42 | 
 43 | def most_similar(idx, array_embeddings, array_sentences):
 44 |     query_sentence = array_sentences[idx]
 45 |     query_embed = array_embeddings[idx]
 46 |     list_scores = {}
 47 |     for i in range(idx) + range(idx + 1, len(array_sentences)):
 48 |         list_scores[i] = cosine_similarity(query_embed, array_embeddings[i])
 49 |     closest_idx = max(list_scores, key=list_scores.get)
 50 | 
 51 |     print 'The query :\n'
 52 |     print query_sentence + '\n'
 53 |     print 'is most similar to\n'
 54 |     print array_sentences[closest_idx]
 55 |     print 'with a score of : {0}\n'.format(list_scores[closest_idx])
 56 | 
 57 |     print '5 most similar sentences:'
 58 |     closest_5 = sorted(list_scores.iteritems(), key=itemgetter(1), reverse=True)[:5]
 59 |     for i, score in closest_5:
 60 |         print array_sentences[i], score
 61 | 
 62 |     return closest_idx
 63 | 
 64 | def most_5_similar(idx, array_embeddings, array_sentences):
 65 |     query_sentence = array_sentences[idx]
 66 |     query_embed = array_embeddings[idx]
 67 |     list_scores = {}
 68 |     for i in range(idx) + range(idx + 1, len(array_sentences)):
 69 |         list_scores[i] = cosine_similarity(query_embed, array_embeddings[i])
 70 | 
 71 |     closest_5 = sorted(list_scores.iteritems(), key=itemgetter(1), reverse=True)[:5]
 72 |     closest_5_idx = [i for i, score in closest_5]
 73 | 
 74 |     assert len(closest_5_idx) == 5
 75 | 
 76 |     return closest_5_idx
 77 | 
 78 | 
 79 | def IDF(dataset='data/snli.test'):
 80 |     # Compute IDF (Inverse Document Frequency). Here a "document" is a sentence.
 81 |     # word2idf['peach'] = IDF(peach)
 82 |     df = {}
 83 |     N = 0
 84 |     with open(dataset) as f:
 85 |         for line in f:
 86 |             N += 1
 87 |             sentence = line.split()
 88 |             sentence = np.unique(sentence)
 89 |             for word in sentence:
 90 |                 if word in df:
 91 |                     df[word] += 1
 92 |                 else:
 93 |                     df[word] = 1
 94 | 
 95 |     word2idf = {}
 96 |     for k,v in df.iteritems():
 97 |         word2idf[k] = np.log(float(N) / v)
 98 | 
 99 |     return word2idf
100 | 
101 | def avg_word2vec_idf(model, word2idf, dataset='data/snli.test'):
102 |     array_sentences = []
103 |     array_embeddings = []
104 |     with open(dataset) as f:
105 |         for line in f:
106 |             avgword2vec = None
107 |             sumidf = 0
108 |             for word in line.split():
109 |                 # get embedding (if it exists) of each word in the sentence
110 |                 if word in model.wv.vocab:
111 |                     sumidf += word2idf[word]
112 |                     if avgword2vec is None:
113 |                         avgword2vec = word2idf[word] * model[word]
114 |                     else:
115 |                         avgword2vec = avgword2vec + word2idf[word] * model[word]
116 |             # if at least one word in the sentence has a word embeddings :
117 |             if avgword2vec is not None:
118 |                 avgword2vec = avgword2vec / sumidf  # normalize sum
119 |                 array_sentences.append(line)
120 |                 array_embeddings.append(avgword2vec)
121 |     print 'avg_word2vec_idf: Generated embeddings for {0} sentences from {1} dataset.'.format(len(array_sentences), dataset)
122 |     return array_sentences, array_embeddings
123 | 
124 | if __name__ == "__main__":
125 | 
126 |     if False: # FIRST PART
127 |         sentences = word2vec.Text8Corpus('data/text8')
128 | 
129 |         # Train a word2vec model
130 |         embedding_size = 200
131 |         model = word2vec.Word2Vec(sentences, size=embedding_size)
132 | 
133 |         # Train a word2vec model with phrases
134 |         bigram_transformer = gensim.models.Phrases(sentences)
135 |         model_phrase = Word2Vec(bigram_transformer[sentences], size=200)
136 |     else:
137 |         # Loading model trained on words
138 |         model = word2vec.Word2Vec.load('models/text8.model')
139 | 
140 |         # Loading model enhanced with phrases (2-grams)
141 |         model_phrase = word2vec.Word2Vec.load('models/text8.phrase.model')
142 | 
143 |     """
144 |     SECOND PART: Investigating word2vec word embeddings space
145 |     """
146 | 
147 |     # Words that are similar are close in the sense of the cosine similarity.
148 |     sim = model.similarity('woman', 'man')
149 |     print 'Printing word similarity between "woman" and "man" : {0}'.format(sim)
150 | 
151 |     sim = model.similarity('apple', 'mac')
152 |     print 'Printing word similarity between "apple" and "mac" : {0}'.format(sim)
153 | 
154 |     sim = model.similarity('apple', 'peach')
155 |     print 'Printing word similarity between "apple" and "peach" : {0}'.format(sim)
156 | 
157 |     sim = model.similarity('banana', 'peach')
158 |     print 'Printing word similarity between "banana" and "peach" : {0}'.format(sim)
159 | 
160 |     # And words that appear in the same context have similar word embeddings.
161 |     print model.most_similar(['paris'])[0]
162 |     print model_phrase.most_similar(['paris'])[0]
163 | 
164 |     words = ['apple', 'peach', 'banana', 'car']
165 | 
166 |     for word in words:
167 |         print word
168 |         print model.most_similar([word])
169 | 
170 |     print "science - scientist + mathematician"
171 |     print model.most_similar(positive=['science', 'mathematician'], negative=['scientist'])[0]
172 |     print "science - scientist + physicist"
173 |     print model.most_similar(positive=['science', 'physicist'], negative=['scientist'])[0]
174 |     print "science - scientist + philosopher"
175 |     print model.most_similar(positive=['science', 'philosopher'], negative=['scientist'])[0]
176 |     print "science - scientist + astronomer"
177 |     print model.most_similar(positive=['science', 'astronomer'], negative=['scientist'])[0]
178 |     print "science - scientist + biologist"
179 |     print model.most_similar(positive=['science', 'biologist'], negative=['scientist'])[0]
180 | 
181 |     print model.most_similar(['difficult'])
182 |     print model_phrase.most_similar(['difficult'])
183 | 
184 |     print model_phrase.most_similar(['clinton'])
185 | 
186 |     # Compositionality and structure in word2vec space
187 |     print model.most_similar(positive=['woman', 'king'], negative=['man'])[0]
188 | 
189 |     print model.most_similar(positive=['france', 'berlin'], negative=['germany'])[0]
190 | 
191 |     """
192 |     THIRD PART: Sentence embeddings with average(word2vec)
193 |     """
194 |     data_path = 'data/snli.test'
195 |     array_sentences, array_embeddings = avg_word2vec(model, dataset=data_path)
196 | 
197 |     query_idx =  777 # random sentence
198 |     assert query_idx < len(array_sentences) # little check
199 | 
200 |     # array_sentences[closest_idx] will be the closest sentence to array_sentences[query_idx].
201 |     closest_idx = most_similar(query_idx, array_embeddings, array_sentences)
202 | 
203 |     closest_5_idx = most_5_similar(query_idx, array_embeddings, array_sentences)
204 | 
205 |     print 'Most 5 similar:\n'
206 |     for idx in closest_5_idx:
207 |         print array_sentences[idx]
208 | 
209 |     """
210 |     FOURTH PART: Weighted average of word vectors with IDF.
211 |     """
212 |     word2idf = IDF(data_path)
213 | 
214 |     words = ['the', 'a' , 'clinton', 'woman', 'man', 'apple', 'peach', 'banana', 'mac', 'paris', 'france']
215 | 
216 |     for word in words:
217 |         if word in word2idf:
218 |             print word, word2idf[word]
219 |         else:
220 |             print word, "not found"
221 | 
222 |     array_sentences_idf, array_embeddings_idf = avg_word2vec_idf(model, word2idf, dataset=data_path)
223 |     closest_idx_idf = most_similar(query_idx, array_embeddings_idf, array_sentences_idf)
224 | 


--------------------------------------------------------------------------------