├── .gitignore
├── .gitmodules
├── LICENSE
├── README.md
├── bench
    ├── MainBenchmark.hs
    └── PCABench.hs
├── main
    ├── analyze.hs
    ├── arxiv.hs
    └── word2vec.hs
├── src
    ├── Concurrent.hs
    ├── Crawl.hs
    ├── Display.hs
    ├── Huffman.hs
    ├── Log.hs
    ├── Model.hs
    ├── Model
    │   ├── Array.hs
    │   ├── IO.hs
    │   ├── Repa.hs
    │   └── Types.hs
    ├── PCA.hs
    ├── Step.hs
    ├── Window.hs
    ├── Words.hs
    └── Words
    │   └── Dictionary.hs
├── stack.yaml
├── test
    ├── PCASpec.hs
    └── word2vec-test.hs
└── word2vec.cabal


/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | *.o
3 | *.hi
4 | .#*
5 | .stack-work/
6 | \#*
7 | TAGS
8 | Setup.hs
9 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "subhask"]
2 | 	path = subhask
3 | 	url =  git@github.com:abailly/subhask.git
4 | [submodule "hmatrix-nipals"]
5 | 	path = hmatrix-nipals
6 | 	url = git@github.com:abailly/hmatrix-nipals.git
7 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2014 Arnaud Bailly
 2 | 
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions
 7 | are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright
10 |    notice, this list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright
13 |    notice, this list of conditions and the following disclaimer in the
14 |    documentation and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the author nor the names of his contributors
17 |    may be used to endorse or promote products derived from this software
18 |    without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
21 | IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR
24 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
28 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
29 | ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 | POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Word2Vec
 2 | 
 3 | This is a partial, probably wrong and comparatively slow Haskell port of the [word2vec](https://code.google.com/p/word2vec/)
 4 | freamework for computing the vector space of a text corpus. I have started this development following a
 5 | [job post](http://www.haskellers.com/jobs/61): I wanted to know whether or not I was able to develop this piece of software, given
 6 | I have not been programming anything related to machine learning since my Master degree and I am really no *data scientist*,
 7 | whatever that mean...
 8 | 
 9 | # What it does?
10 | 
11 | Following the rules of Zalora's challenge, this code is separated in two parts:
12 | 
13 | 1. `crawl` downloads all documents corresponding to a given query on [Arxiv](http;//arxiv.org), a repository of scientific
14 |     papers. It also converts all those PDFs into text documents using the [pdftotext](http://www.foolabs.com/xpdf/home.html)
15 |     utility program which must be installed on the host system,
16 | 2. `word2vec` builds a vector model from all the `.txt` documents in a given directory, and outputs 3 files: A `model.vec` file
17 |    containing the complete description of the neural network trained from the corpora, a `model.pca`mapping all the words in the
18 |    model in a 2-dimensional space using [Principal Component Analysis](http://www.snl.salk.edu/~shlens/pca.pdf), a `model.svg`
19 |    which is an image containing the 100 most frequent words of the corpus drawn in 2D according to previous mapping.
20 | 
21 | On my recent MacBook Air, training 390 documents, (2M total words) over 56k different words with 100 dimensions takes
22 | approximatively 20minutes. It looks like the words/sec ratio is around 2000, which is much much lower than the optimized C and
23 | Python version but better than the unoptimized pure Python!
24 | 
25 | # How it works?
26 | 
27 | This piece of code is 99% Haskell, the remaining 1% being the pdftotext utility I did not find a replacement for. The Haskell port
28 | is not the most brilliant code I ever wrote nor is it intensively tested, but it does the job (at least, it
29 | [works on my machine](http://www.codinghorror.com/blog/2007/03/the-works-on-my-machine-certification-program.html)). Here is a
30 | short summary of how I implemented it:
31 | 
32 | * The crawling stuff uses mostly plain [Network.Browser](http://hackage.haskell.org/package/network) package with a bit of
33 |   [http-client-conduit](http://hackage.haskell.org/package/http-client-conduit). The retrieval of the documents' ids is
34 |   sequential but the download of the PDF files is done concurrently using a custom built thread pool communicating through plain
35 |   old `MVar`s. I should have used a more sophisticated concurrent package like [async](http://hackage.haskell.org/package/async)
36 |   but...
37 | * There is some parsing of the retrieved page to be done that is handled by [tagsoup](http://hackage.haskell.org/package/tagsoup)
38 |   to retrieve the documents' ids and navigate between the pages of the query.
39 | * Text extraction relies on pdftotext which is spawned using standard process control.
40 | * I used the [tokenize](http://hackage.haskell.org/package/tokenize) package to tokenize the corpus in proper words, removing
41 |   anything that does not contain only letters.
42 | * Tricky part starts with construction of
43 |   [Huffmann encoding](https://www.siggraph.org/education/materials/HyperGraph/video/mpeg/mpegfaq/huffman_tutorial.html) of the
44 |   words sorted by frequency. I have used [heap](http://hackage.haskell.org/package/heap) package to build the binary tree after
45 |   inserting `(word,frequency)` tuples and the hashmap from
46 |   [unordered-containers](http://hackage.haskell.org/package/unordered-containers) to map words to their encoding
47 | * The meat of the algorithm is training of a neural network made of an `words x dimensions` input layer and an identically sized
48 |   hidden layer. This practically means that both layers are represented as matrices of doubles and we need to run operations on
49 |   part of these matrices for each word in the corpus. I started trying to use
50 |   [hmatrix](http://hackage.haskell.org/package/hmatrix) to compute directly matrix operations on layers as is done in the
51 |   [python port](https://github.com/piskvorky/gensim/blob/develop/gensim/models/word2vec.py) of word2vec (at least in the
52 |   inefficient training algorithm...). But 99% of the time was spent updating the global matrix using the computations from a
53 |   submatrix. I tried to [profile my code](http://www.haskell.org/ghc/docs/latest/html/users_guide/profiling.html) and started
54 |   sprinkling over strictness annotations but was not able to improve significantly the situation. So I went for a simpler and
55 |   uglier solution: Replicate the C loops using [mutable array](http://hackage.haskell.org/package/array) in the IO monad. Given
56 |   the obfuscated nature of the C code, this was not an easy task but I finally managed to get something working with at least
57 |   acceptable performances (for testing purpose of course...). I toyed with the idea of delving into the
58 |   [repa](http://hackage.haskell.org/package/repa) library but finally gave up due to lack of time.
59 | * A surprisingly difficult task was slicing the input sequence of words into random "windows" uniformely distributed around some
60 |   mean value (10) in order to retrieve the central word of the window (the training word) and the list of words around it. I
61 |   devoted a complete module [Window.hs](Window.hs) to this task and it is the best unit-tested of all modules in this code.
62 | * Speaking of tests, while in my daily work I am a big fan and champion of
63 |   [TDD](https://en.wikipedia.org/wiki/Test-driven_development), I admit I forego any tentative whatsoever to do TDD on this
64 |   project and simply relied on types, the REPL and manual coding to get something working. The few existing tests use
65 |   [doctest](http://hackage.haskell.org/package/doctest) to embed small text-based unit tests in documentation of functions.
66 | * The final part of the challenge needed applying PCA on the trained model to extract a 2D mapping of words and generating a graph
67 |   from this mapping. I used [hmatrix-nipals](http://hackage.haskell.org/package/hmatrix-nipals) to do PCA and
68 |   [Chart](http://hackage.haskell.org/package/Chart) with a [diagrams](http://hackage.haskell.org/package/Chart-diagrams)
69 |   backend. The only complex task in this last step was cleaning the mess with cabal after a `--force-reinstalls` due to some
70 |   mismatch in obscure semigroups and category related packages, which was quite painful until I discovered that `cabal-dev` has
71 |   been integrated in `cabal` as
72 |   [cabal sandbox](http://www.haskell.org/cabal/users-guide/installing-packages.html#developing-with-sandboxes).
73 | 
74 | ## About Parallelization
75 | 
76 | # Conclusion
77 | 
78 | This was fun. And challenging. The most challenging part was trying to understand the training algorithm. The python code was very
79 | helpful as it is thoroughly commented and well written. I would not say the same thing about the C code. YMMV of course...
80 | 
81 | I learnt a things or two:
82 | 
83 | * Haskell is really, really a great language. I want and need to do more of it!
84 | * Its eco-system is complex, with lot of half-baked and underdocumented libraries, but one can do anything with it, most of the
85 |   times using beautiful and concise code, sometimes using other forms of beauty...
86 | * Machine learning is really, really fun. While working at [Polyspot](http://www.polyspot.com/en/) I touched a little bit on this
87 |   subject but had not had the opportunity to code anything interesting in this domain. I read
88 |   [Introduction to Information Retrieval](http://nlp.stanford.edu/IR-book/information-retrieval-book.html) to gain a better
89 |   knowledge of the subject but have never applied it on the field.
90 | 


--------------------------------------------------------------------------------
/bench/MainBenchmark.hs:
--------------------------------------------------------------------------------
 1 | module Main where
 2 | 
 3 | import           Criterion.Main
 4 | import           PCABench
 5 | import           System.Random
 6 | 
 7 | main :: IO ()
 8 | main = defaultMain benchmarks
 9 | 
10 | benchmarks :: [Benchmark]
11 | benchmarks =
12 |   [bgroup "PCA Fast for top 1 component"
13 |      [ bench "PCA 1/5"    $ nfIO $ computeFastPCA 1 5
14 |      , bench "PCA 1/10"   $ nfIO $ computeFastPCA 1 10
15 |      , bench "PCA 1/50"   $ nfIO $ computeFastPCA 1 50
16 |      , bench "PCA 1/100"  $ nfIO $ computeFastPCA 1 100
17 |      , bench "PCA 1/500"  $ nfIO $ computeFastPCA 1 500
18 |      , bench "PCA 1/1000"  $ nfIO $ computeFastPCA 1 1000
19 |      ]
20 |   ,  bgroup "PCA Fast for top 2 components"
21 |      [ bench "PCA 2/5"    $ nfIO $ computeFastPCA 2 5
22 |      , bench "PCA 2/10"   $ nfIO $ computeFastPCA 2 10
23 |      , bench "PCA 2/50"   $ nfIO $ computeFastPCA 2 50
24 |      , bench "PCA 2/100"  $ nfIO $ computeFastPCA 2 100
25 |      , bench "PCA 2/500"  $ nfIO $ computeFastPCA 2 500
26 |      , bench "PCA 2/1000"  $ nfIO $ computeFastPCA 2 1000
27 |      ]
28 |   ,  bgroup "PCA SVD for top 1 component"
29 |      [ bench "PCA 1/5"    $ nfIO $ computePCASVD 1 5
30 |      , bench "PCA 1/10"   $ nfIO $ computePCASVD 1 10
31 |      , bench "PCA 1/50"   $ nfIO $ computePCASVD 1 50
32 |      , bench "PCA 1/100"  $ nfIO $ computePCASVD 1 100
33 |      , bench "PCA 1/500"  $ nfIO $ computePCASVD 1 500
34 |      , bench "PCA 1/1000"  $ nfIO $ computePCASVD 1 1000
35 |      ]
36 |   ,  bgroup "PCA SVD for top 2 components"
37 |      [ bench "PCA 2/5"    $ nfIO $ computePCASVD 2 5
38 |      , bench "PCA 2/10"   $ nfIO $ computePCASVD 2 10
39 |      , bench "PCA 2/50"   $ nfIO $ computePCASVD 2 50
40 |      , bench "PCA 2/100"  $ nfIO $ computePCASVD 2 100
41 |      , bench "PCA 2/500"  $ nfIO $ computePCASVD 2 500
42 |      , bench "PCA 2/1000"  $ nfIO $ computePCASVD 2 1000
43 |      ]
44 |   ]
45 | 
46 | 


--------------------------------------------------------------------------------
/bench/PCABench.hs:
--------------------------------------------------------------------------------
 1 | module PCABench where
 2 | 
 3 | import           Model
 4 | import           Model.Repa
 5 | import           Model.Types
 6 | import           Numeric.LinearAlgebra
 7 | import           PCA
 8 | 
 9 | type NumberOfComponents = Int
10 | type SizeOfMatrix = Int
11 | type NumberOfRows = Int
12 | type NumberOfColumns = Int
13 | 
14 | computePCA :: SizeOfMatrix -> IO (Matrix Double)
15 | computePCA n = do
16 |   mat <- randn n n
17 |   let pca n m = pcaMat where
18 |         (pcaMat, _, _,_) = pcaSVD n m
19 |   return $ pca'' 2 mat pca
20 | 
21 | 
22 | computePCASVD :: NumberOfComponents -> SizeOfMatrix -> IO (Matrix Double)
23 | computePCASVD c n = do
24 |   mat <- randn n n
25 |   let pca n m = pcaMat where
26 |         (pcaMat, _, _,_) = pcaSVD n m
27 |   return $ pca'' c mat pca
28 | 
29 | computeFastPCA :: NumberOfComponents -> SizeOfMatrix -> IO (Matrix Double)
30 | computeFastPCA c n = do
31 |   mat <- randn n n
32 |   let pca c m = pcaMat where
33 |         pcaMat = fromRows $ fastPCA c m
34 |   return $ pca'' c mat pca
35 | 
36 | computePCAAndConvertModel :: NumberOfRows -> NumberOfColumns -> IO (Matrix Double)
37 | computePCAAndConvertModel n m = do
38 |   mat <- syn0 <$> model n m
39 |   let pca n m = pcaMat where
40 |         (pcaMat, _, _,_) = pcaSVD n m
41 |   return $ pca'' 2 (toMatrix n m mat) pca
42 | 
43 | 


--------------------------------------------------------------------------------
/main/analyze.hs:
--------------------------------------------------------------------------------
 1 | module Main where
 2 | 
 3 | import           Control.Monad      (foldM, when)
 4 | import           Data.List          (isSuffixOf)
 5 | import           Prelude            hiding (readFile)
 6 | import           System.Directory   (getDirectoryContents)
 7 | import           System.Environment (getArgs)
 8 | import           System.FilePath    ((</>))
 9 | import           System.IO          (BufferMode (..), hSetBuffering, stdout)
10 | import           System.IO.UTF8     (readFile)
11 | 
12 | 
13 | import           Crawl
14 | import           Display
15 | import           Model
16 | import           Words
17 | 
18 | pca :: String -> IO  [(String, Double, Double)]
19 | pca file = analyze file >>= pcaAnalysis
20 | 
21 | analyze :: String -> IO Model
22 | analyze file = do
23 |   content <- readFile file  >>= return.tokenizeString
24 |   dict <- tokenizeFiles [file]
25 |   trainModel 0 dict [content]
26 | 
27 | trainFiles :: [String] -> IO Model
28 | trainFiles txts = do
29 |   dict <- tokenizeFiles txts
30 |   putStrLn $ "Encoded " ++ (show $ dictionaryLength dict) ++ " words, dim="++  (show $ encodingLength dict)
31 |   contents <- mapM (\ f -> readFile f >>= return. tokenizeString) txts
32 |   let tokens = length contents
33 | 
34 |   putStrLn $ "Training " ++ (show tokens) ++ " files"
35 |   trainModel tokens dict contents
36 | 
37 | analyzeDirectory :: String -> IO Model
38 | analyzeDirectory dir = do
39 |   txts <- getDirectoryContents dir >>= return.filter (isSuffixOf ".txt")
40 |   trainFiles txts
41 | 
42 | main :: IO ()
43 | main = do
44 |   args <- getArgs
45 |   let dir = case args of
46 |         (x:xs) -> x
47 |         []     -> "."
48 |   putStrLn $ "analyzing directory "++ dir
49 |   hSetBuffering stdout NoBuffering
50 |   m <- analyzeDirectory dir
51 |   p <- pcaAnalysis m
52 |   when (length p /= (numberOfWords m))
53 |     (fail $ "PCA should have same number of words than model: "++ (show $ length p) ++ "vs. " ++ (show $ numberOfWords m))
54 | 
55 |   let modelFile = (dir </> "model.vec")
56 |   let pcaFile = (dir </> "model.pca")
57 |   putStrLn $ "Writing model to file "++ modelFile
58 |   writeFile modelFile (show m)
59 |   putStrLn $ "Writing PCA to file " ++ pcaFile
60 |   writeFile pcaFile (show p)
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/main/arxiv.hs:
--------------------------------------------------------------------------------
1 | import Crawl
2 | 
3 | main :: IO ()
4 | main = do 
5 |   pdfs <- downloadPDFs
6 |   txts <- (mapM convertToText pdfs >>= return.filter (/= []))
7 |   putStrLn  $ "Done crawling " ++ (show $ length txts)
8 | 


--------------------------------------------------------------------------------
/main/word2vec.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE DeriveGeneric              #-}
  2 | {-# LANGUAGE FlexibleContexts           #-}
  3 | {-# LANGUAGE FlexibleInstances          #-}
  4 | {-# LANGUAGE GeneralizedNewtypeDeriving #-}
  5 | {-# LANGUAGE MultiParamTypeClasses      #-}
  6 | {-# LANGUAGE OverloadedStrings          #-}
  7 | {-# LANGUAGE UndecidableInstances       #-}
  8 | {-# LANGUAGE ViewPatterns               #-}
  9 | 
 10 | import           Control.Monad                             (when)
 11 | import           Control.Monad.Reader
 12 | import           Control.Monad.Trans                       (MonadIO, lift,
 13 |                                                             liftIO)
 14 | import           Data.Aeson
 15 | import qualified Data.ByteString.Lazy                      as BS
 16 | import qualified Data.ByteString.Lazy.Char8                as BS8
 17 | import           Data.Functor                              (void)
 18 | import           Display
 19 | import           Graphics.Rendering.Chart.Backend.Diagrams
 20 | import           Log
 21 | import           Model
 22 | import           Model.IO
 23 | import           Model.Repa
 24 | import           Model.Types
 25 | import           Options.Generic
 26 | import           Prelude                                   hiding (readFile)
 27 | import           System.Directory                          (doesFileExist)
 28 | import           System.FilePath                           ((</>))
 29 | import           System.IO                                 (BufferMode (..),
 30 |                                                             hSetBuffering,
 31 |                                                             readFile, stdout)
 32 | 
 33 | instance Progress (ReaderT Int IO) where
 34 |   progress lvl m =  do
 35 |     logLevel <- ask
 36 |     when (fromEnum lvl <= logLevel) $ liftIO (BS8.putStrLn $ encode m)
 37 | 
 38 | newtype Step m a = Step { runStep :: m a }
 39 |                  deriving (Functor, Applicative, Monad, MonadIO)
 40 | 
 41 | instance MonadTrans Step  where
 42 |   lift = Step
 43 | 
 44 | mapStep :: (m a -> m b) -> Step m a -> Step m b
 45 | mapStep f (Step m) = Step $ f m
 46 | 
 47 | instance (MonadReader a m) => MonadReader a (Step m) where
 48 |   reader = lift . reader
 49 |   local = mapStep . local
 50 | 
 51 | instance (MonadIO m, MonadReader Int m) => Progress (Step m) where
 52 |   progress lvl m = Step $ do
 53 |     logLevel <- ask
 54 |     when (fromEnum lvl <= logLevel) $ liftIO $ do
 55 |       BS8.putStrLn $ encode m
 56 |       void getLine
 57 | 
 58 | data Config = Config { corpusDirectory  :: FilePath
 59 |                      , verbosity        :: Int
 60 |                      , stepByStep       :: Bool
 61 |                      , numberOfFeatures :: Int
 62 |                      , selectedWords    :: [ String ]
 63 |                      } deriving (Generic)
 64 | 
 65 | instance ParseRecord Config
 66 | 
 67 | defaultConfig :: [String] -> Config
 68 | defaultConfig = Config "." 0 False 100
 69 | 
 70 | main :: IO ()
 71 | main = do
 72 |   config <- getRecord "Word2Vec Trainer"
 73 |   go config
 74 | 
 75 | go :: Config -> IO ()
 76 | go c@(stepByStep -> False) = runReaderT (runAnalysis c)           (verbosity c)
 77 | go c@(stepByStep -> True)  = runReaderT (runStep $ runAnalysis c) (verbosity c)
 78 | 
 79 | runAnalysis :: (MonadIO m, MonadReader Int m, Progress m) => Config -> m ()
 80 | runAnalysis config = do
 81 |   let dir         = corpusDirectory config
 82 |       modelFile   = dir </> "model.vec"
 83 |       pcaFile     = dir </> "model.pca"
 84 |       diagramFile = dir </> "model.svg"
 85 | 
 86 |   liftIO $ hSetBuffering stdout NoBuffering
 87 | 
 88 |   hasModel <- liftIO $ doesFileExist modelFile
 89 | 
 90 |   m <- if hasModel
 91 |        then do
 92 |     progress Coarse $ LoadingModelFile modelFile
 93 |     read `fmap` liftIO (readFile modelFile)
 94 |        else do
 95 |     progress Coarse $ AnalyzingDirectory dir
 96 |     analyzeDirectory (numberOfFeatures config) (corpusDirectory config)
 97 | 
 98 |   let p = pcaAnalysis m
 99 |       top100 = mostFrequentWords 100 m
100 |       chart = drawSelectedWords p (if null ( selectedWords config) then top100 else selectedWords config)
101 |   when (length p /= numberOfWords m)
102 |     (fail $ "PCA should have same number of words than model: "++ show (length p) ++ " vs. " ++ show (numberOfWords m))
103 | 
104 |   progress Coarse $ WritingModelFile modelFile
105 |   liftIO $ writeFile modelFile (show m)
106 | 
107 |   progress Coarse $ WritingPCAFile pcaFile
108 |   liftIO $ writeFile pcaFile (show p)
109 | 
110 |   progress Coarse $ WritingDiagram diagramFile (selectedWords config)
111 | 
112 |   (bs, _) <- liftIO $ renderableToSVGString chart 1000 1000
113 |   liftIO $ BS.writeFile diagramFile bs
114 | 
115 |   progress Coarse Done
116 | 
117 | 
118 | 


--------------------------------------------------------------------------------
/src/Concurrent.hs:
--------------------------------------------------------------------------------
 1 | module Concurrent(runInThreadPool) where
 2 | 
 3 | import Data.Functor(void)
 4 | import Control.Monad(replicateM)
 5 | import Control.Concurrent.MVar(newEmptyMVar,
 6 |                                takeMVar,
 7 |                                putMVar,
 8 |                                MVar)
 9 | import Control.Concurrent(forkIO,
10 |                           killThread)
11 | 
12 | 
13 | -- | Continuously executes an action, feeding it arguments from a channel.
14 | doDownload :: (a -> IO b)  -- ^Action to run
15 |            -> MVar a      -- ^A "channel" to retrieve arguments
16 |            -> MVar b      -- ^A "channel" to feed results
17 |            -> IO ()
18 | doDownload action inChan outChan = do
19 |         docId <- takeMVar inChan
20 |         f <- action docId
21 |         putMVar outChan f
22 |         doDownload action inChan outChan
23 | 
24 | -- |Run a thread pool for executing concurrent computations.
25 | runInThreadPool :: Int        -- ^number of threads to run concurrently
26 |                 -> [ a ]      -- ^list of inputs to proceed
27 |                 -> (a -> IO b) -- ^Action to run in threads
28 |                 -> IO [ b ]   -- ^list of action results
29 | runInThreadPool numThreads ids compute= do
30 |   inChan  <- newEmptyMVar
31 |   outChan <- newEmptyMVar
32 |   tids <- replicateM numThreads (forkIO $ doDownload compute inChan outChan)
33 |   void $ forkIO $ mapM_ (putMVar inChan) ids
34 |   files <- mapM (const $ takeMVar outChan) ids
35 |   mapM_ killThread tids
36 |   return files
37 |           
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/src/Crawl.hs:
--------------------------------------------------------------------------------
  1 | module Crawl where
  2 | 
  3 | import           Control.Arrow        ((&&&))
  4 | import           Control.Monad        (when)
  5 | import qualified Data.ByteString.Lazy as L
  6 | import           Data.List            (isPrefixOf, isSuffixOf)
  7 | import           Data.Maybe           (fromJust)
  8 | import           Network.Browser      (browse, request, setCookies)
  9 | import           Network.HTTP         (RequestMethod (..), getRequest,
 10 |                                        mkRequest, rspBody, rspBody, simpleHTTP)
 11 | import           Network.URI          (URI, parseURI)
 12 | import           System.Exit          (ExitCode (..))
 13 | import           System.FilePath      (splitExtension)
 14 | import           System.Posix.Files   (fileExist)
 15 | import           System.Process       (system)
 16 | import           Text.HTML.TagSoup    (Tag (..), fromAttrib, isTagOpenName,
 17 |                                        parseTags)
 18 | import           Text.Regex.TDFA      ((=~))
 19 | 
 20 | import           Concurrent
 21 | 
 22 | -- |Filter prev/next links.
 23 | prevNext :: [ Tag String ] -> (Maybe String, Maybe String)
 24 | prevNext = locatePrevNext (Nothing, Nothing)
 25 |   where
 26 |     locatePrevNext res []  = res
 27 |     locatePrevNext (p,n) (tag@(TagOpen "a" _): TagText t: rest) | isSuffixOf "Prev" t = locatePrevNext (Just $ fromAttrib "href" tag, n) rest
 28 |                                                                 | isPrefixOf "Next" t = locatePrevNext (p, Just $ fromAttrib "href" tag) rest
 29 |     locatePrevNext res (_:ts) = locatePrevNext res ts
 30 | 
 31 | -- |Filter all href links in a page.
 32 | papers :: [ Tag String ] -> [ String ]
 33 | papers = filter (isPrefixOf "paper.jsp").map (fromAttrib "href").filter (isTagOpenName  "a")
 34 | 
 35 | -- | Collect all paper links on a page and the next/prev links, if they exist
 36 | collectPage :: [ Tag String ] -> ([ String ], (Maybe String,Maybe String))
 37 | collectPage = papers &&& prevNext
 38 | 
 39 | -- | Generate paper ID from paper link.
 40 | --
 41 | -- >>> paperId "paper.jsp?r=cs/9605101&qid=13871620873749a_nCnN_-288443966&qs=%22big+data%22+OR+cloud+OR+%22machine+learning%22+OR+%22artificial+intelligence%22+OR+%22distributed+computing%22"
 42 | -- "cs/9605101"
 43 | paperId :: String -> String
 44 | paperId link = case link =~ "paper.jsp.r=([^&]+)&.*" :: (String,String,String,[String]) of
 45 |   (_,_,_,x:_) -> x
 46 |   _           -> ""
 47 | 
 48 | -- |Get first page of query.
 49 | firstPage :: IO String
 50 | firstPage = body "http://search.arxiv.org:8081/?query=%22big+data%22+OR+cloud+OR+%22machine+learning%22+OR+%22artificial+intelligence%22+OR+%22distributed+computing%22&qid=13871620873749a_nCnN_-288443966&startat=40"
 51 | 
 52 | -- |Follow all `Prev` links till beginning of search and collect paper links.
 53 | previousPages :: ([ String ], (Maybe String,Maybe String)) -> IO [ String ]
 54 | previousPages (uris, (Nothing, _)) = return uris
 55 | previousPages (uris, (Just p, _)) = do
 56 |   b <- body $ "http://search.arxiv.org:8081/" ++ p
 57 |   let (uris', np) = collectPage $ parseTags b
 58 |   previousPages (uris ++ uris', np)
 59 | 
 60 | 
 61 | -- |Follow all `Next` links till end of search and collect paper links.
 62 | nextPages :: ([ String ], (Maybe String,Maybe String)) -> IO [ String ]
 63 | nextPages (uris, (_, Nothing)) = return uris
 64 | nextPages (uris, (_, Just p)) = do
 65 |   b <- body $ "http://search.arxiv.org:8081/" ++ p
 66 |   let (uris', np) = collectPage $ parseTags b
 67 |   nextPages (uris ++ uris', np)
 68 | 
 69 | -- | Get body of request, ignoring all cookies but following redirections.
 70 | body :: String -> IO String
 71 | body uri = browse $ setCookies [] >> (request (getRequest uri)) >>= (return.rspBody.snd)
 72 | 
 73 | 
 74 | -- |Collect all paper ids
 75 | allPaperIds :: IO [ String ]
 76 | allPaperIds = do
 77 |   f <- firstPage
 78 |   let page = collectPage $ parseTags f
 79 |   prev <- previousPages page
 80 |   next <- nextPages ([], snd page)
 81 |   return $ map paperId $ prev ++ next
 82 | 
 83 | -- |Construct an URI to a paper's PDF from an id
 84 | pdfURI :: String -> URI
 85 | pdfURI docId = fromJust $ parseURI $ "http://arxiv.org/pdf/" ++ docId ++ "v1.pdf"
 86 | 
 87 | -- |Download single PDF
 88 | --
 89 | -- throw an error if fail to download, returns filname otherwise.
 90 | downloadPDF :: String     -- ^Id of document to download
 91 |             -> IO String  -- ^File where document has been downloaded to
 92 | downloadPDF docId =  do
 93 |   resp <- simpleHTTP (mkRequest GET $ pdfURI docId)
 94 |   let b = rspBody $ (\ (Right r) -> r) resp
 95 |   let f = filename docId
 96 |   e <- fileExist f
 97 |   when (not e) $ L.writeFile f b
 98 |   return f
 99 |   where
100 |     filename     did = map replaceChars did ++ ".pdf"
101 |     replaceChars '/' = '_'
102 |     replaceChars c   = c
103 | 
104 | -- |Dowload all PDF of papers
105 | downloadPDFs :: IO [ String ]
106 | downloadPDFs = do
107 |   ids <- allPaperIds
108 |   runInThreadPool 10 ids downloadPDF
109 | 
110 | -- | Convert a PDF file to text.
111 | --
112 | -- This assumes `pdftotext` is available in the PATH.
113 | convertToText :: String        -- file path
114 |               -> IO String  -- Converted file
115 | convertToText pdf = do
116 |   let txt = fst (splitExtension pdf) ++ ".txt"
117 |   exit <- system $ "pdftotext " ++ pdf
118 |   case exit of
119 |     ExitSuccess   -> return txt
120 |     ExitFailure _ -> return ""
121 | 
122 | 


--------------------------------------------------------------------------------
/src/Display.hs:
--------------------------------------------------------------------------------
 1 | -- | Display a model in 2D after PCA analysis.
 2 | --
 3 | -- Principal component analysis extracts from a matrix the "most significant dimensions",
 4 | -- that is the dimensions which have the higher correlations with other dimensions. Or at
 5 | -- least this is what I understood...
 6 | -- We extract the two main principal components from the feature matrix of a model and generate
 7 | -- a 2d picture of the most frequent words from the dictionary.
 8 | module Display(drawSelectedWords, pcaAnalysis) where
 9 | 
10 | -- Module containing code for PCA computation
11 | import           Control.Lens
12 | import           Data.Colour
13 | import           Data.Colour.Names
14 | import           Data.Default.Class
15 | import qualified Data.HashMap.Strict      as H
16 | import           Graphics.Rendering.Chart
17 | import           Model.Repa
18 | import           Model.Types
19 | import           PCA
20 | import           Words.Dictionary
21 | 
22 | 
23 | type WordPoints = H.HashMap String [Double]
24 | 
25 | -- | Compute 2D mapping of words from a model.
26 | --
27 | -- We first transform the syn0 values of model into a Matrix of doubles
28 | -- then compute 2 first PCA from this matrix. The first 2 PCAs are zipped along with each corresponding
29 | -- word from the vocabulary to produce a vector of tuples with coordinates
30 | pcaAnalysis :: Model -> WordPoints
31 | pcaAnalysis m =
32 |   let matrix       = toMatrix (numberOfWords m) (modelSize m) (syn0 m)
33 |       pcf          = pca''' 2 matrix fastPCA
34 |       indexedWords = orderedWords (vocabulary m)
35 |       pcs          = toLists pcf
36 |       wordsAndVecs = H.fromList $ zip indexedWords pcs
37 |   in wordsAndVecs
38 | 
39 | -- |Draw  a chart of the X most frequent words in a model using PCA dimensions.
40 | drawSelectedWords :: WordPoints  -- ^Result of PCA analysis from model
41 |                   -> [String]                  -- ^Selected words to plot
42 |                   -> Renderable ()             -- ^The output from Chart
43 | drawSelectedWords vectors selectedWords = let
44 |   coord [x,y] = (x * 1000,y * 1000)
45 |   coord e     = error $ "invalid coordinates for point " ++ show e
46 | 
47 |   points  = plot_points_style .~ filledCircles 2 (opaque red)
48 |            $ plot_points_values .~ (map  (coord . (vectors H.!)) selectedWords)
49 |            $ def
50 | 
51 |   labels = plot_annotation_values .~  [(x * 1000 + 0.01,y * 1000 + 0.01,l) |  (l,x:y:_) <- H.toList (H.filterWithKey (\ k _ -> k `elem` selectedWords) vectors) ]
52 |     $ def
53 | 
54 |   layout = layout_title .~ "Words Vector Space"
55 |            $ layout_plots .~ [toPlot points, toPlot labels]
56 |            $ def
57 |   in
58 |    toRenderable layout
59 | 
60 | 


--------------------------------------------------------------------------------
/src/Huffman.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE DeriveGeneric        #-}
  2 | {-# LANGUAGE FlexibleContexts     #-}
  3 | {-# LANGUAGE FlexibleInstances    #-}
  4 | {-# LANGUAGE TypeSynonymInstances #-}
  5 | -- | Assign codes and inner layer to each word according to Huffman coding
  6 | module Huffman(Code(..), unCode,
  7 |                Bin(..),
  8 |                Coding(..),
  9 |                huffmanEncode,
 10 |                asNum) where
 11 | 
 12 | import           Data.Aeson                   hiding (encode)
 13 | import           Data.Char                    (isDigit, isSpace)
 14 | import           Data.HashMap.Strict          (HashMap, empty, insert, toList)
 15 | import qualified Data.Heap                    as H
 16 | import           Data.List                    (unfoldr)
 17 | import           Data.Maybe                   (fromJust)
 18 | import           GHC.Generics
 19 | import           Text.ParserCombinators.ReadP
 20 | 
 21 | data Bin = Zero | One deriving (Eq, Ord, Show, Read)
 22 | 
 23 | newtype Code = Code { unCode :: [Bin] }  deriving (Eq)
 24 | 
 25 | instance ToJSON Code where
 26 |   toJSON code = Number $ fromIntegral $ fromEnum code
 27 | 
 28 | instance FromJSON Code where
 29 |   parseJSON (Number n) = return $ toEnum $ truncate n
 30 |   parseJSON e          = error $ "failed to JSON " ++ show e ++ " as Code instance"
 31 | 
 32 | asNum :: (Num a ) => Bin -> a
 33 | asNum Zero = 0
 34 | asNum One  = 1
 35 | 
 36 | instance Enum Code where
 37 |   toEnum = Code . reverse . unfoldr encodeBin
 38 |     where
 39 |       encodeBin 0 = Nothing
 40 |       encodeBin n = case n `mod` 2 of
 41 |         0 -> Just (Zero, n `div` 2)
 42 |         _ -> Just (One , n `div` 2)
 43 | 
 44 |   fromEnum (Code c)       = fromEnum' c
 45 |     where
 46 |       fromEnum' []       = 0
 47 |       fromEnum' (Zero:b) = 2 * fromEnum' b
 48 |       fromEnum' (One:b)  = 2 * fromEnum' b + 1
 49 | 
 50 | instance Show Code where
 51 |   show (Code digits) = show' digits
 52 |     where
 53 |       show' (Zero:cs) = '0':show' cs
 54 |       show' (One:cs)  = '1':show' cs
 55 |       show' []        = []
 56 | 
 57 | instance Read Code where
 58 |   readsPrec _ s = [ readCode s [] ]
 59 |     where
 60 |       readCode ('1':cs) c = readCode cs (One : c)
 61 |       readCode ('0':cs) c = readCode cs (Zero : c)
 62 |       readCode r        c = (Code $ reverse c, r)
 63 | 
 64 | data Coding = Coding {
 65 |   -- Index of word in corpus
 66 |   index      :: Int,
 67 | 
 68 |   -- Frequency of word in corpus
 69 |   frequency  :: Int,
 70 | 
 71 |    -- Huffman encoding of a word, LSB first
 72 |   huffman    :: Code,
 73 | 
 74 |   -- List of indices of path from root to word in encoding
 75 |   -- The indices are built in sucha way that most frequent words have smaller indices
 76 |   wordPoints :: [Int]
 77 |   } deriving (Eq, Generic, Show)
 78 | 
 79 | instance Read Coding where
 80 |   readsPrec n = readP_to_S codingParser
 81 | 
 82 | codingParser = do
 83 |   string "Coding" >> spaces >> char '{' >> spaces
 84 |   idx <- string "index" >> spaces >> char '=' >> spaces >> number
 85 |   frq <- spaces >> char ',' >> spaces >> string "frequency" >> spaces >> char '=' >> spaces >> number
 86 |   cod <- spaces >> char ',' >> spaces >> string "huffman" >> spaces >> char '=' >> spaces >> coder
 87 |   pts <- spaces >> char ',' >> spaces >> string "wordPoints" >> spaces >> char '=' >> spaces >> points
 88 |   spaces >> char '}'
 89 |   return $ Coding idx frq cod pts
 90 |     where
 91 |       spaces = munch isSpace
 92 |       number = read <$> munch1 isDigit
 93 |       coder  = readS_to_P (readsPrec 9)
 94 |       points = readS_to_P (readsPrec 8) :: ReadP [Int]
 95 | 
 96 | 
 97 | instance ToJSON Coding
 98 | instance FromJSON Coding
 99 | 
100 | huffmanEncode :: HashMap String Int -> HashMap String Coding
101 | huffmanEncode = encode . arborify . heapify
102 | 
103 | instance Ord Coding where
104 |   compare (Coding _ f _ _ ) (Coding _ f' _ _ ) = compare f f'
105 | 
106 | buildWord ::  ([Huffman],Int) -> (String, Int) -> ([Huffman],Int)
107 | buildWord (ws, n) (w,f) = ((Leaf w (Coding n f (Code []) [])):ws, n+1)
108 | 
109 | -- |Returns the list of words stored in given heap in ascending order.
110 | ascWords :: H.MinHeap Huffman -> [ String ]
111 | ascWords = map unWord . H.toAscList
112 |   where
113 |     unWord (Leaf w _) = w
114 | 
115 | -- |Build a heap from hashmap of words frequencies.
116 | --
117 | -- The heap is built with the frequency as ordering factor. Each word is built into a `Word`
118 | -- object that contains the frequency, the index of the word relative to size of vocabulary.
119 | --
120 | -- >>> ascWords $ heapify (fromList [("foo",3), ("bar",2)])
121 | -- ["bar","foo"]
122 | heapify :: HashMap String Int -> H.MinHeap Huffman
123 | heapify = foldl (flip H.insert) H.empty . fst . foldl buildWord ([],0) . toList
124 | 
125 | data Huffman =  Node Huffman Huffman Coding
126 |                | Leaf String Coding
127 |                  deriving (Eq,Show)
128 | 
129 | freq :: Huffman -> Int
130 | freq (Node _ _ (Coding _ f _ _)) = f
131 | freq (Leaf _ (Coding _ f _ _)) = f
132 | 
133 | instance Ord Huffman where
134 |   compare (Node _ _ c)      (Node _ _ c')      = compare c c'
135 |   compare (Node _ _ c)      (Leaf _ c') = compare c c'
136 |   compare (Leaf _ c) (Node _ _ c')      = compare c c'
137 |   compare (Leaf _ c) (Leaf _ c') = compare c c'
138 | 
139 | 
140 | -- | Build a tree from heap with only words
141 | --
142 | arborify ::  H.MinHeap Huffman -> H.MinHeap Huffman
143 | arborify h = foldl buildTree h [0.. sizeOfVocabulary -1]
144 |   where
145 |     sizeOfVocabulary = H.size h
146 |     buildTree h n    = let (min1,h1) = fromJust $ H.view h
147 |                        in case H.view h1 of
148 |                          Just (min2,h2) -> H.insert (Node min1 min2
149 |                                                      (Coding n
150 |                                                       (freq min1 + freq min2)
151 |                                                       (Code [])
152 |                                                       [])) h2
153 |                          Nothing        -> h
154 | 
155 | 
156 | encode :: H.MinHeap Huffman -> HashMap String Coding
157 | encode h = encode' (fst $ fromJust $ H.view h) [] [] empty
158 |   where
159 |     encode' (Leaf w c)          code points map = insert w c { huffman = Code code, wordPoints = points } map
160 |     encode' (Node left right c) code points map = let pts = index c : points
161 |                                                       m1  = encode' left (Zero:code) pts map
162 |                                                   in
163 |                                                    encode' right (One:code) pts m1
164 | 
165 | -- # Tests
166 | 
167 | -- | test Enum implementation
168 | --
169 | -- >>> toEnum 0 :: Code
170 | -- [Zero]
171 | -- >>> toEnum 1 :: Code
172 | -- [One]
173 | -- >>> toEnum 2 :: Code
174 | -- [Zero,One]
175 | testToEnum :: Int -> Code
176 | testToEnum = toEnum
177 | 
178 | -- | test Enum implementation
179 | --
180 | -- >>> fromEnum [Zero]
181 | -- 0
182 | -- >>> fromEnum [One]
183 | -- 1
184 | -- >>> fromEnum [Zero,One]
185 | -- 2
186 | -- >>> fromEnum [Zero,One,Zero,One]
187 | -- 10
188 | testFromEnum :: Code -> Int
189 | testFromEnum = fromEnum
190 | 
191 | 


--------------------------------------------------------------------------------
/src/Log.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE DeriveGeneric             #-}
 2 | {-# LANGUAGE ExistentialQuantification #-}
 3 | {-# LANGUAGE FlexibleInstances         #-}
 4 | {-# LANGUAGE GADTs                     #-}
 5 | {-# LANGUAGE RankNTypes                #-}
 6 | {-# LANGUAGE TypeSynonymInstances      #-}
 7 | -- | Provides structured logging and report on training process.
 8 | --
 9 | -- This module mainly exposes a type for `Message`s that are output by `word2vec` when
10 | -- working on input and training the underlying model and an interface mechanism for
11 | -- publishing those messages using any underlying `MonadIO` instance.
12 | module Log where
13 | 
14 | import           Control.Monad.Trans (MonadIO)
15 | import           Data.Aeson
16 | import           Data.Time.Clock
17 | import           GHC.Generics
18 | import           Model.Types
19 | import           Words.Dictionary
20 | 
21 | -- |All type of messages emitted by application while working.
22 | data Message = AnalyzingDirectory FilePath
23 |              | EncodedDictionary Dictionary
24 |              | TokenizingFiles Int
25 |              | TokenizingFile FilePath
26 |              | TokenizedFile FilePath [ String ]
27 |              | TokenizedFiles [[String]]
28 |              | WritingModelFile FilePath
29 |              | LoadingModelFile FilePath
30 |              | WritingPCAFile FilePath
31 |              | WritingDiagram FilePath [ String ]
32 |                -- Training
33 |              | StartTraining Int
34 |              | TrainingSentence Int Int
35 |              | TrainWord String String
36 |              | TrainingWindow Double String [String]
37 |              | InitialWordVector Int WordVector
38 |              | BeforeUpdate Int WordVector
39 |              | DotProduct Double
40 |              | ErrorGradient Double
41 |              | InputLayerAfterGradient WordVector
42 |              | HiddenLayerAfterGradient WordVector
43 |              | UpdatedWordVector Int WordVector
44 |              | TrainedSentence NominalDiffTime
45 |              | Done
46 |              deriving (Show, Generic)
47 | 
48 | instance ToJSON Message
49 | 
50 | data Level = Coarse
51 |            | Middle
52 |            | Fine
53 |            deriving (Eq, Show, Read, Enum)
54 | 
55 | class (MonadIO io) => Progress io where
56 |   progress :: Level -> Message -> io ()
57 | 


--------------------------------------------------------------------------------
/src/Model.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns         #-}
  2 | {-# LANGUAGE FlexibleInstances    #-}
  3 | {-# LANGUAGE TypeSynonymInstances #-}
  4 | -- |Neural network based model of words similarity
  5 | module Model where
  6 | 
  7 | import           Control.Monad       (foldM)
  8 | import           Control.Monad.Trans (MonadIO, liftIO)
  9 | import           Data.Array.Repa     ((:.) (..), All (..), Any (..), Array,
 10 |                                       DIM1, DIM2, U, Z (..), computeP, foldP,
 11 |                                       fromListUnboxed, ix1, ix2, slice, sumP,
 12 |                                       (!), (*^), (+^))
 13 | import qualified Data.Array.Repa     as R
 14 | import qualified Data.HashMap.Strict as M
 15 | import qualified Data.IntMap         as I
 16 | import           Data.Time.Clock     (diffUTCTime, getCurrentTime)
 17 | import           Huffman
 18 | import           Log
 19 | import           Model.Repa
 20 | import           Model.Types
 21 | import           System.Random       (RandomGen, getStdGen, random)
 22 | import           Window
 23 | import           Words.Dictionary
 24 | 
 25 | -- | Train a model using a dictionary and a list of sentences
 26 | trainModel :: (Progress m) => Int -> Dictionary -> [[String]] -> m Model
 27 | trainModel numberOfFeatures dict sentences = do
 28 |   theModel <- fromDictionary numberOfFeatures dict
 29 |   let alpha = 0.001
 30 |   progress Middle $ StartTraining (dictionaryLength dict)
 31 |   foldM (trainSentence alpha) (0, theModel) sentences >>= return.snd
 32 | 
 33 | 
 34 | -- |Train given model with a single sentence.
 35 | --
 36 | -- This function updates the model using skip-gram hierarchical softmax model on a single sentence
 37 | -- For each word in the sentence:
 38 | --
 39 | -- * Define a random training window around the word whose default value is 10
 40 | -- * Iterate over all words in the window, updating the underlying neural network
 41 | --
 42 | -- The updated model is returned along with the total number of words it has been trained on.
 43 | trainSentence :: (Progress m)
 44 |                  => Double
 45 |                  -> (Int, Model)
 46 |                  -> [String]
 47 |                  -> m (Int, Model)
 48 | trainSentence alpha (count,!m) sentence = do
 49 |   let len = length sentence
 50 |   start <- liftIO getCurrentTime
 51 |   progress Middle $ TrainingSentence count len
 52 |   g <- liftIO getStdGen
 53 |   m'<- foldM (trainWindow alpha) m (slidingWindows (window m) g sentence)
 54 |   end <- liftIO getCurrentTime
 55 |   progress Middle $ TrainedSentence (diffUTCTime end start)
 56 |   return (count + len,m')
 57 | 
 58 | 
 59 | trainWindow :: (Progress m)
 60 |                => Double            -- alpha threshold
 61 |                -> Model               -- model to train
 62 |                -> (String,[String])   -- prefix, word, suffix to select window around word
 63 |                -> m Model            -- updated model
 64 | trainWindow alpha !m (w, ws) = progress Fine (TrainingWindow alpha w ws) >>
 65 |                                foldM (trainWord alpha w) m (filter (/= w) ws)
 66 | 
 67 | -- | Train model on a single word, given a reference word
 68 | --
 69 | -- Uses skipgram training method to train model given two "close" words.
 70 | -- Code is a direct transposition of C code into Haskell using Mutable arrays.
 71 | trainWord :: (Progress m)
 72 |              => Double    -- alpha threshold
 73 |              -> String    -- reference word
 74 |              -> Model     -- model to train
 75 |              -> String    -- word to learn
 76 |              -> m Model
 77 | trainWord alpha ref m word = do
 78 |   progress Fine $ TrainWord word ref
 79 | 
 80 |   let h = dictionary $ vocabulary m
 81 |       Just (Coding _      _ huff points) = M.lookup ref h
 82 |       Just (Coding index' _ _    _     ) = M.lookup word h
 83 |       encodedPoints = zip points $ unCode huff
 84 | 
 85 |       s0 = syn0 m
 86 |       l0 = s0 I.! index'
 87 | 
 88 | --  progress Fine $ InitialWordVector index'
 89 | 
 90 |   -- update a single point
 91 |   let updatePoint :: (Progress m)
 92 |                      => Vec
 93 |                      -> Double
 94 |                      -> (Vec, Mat)
 95 |                      -> (Int, Bin)
 96 |                      -> m (Vec, Mat)
 97 |       updatePoint l0 alpha (neu1e,s1) (p,b) = do
 98 |         -- dot product of two vectors
 99 |         let l1 = s1 I.! p
100 | --        progress Fine $ BeforeUpdate p (vectorize l1)
101 |         f <- sumP (l0 *^ l1)
102 |         progress Fine $ DotProduct (R.linearIndex f 0)
103 |         let exp_f = exp (f ! Z)
104 |         -- compute gradient
105 |         let g = (1 - asNum b - exp_f) * alpha
106 |         progress Fine $ ErrorGradient g
107 |         -- apply gradient on input layer
108 |         neu1e' <- computeP $ (R.map (*g) l1) +^ neu1e
109 | --        progress Fine $ InputLayerAfterGradient (vectorize neu1e')
110 |         -- apply gradient on hidden layer
111 |         l1' <- computeP $ (R.map (*g) l0) +^ l1
112 | --        progress Fine $ HiddenLayerAfterGradient (vectorize  l1')
113 |         return (neu1e',updateLayer l1' p s1)
114 | 
115 | 
116 |   (neu1e, s1')  <- foldM (updatePoint l0 alpha) (initialVector m, syn1 m) encodedPoints
117 | 
118 | --  progress Fine $ UpdatedWordVector index' neu1e
119 | 
120 |   -- report computed gradient to input layer
121 |   return $ m { syn0 = updateLayer neu1e index' s0, syn1 = s1' }
122 | 
123 | -- |Construct a model from a Dictionary
124 | fromDictionary :: (MonadIO m) => Int -> Dictionary -> m Model
125 | fromDictionary numberOfFeatures d@(Dict _ size _) = model size numberOfFeatures >>= return . \ m -> m { vocabulary = d }
126 | 
127 | mostFrequentWords :: Int -> Model -> [ String ]
128 | mostFrequentWords len = take len . orderedWords . vocabulary
129 | 
130 | 


--------------------------------------------------------------------------------
/src/Model/Array.hs:
--------------------------------------------------------------------------------
  1 | -- | An implementation of `Model.NN` based on Repa arrays
  2 | module Model.Array where
  3 | 
  4 | import           Control.Monad.Trans (MonadIO, liftIO)
  5 | import qualified Data.Aeson          as J
  6 | import           Data.Array.Repa     ((:.) (..), Array, DIM1, U, Z (..),
  7 |                                       computeP, foldP, fromListUnboxed, ix1,
  8 |                                       ix2, slice, sumP, toList, toUnboxed, (!),
  9 |                                       (*^), (+^))
 10 | import qualified Data.Array.Repa     as R
 11 | import qualified Data.HashMap.Strict as M
 12 | import qualified Data.IntMap         as I
 13 | import qualified Data.Vector         as V
 14 | import           GHC.Generics
 15 | import           Huffman
 16 | import           Model.Types
 17 | import           System.Random
 18 | import           Words.Dictionary
 19 | import Log
 20 | 
 21 | 
 22 | -- | Used for vector computations
 23 | type Vec = Array U DIM1 Double
 24 | 
 25 | -- | More efficient to update part of a map than a complete matrix
 26 | type Mat = I.IntMap Vec
 27 | 
 28 | instance J.ToJSON Vec where
 29 |   toJSON v = J.Array $ V.fromList $ map J.toJSON $ toList v
 30 | 
 31 | data Model = Model {
 32 |   -- Number of words in the model
 33 |   numberOfWords :: Int,
 34 | 
 35 |   -- Size of the model or number of dimensions each word is mapped to
 36 |   -- also called number of features
 37 |   modelSize     :: Int,
 38 | 
 39 |   -- The input -> hidden connection matrix
 40 |   -- input layer has size equal to number of words in vocabulary, with each
 41 |   -- cell connected to a number of hidden cells equal to the 'dimension' of the model
 42 |   -- eg, the number of features we want to track defaulting to 100
 43 |   --
 44 |   -- syn0 is the original name in C word2vec implementation
 45 |   syn0          :: !Mat,
 46 | 
 47 |   -- The hidden -> output connection matrix
 48 |   -- It has the same geometry as the input layer.
 49 |   -- syn1 is the original name in C word2vec implementation
 50 |   syn1          :: !Mat,
 51 | 
 52 |   -- The dictionary
 53 |   -- Each word is mapped to a Coding structure containing, among other things,
 54 |   -- the Huffman encoding of the word and references to inner nodes this word is connected to
 55 |   -- contains also number of words and maximal length of coding vectors
 56 |   vocabulary    :: !Dictionary,
 57 | 
 58 |   -- Size of training window
 59 |   window        :: Int
 60 |   } deriving (Show, Read, Generic)
 61 | 
 62 | instance J.ToJSON Model
 63 | 
 64 | instance NN Model where
 65 |   type Vector Model = Vec
 66 |   type Layer Model = Mat
 67 | 
 68 |   initialize = model
 69 |   
 70 |   updateSinglePoint = do
 71 |         -- dot product of two vectors
 72 |         let l1 = s1 I.! p
 73 |         progress Fine $ BeforeUpdate p l1
 74 |         f <- sumP (l0 *^ l1)
 75 |         progress Fine $ DotProduct (R.linearIndex f 0)
 76 |         let exp_f = exp (f ! Z)
 77 |         -- compute gradient
 78 |         let g = (1 - asNum b - exp_f) * alpha
 79 |         progress Fine $ ErrorGradient g
 80 |         -- apply gradient on input layer
 81 |         neu1e' <- computeP $ (R.map (*g) l1) +^ neu1e
 82 |         progress Fine $ InputLayerAfterGradient neu1e'
 83 |         -- apply gradient on hidden layer
 84 |         l1' <- computeP $ (R.map (*g) l0) +^ l1
 85 |         progress Fine $ HiddenLayerAfterGradient l1'
 86 |         return (neu1e',updateLayer l1' p s1)
 87 | 
 88 |   similarity = distance
 89 |   
 90 | 
 91 | -- |Initializes a model of given size
 92 | --
 93 | -- The output connections are initialized to 0 while the hidden connections are
 94 | -- initialized to random values in the [-0.5,+0.5] interval, then divided by the number of
 95 | -- columns.
 96 | model :: (MonadIO m)
 97 |          => Int        -- number of words
 98 |          -> Int        -- number of features (dimensions)
 99 |          -> m Model
100 | model numWords dim = do
101 |   let wordsIndex = [0..numWords-1]
102 |   vecs <- mapM (const $ liftIO $ randomVector dim) wordsIndex
103 |   let s0 = I.fromList (zip wordsIndex vecs)
104 |   let nulls = map (const $ R.fromListUnboxed (Z :. dim) (replicate dim 0)) wordsIndex
105 |   let s1 = I.fromList (zip wordsIndex nulls)
106 |   return $ Model numWords dim s0 s1 emptyDictionary defaultWindow
107 |     where
108 |       -- |Initialize a vector with random values.
109 |       --
110 |       -- Values are distributed in such a way that each cell is between -0.5 and 0.5 and
111 |       -- is further divided by the total number of cells row so that the sum of values in
112 |       -- a row is always between -0.5 and +0.5
113 |       --
114 |       randomVector :: Int       -- number of cells
115 |                    -> IO Vec -- initialized vector
116 |       randomVector cols = do
117 |         g <- getStdGen
118 |         return $ fromListUnboxed (Z :. cols) (take cols (map (/fromIntegral cols) (randoms g)))
119 | 
120 |       randoms :: RandomGen g => g -> [ Double ]
121 |       randoms g = let (i,g') = random g
122 |                   in (i - 0.5) : randoms g'
123 | 
124 | -- | Compute similarity between two words
125 | --
126 | -- Uses the cosine similarity, eg. dot product between the two vectors. The vectors should be of
127 | -- norm 1 and equal length.
128 | distance :: Model -> String -> String -> IO Double
129 | distance m u v = do
130 |   u'<- coefficient m u
131 |   v'<- coefficient m v
132 |   vecU <- unitVector u'
133 |   vecV <- unitVector v'
134 |   sumP (vecU *^ vecV) >>= return.(!Z)
135 |     where
136 |       -- | Raw coefficients of given word
137 |       --
138 |       -- Returns an array of model size length containing the raw coefficients for the given word
139 |       -- in the given model.
140 |       coefficient :: Model -> String -> IO Vec
141 |       coefficient m w = do
142 |         let h = dictionary $ vocabulary m
143 |         let Just (Coding wordIndex _ _ _) = M.lookup w h
144 |         let s0 = syn0 m
145 |         return $ s0 I.! wordIndex
146 | 
147 |       -- | Normalize given array to a vector of length 1.
148 |       unitVector :: Vec -> IO Vec
149 |       unitVector v = do
150 |         s <- foldP (\ s x -> s + (x * x)) 0 v
151 |         let norm = sqrt (s ! Z)
152 |         computeP $ R.map (/ norm) v
153 | 
154 | 
155 | -- |Update a single row of a matrix with a vector at given index.
156 | updateLayer :: Vec -> Int -> Mat -> Mat
157 | updateLayer v = I.adjust (const v)
158 | 
159 | -- | An initialised vector to be used for computing single update of a word
160 | initialVector :: Model -> Vec
161 | initialVector m = fromListUnboxed (ix1 layerSize) (replicate layerSize 0)
162 |   where
163 |     layerSize = modelSize m
164 | 
165 | defaultWindow :: Int
166 | defaultWindow = 10
167 | 
168 | defaultFeatures :: Int
169 | defaultFeatures = 100
170 | 
171 | -- | Output a layer (matrix) as a list of doubles concatenating all rows
172 | layerToList :: Mat -> [Double]
173 | layerToList = concatMap toList . I.elems
174 | 


--------------------------------------------------------------------------------
/src/Model/IO.hs:
--------------------------------------------------------------------------------
 1 | module Model.IO where
 2 | 
 3 | import           Control.Monad.Trans (liftIO)
 4 | import           Data.List
 5 | import           Log
 6 | import           Model
 7 | import           Model.Repa
 8 | import           Model.Types
 9 | import           System.Directory
10 | import           System.FilePath
11 | import           Words
12 | 
13 | trainFiles :: (Progress m) => Int -> [String] -> m Model
14 | trainFiles numFeatures txts = do
15 |   (dict, contents) <- tokenizeFiles txts
16 |   progress Fine (EncodedDictionary dict)
17 |   trainModel numFeatures dict contents
18 | 
19 | analyzeDirectory :: (Progress m) => Int -> String -> m Model
20 | analyzeDirectory numFeatures dir = do
21 |   txts <- filter (isSuffixOf ".txt") <$> liftIO (getDirectoryContents dir)
22 |   trainFiles numFeatures $ map (dir </>) txts
23 | 


--------------------------------------------------------------------------------
/src/Model/Repa.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE DeriveGeneric        #-}
  2 | {-# LANGUAGE FlexibleInstances    #-}
  3 | {-# LANGUAGE InstanceSigs         #-}
  4 | {-# LANGUAGE TypeFamilies         #-}
  5 | {-# LANGUAGE TypeSynonymInstances #-}
  6 | -- | An implementation of `Model.NN` based on Repa arrays
  7 | module Model.Repa where
  8 | 
  9 | import           Control.Monad.Trans (MonadIO, liftIO)
 10 | import qualified Data.Aeson          as J
 11 | import           Data.Array.Repa     ((:.) (..), Array, DIM1, U, Z (..),
 12 |                                       computeP, foldP, fromListUnboxed, ix1,
 13 |                                       sumP, toList, (!), (*^), (+^))
 14 | import qualified Data.Array.Repa     as R
 15 | import qualified Data.HashMap.Strict as M
 16 | import qualified Data.IntMap         as I
 17 | import qualified Data.Vector         as V
 18 | import           GHC.Generics
 19 | import           Huffman
 20 | import           Model.Types
 21 | import           System.Random
 22 | import           Words.Dictionary
 23 | 
 24 | 
 25 | -- | Used for vector computations
 26 | type Vec = Array U DIM1 Double
 27 | 
 28 | -- | More efficient to update part of a map than a complete matrix
 29 | type Mat = I.IntMap Vec
 30 | 
 31 | instance J.ToJSON Vec where
 32 |   toJSON v = J.Array $ V.fromList $ map J.toJSON $ toList v
 33 | 
 34 | data Model = Model {
 35 |   -- Number of words in the model
 36 |   numberOfWords :: Int,
 37 | 
 38 |   -- Size of the model or number of dimensions each word is mapped to
 39 |   -- also called number of features
 40 |   modelSize     :: Int,
 41 | 
 42 |   -- The input -> hidden connection matrix
 43 |   -- input layer has size equal to number of words in vocabulary, with each
 44 |   -- cell connected to a number of hidden cells equal to the 'dimension' of the model
 45 |   -- eg, the number of features we want to track defaulting to 100
 46 |   --
 47 |   -- syn0 is the original name in C word2vec implementation
 48 |   syn0          :: !Mat,
 49 | 
 50 |   -- The hidden -> output connection matrix
 51 |   -- It has the same geometry as the input layer.
 52 |   -- syn1 is the original name in C word2vec implementation
 53 |   syn1          :: !Mat,
 54 | 
 55 |   -- The dictionary
 56 |   -- Each word is mapped to a Coding structure containing, among other things,
 57 |   -- the Huffman encoding of the word and references to inner nodes this word is connected to
 58 |   -- contains also number of words and maximal length of coding vectors
 59 |   vocabulary    :: !Dictionary,
 60 | 
 61 |   -- Size of training window
 62 |   window        :: Int
 63 |   } deriving (Show, Read, Generic)
 64 | 
 65 | instance J.ToJSON Model
 66 | 
 67 | instance NN Model where
 68 |   type Vector Model = Vec
 69 |   type Layer Model = Mat
 70 | 
 71 |   vectorize  = W . V.fromList . toList
 72 | 
 73 |   initialize = model
 74 | 
 75 |   updateSinglePoint l0 alpha (neu1e,s1) (p,b) = do
 76 |         -- dot product of two vectors
 77 |         let l1 = s1 I.! p
 78 | --            v = vectorize l1
 79 | --        progress Fine $ BeforeUpdate p v
 80 |         f <- sumP (l0 *^ l1)
 81 | --        progress Fine $ DotProduct (R.linearIndex f 0)
 82 |         let exp_f = exp (f ! Z)
 83 |         -- compute gradient
 84 |         let g = (1 - asNum b - exp_f) * alpha
 85 | --        progress Fine $ ErrorGradient g
 86 |         -- apply gradient on input layer
 87 |         neu1e' <- computeP $ (R.map (*g) l1) +^ neu1e
 88 | --        progress Fine $ InputLayerAfterGradient (vectorize neu1e')
 89 |         -- apply gradient on hidden layer
 90 |         l1' <- computeP $ (R.map (*g) l0) +^ l1
 91 | --        progress Fine $ HiddenLayerAfterGradient (vectorize l1')
 92 |         return (neu1e',updateLayer l1' p s1)
 93 | 
 94 |   similarity = distance
 95 | 
 96 | 
 97 | -- |Initializes a model of given size
 98 | --
 99 | -- The output connections are initialized to 0 while the hidden connections are
100 | -- initialized to random values in the [-0.5,+0.5] interval, then divided by the number of
101 | -- columns.
102 | model :: (MonadIO m)
103 |          => Int        -- number of words
104 |          -> Int        -- number of features (dimensions)
105 |          -> m Model
106 | model numWords dim = do
107 |   let wordsIndex = [0..numWords-1]
108 |   vecs <- mapM (const $ liftIO $ randomVector dim) wordsIndex
109 |   let s0 = I.fromList (zip wordsIndex vecs)
110 |   let nulls = map (const $ R.fromListUnboxed (Z :. dim) (replicate dim 0)) wordsIndex
111 |   let s1 = I.fromList (zip wordsIndex nulls)
112 |   return $ Model numWords dim s0 s1 emptyDictionary defaultWindow
113 |     where
114 |       -- |Initialize a vector with random values.
115 |       --
116 |       -- Values are distributed in such a way that each cell is between -0.5 and 0.5 and
117 |       -- is further divided by the total number of cells row so that the sum of values in
118 |       -- a row is always between -0.5 and +0.5
119 |       --
120 |       randomVector :: Int       -- number of cells
121 |                    -> IO Vec -- initialized vector
122 |       randomVector cols = do
123 |         g <- getStdGen
124 |         return $ fromListUnboxed (Z :. cols) (take cols (map (/fromIntegral cols) (randoms g)))
125 | 
126 |       randoms :: RandomGen g => g -> [ Double ]
127 |       randoms g = let (i,g') = random g
128 |                   in (i - 0.5) : randoms g'
129 | 
130 | -- | Compute similarity between two words
131 | --
132 | -- Uses the cosine similarity, eg. dot product between the two vectors. The vectors should be of
133 | -- norm 1 and equal length.
134 | distance :: Model -> String -> String -> IO Double
135 | distance m u v = do
136 |   u'<- coefficient m u
137 |   v'<- coefficient m v
138 |   vecU <- unitVector u'
139 |   vecV <- unitVector v'
140 |   sumP (vecU *^ vecV) >>= return.(!Z)
141 |     where
142 |       -- | Raw coefficients of given word
143 |       --
144 |       -- Returns an array of model size length containing the raw coefficients for the given word
145 |       -- in the given model.
146 |       coefficient :: Model -> String -> IO Vec
147 |       coefficient m w = do
148 |         let h = dictionary $ vocabulary m
149 |         let Just (Coding wordIndex _ _ _) = M.lookup w h
150 |         let s0 = syn0 m
151 |         return $ s0 I.! wordIndex
152 | 
153 |       -- | Normalize given array to a vector of length 1.
154 |       unitVector :: Vec -> IO Vec
155 |       unitVector v = do
156 |         s <- foldP (\ s x -> s + (x * x)) 0 v
157 |         let norm = sqrt (s ! Z)
158 |         computeP $ R.map (/ norm) v
159 | 
160 | 
161 | -- |Update a single row of a matrix with a vector at given index.
162 | updateLayer :: Vec -> Int -> Mat -> Mat
163 | updateLayer v = I.adjust (const v)
164 | 
165 | -- | An initialised vector to be used for computing single update of a word
166 | initialVector :: Model -> Vec
167 | initialVector m = fromListUnboxed (ix1 layerSize) (replicate layerSize 0)
168 |   where
169 |     layerSize = modelSize m
170 | 
171 | defaultWindow :: Int
172 | defaultWindow = 10
173 | 
174 | defaultFeatures :: Int
175 | defaultFeatures = 100
176 | 
177 | -- | Output a layer (matrix) as a list of doubles concatenating all rows
178 | layerToList :: Mat -> [Double]
179 | layerToList = concatMap toList . I.elems
180 | 
181 | 


--------------------------------------------------------------------------------
/src/Model/Types.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE DeriveGeneric     #-}
 2 | {-# LANGUAGE FlexibleInstances #-}
 3 | {-# LANGUAGE TypeFamilies      #-}
 4 | -- | Interface and core types describing a training model
 5 | module Model.Types where
 6 | 
 7 | import           Control.Monad.Trans
 8 | import           Data.Aeson
 9 | import qualified Data.Vector         as V
10 | import           GHC.Generics
11 | import           Huffman
12 | 
13 | -- | A "generic" type of vectors
14 | newtype WordVector = W { vector :: V.Vector Double }
15 |                      deriving (Show,Generic)
16 | 
17 | instance ToJSON WordVector
18 | 
19 | class NN a where
20 |   type Vector a :: *
21 |   type Layer a :: *
22 | 
23 |   vectorize :: Vector a -> WordVector
24 | 
25 |   initialize :: (MonadIO m)
26 |                 => Int        -- number of words
27 |                 -> Int        -- number of features (dimensions)
28 |                 -> m a
29 | 
30 |   updateSinglePoint :: (MonadIO m)
31 |                        => Vector a
32 |                        -> Double
33 |                        -> (Vector a, Layer a)
34 |                        -> (Int, Bin)
35 |                        -> m (Vector a, Layer a)
36 | 
37 |   similarity :: a -> String -> String -> IO Double
38 | 


--------------------------------------------------------------------------------
/src/PCA.hs:
--------------------------------------------------------------------------------
  1 | -- | Compute PCA from a Matrix
  2 | --
  3 | -- http://www.cs.otago.ac.nz/cosc453/student_tutorials/principal_components.pdf
  4 | module PCA (
  5 |   -- * PCA Computations
  6 |   pcaSVD, pca', pca'', pca''',
  7 |   pcaNipals, fastPCA, fastPCA',
  8 |   -- * Conversions
  9 |   toMatrix, toLists
 10 |   ) where
 11 | 
 12 | import           Debug.Trace
 13 | -- TODO remove
 14 | import qualified Model.Repa                   as T
 15 | import           Model.Types                  (Layer)
 16 | import           Numeric.LinearAlgebra
 17 | import           Numeric.LinearAlgebra.NIPALS
 18 | 
 19 | type Vec = Vector Double
 20 | type Mat = Matrix Double
 21 | 
 22 | -- | Turn a Layer into a Matrix for purpose of PCA.
 23 | toMatrix :: Int -> Int -> T.Mat -> Mat
 24 | toMatrix r c = (r >< c) . T.layerToList
 25 | 
 26 | -----------------------------------------------------
 27 | -- * Standard (Full) PCA Computation
 28 | -- https://github.com/albertoruiz/hmatrix/blob/master/examples/pca1.hs
 29 | 
 30 | -- | Run *Principal Component Analysis* on given Matrix and returns requested number
 31 | -- of most significant dimensions.
 32 | -- creates the compression and decompression functions from the desired number of components
 33 | pcaSVD :: Int -> Mat -> (Mat, Vec, Vec -> Vec , Vec -> Vec)
 34 | pcaSVD n dataSet = (vp, m, encode,decode)
 35 |   where
 36 |     encode x = vp #> (x - m)
 37 |     decode x = x <# vp + m
 38 |     (m,c) = meanCov dataSet
 39 |     (_,v) = eigSH (trustSym c)
 40 |     vp = tr $ takeColumns n v
 41 | 
 42 | -- | Return a function that yields the PCA vector for some index of given matrix
 43 | pca' :: Int -> Mat -> (Int -> [Double])
 44 | pca' n dataSet = toList . enc . (mat' !!)
 45 |   where
 46 |     mat' = toRows dataSet
 47 |     (_,_,enc,_) = pcaSVD n dataSet
 48 | 
 49 | pca'' :: Int -> Mat -> (Int -> Mat -> Mat) -> Mat
 50 | pca'' n dataSet pca = tr (pcaMat <> tr dataSet)
 51 |   where
 52 |     pcaMat = pca n dataSet
 53 | 
 54 | pca''' :: Int -> Mat -> (Int -> Mat -> [Vec]) -> Mat
 55 | pca''' n dataSet pca = tr (pcaMat <> tr dataSet)
 56 |   where
 57 |     pcaMat = fromRows $ pca n dataSet
 58 | 
 59 | 
 60 | --------------------------------------------------
 61 | -- * NIPALS Algorithm
 62 | 
 63 | pcaNipals :: Int -> Mat -> [ Vec ]
 64 | pcaNipals 0 _       = []
 65 | pcaNipals n dataSet = let (pc1, _ , residual) = firstPC dataSet
 66 |                       in  pc1 : pcaNipals (n - 1) residual
 67 | 
 68 | -----------------------------------------------------
 69 | -- * Fast (Iterative) PCA Computation
 70 | -- https://maxwell.ict.griffith.edu.au/spl/publications/papers/prl07_alok_pca.pdf
 71 | 
 72 | -- | Computes a list of top PCA components for given matrix
 73 | fastPCA :: Int -> Matrix Double -> [ Vector Double ]
 74 | fastPCA n dataSet = fastPCARec n dataSet []
 75 | 
 76 | fastPCARec :: Int -> Matrix Double -> [ Vector Double ] -> [ Vector Double ]
 77 | fastPCARec 0 _       _ = []
 78 | fastPCARec n dataSet phis =
 79 |   let (_,cov) = meanCov dataSet       -- compute covariance matrix
 80 | 
 81 |       max_iter = 30
 82 | 
 83 |       phi_p :: Vector Double
 84 |       phi_p = unitary $ konst 1 (cols dataSet)
 85 | 
 86 |       gram_schmidt :: Vector Double -> [ Vector Double ] -> Vector Double
 87 |       gram_schmidt phip phis = phip - sum (map (\ phi_j -> cmap (* (phip <.> phi_j)) phi_j) phis)
 88 | 
 89 |       go :: Vector Double -> Int -> Vector Double
 90 |       go phi k | k > max_iter = phi
 91 |                | otherwise    = let phi_p_new = cov #> phi
 92 |                                     norm_phi  = unitary $ gram_schmidt phi_p_new phis
 93 |                                     conv      = abs (norm_phi <.> phi - 1) < peps
 94 |                                 in  if conv
 95 |                                     then norm_phi
 96 |                                     else go norm_phi (k+1)
 97 | 
 98 |       new_phi = go phi_p 0
 99 |   in new_phi : fastPCARec (n-1) dataSet (new_phi:phis)
100 | 
101 | 
102 | ----------------------------------------------
103 | -- * Another Fast PCA algorithm
104 | -- Computes top k principal components using power iteration method
105 | -- http://theory.stanford.edu/~tim/s15/l/l8.pdf
106 | 
107 | fastPCA' :: Int -> Matrix Double -> [ Vector Double ]
108 | fastPCA' n dataSet = fastPCARec' n seed
109 |   where
110 |     square = uncurry (==) . size
111 | 
112 |     seed = if not (square dataSet)
113 |            then tr dataSet <> dataSet
114 |            else dataSet
115 | 
116 |     fastPCARec' 0 _        = []
117 |     fastPCARec' k mat  =
118 |       let v_0   = unitary $ konst 1 (cols mat)
119 |           go v  = let v'     = mat #> v
120 |                       unit_v = unitary v'
121 |                       stop   = abs (unit_v <.> unitary v - 1) < peps
122 |                   in if stop
123 |                      then unit_v
124 |                      else go v'
125 |           new_v = go v_0
126 |           mat_v = mat #> new_v
127 |           mat'  = mat - (mat_v `outer` new_v)
128 |       in new_v : fastPCARec' (k-1) mat'
129 | 
130 | 


--------------------------------------------------------------------------------
/src/Step.hs:
--------------------------------------------------------------------------------
1 | module Steps where
2 | 
3 | 
4 | instance Progress IO where
5 |   progress =  BS8.putStrLn . encode
6 | 
7 | 


--------------------------------------------------------------------------------
/src/Window.hs:
--------------------------------------------------------------------------------
 1 | -- | Utilities for manipulating windows of lists (eg. random subsequence of a list around some element)
 2 | module Window(slidingWindows) where
 3 | import System.Random(randomR,getStdGen,RandomGen,mkStdGen)
 4 | 
 5 | -- |Select a window of size around word
 6 | --
 7 | -- >>> selectWindow 1 ("abc",'d', "efg")
 8 | -- ('d',"d")
 9 | -- >>> selectWindow 2 ("abc",'d', "efg")
10 | -- ('d',"ad")
11 | -- >>> selectWindow 3 ("abc",'d', "efg")
12 | -- ('d',"ade")
13 | -- >>> selectWindow 4 ("abc",'d', "efg")
14 | -- ('d',"bade")
15 | -- >>> selectWindow 5 ("abc",'d', "efg")
16 | -- ('d',"badfe")
17 | -- >>> selectWindow 2 ("",'d', "efg")
18 | -- ('d',"de")
19 | -- >>> selectWindow 2 ("",'d', "")
20 | -- ('d',"d")
21 | -- >>> selectWindow 3 ("abc",'d', "")
22 | -- ('d',"bad")
23 | selectWindow :: Int -> ([a],a,[a]) -> (a,[a])
24 | selectWindow n (pref,word,suff) = (word, selectWindow' n (pref,word,suff) [] [])
25 |   where
26 |     selectWindow' 1 (pref,word,suff) prefs suffs = prefs ++ (word:suffs)                          
27 |     selectWindow' n ([],word,[])     prefs suffs = prefs ++ (word:suffs)                          
28 |     selectWindow' n ([],word,a:as)   prefs suffs = selectWindow' (n-1) ([],word,as) prefs (a :suffs)
29 |     selectWindow' n (b:bs,word,[])   prefs suffs = selectWindow' (n-1) (bs,word,[]) (b:prefs) suffs
30 |     selectWindow' 2 (b:bs,word,suff) prefs suffs = selectWindow' 1     (bs,word,suff)      (b:prefs) suffs
31 |     selectWindow' n (b:bs,word,a:as) prefs suffs = selectWindow' (n-2) (bs,word,as) (b:prefs) (a: suffs)
32 |     
33 | 
34 | -- |Generate random sliding windows over a given sentence
35 | --
36 | -- >>> slidingWindows 4 (mkStdGen 0) "abcdefgh"
37 | -- [('a',"adcb"),('b',"abdc"),('c',"bc"),('d',"cde"),('e',"e"),('f',"ef"),('g',"efgh"),('h',"gh")]
38 | slidingWindows :: (RandomGen g) =>
39 |                   Int       -- Maximum size of window
40 |                -> g         -- random generator seed
41 |                -> [a]       -- 'sentence' to generate windows from
42 |                -> [(a,[a])] -- list of couple (word, window)
43 | slidingWindows w g sentence = snd $ foldl (randomWindow w) (g,[]) (parts sentence)
44 |   where
45 |     randomWindow :: (RandomGen g) =>
46 |                   Int
47 |                -> (g, [(a,[a])])
48 |                -> ([a],a,[a])
49 |                -> (g, [(a,[a])])
50 |     randomWindow w (g,ws) part =
51 |       let (n,g') = randomR (1,w) g
52 |       in  (g', selectWindow n part: ws)
53 |   
54 | -- |Iterate over a list, splitting it in 3 parts: prefix, element, suffix
55 | --
56 | -- >>> reverse $ parts [1,2,3,4]
57 | -- [([],1,[2,3,4]),([1],2,[3,4]),([2,1],3,[4]),([3,2,1],4,[])]
58 | parts :: [a] -> [([a],a,[a])]
59 | parts xs = parts' xs []
60 |   where
61 |     parts' []     ps                   = ps
62 |     parts' (x:xs) []                   = parts' xs [([],x,xs)]
63 |     parts' (x:xs) p@((pref,v,suff):ps) = parts' xs ((v:pref,x,xs):p)
64 | 


--------------------------------------------------------------------------------
/src/Words.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE BangPatterns #-}
 2 | module Words where
 3 | 
 4 | import           Control.Arrow       ((***))
 5 | import           Control.DeepSeq     (force)
 6 | import           Control.Exception   (evaluate, finally)
 7 | import           Control.Monad       (foldM)
 8 | import           Control.Monad.Trans (liftIO)
 9 | import           Data.HashMap.Strict (empty)
10 | import           Log
11 | import           Prelude             hiding (readFile)
12 | import           System.IO           (IOMode (..), hClose, hGetContents,
13 |                                       hSetEncoding, openFile, utf8)
14 | import           Words.Dictionary
15 | 
16 | indexFile :: (Progress m) => FilePath -> Index -> m (Index, [String])
17 | indexFile file dict = do
18 |   h <- liftIO $ openFile file ReadMode
19 |   progress Middle (TokenizingFile file)
20 |   s <- liftIO $ (do hSetEncoding h utf8
21 |                     s <- hGetContents h
22 |                     evaluate $ force s
23 |                 ) `finally` hClose h
24 | 
25 |   let tokens = tokenizeString s
26 |       dict' = s `seq` indexString dict tokens
27 |   progress Middle (TokenizedFile file tokens)
28 |   return  (dict', tokens)
29 | 
30 | 
31 | -- |Encode the words of several files into a dictionary
32 | tokenizeFiles :: (Progress m)
33 |                  => [String]        -- file paths
34 |                  -> m (Dictionary, [[String]])
35 | tokenizeFiles files = do
36 |   progress Middle $ TokenizingFiles (length files)
37 |   !(dict, rtokens) <- (encodeWords *** reverse) <$> foldM tokenizeAndIndex (empty, []) files
38 |   progress Middle $ EncodedDictionary dict
39 |   progress Middle $ TokenizedFiles rtokens
40 |   return (dict, rtokens)
41 |     where
42 |       tokenizeAndIndex (dict, toks) f = do
43 |         (dict', tokens) <- indexFile f dict
44 |         return (dict', tokens:toks)
45 | 
46 | 


--------------------------------------------------------------------------------
/src/Words/Dictionary.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE DeriveGeneric #-}
 2 | module Words.Dictionary(
 3 |   -- * Types
 4 |   Dictionary(Dict), emptyDictionary, Index,
 5 |   -- * Tokenizing
 6 |   tokenizeString,
 7 |   -- * Indexing
 8 |   indexString, encodeWords,
 9 |   -- * Querying
10 |   dictionary, dictionaryLength, encodingLength,
11 |   orderedWords
12 |   ) where
13 | 
14 | import           Data.Aeson          (FromJSON, ToJSON)
15 | import           Data.Char           (toLower)
16 | import           Data.HashMap.Strict (HashMap, elems, empty, insertWith, size,
17 |                                       size, toList)
18 | import           Data.List           (sortBy)
19 | import           Data.Ord            (comparing)
20 | import           GHC.Generics
21 | import           Huffman
22 | import           NLP.Tokenize        (tokenize)
23 | import           Prelude             hiding (readFile)
24 | import           Text.Regex.TDFA     ((=~))
25 | 
26 | type Index = HashMap String Int
27 | 
28 | data Dictionary = Dict {
29 |   dictionary       :: HashMap String Coding,
30 |   dictionaryLength :: Int,
31 |   encodingLength   :: Int } deriving (Eq, Show, Read, Generic)
32 | 
33 | instance ToJSON Dictionary
34 | instance FromJSON Dictionary
35 | 
36 | emptyDictionary :: Dictionary
37 | emptyDictionary = Dict empty 0 0
38 | 
39 | -- | Return a list of all words in dictionary in ascending order of their index.
40 | --
41 | -- >>> orderedWords (encodeWords $ indexString empty "some words for testing words")
42 | -- ["words","some","for","testing"]
43 | orderedWords :: Dictionary -> [ String ]
44 | orderedWords (Dict d _ _)  = map fst $ sortBy (comparing (index.snd)) (toList d)
45 | 
46 | -- |Index a list of words into a frequency map
47 | indexWord :: Index -> String -> Index
48 | indexWord m w = insertWith (+) w 1 m
49 | 
50 | -- |Tokenize and normalize a string
51 | tokenizeString :: String -> [ String ]
52 | tokenizeString = map (map toLower).filter (=~ "^[a-zA-Z'-]+$").tokenize
53 | 
54 | -- |Update a frequency map with tokens from given string.
55 | indexString :: Index -> [ String ] -> Index
56 | indexString = foldl indexWord
57 | 
58 | encodeWords :: Index -> Dictionary
59 | encodeWords dict = let encoding = huffmanEncode $ dict
60 |                        maxCodeLength = maximum (map (length . unCode . huffman) $ elems encoding)
61 |                    in Dict encoding (size encoding) maxCodeLength
62 | 


--------------------------------------------------------------------------------
/stack.yaml:
--------------------------------------------------------------------------------
 1 | flags: {}
 2 | packages:
 3 |   - '.'
 4 |   - 'hmatrix-nipals'
 5 | extra-deps:
 6 |   - tokenize-0.3.0
 7 |   # - cassava-0.4.3.1
 8 |   # - gamma-0.9.0.2
 9 |   # - continued-fractions-0.9.1.1
10 |   # - converge-0.1.0.1
11 |   # - optparse-generic-1.1.0
12 | resolver: lts-6.2
13 | 


--------------------------------------------------------------------------------
/test/PCASpec.hs:
--------------------------------------------------------------------------------
 1 | module PCASpec where
 2 | 
 3 | import           Numeric.LinearAlgebra
 4 | import           PCA
 5 | import           Test.Hspec
 6 | 
 7 | spec :: Spec
 8 | spec = describe "PCA Computation" $ do
 9 |   let raw = fromLists [ [2.5, 2.4]
10 |                       , [0.5, 0.7]
11 |                       , [2.2, 2.9]
12 |                       , [1.9, 2.2]
13 |                       , [3.1, 3.0]
14 |                       , [2.3, 2.7]
15 |                       , [2  , 1.6]
16 |                       , [1  , 1.1]
17 |                       , [1.5, 1.6]
18 |                       , [1.1, 0.9]
19 |                       ]
20 | 
21 |   it "compute first component of PCA using SVD from known sample" $ do
22 |     let (m,_,enc,dec) = pcaSVD 1 raw
23 | 
24 |     (toRows raw !! 0 <.> toColumns m !! 0) `shouldBe` 3.459112269626609
25 | 
26 |   it "computes first component of PCA with NIPALS from known sample" $ do
27 |     let [ v ] = pcaNipals 1 raw
28 | 
29 |     (toRows raw !! 0 <.> v) `shouldBe` 3.461356337247943
30 | 
31 |   it "computes first component of PCA with power iteration from known sample" $ do
32 |     let [ v ] = fastPCA' 1 raw
33 | 
34 |     (toRows raw !! 0 <.> v) `shouldBe` 3.461356337251494
35 | 
36 |   it "computes first component of fast PCA from known sample" $ do
37 |     let [ v ] = fastPCA 1 raw
38 | 
39 |     (toRows raw !! 0 <.> v) `shouldBe` 3.4591122696276297
40 | 


--------------------------------------------------------------------------------
/test/word2vec-test.hs:
--------------------------------------------------------------------------------
1 | {-# OPTIONS_GHC -F -pgmF hspec-discover #-}
2 | 


--------------------------------------------------------------------------------
/word2vec.cabal:
--------------------------------------------------------------------------------
 1 | Name:                 word2vec
 2 | Version:              0.1
 3 | Build-type:           Simple
 4 | Synopsis:             A partial and unverified Haskell port of C word2vec
 5 | License:              BSD3
 6 | License-file:         LICENSE
 7 | Author:               Arnaud Bailly
 8 | Maintainer:           Arnaud Bailly <arnaud.oqube@gmail.com>
 9 | Cabal-version:        >= 1.20
10 | 
11 | Library
12 |   Build-Depends: base, mtl, aeson,
13 |                  hmatrix ,
14 |                  heap ,
15 |                  utf8-string ,
16 |                  containers ,
17 |                  regex-tdfa ,
18 |                  tokenize ,
19 |                  random ,
20 |                  array ,
21 |                  time ,
22 |                  process ,
23 |                  unix ,
24 |                  repa,
25 |                  filepath ,
26 |                  bytestring ,
27 |                  tagsoup ,
28 |                  http-conduit ,
29 |                  network ,
30 |                  HTTP ,
31 |                  directory ,
32 |                  Chart ,
33 |                  Chart-diagrams ,
34 |                  lens ,
35 |                  network-uri ,
36 |                  colour ,
37 |                  data-default-class,
38 |                  hmatrix-nipals,
39 |                  unordered-containers, deepseq, vector
40 |   hs-source-dirs:     src
41 |   exposed-modules: Display, Model, Words, Crawl, Log,
42 |                    Words.Dictionary, Model.Repa, Model.Types, Model.IO, PCA
43 |   other-modules: Concurrent, Huffman, Window
44 |   ghc-options:        -Wall -threaded -rtsopts -O3 -fexcess-precision -fprof-auto 
45 | 
46 | Executable words
47 |   main-is: word2vec.hs
48 |   hs-source-dirs: main
49 |   default-language: Haskell2010
50 |   build-depends: word2vec,
51 |                  directory ,
52 |                  Chart , aeson,
53 |                  Chart-diagrams ,
54 |                  base, mtl,
55 |                  unix ,
56 |                  filepath ,
57 |                  bytestring ,
58 |                  tagsoup ,
59 |                  http-conduit ,
60 |                  HTTP, network-uri , network,optparse-generic
61 |   ghc-options:   -Wall -threaded -rtsopts -O3 -fprof-auto
62 | 
63 | Executable crawl
64 |   default-language: Haskell2010
65 |   Build-Depends: word2vec,
66 |                  base, mtl,
67 |                  process ,
68 |                  unix ,
69 |                  filepath ,
70 |                  bytestring ,
71 |                  tagsoup ,
72 |                  http-conduit ,
73 |                  HTTP, network-uri , network
74 | 
75 |   hs-source-dirs:     main
76 |   main-is:            arxiv.hs
77 |   ghc-options:        -Wall -threaded -rtsopts -O3
78 | 
79 | Test-Suite word2vec-test
80 |   default-language: Haskell2010
81 |   type:          exitcode-stdio-1.0
82 |   hs-source-dirs:     test
83 |   main-is:       word2vec-test.hs
84 |   build-depends: base, doctest, hspec, hmatrix, word2vec
85 | 
86 | Benchmark pca-bench
87 |   type:             exitcode-stdio-1.0
88 |   hs-source-dirs:   bench
89 |   main-is:          MainBenchmark.hs
90 |   build-depends:    base,
91 |                     criterion, hmatrix, hmatrix-nipals, random, word2vec
92 |   ghc-options:      -Wall
93 |                     -O2  


--------------------------------------------------------------------------------