├── .gitignore ├── .gitmodules ├── BayesStack ├── DirMulti.hs ├── Dirichlet.hs ├── Gibbs.hs ├── Gibbs │ ├── Concurrent.hs │ └── Simple.hs ├── TupleEnum.hs ├── Types.hs └── UniqueKey.hs ├── Data ├── Binary │ └── EnumMap.hs ├── Random │ └── Sequence.hs └── Sequence │ └── Chunk.hs ├── LICENSE ├── README.mkd ├── Setup.hs ├── SharedTasteTest.hs ├── bayes-stack.cabal ├── doc ├── installation.markdown └── usage.markdown └── network-topic-models ├── BayesStack └── Models │ └── Topic │ ├── CitationInfluence.hs │ ├── CitationInfluenceNoTopics.hs │ ├── LDA.hs │ ├── LDARelevance.hs │ ├── SharedTaste.hs │ ├── Test.hs │ └── Types.hs ├── BenchLDA.hs ├── BenchST.hs ├── Benchmark.hs ├── DumpCI.hs ├── DumpCINT.hs ├── DumpLDA.hs ├── DumpLDARelevance.hs ├── DumpST.hs ├── FormatMultinom.hs ├── LICENSE ├── README.mkd ├── ReadData.hs ├── ReadRelevanceData.hs ├── RunCI.hs ├── RunCINT.hs ├── RunLDA.hs ├── RunLDARelevance.hs ├── RunST.hs ├── RunSampler.hs ├── SerializeText.hs ├── Setup.hs ├── TopicLanguageModel.hs ├── doc └── models │ ├── model_ci.tex │ ├── model_ci_nt.tex │ ├── model_lda.tex │ ├── model_st.tex │ ├── models.tex │ └── tikzlibrarybayesnet.code.tex └── network-topic-models.cabal /.gitignore: -------------------------------------------------------------------------------- 1 | *.*~ 2 | *.o 3 | *.hi 4 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "network-topic-models/doc/models/tikz-bayesnet"] 2 | path = network-topic-models/doc/models/tikz-bayesnet 3 | url = git://github.com/jluttine/tikz-bayesnet.git 4 | -------------------------------------------------------------------------------- /BayesStack/DirMulti.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE TypeFamilies, FlexibleInstances, ConstraintKinds, DeriveGeneric, DefaultSignatures #-} 2 | 3 | module BayesStack.DirMulti ( -- * Dirichlet/multinomial pair 4 | Multinom, dirMulti, symDirMulti, multinom 5 | -- | Do not do record updates with these 6 | , dmTotal, dmAlpha, dmDomain 7 | , setMultinom, SetUnset (..) 8 | , addMultinom, subMultinom 9 | , decMultinom, incMultinom 10 | , prettyMultinom 11 | , updatePrior 12 | , obsProb 13 | -- * Parameter estimation 14 | , estimatePrior, reestimatePriors, reestimateSymPriors 15 | -- * Convenience functions 16 | , probabilities, decProbabilities 17 | ) where 18 | 19 | import Data.EnumMap (EnumMap) 20 | import qualified Data.EnumMap as EM 21 | 22 | import Data.Sequence (Seq) 23 | import qualified Data.Sequence as SQ 24 | 25 | import qualified Data.Foldable as Foldable 26 | import Data.Foldable (toList, Foldable, foldMap) 27 | import Data.Function (on) 28 | 29 | import Text.PrettyPrint 30 | import Text.Printf 31 | 32 | import GHC.Generics (Generic) 33 | import Data.Binary 34 | import Data.Binary.EnumMap () 35 | 36 | import BayesStack.Types 37 | import BayesStack.Dirichlet 38 | 39 | import Numeric.Log hiding (sum) 40 | import Numeric.Digamma 41 | import Math.Gamma hiding (p) 42 | 43 | -- | Make error handling a bit easier 44 | checkNaN :: RealFloat a => String -> a -> a 45 | checkNaN loc x | isNaN x = error $ "BayesStack.DirMulti."++loc++": Not a number" 46 | checkNaN loc x | isInfinite x = error $ "BayesStack.DirMulti."++loc++": Infinity" 47 | checkNaN _ x = x 48 | 49 | maybeInc, maybeDec :: (Num a, Eq a) => Maybe a -> Maybe a 50 | maybeInc Nothing = Just 1 51 | maybeInc (Just n) = Just (n+1) 52 | maybeDec Nothing = error "Can't decrement zero count" 53 | maybeDec (Just 1) = Nothing 54 | maybeDec (Just n) = Just (n-1) 55 | 56 | {-# INLINEABLE decMultinom #-} 57 | {-# INLINEABLE incMultinom #-} 58 | decMultinom, incMultinom :: (Num w, Eq w, Ord a, Enum a) 59 | => a -> Multinom w a -> Multinom w a 60 | decMultinom k = subMultinom 1 k 61 | incMultinom k = addMultinom 1 k 62 | 63 | subMultinom, addMultinom :: (Num w, Eq w, Ord a, Enum a) 64 | => w -> a -> Multinom w a -> Multinom w a 65 | subMultinom w k dm = dm { dmCounts = EM.alter maybeDec k $ dmCounts dm 66 | , dmTotal = dmTotal dm - w } 67 | addMultinom w k dm = dm { dmCounts = EM.alter maybeInc k $ dmCounts dm 68 | , dmTotal = dmTotal dm + w } 69 | 70 | data SetUnset = Set | Unset 71 | 72 | setMultinom :: (Num w, Eq w, Enum a, Ord a) => SetUnset -> a -> Multinom w a -> Multinom w a 73 | setMultinom Set s = incMultinom s 74 | setMultinom Unset s = decMultinom s 75 | 76 | -- | 'Multinom a' represents multinomial distribution over domain 'a'. 77 | -- Optionally, this can include a collapsed Dirichlet prior. 78 | -- 'Multinom alpha count total' is a multinomial with Dirichlet prior 79 | -- with symmetric parameter 'alpha', ... 80 | data Multinom w a = DirMulti { dmAlpha :: !(Alpha a) 81 | , dmCounts :: !(EnumMap a w) 82 | , dmTotal :: !w 83 | , dmDomain :: !(Seq a) 84 | } 85 | | Multinom { dmProbs :: !(EnumMap a Double) 86 | , dmCounts :: !(EnumMap a w) 87 | , dmTotal :: !w 88 | , dmDomain :: !(Seq a) 89 | } 90 | deriving (Show, Eq, Generic) 91 | instance (Enum a, Binary a, Binary w) => Binary (Multinom w a) 92 | 93 | -- | 'symMultinomFromPrecision d p' is a symmetric Dirichlet/multinomial over a 94 | -- domain 'd' with precision 'p' 95 | symDirMultiFromPrecision :: (Num w, Enum a) => [a] -> DirPrecision -> Multinom w a 96 | symDirMultiFromPrecision domain prec = symDirMulti (0.5*prec) domain 97 | 98 | -- | 'dirMultiFromMeanPrecision m p' is an asymmetric Dirichlet/multinomial 99 | -- over a domain 'd' with mean 'm' and precision 'p' 100 | dirMultiFromPrecision :: (Num w, Enum a) => DirMean a -> DirPrecision -> Multinom w a 101 | dirMultiFromPrecision m p = dirMultiFromAlpha $ meanPrecisionToAlpha m p 102 | 103 | -- | Create a symmetric Dirichlet/multinomial 104 | symDirMulti :: (Num w, Enum a) => Double -> [a] -> Multinom w a 105 | symDirMulti alpha domain = dirMultiFromAlpha $ symAlpha domain alpha 106 | 107 | -- | A multinomial without a prior 108 | multinom :: (Num w, Enum a) => [(a,Double)] -> Multinom w a 109 | multinom probs = Multinom { dmProbs = EM.fromList probs 110 | , dmCounts = EM.empty 111 | , dmTotal = 0 112 | , dmDomain = SQ.fromList $ map fst probs 113 | } 114 | 115 | -- | Create an asymmetric Dirichlet/multinomial from items and alphas 116 | dirMulti :: (Num w, Enum a) => [(a,Double)] -> Multinom w a 117 | dirMulti domain = dirMultiFromAlpha $ asymAlpha $ EM.fromList domain 118 | 119 | -- | Create a Dirichlet/multinomial with a given prior 120 | dirMultiFromAlpha :: (Enum a, Num w) => Alpha a -> Multinom w a 121 | dirMultiFromAlpha alpha = DirMulti { dmAlpha = alpha 122 | , dmCounts = EM.empty 123 | , dmTotal = 0 124 | , dmDomain = alphaDomain alpha 125 | } 126 | 127 | data Acc w = Acc !w !Probability 128 | 129 | obsProb :: (Enum a, Real w, Functor f, Foldable f) 130 | => Multinom w a -> f (a, w) -> Probability 131 | obsProb (Multinom {dmProbs=prob}) obs = 132 | Foldable.product $ fmap (\(k,w)->(realToFrac $ prob EM.! k)^^w) obs 133 | where (^^) :: Real w => Log Double -> w -> Log Double 134 | x ^^ y = Exp $ realToFrac y * ln x 135 | obsProb (DirMulti {dmAlpha=alpha}) obs = 136 | let go (Acc w p) (k',w') = Acc (w+w') (p*p') 137 | where p' = Exp $ checkNaN "obsProb" 138 | $ lnGamma (realToFrac w' + alpha `alphaOf` k') 139 | in case Foldable.foldl' go (Acc 0 1) obs of 140 | Acc w p -> p / alphaNormalizer alpha 141 | / Exp (lnGamma $ realToFrac w + sumAlpha alpha) 142 | {-# INLINE obsProb #-} 143 | 144 | dmGetCounts :: (Enum a, Num w) => Multinom w a -> a -> w 145 | dmGetCounts dm k = 146 | EM.findWithDefault 0 k (dmCounts dm) 147 | 148 | instance HasLikelihood (Multinom w) where 149 | type LContext (Multinom w) a = (Real w, Ord a, Enum a) 150 | likelihood dm = obsProb dm $ EM.assocs $ dmCounts dm 151 | {-# INLINEABLE likelihood #-} 152 | 153 | instance FullConditionable (Multinom w) where 154 | type FCContext (Multinom w) a = (Real w, Ord a, Enum a) 155 | sampleProb (Multinom {dmProbs=prob}) k = prob EM.! k 156 | sampleProb dm@(DirMulti {dmAlpha=a}) k = 157 | let alpha = a `alphaOf` k 158 | n = realToFrac $ dmGetCounts dm k 159 | total = realToFrac $ dmTotal dm 160 | in (n + alpha) / (total + sumAlpha a) 161 | {-# INLINEABLE sampleProb #-} 162 | 163 | {-# INLINEABLE probabilities #-} 164 | probabilities :: (Real w, Ord a, Enum a) => Multinom w a -> Seq (Double, a) 165 | probabilities dm = fmap (\a->(sampleProb dm a, a)) $ dmDomain dm -- FIXME 166 | 167 | -- | Probabilities sorted decreasingly 168 | decProbabilities :: (Real w, Ord a, Enum a, Num w) => Multinom w a -> Seq (Double, a) 169 | decProbabilities = SQ.sortBy (flip (compare `on` fst)) . probabilities 170 | 171 | prettyMultinom :: (Real w, Ord a, Enum a) => Int -> (a -> String) -> Multinom w a -> Doc 172 | prettyMultinom _ _ (Multinom {}) = error "TODO: prettyMultinom" 173 | prettyMultinom n showA dm@(DirMulti {}) = 174 | text "DirMulti" <+> parens (text "alpha=" <> prettyAlpha showA (dmAlpha dm)) 175 | $$ nest 5 (fsep $ punctuate comma 176 | $ map (\(p,a)->text (showA a) <> parens (text $ printf "%1.2e" p)) 177 | $ take n $ Data.Foldable.toList $ decProbabilities dm) 178 | 179 | -- | Update the prior of a Dirichlet/multinomial 180 | updatePrior :: (Alpha a -> Alpha a) -> Multinom w a -> Multinom w a 181 | updatePrior _ (Multinom {}) = error "TODO: updatePrior" 182 | updatePrior f dm = dm {dmAlpha=f $ dmAlpha dm} 183 | 184 | -- | Relative tolerance in precision for prior estimation 185 | estimationTol = 1e-8 186 | 187 | reestimatePriors :: (Foldable f, Functor f, Real w, Enum a) 188 | => f (Multinom w a) -> f (Multinom w a) 189 | reestimatePriors dms = 190 | let usableDms = filter (\dm->dmTotal dm > 5) $ toList dms 191 | alpha = case () of 192 | _ | length usableDms <= 3 -> id 193 | otherwise -> const $ estimatePrior estimationTol usableDms 194 | in fmap (updatePrior alpha) dms 195 | 196 | reestimateSymPriors :: (Foldable f, Functor f, Real w, Enum a) 197 | => f (Multinom w a) -> f (Multinom w a) 198 | reestimateSymPriors dms = 199 | let usableDms = filter (\dm->dmTotal dm > 5) $ toList dms 200 | alpha = case () of 201 | _ | length usableDms <= 3 -> id 202 | otherwise -> const $ symmetrizeAlpha $ estimatePrior estimationTol usableDms 203 | in fmap (updatePrior alpha) dms 204 | 205 | -- | Estimate the prior alpha from a set of Dirichlet/multinomials 206 | estimatePrior' :: (Real w, Enum a) => [Multinom w a] -> Alpha a -> Alpha a 207 | estimatePrior' dms alpha = 208 | let domain = toList $ dmDomain $ head dms 209 | f k = let num = sum $ map (\i->digamma (realToFrac (dmGetCounts i k) + alphaOf alpha k) 210 | - digamma (alphaOf alpha k) 211 | ) 212 | $ filter (\i->dmGetCounts i k > 0) dms 213 | total i = realToFrac $ sum $ map (\k->dmGetCounts i k) domain 214 | sumAlpha = sum $ map (alphaOf alpha) domain 215 | denom = sum $ map (\i->digamma (total i + sumAlpha) - digamma sumAlpha) dms 216 | in case () of 217 | _ | isNaN num -> error $ "BayesStack.DirMulti.estimatePrior': num = NaN: "++show (map (\i->(digamma (realToFrac (dmGetCounts i k) + alphaOf alpha k), digamma (alphaOf alpha k))) dms) 218 | _ | denom == 0 -> error "BayesStack.DirMulti.estimatePrior': denom=0" 219 | _ | isInfinite num -> error "BayesStack.DirMulti.estimatePrior': num is infinity " 220 | _ | isNaN (alphaOf alpha k * num / denom) -> error $ "NaN"++show (num, denom) 221 | otherwise -> alphaOf alpha k * num / denom 222 | in asymAlpha $ foldMap (\k->EM.singleton k (f k)) domain 223 | 224 | estimatePrior :: (Real w, Enum a) => Double -> [Multinom w a] -> Alpha a 225 | estimatePrior tol dms = iter $ dmAlpha $ head dms 226 | where iter alpha = let alpha' = estimatePrior' dms alpha 227 | (_, prec) = alphaToMeanPrecision alpha 228 | (_, prec') = alphaToMeanPrecision alpha' 229 | in if abs ((prec' - prec) / prec) > tol 230 | then iter alpha' 231 | else alpha' 232 | -------------------------------------------------------------------------------- /BayesStack/Dirichlet.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric #-} 2 | 3 | module BayesStack.Dirichlet ( -- * Dirichlet parameter 4 | Alpha 5 | , symAlpha, asymAlpha 6 | , alphaDomain, alphaNormalizer, sumAlpha 7 | , DirMean, DirPrecision 8 | , alphaOf, setAlphaOf, setSymAlpha 9 | , alphaToMeanPrecision, meanPrecisionToAlpha 10 | , symmetrizeAlpha 11 | , prettyAlpha 12 | ) where 13 | 14 | import Data.Foldable (toList, Foldable, fold) 15 | 16 | import Data.EnumMap (EnumMap) 17 | import qualified Data.EnumMap as EM 18 | 19 | import Data.Sequence (Seq) 20 | import qualified Data.Sequence as SQ 21 | 22 | import Numeric.Log 23 | import Math.Gamma 24 | 25 | import Text.Printf 26 | import Text.PrettyPrint 27 | 28 | import Data.Binary 29 | import Data.Binary.EnumMap () 30 | import GHC.Generics (Generic) 31 | 32 | -- | Make error handling a bit easier 33 | checkNaN :: RealFloat a => String -> a -> a 34 | checkNaN loc x | isNaN x = error $ "BayesStack.Dirichlet."++loc++": Not a number" 35 | checkNaN loc x | isInfinite x = error $ "BayesStack.Dirichlet."++loc++": Infinity" 36 | checkNaN _ x = x 37 | 38 | -- | A Dirichlet prior 39 | data Alpha a = SymAlpha { aDomain :: Seq a 40 | , aAlpha :: !Double 41 | , aNorm :: Log Double 42 | } 43 | | Alpha { aAlphas :: EnumMap a Double 44 | , aSumAlphas :: !Double 45 | , aNorm :: Log Double 46 | } 47 | deriving (Show, Eq, Generic) 48 | instance (Enum a, Binary a) => Binary (Alpha a) 49 | 50 | type DirMean a = EnumMap a Double 51 | type DirPrecision = Double 52 | 53 | symAlpha :: Enum a => [a] -> Double -> Alpha a 54 | symAlpha domain _ | null domain = error "Dirichlet over null domain is undefined" 55 | symAlpha domain alpha = SymAlpha { aDomain = SQ.fromList domain 56 | , aAlpha = alpha 57 | , aNorm = alphaNorm $ symAlpha domain alpha 58 | } 59 | 60 | -- | Construct an asymmetric Alpha 61 | asymAlpha :: Enum a => EnumMap a Double -> Alpha a 62 | asymAlpha alphas | EM.null alphas = error "Dirichlet over null domain is undefined" 63 | asymAlpha alphas = Alpha { aAlphas = alphas 64 | , aSumAlphas = Prelude.sum $ EM.elems alphas 65 | , aNorm = alphaNorm $ asymAlpha alphas 66 | } 67 | 68 | setSymAlpha :: Enum a => Double -> Alpha a -> Alpha a 69 | setSymAlpha alpha a = let b = (symmetrizeAlpha a) { aAlpha = alpha 70 | , aNorm = alphaNorm b 71 | } 72 | in b 73 | 74 | -- | Compute the normalizer of the likelihood involving alphas, 75 | -- (product_k gamma(alpha_k)) / gamma(sum_k alpha_k) 76 | alphaNorm :: Enum a => Alpha a -> Log Double 77 | alphaNorm alpha = normNum / normDenom 78 | where dim = realToFrac $ SQ.length $ aDomain alpha 79 | normNum = case alpha of 80 | Alpha {} -> product $ map (\a->Exp $ checkNaN ("alphaNorm.normNum(asym) alpha="++show a) $ lnGamma a) 81 | $ EM.elems $ aAlphas alpha 82 | SymAlpha {} -> Exp $ checkNaN "alphaNorm.normNum(sym)" $ dim * lnGamma (aAlpha alpha) 83 | normDenom = Exp $ checkNaN "alphaNorm.normDenom" $ lnGamma $ sumAlpha alpha 84 | 85 | -- | 'alphaDomain a' is the domain of prior 'a' 86 | alphaDomain :: Enum a => Alpha a -> Seq a 87 | alphaDomain (SymAlpha {aDomain=d}) = d 88 | alphaDomain (Alpha {aAlphas=a}) = SQ.fromList $ EM.keys a 89 | {-# INLINE alphaDomain #-} 90 | 91 | alphaNormalizer :: Enum a => Alpha a -> Log Double 92 | alphaNormalizer = aNorm 93 | {-# INLINE alphaNormalizer #-} 94 | 95 | -- | 'alphaOf alpha k' is the value of element 'k' in prior 'alpha' 96 | alphaOf :: Enum a => Alpha a -> a -> Double 97 | alphaOf (SymAlpha {aAlpha=alpha}) = const alpha 98 | alphaOf (Alpha {aAlphas=alphas}) = (alphas EM.!) 99 | {-# INLINE alphaOf #-} 100 | 101 | -- | 'sumAlpha alpha' is the sum of all alphas 102 | sumAlpha :: Enum a => Alpha a -> Double 103 | sumAlpha (SymAlpha {aDomain=domain, aAlpha=alpha}) = realToFrac (SQ.length domain) * alpha 104 | sumAlpha (Alpha {aSumAlphas=sum}) = sum 105 | {-# INLINE sumAlpha #-} 106 | 107 | -- | Set a particular alpha element 108 | setAlphaOf :: Enum a => a -> Double -> Alpha a -> Alpha a 109 | setAlphaOf k a alpha@(SymAlpha {}) = setAlphaOf k a $ asymmetrizeAlpha alpha 110 | setAlphaOf k a (Alpha {aAlphas=alphas}) = asymAlpha $ EM.insert k a alphas 111 | {-# INLINE setAlphaOf #-} 112 | 113 | -- | 'alphaToMeanPrecision a' is the mean/precision representation of the prior 'a' 114 | alphaToMeanPrecision :: Enum a => Alpha a -> (DirMean a, DirPrecision) 115 | alphaToMeanPrecision (SymAlpha {aDomain=dom, aAlpha=alpha}) = 116 | let prec = realToFrac (SQ.length dom) * alpha 117 | in (EM.fromList $ map (\a->(a, alpha/prec)) $ toList dom, prec) 118 | alphaToMeanPrecision (Alpha {aAlphas=alphas, aSumAlphas=prec}) = 119 | (fmap (/prec) alphas, prec) 120 | {-# INLINE alphaToMeanPrecision #-} 121 | 122 | -- | 'meanPrecisionToAlpha m p' is a prior with mean 'm' and precision 'p' 123 | meanPrecisionToAlpha :: Enum a => DirMean a -> DirPrecision -> Alpha a 124 | meanPrecisionToAlpha mean prec = asymAlpha $ fmap (*prec) mean 125 | {-# INLINE meanPrecisionToAlpha #-} 126 | 127 | -- | Symmetrize a Dirichlet prior (such that mean=0) 128 | symmetrizeAlpha :: Enum a => Alpha a -> Alpha a 129 | symmetrizeAlpha alpha@(SymAlpha {}) = alpha 130 | symmetrizeAlpha alpha@(Alpha {}) = 131 | SymAlpha { aDomain = alphaDomain alpha 132 | , aAlpha = sumAlpha alpha / realToFrac (EM.size $ aAlphas alpha) 133 | , aNorm = alphaNorm $ symmetrizeAlpha alpha 134 | } 135 | 136 | -- | Turn a symmetric alpha into an asymmetric alpha. For internal use. 137 | asymmetrizeAlpha :: Enum a => Alpha a -> Alpha a 138 | asymmetrizeAlpha (SymAlpha {aDomain=domain, aAlpha=alpha}) = 139 | asymAlpha $ fold $ fmap (\k->EM.singleton k alpha) domain 140 | asymmetrizeAlpha alpha@(Alpha {}) = alpha 141 | 142 | -- | Pretty-print a Dirichlet prior 143 | prettyAlpha :: Enum a => (a -> String) -> Alpha a -> Doc 144 | prettyAlpha showA (SymAlpha {aAlpha=alpha}) = text "Symmetric" <+> double alpha 145 | prettyAlpha showA (Alpha {aAlphas=alphas}) = 146 | text "Assymmetric" 147 | <+> fsep (punctuate comma 148 | $ map (\(a,alpha)->text (showA a) <> parens (text $ printf "%1.2e" alpha)) 149 | $ take 100 $ EM.toList $ alphas) 150 | -------------------------------------------------------------------------------- /BayesStack/Gibbs.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE TypeFamilies, FlexibleInstances, FlexibleContexts, 2 | ExistentialQuantification, GADTs, CPP #-} 3 | 4 | module BayesStack.Gibbs ( UpdateUnit(..) 5 | , WrappedUpdateUnit(..) 6 | ) where 7 | 8 | import Control.DeepSeq 9 | import Data.Random 10 | 11 | class (Show (Setting uu), Show uu) => UpdateUnit uu where 12 | type ModelState uu 13 | type Setting uu 14 | fetchSetting :: uu -> ModelState uu -> Setting uu 15 | evolveSetting :: ModelState uu -> uu -> RVar (Setting uu) 16 | updateSetting :: uu -> Setting uu -> Setting uu -> ModelState uu -> ModelState uu 17 | 18 | data WrappedUpdateUnit ms = forall uu. (UpdateUnit uu, ModelState uu ~ ms, 19 | NFData (Setting uu), Eq (Setting uu)) 20 | => WrappedUU uu 21 | 22 | -------------------------------------------------------------------------------- /BayesStack/Gibbs/Concurrent.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE TypeFamilies, FlexibleInstances, FlexibleContexts, GADTs, CPP #-} 2 | 3 | module BayesStack.Gibbs.Concurrent ( gibbsUpdate 4 | , module BayesStack.Gibbs 5 | ) where 6 | 7 | import BayesStack.Gibbs 8 | import Control.Concurrent 9 | import Control.Concurrent.STM 10 | import Control.DeepSeq 11 | import Control.Monad (replicateM_, when, forever) 12 | import Control.Monad.State hiding (lift) 13 | import Data.IORef 14 | import Data.Random 15 | import Data.Random.Lift 16 | import Debug.Trace (traceEventIO) 17 | import GHC.Conc.Sync (labelThread) 18 | import System.Random.MWC (withSystemRandom) 19 | 20 | updateUnit :: WrappedUpdateUnit ms -> IORef ms -> TBQueue (ms -> ms) -> RVarT IO () 21 | updateUnit (WrappedUU unit) stateRef diffQueue = do 22 | modelState <- lift $ readIORef stateRef 23 | let s = fetchSetting unit modelState 24 | s' <- lift $ evolveSetting modelState unit 25 | (s,s') `deepseq` return () 26 | when (s /= s') $ 27 | lift $ atomically $ writeTBQueue diffQueue (updateSetting unit s s') 28 | 29 | updateWorker :: TQueue (WrappedUpdateUnit ms) -> IORef ms -> TBQueue (ms -> ms) -> RVarT IO () 30 | updateWorker unitsQueue stateRef diffQueue = do 31 | unit <- lift $ atomically $ tryReadTQueue unitsQueue 32 | case unit of 33 | Just unit' -> do updateUnit unit' stateRef diffQueue 34 | updateWorker unitsQueue stateRef diffQueue 35 | Nothing -> return () 36 | 37 | #if __GLASGOW_HASKELL__ < 706 38 | atomicModifyIORef' = atomicModifyIORef 39 | #endif 40 | 41 | diffWorker :: IORef ms -> TBQueue (ms -> ms) -> Int -> IO () 42 | diffWorker stateRef diffQueue updateBlock = forever $ do 43 | s <- readIORef stateRef 44 | s' <- execStateT (replicateM_ updateBlock $ do 45 | diff <- lift $ atomically $ readTBQueue diffQueue 46 | modify diff 47 | ) s 48 | atomicWriteIORef stateRef $! s' 49 | traceEventIO "diffWorker: State updated" 50 | 51 | labelMyThread :: String -> IO () 52 | labelMyThread label = myThreadId >>= \id->labelThread id label 53 | 54 | gibbsUpdate :: Int -> Int -> ms -> [WrappedUpdateUnit ms] -> IO ms 55 | gibbsUpdate nUpdateWorkers updateBlock modelState units = do 56 | unitsQueue <- atomically $ do q <- newTQueue 57 | mapM_ (writeTQueue q) units 58 | return q 59 | diffQueue <- atomically $ newTBQueue $ 2*updateBlock -- FIXME 60 | stateRef <- newIORef modelState 61 | diffThread <- forkIO $ do labelMyThread "diff worker" 62 | diffWorker stateRef diffQueue updateBlock 63 | 64 | runningWorkers <- atomically $ newTVar (0 :: Int) 65 | done <- atomically $ newEmptyTMVar :: IO (TMVar ()) 66 | replicateM_ nUpdateWorkers $ forkIO $ withSystemRandom $ \mwc->do 67 | labelMyThread "update worker" 68 | atomically $ modifyTVar' runningWorkers (+1) 69 | runRVarT (updateWorker unitsQueue stateRef diffQueue) mwc 70 | atomically $ do 71 | modifyTVar' runningWorkers (+(-1)) 72 | running <- readTVar runningWorkers 73 | when (running == 0) $ putTMVar done () 74 | 75 | atomically $ takeTMVar done 76 | readIORef stateRef -------------------------------------------------------------------------------- /BayesStack/Gibbs/Simple.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE TypeFamilies, FlexibleInstances, FlexibleContexts, GADTs, CPP #-} 2 | 3 | module BayesStack.Gibbs.Simple ( gibbsUpdate 4 | , module BayesStack.Gibbs 5 | ) where 6 | 7 | import BayesStack.Gibbs 8 | import Control.Monad.State hiding (lift) 9 | import Control.DeepSeq 10 | import Data.Random 11 | import Data.Random.Lift 12 | import System.Random.MWC 13 | 14 | updateUnit :: WrappedUpdateUnit ms -> StateT ms RVar () 15 | updateUnit (WrappedUU unit) = do 16 | ms <- get 17 | let s = fetchSetting unit ms 18 | s' <- lift $ evolveSetting ms unit 19 | (s,s') `deepseq` return () 20 | put $ updateSetting unit s s' ms 21 | 22 | gibbsUpdate :: ms -> [WrappedUpdateUnit ms] -> IO ms 23 | gibbsUpdate modelState units = withSystemRandom $ asGenIO $ \mwc-> 24 | runRVar (execStateT (mapM_ updateUnit units) modelState) mwc -------------------------------------------------------------------------------- /BayesStack/TupleEnum.hs: -------------------------------------------------------------------------------- 1 | module BayesStack.TupleEnum where 2 | 3 | -- This is probably a bad idea 4 | -- Perhaps some TH magic to automatically derive this would make it more tolerable 5 | instance (Enum a, Enum b) => Enum (a,b) where 6 | fromEnum (a,b) = 2^32 * fromEnum a + fromEnum b 7 | toEnum n = let (na, nb) = n `quotRem` (2^32) 8 | in (toEnum na, toEnum nb) 9 | -------------------------------------------------------------------------------- /BayesStack/Types.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE TypeFamilies, KindSignatures, ConstraintKinds #-} 2 | 3 | module BayesStack.Types ( Probability 4 | , HasLikelihood(..) 5 | , FullConditionable(..) 6 | ) where 7 | 8 | import GHC.Prim (Constraint) 9 | import Numeric.Log 10 | 11 | type Probability = Log Double 12 | 13 | class HasLikelihood p where 14 | type LContext p a :: Constraint 15 | type LContext p a = () 16 | likelihood :: LContext p a => p a -> Probability 17 | 18 | -- | A distribution for which a full conditional factor can be produced 19 | class FullConditionable p where 20 | type FCContext p a :: Constraint 21 | type FCContext p a = () 22 | sampleProb :: FCContext p a => p a -> a -> Double 23 | -------------------------------------------------------------------------------- /BayesStack/UniqueKey.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE GeneralizedNewtypeDeriving, CPP #-} 2 | 3 | module BayesStack.UniqueKey ( getUniqueKey 4 | , getValueMap, getKeyMap 5 | , mapTraversable 6 | , UniqueKey, UniqueKeyT 7 | , runUniqueKey, runUniqueKeyT 8 | , runUniqueKey', runUniqueKeyT' 9 | ) where 10 | 11 | import Prelude hiding (mapM) 12 | import Control.Applicative (Applicative, (<$>)) 13 | import Data.Traversable (Traversable, mapM) 14 | import Data.Tuple 15 | import Data.Functor.Identity 16 | 17 | import Control.Monad.Trans 18 | import Control.Monad.State.Strict hiding (mapM) 19 | 20 | #if __GLASGOW_HASKELL__ >= 706 21 | import Data.Map.Strict (Map) 22 | import qualified Data.Map.Strict as M 23 | #else 24 | import Data.Map (Map) 25 | import qualified Data.Map as M 26 | #endif 27 | 28 | -- | 'UniqueKey val key' is a monad for a calculation of a mapping unique keys 29 | -- 'key' onto values 'val' 30 | type UniqueKey val key = UniqueKeyT val key Identity 31 | newtype UniqueKeyT val key m a = UniqueKeyT (StateT ([key], Map val key) m a) 32 | deriving (Monad, Applicative, Functor, MonadTrans) 33 | 34 | -- | Get map of unique keys to values 35 | getKeyMap :: (Monad m, Applicative m, Ord key, Ord val) => UniqueKeyT val key m (Map key val) 36 | getKeyMap = M.fromList . map swap . M.toList <$> getValueMap 37 | 38 | -- | Get map of values to unique keys 39 | getValueMap :: (Monad m, Applicative m, Ord key, Ord val) => UniqueKeyT val key m (Map val key) 40 | getValueMap = snd <$> UniqueKeyT get 41 | 42 | popUniqueKey :: Monad m => UniqueKeyT val key m key 43 | popUniqueKey = do 44 | (keys, a) <- UniqueKeyT get 45 | case keys of 46 | key:rest -> UniqueKeyT (put $! (rest, a)) >> return key 47 | [] -> error "Ran out of unique keys" 48 | 49 | -- | Find the unique key for value 'val' or 'Nothing' if the value is unknown 50 | findUniqueKey :: (Monad m, Applicative m, Ord key, Ord val) => val -> UniqueKeyT val key m (Maybe key) 51 | findUniqueKey value = M.lookup value <$> getValueMap 52 | 53 | getUniqueKey :: (Monad m, Applicative m, Ord key, Ord val) => val -> UniqueKeyT val key m key 54 | getUniqueKey x = do 55 | key <- findUniqueKey x 56 | case key of 57 | Just k -> return k 58 | Nothing -> do k <- popUniqueKey 59 | UniqueKeyT $ modify $ \(keys, keyMap)->(keys, M.insert x k keyMap) 60 | return k 61 | 62 | runUniqueKey :: (Ord key) => [key] -> UniqueKey val key a -> a 63 | runUniqueKey keys = runIdentity . runUniqueKeyT keys 64 | 65 | runUniqueKeyT :: (Monad m, Ord key) => [key] -> UniqueKeyT val key m a -> m a 66 | runUniqueKeyT keys (UniqueKeyT a) = evalStateT a (keys, M.empty) 67 | 68 | -- | Run a `UniqueKeyT`, returning the result and the associated key map 69 | runUniqueKeyT' :: (Monad m, Applicative m, Ord key, Ord val) => [key] -> UniqueKeyT val key m a -> m (a, Map key val) 70 | runUniqueKeyT' keys action = 71 | runUniqueKeyT keys $ do result <- action 72 | keyMap <- getKeyMap 73 | return (result, keyMap) 74 | 75 | -- | Run a `UniqueKey`, returning the result and the associated key map 76 | runUniqueKey' :: (Ord key, Ord val) => [key] -> UniqueKey val key a -> (a, Map key val) 77 | runUniqueKey' keys action = 78 | runUniqueKey keys $ do result <- action 79 | keyMap <- getKeyMap 80 | return (result, keyMap) 81 | 82 | mapTraversable :: (Traversable t, Ord key, Ord val) => [key] -> t val -> (t key, Map key val) 83 | mapTraversable keys xs = runUniqueKey' keys $ mapM getUniqueKey xs 84 | -------------------------------------------------------------------------------- /Data/Binary/EnumMap.hs: -------------------------------------------------------------------------------- 1 | module Data.Binary.EnumMap where 2 | 3 | import Data.Binary 4 | import Data.EnumMap 5 | 6 | instance (Enum k, Binary k, Binary v) => Binary (EnumMap k v) where 7 | get = do a <- get 8 | return $ fromList a 9 | put = put . toList 10 | -------------------------------------------------------------------------------- /Data/Random/Sequence.hs: -------------------------------------------------------------------------------- 1 | module Data.Random.Sequence (randomElementT) where 2 | 3 | import Data.Random 4 | import Data.Sequence as SQ 5 | 6 | randomElementT :: Seq a -> RVarT m a 7 | randomElementT xs | SQ.null xs = error "randomElementT: empty seq!" 8 | randomElementT xs = do 9 | n <- uniformT 0 (SQ.length xs - 1) 10 | return (xs `index` n) 11 | -------------------------------------------------------------------------------- /Data/Sequence/Chunk.hs: -------------------------------------------------------------------------------- 1 | module Data.Sequence.Chunk (chunk) where 2 | 3 | import Data.Sequence as SQ 4 | 5 | -- | 'chunk n xs' splits 'xs' into 'n' chunks 6 | chunk :: Int -> Seq a -> Seq (Seq a) 7 | chunk n xs = let m = ceiling $ realToFrac (SQ.length xs) / realToFrac n 8 | f xs | SQ.null xs = SQ.empty 9 | f xs = SQ.take m xs <| (f $ SQ.drop m xs) 10 | in f xs 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c)2012, Ben Gamari, Laura Dietz 2 | 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | 16 | * Neither the name of Ben Gamari nor the names of other 17 | contributors may be used to endorse or promote products derived 18 | from this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /README.mkd: -------------------------------------------------------------------------------- 1 | # bayes-stack: Parallel MCMC inference on graphical models 2 | 3 | `Bayes-stack` is a framework for parallel probabilistic inference on 4 | graphical models. The framework provides infrastructure for easily 5 | implementing MCMC/Gibbs sampling methods capable of scaling to dozens 6 | of cores. 7 | 8 | Along with the framework itself, several models using blocked Gibbs 9 | sampling are provided in `network-topic-models/`. See documentation: [blob/stable/doc/usage.markdown](http://github.com/bgamari/bayes-stack/blob/stable/doc/usage.markdown) 10 | -------------------------------------------------------------------------------- /Setup.hs: -------------------------------------------------------------------------------- 1 | import Distribution.Simple 2 | main = defaultMain 3 | -------------------------------------------------------------------------------- /SharedTasteTest.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE TupleSections #-} 2 | 3 | import BayesStack.Core 4 | import BayesStack.Models.Topic.SharedTaste 5 | 6 | import qualified Data.Map as M 7 | import qualified Data.Sequence as SQ 8 | import qualified Data.Set as S 9 | import qualified Data.EnumMap as EM 10 | import Data.Foldable (forM_) 11 | import Data.List 12 | import Data.Function 13 | import Data.Tuple (swap) 14 | import Text.Printf 15 | import Text.PrettyPrint 16 | import Data.Random 17 | import System.Random.MWC 18 | import Control.Monad.IO.Class 19 | import Data.Number.LogFloat hiding (realToFrac) 20 | 21 | -- | Shared taste/LDA interpolation parameter 22 | alphaGammaShared = 0.9 23 | 24 | -- | Number of topics to train 25 | topics = S.fromList $ [Topic i | i <- [1..3]] 26 | 27 | -- | Number of Gibbs sweeps 28 | nIter = 1000 29 | 30 | vocabulary :: EM.EnumMap Item String 31 | vocabulary = EM.fromList $ zip (map Item [1..]) 32 | $ [ "cats" 33 | , "crafts" 34 | , "birds" 35 | , "dogs" 36 | , "christmas" 37 | , "Scotland" 38 | , "flowers" 39 | , "survival" 40 | , "dragons" 41 | , "vampires" 42 | , "angels" 43 | , "animals" 44 | , "autism" 45 | , "thriller" 46 | , "historical" 47 | , "horror" 48 | , "non-fiction" 49 | , "fantasy" 50 | , "mystery" 51 | ] 52 | 53 | revVocabulary :: M.Map String Item 54 | revVocabulary = M.fromList $ map swap $ EM.assocs vocabulary 55 | 56 | -- | Input data to the model 57 | stdata = STData { -- Content-enriched network 58 | stNodes = S.fromList $ map Node [1..5] 59 | , stFriendships = S.fromList $ map Friendship $ 60 | [ (Node 1, Node 2) 61 | , (Node 2, Node 4) 62 | , (Node 2, Node 3) 63 | , (Node 4, Node 5) 64 | , (Node 5, Node 1) 65 | ] 66 | , stItems = S.fromList $ EM.keys vocabulary 67 | , stNodeItems = setupNodeItems 68 | $ concatMap (\(n,items)->map (\i->(n, revVocabulary M.! i)) items) 69 | $ [ (Node 1, [ "cats", "crafts", "birds" 70 | , "christmas", "flowers" 71 | ] 72 | ) 73 | , (Node 2, [ "cats", "birds", "animals" 74 | , "angels", "dragons", "horror" 75 | , "vampires" 76 | ] 77 | ) 78 | , (Node 3, [ "autism", "survival", "dragons" 79 | , "vampires", "angels", "animals" 80 | , "survival" 81 | ] 82 | ) 83 | , (Node 4, [ "thriller", "historical", "horror" 84 | , "non-fiction", "vampires", "fantasy" 85 | ] 86 | ) 87 | , (Node 5, [ "vampires", "dragons", "angels" 88 | , "horror", "fantasy", "thriller" 89 | ] 90 | ) 91 | ] 92 | 93 | -- Hyper-parameters and such 94 | , stAlphaGammaShared = alphaGammaShared 95 | , stAlphaGammaOwn = 1 - alphaGammaShared 96 | , stAlphaPsi = 0.1 97 | , stAlphaLambda = 0.1 98 | , stAlphaPhi = 0.1 99 | , stAlphaOmega = 0.1 100 | , stTopics = topics 101 | } 102 | 103 | main :: IO () 104 | main = do 105 | state <- withSystemRandom $ runModel run 106 | let maybeInc (Just n) = Just $ n+1 107 | maybeInc Nothing = Just 1 108 | wordCounts = foldl' (\a (n,x)->EM.alter maybeInc x a) EM.empty 109 | $ EM.elems $ stNodeItems $ msData state 110 | totalCounts = EM.size $ stNodeItems $ msData state 111 | 112 | liftIO $ putStr "\nTopics:\n" 113 | forM_ topics $ \t -> do 114 | let phi = msPhis state EM.! t 115 | probs = map (sampleProb phi) $ S.toList $ stItems $ msData state 116 | liftIO $ print $ text (show t) <+> colon 117 | <+> hsep ( punctuate comma 118 | $ map (\(x,p)->text (vocabulary EM.! x) <> parens (text $ printf "%1.2e" p)) 119 | $ take 10 $ sortBy (flip (compare `on` snd)) 120 | $ zip (S.toList $ stItems $ msData state) probs 121 | ) 122 | 123 | liftIO $ putStr "\nFriendship weights:\n" 124 | forM_ (stFriendships stdata) $ \(Friendship (a,b)) -> do 125 | let psi = msPsis state EM.! a 126 | liftIO $ putStr $ printf "%s\t%s\t\t%e\t%e\n" 127 | (show a) (show b) 128 | (friendInfluence state a b) 129 | (sampleProb psi b) 130 | 131 | liftIO $ putStr "\nShared topic mixtures:\n" 132 | forM_ (stFriendships stdata) $ \fs@(Friendship (a,b)) -> do 133 | let lambda = msLambdas state EM.! fs 134 | liftIO $ putStr $ printf "%s\t%s\t\t" (show a) (show b) 135 | liftIO $ putStr $ intercalate "\t" 136 | $ map (\t->printf "%s(%e)\t" (show t) (sampleProb lambda t)) 137 | $ S.toList topics 138 | liftIO $ putStr "\n" 139 | 140 | liftIO $ putStr "\nOwn topic mixtures:\n" 141 | forM_ (stNodes stdata) $ \n -> do 142 | let omega = msOmegas state EM.! n 143 | liftIO $ putStr $ printf "%s\t\t" (show n) 144 | liftIO $ putStr $ intercalate "\t" 145 | $ map (\t->printf "%s(%e)\t" (show t) (sampleProb omega t)) 146 | $ S.toList topics 147 | liftIO $ putStr "\n" 148 | 149 | run :: ModelMonad STModelState 150 | run = do 151 | initial <- liftRVar $ randomInitialize stdata 152 | (ius, model) <- model stdata initial 153 | 154 | state <- getModelState model 155 | liftIO $ putStrLn $ printf "Model log likelihood after initialization: %e" 156 | (logFromLogFloat $ modelLikelihood state :: Double) 157 | 158 | liftIO $ putStrLn $ printf "Created %d update units" (SQ.length ius) 159 | liftIO $ putStrLn "Starting inference..." 160 | forM_ [1..nIter::Int] $ \sweepN -> do 161 | concurrentFullGibbsUpdate 10 ius 162 | 163 | state <- getModelState model 164 | liftIO $ putStrLn $ printf "Model log likelihood after %d iterations: %e" 165 | nIter (logFromLogFloat $ modelLikelihood state :: Double) 166 | return state 167 | -------------------------------------------------------------------------------- /bayes-stack.cabal: -------------------------------------------------------------------------------- 1 | Name: bayes-stack 2 | 3 | Version: 0.2.0.1 4 | Synopsis: Framework for inferring generative probabilistic models 5 | with Gibbs sampling 6 | Description: bayes-stack is a framework for inference on generative 7 | probabilistic models. The framework uses Gibbs sampling, 8 | although is suitable for other iterative update methods. 9 | homepage: https://github.com/bgamari/bayes-stack 10 | License: BSD3 11 | License-file: LICENSE 12 | Author: Ben Gamari 13 | Maintainer: bgamari.foss@gmail.com 14 | copyright: Copyright (c) 2012 Ben Gamari 15 | Category: Math 16 | 17 | Build-type: Simple 18 | Cabal-version: >=1.6 19 | 20 | source-repository head 21 | type: git 22 | location: https://github.com/bgamari/bayes-stack.git 23 | 24 | Library 25 | Exposed-modules: BayesStack.Types, 26 | BayesStack.Gibbs, BayesStack.Gibbs.Concurrent, 27 | BayesStack.Gibbs.Simple, 28 | BayesStack.DirMulti, BayesStack.Dirichlet, 29 | BayesStack.UniqueKey, 30 | BayesStack.TupleEnum, 31 | Data.Binary.EnumMap, 32 | Data.Random.Sequence, Data.Sequence.Chunk 33 | 34 | Build-depends: base >=4 && <5, 35 | stm, 36 | transformers, 37 | mtl, 38 | deepseq, 39 | random-source, 40 | random-fu, 41 | rvar, 42 | containers, 43 | enummapset, 44 | ghc-prim, 45 | vector, 46 | mwc-random, 47 | pretty, 48 | binary >= 0.7 && < 0.8, 49 | log-domain >= 0.6, 50 | digamma, 51 | gamma, 52 | statistics 53 | -------------------------------------------------------------------------------- /doc/installation.markdown: -------------------------------------------------------------------------------- 1 | 2 | Bayes-Stack and Network Topic Models 3 | ===================================== 4 | 5 | Installing Haskell 7.6.1 6 | ------------------------- 7 | 8 | 9 | The model implementation is in `BayesStack/Models/Topic/SharedTaste.hs` 10 | 11 | For an example on how to use is, see `SharedTasteTest.hs` 12 | 13 | 1. Install Haskell Platform 7.4.1 from [http://hackage.haskell.org/platform/](http://hackage.haskell.org/platform/) 14 | 2. Ensure that package libgmp3c2 is available, for instance with `sudo apt-get install libgmp3c2` 15 | 2. Install Haskell GHC 7.6.1 from [http://www.haskell.org/ghc/download_ghc_7_6_1](http://www.haskell.org/ghc/download_ghc_7_6_1) 16 | - Ignore the large "STOP" note on the website. Simply installing Haskell Platform is not sufficient since Bayes-Stack relies on library features that are currently not available in the GHC version provided by Haskell Platform. Scroll down to "Binary Packages" and download the tarball for your platform. 17 | - Unzip tarball and cd into the directory 18 | - If you have root access, install GHC with `./configure; sudo make install` 19 | - If you do not have root access, you can install GHC to directory of your choice (let's call it ``) with `./configure --prefix=; make install` then add `/bin` to your `$PATH` 20 | - check that you have the right version with `ghc -V` respond with version 7.6.1 - *not* 7.4.1! 21 | - Update the Haskell package list with `cabal update` 22 | 23 | 24 | 25 | Installing Bayes-Stack 26 | ---------------------------- 27 | 28 | - Clone from github by calling `git clone git://github.com/bgamari/bayes-stack.git` 29 | - `cd` into the bayes-stack source directory 30 | - Call `cabal install ./ network-topic-models/` 31 | - It should finish with "Installing executable(s) in ~/.cabal/bin" 32 | - ensure that `~/.cabal/bin/` is permanently your `$PATH` variable 33 | 34 | 35 | Testing the Installation 36 | ------------------------ 37 | 38 | - Run `bayes-stack-lda` 39 | - If you see the usage information, you are all set. 40 | 41 | 42 | Continue to read about [how to use bayes-stack topic models..](usage.html) 43 | 44 | 45 | Bug Reports 46 | ------------ 47 | If you come across any issues/bugs/trouble, please submit a bug report to our issue tracker: 48 | [https://github.com/bgamari/bayes-stack/issues/new](https://github.com/bgamari/bayes-stack/issues/new) 49 | 50 | -------------------------------------------------------------------------------- /doc/usage.markdown: -------------------------------------------------------------------------------- 1 | Using Bayes-Stack Network Topic Models 2 | ======================================= 3 | 4 | Bayes-stack comes with implementations of a various topic models for social network analysis: 5 | - Shared Taste Model, to analyze topics shared by friends. [Dietz2012] 6 | - Citation Influence Model, to analyze topics for which a paper is cited. [Dietz2007] 7 | - Latent Dirichlet allocation, a baseline topic model that only incorporates text, but ignores the network structure. [Blei2003] 8 | 9 | [Dietz2012]: http://people.cs.umass.edu/~dietz/delayer/dietz-cameraready.pdf "Laura Dietz, Ben Gamari, John Guiver, Edward Snelson, Ralf Herbrich. De-Layering Social Networks by Shared Tastes of Friendships. ICWSM 2012." 10 | [Dietz2007]: http://www.machinelearning.org/proceedings/icml2007/papers/257.pdf "Laura Dietz, Steffen Bickel, Tobias Scheffer. "Unsupervised Prediction of Citation Influences. ICML 2007." 11 | [Blei2003]: http://www.cs.princeton.edu/~blei/papers/BleiNgJordan2003.pdf "David Blei, Andrew Ng, Michael Jordan. Latent Dirichlet Allocation. JMLR 2003." 12 | 13 | Probabistic topic models and its seminal basic variant latent Dirichlet allocation [Blei2003] are unsupervised methods for clustering words into so-called topics, taking their context into account. The original work was motivated from a perspective of documents; here we assume that each node as a set of words associated. We therefore use the term `node` instead of `document`. Striving towards applications on tags, songs, books, we also use the `item` instead of `word`. 14 | 15 | 16 | ### Gibbs sampling in Bayes-Stack 17 | 18 | All models are implemented using the bayes-stack framework, which makes it very easy to implement multi-threaded blocked collapsed Gibbs samplers for latent variable and parameter estimation. Gibbs sampling starts with a random initialization of unknown variables, which are iteratively updated in sweeps over all variables for a number of iterations. Every `lag` iterations, bayes-stack will dump the model likelihood to `sweeps/likelihood.log` - a diagnostic measure which should increase on average. Bayes-stack will dump a state of latent variables to the `sweeps` directory whenever a new "best" likelihood is achieved. As early iterations are dominated by the random initialization, the first `burnin` iterations won't lead to a dump. Bayes-stack provides support for updating sets of variables from their joint distribution (aka blocked Gibbs sampler) via the concept of an update unit. 19 | 20 | ### Parallel inference in Bayes-Stack 21 | 22 | Bayes-stack aims at multi-core environments, parallelizing the inference across multiple threads. In bayes-stack each worker thread will repeatedly pick an update unit, fetch the current model state, and compute a new setting for variables in the update unit. It then prepares instructions for updating the model state with this new setting (called a `diff`). One global diff-worker will apply the diffs in batches of size `diff-batch`. Notice that the model state may have advanced in the mean time, but only for a fraction of an iteration. We leverage that Gibbs samplers are robust towards mildy out-of-date state. Bayes-stack ensures the consistency of count statistics. 23 | 24 | Bayes-stack is different to other parallel topic model frameworks in that *it does not* update disjoint sets of variables in isolation for several iterations. It further supports integrating out parameters with conjugative priors (collapsing). Bayes-stack is applicable to any generative model (not only LDA), especially if strong interdependencies between variables and plates exist. Bayes-stack supports arbitrary nesting of plates and conditional draws (aka gates). 25 | 26 | 27 | [gates]:http://research.microsoft.com/apps/pubs/default.aspx?id=78857 "Gates" 28 | 29 | ### Hyper-parameter optimization 30 | 31 | Bayes-stack supports optimizing symmetric Dirichlet hyper-parameters (`alpha` in LDA) using Tom Minka's fixed point method. If enabled, an optimization phase is inserted every `hyper-lag` iterations (after `hyper-burnin` iterations). Diagnostic information about new settins of hyper-parameters as well as model likelihood before and after the change are written to `sweeps/hyperparams.log`. It is highly recommended to inspect this diagnostic information before proceeding. 32 | 33 | 34 | ### Model inference and analysis 35 | 36 | For each bayes-stack model, two binaries are provided: 37 | 38 | bayes-stack-<model> 39 | : program to run the Gibbs sampler and dump settings to the sweeps directory 40 | 41 | bayes-stack-dump-<model> <parameter> 42 | : program to run after the Gibbs sampler finished to analyse the sweeps directory and compute point-estimates for parameters (e.g. `thetas` and `phis`) or posterior inferences (e.g. `influences`) and output as csv files. 43 | 44 | 45 | Network Topic models 46 | -------------------- 47 | 48 | ### Latent Dirichlet allocation 49 | 50 | The intuition of topic model is that two tokens are likely about the same `topic` if they represent the same word, and/or are in the same node. The model does not consider edge structure among the nodes. 51 | 52 | Nomenclature: 53 | 54 | *thetas* 55 | : for all nodes, the mixture of topics (e.g. node 1 is one third about topic 1, two thirds about topic 5) 56 | 57 | *phis* 58 | : for all topics, the mixture of items. (e.g. topic 1 has item "soccer" with 0.1 and item "ball" with 0.05) 59 | 60 | Each of the mixtures are represented by a multinomial distribution with a symmetric Dirichlet prior. 61 | 62 | To run an LDA topic model with 10 topics, priors for theta and phi of 0.1, using 5 parallel threads call, 63 | 64 | bayes-stack-lda --nodes FILE -t10 --prior-theta=0.1 --prior-phi=0.1 --sweeps=ldasweeps --threads=5 65 | 66 | The nodes file must be in the format of, 67 | 68 | `node id` \t all items (e.g. words) on one line \n 69 | 70 | Words (white-space separated) listed in the stopwords file (if given) are ignored from the nodes file. 71 | 72 | 73 | After the Gibbs sampler has finished, inspect `likelihood.log` file (in the sweeps directory) to confirm that the model likelihood converged. If hyperparameter estimation is enabled, also inspect `hyperparams.log` to ensure that parameters are in a reasonable range. 74 | 75 | To output the top 20 items for each topic call 76 | 77 | bayes-stack-dump-lda phis -n20 --sweeps ldasweeps 78 | 79 | To output the topic mixtures for each node/document call 80 | 81 | bayes-stack-dump-lda thetas --sweeps ldasweeps 82 | 83 | The output format for `N` multinomial parameters over support `S1`, `S2`, ... is 84 | 85 | 86 | 87 | 88 | 89 | 90 |
N  
 S1probability
 S2probability
 ... 
91 | 92 | ### Shared Taste Model 93 | 94 | The shared taste model is a probabilistic network topic model to understand shared topics that underlie a friendship. The intuition is that shared taste is indicated when two friends are using the same items. If the friends use different items that also have been mutually used in other friendships, it is also likely that they represent the shared taste. This is modeled by introducing topic mixtures of friendships (i.e. edges, not nodes!). Each node associates their items with one of their friends, then draws a topic from the shared topic mixture of that friendship to generate the item. In order to be robust against nodes with individual (that is, non-shared) interests, an item can also be associated with its node's own topic mixture. 95 | 96 | Nomenclature: 97 | 98 | *lamdas* 99 | : for all edges, the mixture of topics (equivalent to theta in LDA, but shared by two nodes). 100 | 101 | *phis* 102 | : for all topics, mixture of items (as in LDA). 103 | 104 | *psis* 105 | : one global mixture over all edges (for convenience, it is projected onto the set of friends for each user). 106 | 107 | *omegas* 108 | : for each node, its own mixture of topics. 109 | 110 | *gammas* 111 | : for each node, a Bernoulli distribution sharing versus own topics. The current version does not support estimating gammas from data. The parameter is fixed and set to the mean of its given Beta prior. 112 | 113 | 114 | To run the shared taste model with 10 topics using 5 parallel threads call, 115 | 116 | bayes-stack-st --edges FILE --nodes FILE -t10 --sweeps=stsweeps --threads=5 117 | 118 | To set Dirichlet priors for all mixture distribution use command line arguments such as `--prior-lambda=0.1`. 119 | 120 | 121 | Two kinds of input data are required. The nodes file has to be in the format, 122 | 123 | `node id` \t all items (e.g. words) on one line \n 124 | 125 | If a stopwords file is given, those items are ignored from the input. 126 | 127 | The edges file has to list each edge as the two nodes it connects. All edges are undirected, so giving one direction is sufficient, duplicates are ignored. Follow the format: 128 | 129 | `node id` \t `node id` \n 130 | 131 | 132 | After the Gibbs sampler finished, inspect `likelihood.log` file (in the sweeps directory) to confirm that the model likelihood converged. If hyperparameter estimation is enabled, also inspect `hyperparams.log` to ensure that parameters are in a reasonable range. 133 | 134 | To output any of the multinomial or Bernoulli parameters listed above (say lambdas) call, 135 | 136 | bayes-stack-dump-st lambdas --sweeps stsweeps 137 | 138 | For multinomial distributions with many dimentions (such as phis) it is advisable to restrict the range, e.g. "-n20". 139 | 140 | To identify the influencial friends for a particular user call, 141 | 142 | bayes-stack-dump-st influences --sweeps stsweeps 143 | 144 | Notice that `influences` is different from `psis`: If two or more friends share the same taste user, they will have a high `influence`. But in `psis` those friends have to compete for items. We recommend to use `influences` for any social network analysis. 145 | 146 | 147 | 148 | ### Citation influence model 149 | 150 | The citation influence model is designed to analyze for which topics a document was cited and the strength of its influence on citing papers. Notice that even seminal papers may be cited by papers on which they only have a marginal influence. We treat each document as a node, and each citation as an arc from the citing to the cited paper. 151 | 152 | The citation influence model captures the topics of a node emphasizing for this topics it is cited (followed, subscribed to, ...). It therefore models shared topics of a cited node and all its citing nodes. (This in unlike the shared taste model, which models sharing across one edge only.) 153 | 154 | The graph is converted into a bipartite graph of casting each node as a cited node and a separate citing node. Items in citing nodes are explained by topic mixtures of its citations, together with its own topic mixture which captures innovation. 155 | 156 | Those local influences on a citing node are modeled by mixture over citations `psi`. Each cited node has a mixture over topics `lambda`. Each item in a citing node are associated with one of its citations, and a topic drawn from that cited documents's `lambda`. Further, each item in a cited node are drawn from its `lambda`. The consequence is that a cited node's topic mixture `lambda` is a shared topic mixture, estimated not only from the node's items, but also some items in citing nodes. This is a crucial distinction to LDA. `psi` and `lambda` influence each other: The more likely items in a citing document fit to a topic mixture, the higher its probability under `psi`; The more items in citing documents are associated with the cited document, the more `lamba` will be representing it. 157 | 158 | 159 | Each node in the citation graph will be represented once as a cited document, and once as a citing document. Topics in both duplicates are synchronized via joint influence of `phi`. As some research papers include more novel ideas than others, each citing document also has a topic mixture `omega` for own topics. The propensity to re-use topics from citations versus the introduction of new topics is captured in the Bernoulli parameter `gamma`. 160 | 161 | Nomenclature: 162 | 163 | *lambdas* 164 | : for each cited node, the mixture of topics (equivalent to theta in LDA, but modeling shared across the cited node and its citations). 165 | 166 | *phis* 167 | : for each topic, the mixture of items (as in LDA). 168 | 169 | *psis* 170 | : for each citing node, the mixture over its cited nodes. 171 | 172 | *omegas* 173 | : for each citing node, the own mixture of topics. 174 | 175 | *gammas* 176 | : for each citing node, a Bernoulli distribution for sharing versus own topics. The current version does not support estimating gammas from data. The parameter is fixed and set to the mean of its given Beta prior. 177 | 178 | 179 | To run the shared taste model with 10 topics using 5 parallel threads call, 180 | 181 | bayes-stack-ci --arcs FILE --nodes FILE -t10 --sweeps=cisweeps --threads=5 182 | 183 | To set Dirichlet priors for all mixture distribution use command line arguments such as `--prior-lambda=0.1`. 184 | 185 | Two kinds of input data are required. The nodes file has to be in the format, 186 | 187 | `node id` \t all items (e.g. words) on one line \n 188 | 189 | This id is used both as a cited node id as well as a citing node id. If a stopwords file is given, those items are ignored from the input. 190 | 191 | The arcs file has to list each arc as the source node and sink node. All arcs are directed. You *can* add cycles. Follow the format, 192 | 193 | `citing node id` \t `cited node id` \n 194 | 195 | 196 | After the Gibbs sampler finished, inspect `likelihood.log` file (in the sweeps directory) to confirm that the model likelihood converged. If hyperparameter estimation is turned on, also inspect `hyperparams.log` to ensure that parameters are in a reasonable range. 197 | 198 | To output any of the multinomial or Bernoulli parameters listed above (say lambdas) call, 199 | 200 | bayes-stack-dump-ci lambdas --sweeps cisweeps 201 | 202 | For multinomial distributions with many dimentions (such as `phis`) it is advisable to restrict the range, e.g. "-n20". 203 | 204 | To identify the influencial cited nodes for a particular citing node call, 205 | 206 | bayes-stack-dump-ci influences --sweeps cisweeps 207 | 208 | Notice that `influences` is different from `psis`. Any citation that shares a frequent topic with the node will have a high `influence`. But for generating items, those citations would have to compete with each other, which is reflected in `psis`. We recommend to use `influences` for any social network analysis. 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | -------------------------------------------------------------------------------- /network-topic-models/BayesStack/Models/Topic/CitationInfluenceNoTopics.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE TypeFamilies, GeneralizedNewtypeDeriving, DeriveGeneric, TupleSections, RecordWildCards, TemplateHaskell, RankNTypes, FlexibleContexts #-} 2 | 3 | module BayesStack.Models.Topic.CitationInfluenceNoTopics 4 | ( -- * Primitives 5 | NetData 6 | , dHypers, dArcs, dItems, dNodeItems, dCitingNodes, dCitedNodes 7 | , netData 8 | , HyperParams(..) 9 | , MState 10 | , stGammas, stOmegas, stPsis, stCiting, stLambdas 11 | , CitingUpdateUnit 12 | , ItemSource(..) 13 | , CitedNode(..), CitedNodeItem(..) 14 | , CitingNode(..), CitingNodeItem(..) 15 | , Citing(..), Cited(..) 16 | , Item(..), Topic(..), NodeItem(..), Node(..), Arc(..) 17 | , setupNodeItems 18 | -- * Initialization 19 | , verifyNetData, cleanNetData 20 | , ModelInit 21 | , randomInitialize 22 | , model 23 | , updateUnits 24 | -- * Diagnostics 25 | , modelLikelihood 26 | ) where 27 | 28 | import qualified Data.Vector as V 29 | import Statistics.Sample (mean) 30 | 31 | import Prelude hiding (mapM_, sum) 32 | import Data.Maybe (fromMaybe) 33 | 34 | import Control.Lens hiding (Setting) 35 | import Data.Set (Set) 36 | import qualified Data.Set as S 37 | 38 | import Data.Map.Strict (Map) 39 | import qualified Data.Map.Strict as M 40 | 41 | import Data.Foldable hiding (product) 42 | import Control.Applicative ((<$>), (<*>)) 43 | import Control.Monad (when) 44 | import Control.Monad.Trans.State.Strict 45 | import Control.Monad.Trans.Writer.Strict 46 | 47 | import Data.Random 48 | import Data.Random.Lift (lift) 49 | import Data.Random.Distribution.Categorical (categorical) 50 | import Numeric.Log hiding (sum) 51 | 52 | import BayesStack.Types 53 | import BayesStack.Gibbs 54 | import BayesStack.DirMulti 55 | import BayesStack.TupleEnum () 56 | import BayesStack.Models.Topic.Types 57 | 58 | import GHC.Generics (Generic) 59 | import Data.Binary (Binary) 60 | import Control.DeepSeq 61 | 62 | at' :: At m => Index m -> IndexedLens' (Index m) m (IxValue m) 63 | at' i = at i . _fromMaybe 64 | where _fromMaybe = iso (fromMaybe $ error "at': Unexpected Nothing") Just 65 | 66 | data ItemSource = Shared | Own deriving (Show, Eq, Enum, Ord, Generic) 67 | instance Binary ItemSource 68 | instance NFData ItemSource 69 | 70 | newtype Citing a = Citing a deriving (Show, Eq, Enum, Ord, Generic, NFData) 71 | newtype Cited a = Cited a deriving (Show, Eq, Enum, Ord, Generic, NFData) 72 | instance Binary a => Binary (Citing a) 73 | instance Binary a => Binary (Cited a) 74 | 75 | type CitingNode = Citing Node 76 | type CitedNode = Cited Node 77 | type CitingNodeItem = Citing NodeItem 78 | type CitedNodeItem = Cited NodeItem 79 | 80 | -- ^ A directed edge 81 | data Arc = Arc { citingNode :: !CitingNode, citedNode :: !CitedNode } 82 | deriving (Show, Eq, Ord, Generic) 83 | instance Binary Arc 84 | 85 | data HyperParams = HyperParams 86 | { _alphaPsi :: Double 87 | , _alphaLambda :: Double 88 | , _alphaOmega :: Double 89 | , _alphaGammaShared :: Double 90 | , _alphaGammaOwn :: Double 91 | , _alphaBetaFG :: Double 92 | , _alphaBetaBG :: Double 93 | } 94 | deriving (Show, Eq, Generic) 95 | instance Binary HyperParams 96 | makeLenses ''HyperParams 97 | 98 | data NetData = NetData { _dHypers :: !(HyperParams) 99 | , _dArcs :: !(Set Arc) 100 | , _dItems :: !(Map Item Double) 101 | , _dNodeItems :: !(Map NodeItem (Node, Item)) 102 | , _dCitingNodes :: !(Map CitingNode (Set CitedNode)) 103 | -- ^ Maps each citing node to the set of nodes cited by it 104 | , _dCitedNodes :: !(Map CitedNode (Set CitingNode)) 105 | -- ^ Maps each cited node to the set of nodes citing it 106 | } 107 | deriving (Show, Eq, Generic) 108 | instance Binary NetData 109 | makeLenses ''NetData 110 | 111 | netData :: HyperParams -> Set Arc -> Map Item Double -> Map NodeItem (Node,Item) -> NetData 112 | netData hypers arcs items nodeItems = 113 | NetData { _dHypers = hypers 114 | , _dArcs = arcs 115 | , _dItems = items 116 | , _dNodeItems = nodeItems 117 | , _dCitingNodes = M.unionsWith S.union 118 | $ map (\(Arc a b)->M.singleton a $ S.singleton b) 119 | $ S.toList arcs 120 | , _dCitedNodes = M.unionsWith S.union 121 | $ map (\(Arc a b)->M.singleton b $ S.singleton a) 122 | $ S.toList arcs 123 | } 124 | 125 | dCitingNodeItems :: NetData -> Map CitingNodeItem (CitingNode, Item) 126 | dCitingNodeItems nd = 127 | M.mapKeys Citing 128 | $ M.map (\(n,i)->(Citing n, i)) 129 | $ M.filter (\(n,i)->Citing n `M.member` (nd^.dCitingNodes)) 130 | $ nd^.dNodeItems 131 | 132 | itemsOfCitingNode :: NetData -> CitingNode -> [Item] 133 | itemsOfCitingNode d (Citing u) = 134 | map snd $ M.elems $ M.filter (\(n,_)->n==u) $ d^.dNodeItems 135 | 136 | connectedNodes :: Set Arc -> Set Node 137 | connectedNodes arcs = 138 | S.map ((\(Cited n)->n) . citedNode) arcs `S.union` S.map ((\(Citing n)->n) . citingNode) arcs 139 | 140 | cleanNetData :: NetData -> NetData 141 | cleanNetData d = 142 | let nodesWithItems = S.fromList $ map fst $ M.elems $ d^.dNodeItems 143 | nodesWithArcs = connectedNodes $ d^.dArcs 144 | keptNodes = nodesWithItems `S.intersection` nodesWithArcs 145 | keepArc (Arc (Citing citing) (Cited cited)) = 146 | citing `S.member` keptNodes && cited `S.member` keptNodes 147 | go = do dArcs %= S.filter keepArc 148 | dNodeItems %= M.filter (\(n,i)->n `S.member` keptNodes) 149 | in execState go d 150 | 151 | verifyNetData :: (Node -> String) -> NetData -> [String] 152 | verifyNetData showNode d = execWriter $ do 153 | let nodesWithItems = S.fromList $ map fst $ M.elems $ d^.dNodeItems 154 | forM_ (d^.dArcs) $ \(Arc (Citing citing) (Cited cited))->do 155 | when (cited `S.notMember` nodesWithItems) 156 | $ tell [showNode cited++" has arc yet has no items"] 157 | when (citing `S.notMember` nodesWithItems) 158 | $ tell [showNode citing++" has arc yet has no items"] 159 | 160 | -- Citing Update unit (Shared Taste-like) 161 | data CitingUpdateUnit = CitingUpdateUnit { _uuNI :: CitingNodeItem 162 | , _uuN :: CitingNode 163 | , _uuX :: Item 164 | , _uuCites :: Set CitedNode 165 | , _uuItemWeight :: Double 166 | } 167 | deriving (Show, Generic) 168 | instance Binary CitingUpdateUnit 169 | makeLenses ''CitingUpdateUnit 170 | 171 | citingUpdateUnits :: NetData -> [CitingUpdateUnit] 172 | citingUpdateUnits d = 173 | map (\(ni,(n,x))->CitingUpdateUnit { _uuNI = ni 174 | , _uuN = n 175 | , _uuX = x 176 | , _uuCites = d^.dCitingNodes . at' n 177 | , _uuItemWeight = (d ^. dItems . at' x) 178 | } 179 | ) $ M.assocs $ dCitingNodeItems d 180 | 181 | updateUnits :: NetData -> [WrappedUpdateUnit MState] 182 | updateUnits d = map WrappedUU (citingUpdateUnits d) 183 | 184 | -- | Model State 185 | data CitingSetting = OwnSetting 186 | | SharedSetting !CitedNode 187 | deriving (Show, Eq, Generic) 188 | instance Binary CitingSetting 189 | instance NFData CitingSetting where 190 | rnf (OwnSetting) = () 191 | rnf (SharedSetting c) = rnf c `seq` () 192 | 193 | data MState = MState { -- Citing model state 194 | _stGammas :: !(Map CitingNode (Multinom Int ItemSource)) 195 | , _stOmegas :: !(Map CitingNode (Multinom Int Item)) 196 | , _stPsis :: !(Map CitingNode (Multinom Int CitedNode)) 197 | 198 | , _stCiting :: !(Map CitingNodeItem CitingSetting) 199 | 200 | -- Cited model state 201 | , _stLambdas :: !(Map CitedNode (Multinom Int Item)) 202 | } 203 | deriving (Show, Generic) 204 | instance Binary MState 205 | makeLenses ''MState 206 | 207 | -- | Model initialization 208 | type ModelInit = Map CitingNodeItem (Setting CitingUpdateUnit) 209 | 210 | modify' :: Monad m => (a -> a) -> StateT a m () 211 | modify' f = do x <- get 212 | put $! f x 213 | 214 | randomInitializeCiting :: NetData -> ModelInit -> RVar ModelInit 215 | randomInitializeCiting d init = execStateT doInit init 216 | where doInit :: StateT ModelInit RVar () 217 | doInit = let unset = M.keysSet (dCitingNodeItems d) `S.difference` M.keysSet init 218 | in mapM_ (randomInitCitingUU d) (S.toList unset) 219 | 220 | randomInitCitingUU :: NetData -> CitingNodeItem -> StateT ModelInit RVar () 221 | randomInitCitingUU d cni@(Citing ni) = 222 | let (n,_) = d ^. dNodeItems . at' ni 223 | in case d ^. dCitingNodes . at' (Citing n) of 224 | a | S.null a -> do 225 | modify' $ M.insert cni OwnSetting 226 | 227 | citedNodes -> do 228 | s <- lift $ randomElement [Shared, Own] 229 | c <- lift $ randomElement $ toList citedNodes 230 | modify' $ M.insert cni $ 231 | case s of Shared -> SharedSetting c 232 | Own -> OwnSetting 233 | 234 | randomInitialize :: NetData -> RVar ModelInit 235 | randomInitialize d = randomInitializeCiting d M.empty 236 | 237 | model :: NetData -> ModelInit -> MState 238 | model d citingInit = 239 | let citingNodes = M.keys $ d^.dCitingNodes 240 | hp = d^.dHypers 241 | s = MState { -- Citing model 242 | _stPsis = let dist n = case d ^. dCitingNodes . at' n . to toList of 243 | [] -> M.empty 244 | nodes -> M.singleton n 245 | $ symDirMulti (hp^.alphaPsi) nodes 246 | in foldMap dist citingNodes 247 | , _stGammas = let dist = multinom [ (Shared, hp^.alphaGammaShared) 248 | , (Own, hp^.alphaGammaOwn) ] 249 | in foldMap (\t->M.singleton t dist) citingNodes 250 | , _stOmegas = let dist = symDirMulti (hp^.alphaOmega) (M.keys $ d^.dItems) 251 | in foldMap (\t->M.singleton t dist) citingNodes 252 | , _stCiting = M.empty 253 | 254 | -- Cited model 255 | , _stLambdas = let dist = symDirMulti (hp^.alphaLambda) (M.keys $ d^.dItems) 256 | lambdas0 = foldMap (\n->M.singleton n dist) $ M.keys $ d^.dCitedNodes 257 | in foldl' (\dms (n,x)->M.adjust (incMultinom x) (Cited n) dms) lambdas0 (M.elems $ d^.dNodeItems) 258 | } 259 | 260 | initCitingUU :: CitingUpdateUnit -> State MState () 261 | initCitingUU uu = do 262 | let err = error $ "CitationInference: Initial value for "++show uu++" not given\n" 263 | s = maybe err id $ M.lookup (uu^.uuNI) citingInit 264 | modify' $ setCitingUU uu (Just s) 265 | 266 | in execState (mapM_ initCitingUU $ citingUpdateUnits d) s 267 | 268 | modelLikelihood :: MState -> Probability 269 | modelLikelihood model = 270 | product $ (model ^.. stGammas . folded . to likelihood) 271 | ++ (model ^.. stLambdas . folded . to likelihood) 272 | ++ (model ^.. stOmegas . folded . to likelihood) 273 | ++ (model ^.. stPsis . folded . to likelihood) 274 | 275 | instance UpdateUnit CitingUpdateUnit where 276 | type ModelState CitingUpdateUnit = MState 277 | type Setting CitingUpdateUnit = CitingSetting 278 | fetchSetting uu ms = ms ^. stCiting . at' (uu^.uuNI) 279 | evolveSetting ms uu = categorical $ citingFullCond (setCitingUU uu Nothing ms) uu 280 | updateSetting uu _ s' = setCitingUU uu (Just s') . setCitingUU uu Nothing 281 | 282 | citingProb :: MState -> CitingUpdateUnit -> Setting CitingUpdateUnit -> Double 283 | citingProb st (CitingUpdateUnit {_uuN=n, _uuX=x}) setting = 284 | let gamma = st ^. stGammas . at' n 285 | omega = st ^. stOmegas . at' n 286 | psi = st ^. stPsis . at' n 287 | in case setting of 288 | SharedSetting c -> let lambda = st ^. stLambdas . at' c 289 | in sampleProb gamma Shared 290 | * sampleProb psi c 291 | * sampleProb lambda x 292 | OwnSetting -> sampleProb gamma Own 293 | * sampleProb omega x 294 | 295 | citingFullCond :: MState -> CitingUpdateUnit -> [(Double, Setting CitingUpdateUnit)] 296 | citingFullCond ms uu = map (\s->(citingProb ms uu s, s)) $ citingDomain ms uu 297 | 298 | citingDomain :: MState -> CitingUpdateUnit -> [Setting CitingUpdateUnit] 299 | citingDomain ms uu = do 300 | s <- [Own, Shared] 301 | case s of 302 | Shared -> do c <- uu ^. uuCites . to S.toList 303 | return $ SharedSetting c 304 | Own -> do return $ OwnSetting 305 | 306 | setCitingUU :: CitingUpdateUnit -> Maybe (Setting CitingUpdateUnit) -> MState -> MState 307 | setCitingUU uu@(CitingUpdateUnit {_uuNI=ni, _uuN=n, _uuX=x}) setting ms = execState go ms 308 | where 309 | set = maybe Unset (const Set) setting 310 | go = case maybe (fetchSetting uu ms) id setting of 311 | SharedSetting c -> do stPsis . at' n %= setMultinom set c 312 | stLambdas . at' c %= setMultinom set x 313 | stGammas . at' n %= setMultinom set Shared 314 | stCiting . at ni .= setting 315 | 316 | OwnSetting -> do stOmegas . at' n %= setMultinom set x 317 | stGammas . at' n %= setMultinom set Own 318 | stCiting . at ni .= setting 319 | -------------------------------------------------------------------------------- /network-topic-models/BayesStack/Models/Topic/LDA.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE TypeFamilies, GeneralizedNewtypeDeriving, DeriveGeneric, RecordWildCards #-} 2 | 3 | module BayesStack.Models.Topic.LDA 4 | ( -- * Primitives 5 | NetData(..) 6 | , HyperParams(..) 7 | , MState(..) 8 | , LDAUpdateUnit 9 | , Node(..), Item(..), Topic(..) 10 | , NodeItem(..), setupNodeItems 11 | -- * Initialization 12 | , ModelInit 13 | , randomInitialize 14 | , model, updateUnits 15 | -- * Hyperparameter estimation 16 | , reestimate, reestimatePhis, reestimateThetas 17 | -- * Diagnostics 18 | , modelLikelihood 19 | ) where 20 | 21 | import Prelude hiding (mapM_) 22 | 23 | import Data.Set (Set) 24 | import qualified Data.Set as S 25 | 26 | import Data.Map.Strict (Map) 27 | import qualified Data.Map.Strict as M 28 | 29 | import Data.Traversable 30 | import Data.Foldable hiding (product) 31 | import Data.Monoid 32 | 33 | import Control.Monad (liftM) 34 | import Control.Monad.Trans.State.Strict 35 | import Data.Random 36 | import Data.Random.Distribution.Categorical (categorical) 37 | 38 | import BayesStack.Types 39 | import BayesStack.Gibbs 40 | import BayesStack.DirMulti 41 | import BayesStack.TupleEnum () 42 | import BayesStack.Models.Topic.Types 43 | 44 | import GHC.Generics 45 | import Data.Binary (Binary) 46 | 47 | data HyperParams = HyperParams 48 | { alphaTheta :: Double 49 | , alphaPhi :: Double 50 | } 51 | deriving (Show, Eq, Generic) 52 | instance Binary HyperParams 53 | 54 | data NetData = NetData { dHypers :: !HyperParams 55 | , dNodes :: !(Set Node) 56 | , dItems :: !(Set Item) 57 | , dTopics :: !(Set Topic) 58 | , dNodeItems :: !(Map NodeItem (Node, Item)) 59 | } 60 | deriving (Show, Eq, Generic) 61 | instance Binary NetData 62 | 63 | type ModelInit = Map NodeItem Topic 64 | 65 | randomInitialize' :: NetData -> ModelInit -> RVar ModelInit 66 | randomInitialize' d init = 67 | let unset = M.keysSet (dNodeItems d) `S.difference` M.keysSet init 68 | topics = S.toList $ dTopics d 69 | randomInit :: NodeItem -> RVar ModelInit 70 | randomInit ni = liftM (M.singleton ni) $ randomElement topics 71 | in liftM mconcat $ forM (S.toList unset) randomInit 72 | 73 | randomInitialize :: NetData -> RVar ModelInit 74 | randomInitialize = (flip randomInitialize') M.empty 75 | 76 | updateUnits :: NetData -> [WrappedUpdateUnit MState] 77 | updateUnits = map WrappedUU . updateUnits' 78 | 79 | updateUnits' :: NetData -> [LDAUpdateUnit] 80 | updateUnits' = 81 | map (\(ni,(n,x))->LDAUpdateUnit {uuNI=ni, uuN=n, uuX=x}) . M.assocs . dNodeItems 82 | 83 | model :: NetData -> ModelInit -> MState 84 | model d init = 85 | let uus = updateUnits' d 86 | s = MState { stThetas = foldMap (\n->M.singleton n (symDirMulti alphaTheta (toList $ dTopics d))) 87 | $ dNodes d 88 | , stPhis = foldMap (\t->M.singleton t (symDirMulti alphaPhi (toList $ dItems d))) 89 | $ dTopics d 90 | , stT = M.empty 91 | } 92 | HyperParams {..} = dHypers d 93 | in execState (mapM_ (\uu->modify' $ setUU uu (Just $ M.findWithDefault (Topic 0) (uuNI uu) init)) uus) s 94 | 95 | modify' :: (s -> s) -> State s () 96 | modify' f = do 97 | a <- get 98 | put $! f a 99 | 100 | data MState = MState { stThetas :: !(Map Node (Multinom Int Topic)) 101 | , stPhis :: !(Map Topic (Multinom Int Item)) 102 | , stT :: !(Map NodeItem Topic) 103 | } 104 | deriving (Show, Generic) 105 | instance Binary MState 106 | 107 | data LDAUpdateUnit = LDAUpdateUnit { uuNI :: NodeItem 108 | , uuN :: Node 109 | , uuX :: Item 110 | } 111 | deriving (Show, Generic) 112 | instance Binary LDAUpdateUnit 113 | 114 | setUU :: LDAUpdateUnit -> Maybe Topic -> MState -> MState 115 | setUU uu@(LDAUpdateUnit {uuN=n, uuNI=ni, uuX=x}) setting ms = 116 | let t = maybe (fetchSetting uu ms) id setting 117 | set = maybe Unset (const Set) setting 118 | in ms { stPhis = M.adjust (setMultinom set x) t (stPhis ms) 119 | , stThetas = M.adjust (setMultinom set t) n (stThetas ms) 120 | , stT = case setting of Just _ -> M.insert ni t $ stT ms 121 | Nothing -> stT ms 122 | } 123 | 124 | instance UpdateUnit LDAUpdateUnit where 125 | type ModelState LDAUpdateUnit = MState 126 | type Setting LDAUpdateUnit = Topic 127 | fetchSetting (LDAUpdateUnit {uuNI=ni}) ms = stT ms M.! ni 128 | evolveSetting ms uu = categorical $ ldaFullCond (setUU uu Nothing ms) uu 129 | updateSetting uu _ s' = setUU uu (Just s') . setUU uu Nothing 130 | 131 | uuProb :: MState -> LDAUpdateUnit -> Topic -> Double 132 | uuProb state (LDAUpdateUnit {uuN=n, uuX=x}) t = 133 | let theta = stThetas state M.! n 134 | phi = stPhis state M.! t 135 | in realToFrac $ sampleProb theta t * sampleProb phi x 136 | 137 | ldaFullCond :: MState -> LDAUpdateUnit -> [(Double, Topic)] 138 | ldaFullCond ms uu = do 139 | t <- uuDomain ms uu 140 | return (uuProb ms uu t, t) 141 | 142 | uuDomain :: MState -> LDAUpdateUnit -> [Topic] 143 | uuDomain ms uu = M.keys $ stPhis ms 144 | 145 | modelLikelihood :: MState -> Probability 146 | modelLikelihood model = 147 | product (map likelihood $ M.elems $ stThetas model) 148 | * product (map likelihood $ M.elems $ stPhis model) 149 | 150 | -- | Re-estimate phi hyperparameter 151 | reestimatePhis :: MState -> MState 152 | reestimatePhis ms = ms { stPhis = reestimateSymPriors $ stPhis ms } 153 | 154 | -- | Re-estimate theta hyperparameter 155 | reestimateThetas :: MState -> MState 156 | reestimateThetas ms = ms { stThetas = reestimateSymPriors $ stThetas ms } 157 | 158 | reestimate :: MState -> MState 159 | reestimate = reestimatePhis . reestimateThetas 160 | -------------------------------------------------------------------------------- /network-topic-models/BayesStack/Models/Topic/LDARelevance.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE TypeFamilies, GeneralizedNewtypeDeriving, DeriveGeneric, FlexibleInstances, RecordWildCards #-} 2 | 3 | module BayesStack.Models.Topic.LDARelevance 4 | ( -- * Primitives 5 | NetData(..) 6 | , HyperParams(..) 7 | , MState(..) 8 | , LDAUpdateUnit 9 | , ItemWeight 10 | , Node(..), Item(..), Topic(..) 11 | , NodeItem(..), setupNodeItems 12 | -- * Initialization 13 | , ModelInit 14 | , randomInitialize 15 | , model, updateUnits 16 | -- * Hyperparameter estimation 17 | , reestimate, reestimatePhis, reestimateThetas 18 | -- * Diagnostics 19 | , modelLikelihood 20 | ) where 21 | 22 | import Prelude hiding (mapM) 23 | 24 | import Data.Set (Set) 25 | import qualified Data.Set as S 26 | 27 | import Data.Map.Strict (Map) 28 | import qualified Data.Map.Strict as M 29 | 30 | import Data.Traversable 31 | import Data.Foldable hiding (product) 32 | import Data.Monoid 33 | 34 | import Control.Monad (liftM) 35 | import Control.Monad.Trans.State 36 | import Data.Random 37 | import Data.Random.Distribution.Categorical (categorical) 38 | 39 | import BayesStack.Types 40 | import BayesStack.Gibbs 41 | import BayesStack.DirMulti 42 | import BayesStack.TupleEnum () 43 | import BayesStack.Models.Topic.Types 44 | 45 | import GHC.Generics 46 | import Data.Binary as B 47 | import Data.Fixed 48 | 49 | type ItemWeight = Micro 50 | 51 | data HyperParams = HyperParams 52 | { alphaTheta :: Double 53 | , alphaPhi :: Double 54 | } 55 | deriving (Show, Eq, Generic) 56 | instance Binary HyperParams 57 | 58 | data NetData = NetData { dHypers :: !HyperParams 59 | , dNodes :: !(Set Node) 60 | , dItems :: !(Map Item ItemWeight) 61 | , dTopics :: !(Set Topic) 62 | , dNodeItems :: !(Map NodeItem (Node, Item)) 63 | } 64 | deriving (Show, Eq, Generic) 65 | instance Binary NetData 66 | 67 | type ModelInit = Map NodeItem Topic 68 | 69 | randomInitialize' :: NetData -> ModelInit -> RVar ModelInit 70 | randomInitialize' d init = 71 | let unset = M.keysSet (dNodeItems d) `S.difference` M.keysSet init 72 | topics = S.toList $ dTopics d 73 | randomInit :: NodeItem -> RVar ModelInit 74 | randomInit ni = liftM (M.singleton ni) $ randomElement topics 75 | in liftM mconcat $ forM (S.toList unset) randomInit 76 | 77 | randomInitialize :: NetData -> RVar ModelInit 78 | randomInitialize = (flip randomInitialize') M.empty 79 | 80 | updateUnits :: NetData -> [WrappedUpdateUnit MState] 81 | updateUnits = map WrappedUU . updateUnits' 82 | 83 | updateUnits' :: NetData -> [LDAUpdateUnit] 84 | updateUnits' nd = 85 | map (\(ni,(n,x))->LDAUpdateUnit { uuNI=ni, uuN=n, uuX=x 86 | , uuW=dItems nd M.! x 87 | }) 88 | $ M.assocs $ dNodeItems nd 89 | 90 | model :: NetData -> ModelInit -> MState 91 | model d init = 92 | let uus = updateUnits' d 93 | s = MState { stThetas = foldMap (\n->M.singleton n (symDirMulti alphaTheta (toList $ dTopics d))) 94 | $ dNodes d 95 | , stPhis = foldMap (\t->M.singleton t (symDirMulti alphaPhi (M.keys $ dItems d))) 96 | $ dTopics d 97 | , stT = M.empty 98 | } 99 | HyperParams {..} = dHypers d 100 | in execState (mapM (\uu->modify $ setUU uu (Just $ M.findWithDefault (Topic 0) (uuNI uu) init)) uus) s 101 | 102 | data MState = MState { stThetas :: !(Map Node (Multinom Int Topic)) 103 | , stPhis :: !(Map Topic (Multinom ItemWeight Item)) 104 | , stT :: !(Map NodeItem Topic) 105 | } 106 | deriving (Show, Generic) 107 | instance Binary MState 108 | 109 | data LDAUpdateUnit = LDAUpdateUnit { uuNI :: NodeItem 110 | , uuN :: Node 111 | , uuX :: Item 112 | , uuW :: ItemWeight 113 | } 114 | deriving (Show, Generic) 115 | instance Binary LDAUpdateUnit 116 | 117 | instance Binary (Fixed E6) where 118 | get = do a <- B.get :: Get Int 119 | return $ fromIntegral a / 1000000 120 | put = (B.put :: Int -> Put) . round . (*1000000) 121 | 122 | setUU :: LDAUpdateUnit -> Maybe Topic -> MState -> MState 123 | setUU uu@(LDAUpdateUnit {uuN=n, uuNI=ni, uuX=x, uuW=w}) setting ms = 124 | let t = maybe (fetchSetting uu ms) id setting 125 | set = maybe Unset (const Set) setting 126 | setPhi = case setting of 127 | Just _ -> addMultinom w x 128 | Nothing -> subMultinom w x 129 | in ms { stPhis = M.adjust setPhi t (stPhis ms) 130 | , stThetas = M.adjust (setMultinom set t) n (stThetas ms) 131 | , stT = case setting of Just _ -> M.insert ni t $ stT ms 132 | Nothing -> stT ms 133 | } 134 | 135 | instance UpdateUnit LDAUpdateUnit where 136 | type ModelState LDAUpdateUnit = MState 137 | type Setting LDAUpdateUnit = Topic 138 | fetchSetting (LDAUpdateUnit {uuNI=ni}) ms = stT ms M.! ni 139 | evolveSetting ms uu = categorical $ ldaFullCond (setUU uu Nothing ms) uu 140 | updateSetting uu _ s' = setUU uu (Just s') . setUU uu Nothing 141 | 142 | uuProb :: MState -> LDAUpdateUnit -> Topic -> Double 143 | uuProb state (LDAUpdateUnit {uuN=n, uuX=x}) t = 144 | let theta = stThetas state M.! n 145 | phi = stPhis state M.! t 146 | in realToFrac $ sampleProb theta t * sampleProb phi x 147 | 148 | ldaFullCond :: MState -> LDAUpdateUnit -> [(Double, Topic)] 149 | ldaFullCond ms uu = do 150 | t <- uuDomain ms uu 151 | return (uuProb ms uu t, t) 152 | 153 | uuDomain :: MState -> LDAUpdateUnit -> [Topic] 154 | uuDomain ms uu = M.keys $ stPhis ms 155 | 156 | modelLikelihood :: MState -> Probability 157 | modelLikelihood model = 158 | product (map likelihood $ M.elems $ stThetas model) 159 | * product (map likelihood $ M.elems $ stPhis model) 160 | 161 | -- | Re-estimate phi hyperparameter 162 | reestimatePhis :: MState -> MState 163 | reestimatePhis ms = ms { stPhis = reestimateSymPriors $ stPhis ms } 164 | 165 | -- | Re-estimate theta hyperparameter 166 | reestimateThetas :: MState -> MState 167 | reestimateThetas ms = ms { stThetas = reestimateSymPriors $ stThetas ms } 168 | 169 | reestimate :: MState -> MState 170 | reestimate = reestimatePhis . reestimateThetas 171 | -------------------------------------------------------------------------------- /network-topic-models/BayesStack/Models/Topic/SharedTaste.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE TypeFamilies, GeneralizedNewtypeDeriving, DeriveGeneric, RecordWildCards #-} 2 | 3 | module BayesStack.Models.Topic.SharedTaste 4 | ( -- * Primitives 5 | NetData(..) 6 | , HyperParams(..) 7 | , MState(..) 8 | , STUpdateUnit 9 | , ItemSource(..) 10 | , Node(..), Item(..), Topic(..), Edge(..) 11 | , NodeItem(..), setupNodeItems 12 | -- * Initialization 13 | , ModelInit 14 | , randomInitialize 15 | , model, updateUnits 16 | -- * Diagnostics 17 | , modelLikelihood 18 | , influence 19 | ) where 20 | 21 | import Prelude hiding (mapM, sum) 22 | 23 | import Data.Set (Set) 24 | import qualified Data.Set as S 25 | 26 | import Data.Map.Strict (Map) 27 | import qualified Data.Map.Strict as M 28 | 29 | import Data.Traversable 30 | import Data.Foldable hiding (product) 31 | import Data.Monoid 32 | 33 | import Control.DeepSeq 34 | import Control.Monad (liftM) 35 | import Control.Monad.Trans.State 36 | import Data.Random 37 | import Data.Random.Distribution.Categorical (categorical) 38 | import Numeric.Log hiding (sum) 39 | 40 | import BayesStack.Types 41 | import BayesStack.Gibbs 42 | import BayesStack.DirMulti 43 | import BayesStack.TupleEnum () 44 | import BayesStack.Models.Topic.Types 45 | 46 | import GHC.Generics 47 | import Data.Binary 48 | 49 | data ItemSource = Shared | Own 50 | deriving (Show, Eq, Generic, Enum, Ord) 51 | instance Binary ItemSource 52 | instance NFData ItemSource 53 | 54 | data HyperParams = HyperParams 55 | { alphaPsi :: Double 56 | , alphaLambda :: Double 57 | , alphaPhi :: Double 58 | , alphaOmega :: Double 59 | , alphaGammaShared :: Double 60 | , alphaGammaOwn :: Double 61 | } 62 | deriving (Show, Eq, Generic) 63 | instance Binary HyperParams 64 | 65 | data NetData = NetData { dHypers :: !HyperParams 66 | , dEdges :: !(Set Edge) 67 | , dItems :: !(Set Item) 68 | , dTopics :: !(Set Topic) 69 | , dNodeItems :: !(Map NodeItem (Node, Item)) 70 | } 71 | deriving (Show, Eq, Generic) 72 | instance Binary NetData 73 | 74 | dNodes :: NetData -> Set Node 75 | dNodes = S.fromList . map fst . M.elems . dNodeItems 76 | 77 | dAdjNodes :: NetData -> Node -> Set Node 78 | dAdjNodes nd n = S.fromList $ getFriends (S.toList $ dEdges nd) n 79 | 80 | dItemsOfNode :: NetData -> Node -> [Item] 81 | dItemsOfNode nd u = map snd $ filter (\(n,_)->u==n) $ M.elems $ dNodeItems nd 82 | 83 | type ModelInit = Map NodeItem (Setting STUpdateUnit) 84 | 85 | randomInit :: NetData -> Map Node (Set Node) -> NodeItem -> RVar ModelInit 86 | randomInit d friends ni = do 87 | let topics = S.toList $ dTopics d 88 | t <- randomElement topics 89 | s <- randomElement [Shared, Own] 90 | let (u,_) = dNodeItems d M.! ni 91 | f <- randomElement $ S.toList 92 | $ maybe (error "SharedTaste.randomInit: No friends") id $ M.lookup u friends 93 | return $ M.singleton ni $ 94 | case s of Shared -> SharedSetting t (Edge (u,f)) 95 | Own -> OwnSetting t 96 | 97 | randomInitialize' :: NetData -> ModelInit -> RVar ModelInit 98 | randomInitialize' d init = 99 | let unset = M.keysSet (dNodeItems d) `S.difference` M.keysSet init 100 | friends = M.unionsWith S.union $ map (\(Edge (a,b))-> M.singleton a (S.singleton b) 101 | <> M.singleton b (S.singleton a) 102 | ) $ S.toList $ dEdges d 103 | in liftM mconcat $ forM (S.toList unset) $ randomInit d friends 104 | 105 | 106 | randomInitialize :: NetData -> RVar ModelInit 107 | randomInitialize = (flip randomInitialize') M.empty 108 | 109 | updateUnits' :: NetData -> [STUpdateUnit] 110 | updateUnits' d = 111 | map (\(ni,(n,x)) -> 112 | STUpdateUnit { uuNI = ni 113 | , uuN = n 114 | , uuX = x 115 | , uuFriends = getFriends (S.toList $ dEdges d) n 116 | } 117 | ) 118 | $ M.assocs $ dNodeItems d 119 | 120 | updateUnits :: NetData -> [WrappedUpdateUnit MState] 121 | updateUnits = map WrappedUU . updateUnits' 122 | 123 | model :: NetData -> ModelInit -> MState 124 | model d init = 125 | let uus = updateUnits' d 126 | s = MState { stPsis = let dist n = symDirMulti alphaPsi (toList $ getFriends (toList $ dEdges d) n) 127 | in foldMap (\n->M.singleton n $ dist n) $ dNodes d 128 | , stPhis = let dist = symDirMulti alphaPhi (toList $ dItems d) 129 | in foldMap (\t->M.singleton t dist) $ dTopics d 130 | , stGammas = let dist = multinom [ (Shared, alphaGammaShared) 131 | , (Own, alphaGammaOwn) ] 132 | in foldMap (\t->M.singleton t dist) $ dNodes d 133 | , stOmegas = let dist = symDirMulti alphaOmega (toList $ dTopics d) 134 | in foldMap (\t->M.singleton t dist) $ dNodes d 135 | , stLambdas = let dist = symDirMulti alphaLambda (toList $ dTopics d) 136 | in foldMap (\t->M.singleton t dist) $ dEdges d 137 | , stVars = M.empty 138 | } 139 | HyperParams {..} = dHypers d 140 | initUU uu = do 141 | let s = maybe (error "Incomplete initialization") id $ M.lookup (uuNI uu) init 142 | modify $ setUU uu (Just s) 143 | in execState (mapM initUU uus) s 144 | 145 | data STSetting = OwnSetting !Topic 146 | | SharedSetting !Topic !Edge 147 | deriving (Show, Eq, Generic) 148 | 149 | instance Binary STSetting 150 | instance NFData STSetting where 151 | rnf (OwnSetting t) = rnf t `seq` () 152 | rnf (SharedSetting t f) = rnf t `seq` rnf f `seq` () 153 | 154 | data MState = MState { stGammas :: !(Map Node (Multinom Int ItemSource)) 155 | , stOmegas :: !(Map Node (Multinom Int Topic)) 156 | , stPsis :: !(Map Node (Multinom Int Node)) 157 | , stLambdas :: !(Map Edge (Multinom Int Topic)) 158 | , stPhis :: !(Map Topic (Multinom Int Item)) 159 | 160 | , stVars :: !(Map NodeItem STSetting) 161 | } 162 | deriving (Show, Generic) 163 | instance Binary MState 164 | 165 | data STUpdateUnit = STUpdateUnit { uuNI :: NodeItem 166 | , uuN :: Node 167 | , uuX :: Item 168 | , uuFriends :: [Node] 169 | } 170 | deriving (Show, Generic) 171 | instance Binary STUpdateUnit 172 | 173 | setUU :: STUpdateUnit -> Maybe (Setting STUpdateUnit) -> MState -> MState 174 | setUU uu@(STUpdateUnit {uuNI=ni, uuN=n, uuX=x}) setting ms = 175 | let set = maybe Unset (const Set) setting 176 | ms' = case maybe (fetchSetting uu ms) id setting of 177 | SharedSetting t fship -> 178 | let f = maybe (error "Node isn't part of friendship") id 179 | $ otherFriend n fship 180 | in ms { stPsis = M.adjust (setMultinom set n) f 181 | $ M.adjust (setMultinom set f) n $ stPsis ms 182 | , stLambdas = M.adjust (setMultinom set t) fship $ stLambdas ms 183 | , stPhis = M.adjust (setMultinom set x) t $ stPhis ms 184 | , stGammas = M.adjust (setMultinom set Shared) n $ stGammas ms 185 | } 186 | OwnSetting t -> 187 | ms { stOmegas = M.adjust (setMultinom set t) n $ stOmegas ms 188 | , stPhis = M.adjust (setMultinom set x) t $ stPhis ms 189 | , stGammas = M.adjust (setMultinom set Own) n $ stGammas ms 190 | } 191 | in ms' { stVars = M.alter (const setting) ni $ stVars ms' } 192 | 193 | instance UpdateUnit STUpdateUnit where 194 | type ModelState STUpdateUnit = MState 195 | type Setting STUpdateUnit = STSetting 196 | fetchSetting uu ms = stVars ms M.! uuNI uu 197 | evolveSetting ms uu = categorical $ stFullCond (setUU uu Nothing ms) uu 198 | updateSetting uu _ s' = setUU uu (Just s') . setUU uu Nothing 199 | 200 | uuProb :: MState -> STUpdateUnit -> Setting STUpdateUnit -> Double 201 | uuProb st (STUpdateUnit {uuN=n, uuX=x}) setting = 202 | let gamma = stGammas st M.! n 203 | omega = stOmegas st M.! n 204 | psi = stPsis st M.! n 205 | in case setting of 206 | SharedSetting t fship -> let phi = stPhis st M.! t 207 | lambda = stLambdas st M.! fship 208 | f = maybe (error "Friend isn't friends with node") id 209 | $ otherFriend n fship 210 | in sampleProb gamma Shared 211 | * sampleProb psi f 212 | * sampleProb lambda t 213 | * sampleProb phi x 214 | OwnSetting t -> let phi = stPhis st M.! t 215 | in sampleProb gamma Own 216 | * sampleProb omega t 217 | * sampleProb phi x 218 | 219 | stFullCond :: MState -> STUpdateUnit -> [(Double, Setting STUpdateUnit)] 220 | stFullCond ms uu = map (\s->(uuProb ms uu s, s)) $ stDomain ms uu 221 | 222 | stDomain :: MState -> STUpdateUnit -> [Setting STUpdateUnit] 223 | stDomain ms uu = do 224 | s <- [Own, Shared] 225 | t <- M.keys $ stPhis ms 226 | case s of 227 | Shared -> do f <- uuFriends uu 228 | return $ SharedSetting t (Edge (uuN uu, f)) 229 | Own -> do return $ OwnSetting t 230 | 231 | modelLikelihood :: MState -> Probability 232 | modelLikelihood model = 233 | product (map likelihood $ M.elems $ stGammas model) 234 | * product (map likelihood $ M.elems $ stPhis model) 235 | * product (map likelihood $ M.elems $ stLambdas model) 236 | * product (map likelihood $ M.elems $ stOmegas model) 237 | * product (map likelihood $ M.elems $ stPsis model) 238 | 239 | -- | The probability of a collections of items under a given topic mixture. 240 | topicCompatibility :: MState -> [Item] -> Multinom Int Topic -> Probability 241 | topicCompatibility m items lambda = 242 | product $ do t <- toList $ dmDomain lambda 243 | let phi = stPhis m M.! t 244 | itemObs = zip items (repeat 1) 245 | return $ realToFrac (sampleProb lambda t) * obsProb phi itemObs 246 | 247 | topicCompatibilities :: (Functor f, Foldable f) 248 | => MState -> [Item] -> f (Multinom Int Topic) -> f Probability 249 | topicCompatibilities m items topics = 250 | let scores = fmap (topicCompatibility m items) topics 251 | in fmap (/sum scores) scores 252 | 253 | -- | The influence of adjacent nodes on a node. 254 | influence :: NetData -> MState -> Node -> Map Node Probability 255 | influence d m u = 256 | let lambdas = foldMap (\f->M.singleton f $ stLambdas m M.! Edge (u,f)) 257 | $ dAdjNodes d u 258 | in topicCompatibilities m (dItemsOfNode d u) lambdas 259 | -------------------------------------------------------------------------------- /network-topic-models/BayesStack/Models/Topic/Test.hs: -------------------------------------------------------------------------------- 1 | import qualified Data.Map as M 2 | 3 | main = do 4 | let m :: M.Map Int Int 5 | m = M.fromList [ (1,1), (2,2) ] 6 | print $ M.findWithDefault 6 1 m 7 | print $ M.findWithDefault 6 2 m 8 | 9 | print $ M.lookup 1 m 10 | print $ M.lookup 2 m 11 | -------------------------------------------------------------------------------- /network-topic-models/BayesStack/Models/Topic/Types.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE TypeFamilies, GeneralizedNewtypeDeriving, DeriveGeneric #-} 2 | 3 | module BayesStack.Models.Topic.Types where 4 | 5 | import Control.DeepSeq 6 | import GHC.Generics 7 | import Data.Binary 8 | import Data.Function 9 | import Data.Maybe 10 | 11 | import Data.Map (Map) 12 | import qualified Data.Map as M 13 | 14 | newtype Node = Node Int deriving (Show, Eq, Ord, Enum, Generic, NFData) 15 | newtype Item = Item Int deriving (Show, Eq, Ord, Enum, Generic, NFData) 16 | newtype Topic = Topic Int deriving (Show, Eq, Ord, Enum, Generic, NFData) 17 | newtype NodeItem = NodeItem Int deriving (Show, Eq, Ord, Enum, Generic, NFData) 18 | newtype Edge = Edge (Node, Node) deriving (Show, Generic, NFData) 19 | 20 | instance Binary Node 21 | instance Binary Item 22 | instance Binary Topic 23 | instance Binary NodeItem 24 | instance Binary Edge 25 | 26 | instance Eq Edge where 27 | (Edge (a,b)) == (Edge (c,d)) = (a == c && b == d) || (a == d && b == c) 28 | instance Enum Edge where 29 | fromEnum (Edge (a,b)) = let a' = min a b 30 | b' = max a b 31 | in 2^32 * fromEnum a' + fromEnum b' 32 | toEnum n = let (na, nb) = n `quotRem` (2^32) 33 | in Edge (toEnum na, toEnum nb) 34 | instance Ord Edge where 35 | compare = compare `on` fromEnum 36 | 37 | otherFriend :: Node -> Edge -> Maybe Node 38 | otherFriend u (Edge (a,b)) 39 | | u == a = Just b 40 | | u == b = Just a 41 | | otherwise = Nothing 42 | 43 | isFriend :: Node -> Edge -> Bool 44 | isFriend u fs = isJust $ otherFriend u fs 45 | 46 | getFriends :: [Edge] -> Node -> [Node] 47 | getFriends fs u = mapMaybe (otherFriend u) fs 48 | 49 | setupNodeItems :: [(Node,Item)] -> Map NodeItem (Node, Item) 50 | setupNodeItems nodeItems = M.fromList $ zipWith (\idx (n,i)->(NodeItem idx, (n,i))) [0..] nodeItems 51 | -------------------------------------------------------------------------------- /network-topic-models/BenchLDA.hs: -------------------------------------------------------------------------------- 1 | module BenchLDA where 2 | 3 | import Control.Monad.Trans.State 4 | import Control.Monad (replicateM, forM) 5 | import Control.Applicative ((<$>)) 6 | import Text.Printf 7 | import Control.Concurrent (setNumCapabilities) 8 | 9 | import Criterion 10 | import Data.Random 11 | 12 | import qualified Data.Set as S 13 | import BayesStack.Core.Gibbs 14 | import BayesStack.Models.Topic.LDA 15 | 16 | data NetParams = NetParams { nNodes :: Int 17 | , nItems :: Int 18 | , nTopics :: Int 19 | , nItemsPerNode :: Int 20 | } 21 | deriving (Show, Eq, Ord) 22 | 23 | netParams = NetParams { nNodes = 50000 24 | , nItems = nItemsPerNode netParams * nNodes netParams `div` 10 25 | , nTopics = 100 26 | , nItemsPerNode = 200 27 | } 28 | 29 | randomNetwork :: NetParams -> RVar NetData 30 | randomNetwork net = do 31 | let nodes = [Node i | i <- [1..nNodes net]] 32 | items = [Item i | i <- [1..nItems net]] 33 | nodeItem = do node <- randomElement nodes 34 | item <- randomElement items 35 | return (node, item) 36 | edges <- replicateM (nItemsPerNode net) nodeItem 37 | return $! NetData { dAlphaTheta = 0.1 38 | , dAlphaPhi = 0.1 39 | , dNodes = S.fromList nodes 40 | , dItems = S.fromList items 41 | , dTopics = S.fromList [Topic i | i <- [1..nTopics net]] 42 | , dNodeItems = setupNodeItems edges 43 | } 44 | 45 | benchmarksForNetwork :: NetParams -> NetData -> ModelInit -> [Benchmark] 46 | benchmarksForNetwork np net init = do 47 | let sweeps = 100 48 | updateBlock <- [10, 100, 1000] 49 | threads <- [1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26] 50 | let name = printf "%d topics, %d threads, %d block, %d items per node" (nTopics np) threads updateBlock (nItemsPerNode np) 51 | return $ bench name $ do 52 | setNumCapabilities threads 53 | gibbsUpdate threads updateBlock (model net init) 54 | $ concat $ replicate sweeps (updateUnits net) 55 | 56 | benchmarksForNetParams :: NetParams -> RVar [Benchmark] 57 | benchmarksForNetParams np = do 58 | net <- randomNetwork np 59 | init <- randomInitialize net 60 | return $ benchmarksForNetwork np net init 61 | 62 | ldaBenchmarkParams :: RVar [[Benchmark]] 63 | ldaBenchmarkParams = 64 | mapM benchmarksForNetParams 65 | $ do topics <- [100, 500, 1000] 66 | return netParams {nTopics=topics} 67 | 68 | ldaBenchmarks :: RVar Benchmark 69 | ldaBenchmarks = bgroup "LDA" . concat <$> ldaBenchmarkParams 70 | -------------------------------------------------------------------------------- /network-topic-models/BenchST.hs: -------------------------------------------------------------------------------- 1 | module BenchST where 2 | 3 | import Control.Monad.Trans.State 4 | import Control.Monad (replicateM, forM, guard) 5 | import Control.Applicative ((<$>)) 6 | import Text.Printf 7 | import Control.Concurrent (setNumCapabilities) 8 | 9 | import Criterion 10 | import Data.Random 11 | 12 | import qualified Data.Set as S 13 | import Data.List ((\\)) 14 | import BayesStack.Core.Gibbs 15 | import BayesStack.Models.Topic.SharedTaste 16 | 17 | data NetParams = NetParams { nNodes :: Int 18 | , nEdgesPerNode :: Int 19 | , nItems :: Int 20 | , nTopics :: Int 21 | , nItemsPerNode :: Int 22 | } 23 | 24 | netParams = NetParams { nNodes = 5000 25 | , nEdgesPerNode = 10 26 | , nItems = nItemsPerNode netParams * nNodes netParams `div` 10 27 | , nTopics = 100 28 | , nItemsPerNode = 20 29 | } 30 | 31 | randomNetwork :: NetParams -> RVar NetData 32 | randomNetwork net = do 33 | let nodes = [Node i | i <- [1..nNodes net]] 34 | items = [Item i | i <- [1..nItems net]] 35 | edge a = do b <- randomElement nodes --(nodes \\ [a]) 36 | return $ Edge (a,b) 37 | nodeItem = do node <- randomElement nodes 38 | item <- randomElement items 39 | return (node, item) 40 | edges <- concat <$> forM nodes (replicateM (nEdgesPerNode net) . edge) 41 | nodeItems <- replicateM (nItemsPerNode net) nodeItem 42 | return $ NetData { dAlphaLambda= 0.1 43 | , dAlphaPhi = 0.1 44 | , dAlphaPsi = 0.01 45 | , dAlphaOmega = 0.1 46 | , dAlphaGammaShared = 0.9 47 | , dAlphaGammaOwn = 0.1 48 | , dEdges = S.fromList edges 49 | , dItems = S.fromList items 50 | , dTopics = S.fromList [Topic i | i <- [1..nTopics net]] 51 | , dNodeItems = setupNodeItems nodeItems 52 | } 53 | 54 | data STBenchmark = STBenchmark { bNetParams :: NetParams 55 | , bThreads :: Int 56 | , bUpdateBlock :: Int 57 | , bSweeps :: Int 58 | } 59 | 60 | drawStBenchmark :: STBenchmark -> RVar Benchmark 61 | drawStBenchmark b = do 62 | net <- randomNetwork $ bNetParams b 63 | init <- randomInitialize net 64 | let name = printf "%d topics, %d threads, %d block, %d items per node" (nTopics $ bNetParams b) (bThreads b) (bUpdateBlock b) (nItemsPerNode $ bNetParams b) 65 | return $ bench name $ do 66 | setNumCapabilities (bThreads b) 67 | gibbsUpdate (bThreads b) (bUpdateBlock b) (model net init) 68 | $ concat $ replicate (bSweeps b) (updateUnits net) 69 | 70 | stBenchmarkParams :: [STBenchmark] 71 | stBenchmarkParams = do 72 | updateBlock <- [10, 100, 1000] 73 | topics <- [20, 100, 500] 74 | threads <- [1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26] 75 | return STBenchmark { bNetParams = netParams {nTopics=topics} 76 | , bThreads = threads 77 | , bUpdateBlock = updateBlock 78 | , bSweeps = 2 79 | } 80 | 81 | stBenchmarks :: RVar Benchmark 82 | stBenchmarks = bgroup "ST" `fmap` mapM drawStBenchmark stBenchmarkParams 83 | -------------------------------------------------------------------------------- /network-topic-models/Benchmark.hs: -------------------------------------------------------------------------------- 1 | import Criterion 2 | import Criterion.Config 3 | import Criterion.Main 4 | import Data.Random 5 | import System.Random.MWC 6 | import BenchLDA 7 | import BenchST 8 | 9 | benchmarks :: RVar [Benchmark] 10 | benchmarks = sequence [ldaBenchmarks] --, stBenchmarks] 11 | 12 | withSystemRandomIO :: (GenIO -> IO a) -> IO a 13 | withSystemRandomIO = withSystemRandom 14 | 15 | main = do 16 | bs <- withSystemRandomIO $ runRVar benchmarks 17 | defaultMainWith defaultConfig (return ()) bs 18 | -------------------------------------------------------------------------------- /network-topic-models/DumpCI.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | 3 | import Data.Monoid 4 | import Data.Foldable 5 | import Data.List 6 | import Data.Function (on) 7 | import Options.Applicative 8 | 9 | import qualified Data.Map as M 10 | import qualified Data.Set as S 11 | import qualified Data.ByteString as BS 12 | 13 | import qualified Data.Text.Lazy.IO as TL 14 | import qualified Data.Text.Lazy.Builder as TB 15 | import Data.Text.Lazy.Builder.Int 16 | import Data.Text.Lazy.Builder.RealFloat 17 | import Data.Binary 18 | 19 | import System.FilePath (()) 20 | import Text.Printf 21 | 22 | import BayesStack.Models.Topic.CitationInfluence 23 | import FormatMultinom 24 | import Numeric.Log 25 | import ReadData 26 | import SerializeText 27 | 28 | data Opts = Opts { nElems :: Maybe Int 29 | , dumper :: Dumper 30 | , sweepDir :: FilePath 31 | , sweepNum :: Maybe Int 32 | } 33 | 34 | type Dumper = Opts -> NetData -> MState 35 | -> (Item -> TB.Builder) -> (Node -> TB.Builder) 36 | -> TB.Builder 37 | 38 | showB :: Show a => a -> TB.Builder 39 | showB = TB.fromString . show 40 | 41 | showTopic :: Topic -> TB.Builder 42 | showTopic (Topic n) = "Topic "<>decimal n 43 | 44 | formatProb = formatRealFloat Exponent (Just 3) . realToFrac 45 | 46 | readDumper :: String -> Maybe Dumper 47 | readDumper "phis" = Just $ \opts nd m showItem showNode -> 48 | formatMultinoms showTopic showItem (nElems opts) (stPhis m) 49 | 50 | readDumper "psis" = Just $ \opts nd m showItem showNode -> 51 | formatMultinoms (\(Citing n)->showNode n) showB (nElems opts) (stPsis m) 52 | 53 | readDumper "lambdas"= Just $ \opts nd m showItem showNode -> 54 | formatMultinoms (\(Cited n)->showNode n) showB (nElems opts) (stLambdas m) 55 | 56 | readDumper "omegas" = Just $ \opts nd m showItem showNode -> 57 | formatMultinoms (\(Citing n)->showNode n) showB (nElems opts) (stOmegas m) 58 | 59 | readDumper "gammas" = Just $ \opts nd m showItem showNode -> 60 | formatMultinoms (\(Citing n)->showNode n) showB (nElems opts) (stGammas m) 61 | 62 | readDumper "influences" = Just $ \opts nd m showItem showNode -> 63 | let formatInfluences u = 64 | foldMap (\(Cited n,p)->"\t" <> showNode n <> "\t" <> formatProb p <> "\n") 65 | $ sortBy (flip (compare `on` snd)) 66 | $ M.assocs $ influence nd m u 67 | in foldMap (\u@(Citing u')->"\n" <> showNode u' <> "\n" <> formatInfluences u) 68 | $ M.keys $ stGammas m 69 | 70 | readDumper "edge-mixtures" = Just $ \opts nd m showItem showNode -> 71 | let showArc (Arc (Citing d) (Cited c)) = showNode d <> " -> " <> showNode c 72 | formatMixture a = 73 | let ps = sortBy (flip compare `on` snd) 74 | $ map (\t->(t, arcTopicMixture nd m a t)) 75 | $ S.toList $ dTopics nd 76 | norm = Numeric.Log.sum $ map snd ps 77 | in foldMap (\(t,p)->"\t" <> showTopic t <> "\t" <> formatProb p <> "\n") 78 | $ maybe id take (nElems opts) 79 | $ map (\(t,p)->(t, p / norm)) ps 80 | in foldMap (\a->"\n" <> showArc a <> "\n" <> formatMixture a) 81 | $ S.toList $ dArcs nd 82 | 83 | readDumper _ = Nothing 84 | 85 | opts = Opts 86 | <$> nullOption ( long "top" 87 | <> short 'n' 88 | <> value Nothing 89 | <> reader (pure . auto) 90 | <> metavar "N" 91 | <> help "Number of elements to output from each distribution" 92 | ) 93 | <*> argument readDumper 94 | ( metavar "STR" 95 | <> help "One of: phis, psis, lambdas, omegas, gammas, influences, edge-mixtures" 96 | ) 97 | <*> strOption ( long "sweeps" 98 | <> short 's' 99 | <> value "sweeps" 100 | <> metavar "DIR" 101 | <> help "The directory of sweeps to dump" 102 | ) 103 | <*> option ( long "sweep-n" 104 | <> short 'N' 105 | <> reader (pure . auto) 106 | <> value Nothing 107 | <> metavar "N" 108 | <> help "The sweep number to dump" 109 | ) 110 | 111 | readSweep :: FilePath -> IO MState 112 | readSweep = decodeFile 113 | 114 | readNetData :: FilePath -> IO NetData 115 | readNetData = decodeFile 116 | 117 | main = do 118 | args <- execParser $ info (helper <*> opts) 119 | ( fullDesc 120 | <> progDesc "Dump distributions from an citation influence model sweep" 121 | <> header "dump-ci - Dump distributions from an citation influence model sweep" 122 | ) 123 | 124 | nd <- readNetData $ sweepDir args "data" 125 | itemMap <- readItemMap $ sweepDir args 126 | nodeMap <- readNodeMap $ sweepDir args 127 | m <- case sweepNum args of 128 | Nothing -> readSweep =<< getLastSweep (sweepDir args) 129 | Just n -> readSweep $ sweepDir args printf "%05d.state" n 130 | 131 | let showItem = showB . (itemMap M.!) 132 | showNode = showB . (nodeMap M.!) 133 | TL.putStr $ TB.toLazyText $ dumper args args nd m showItem showNode 134 | -------------------------------------------------------------------------------- /network-topic-models/DumpCINT.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | 3 | import Control.Lens hiding (argument) 4 | import Data.Foldable 5 | import Data.Function (on) 6 | import Data.List 7 | import Data.Monoid 8 | import Options.Applicative 9 | 10 | import qualified Data.Map as M 11 | import qualified Data.Set as S 12 | import qualified Data.ByteString as BS 13 | 14 | import qualified Data.Text.Lazy.IO as TL 15 | import qualified Data.Text.Lazy.Builder as TB 16 | import Data.Text.Lazy.Builder.Int 17 | import Data.Text.Lazy.Builder.RealFloat 18 | import Data.Binary 19 | 20 | import System.FilePath (()) 21 | import Text.Printf 22 | 23 | import BayesStack.Models.Topic.CitationInfluenceNoTopics 24 | import FormatMultinom 25 | import Numeric.Log 26 | import ReadData 27 | import SerializeText 28 | 29 | data Opts = Opts { nElems :: Maybe Int 30 | , dumper :: Dumper 31 | , sweepDir :: FilePath 32 | , sweepNum :: Maybe Int 33 | } 34 | 35 | type Dumper = Opts -> NetData -> MState 36 | -> (Item -> TB.Builder) -> (Node -> TB.Builder) 37 | -> TB.Builder 38 | 39 | showB :: Show a => a -> TB.Builder 40 | showB = TB.fromString . show 41 | 42 | showTopic :: Topic -> TB.Builder 43 | showTopic (Topic n) = "Topic "<>decimal n 44 | 45 | formatProb = formatRealFloat Exponent (Just 3) . realToFrac 46 | 47 | readDumper :: String -> Maybe Dumper 48 | readDumper "psis" = Just $ \opts nd m showItem showNode -> 49 | formatMultinoms (\(Citing n)->showNode n) (\(Cited n)->showNode n) (nElems opts) (m^.stPsis) 50 | 51 | readDumper "lambdas"= Just $ \opts nd m showItem showNode -> 52 | formatMultinoms (\(Cited n)->showNode n) showItem (nElems opts) (m^.stLambdas) 53 | 54 | readDumper "omegas" = Just $ \opts nd m showItem showNode -> 55 | formatMultinoms (\(Citing n)->showNode n) showB (nElems opts) (m^.stOmegas) 56 | 57 | readDumper "gammas" = Just $ \opts nd m showItem showNode -> 58 | formatMultinoms (\(Citing n)->showNode n) showB (nElems opts) (m^.stGammas) 59 | 60 | readDumper _ = Nothing 61 | 62 | opts = Opts 63 | <$> nullOption ( long "top" 64 | <> short 'n' 65 | <> value Nothing 66 | <> reader (pure . auto) 67 | <> metavar "N" 68 | <> help "Number of elements to output from each distribution" 69 | ) 70 | <*> argument readDumper 71 | ( metavar "STR" 72 | <> help "One of: phis, psis, lambdas, omegas, gammas, influences, edge-mixtures" 73 | ) 74 | <*> strOption ( long "sweeps" 75 | <> short 's' 76 | <> value "sweeps" 77 | <> metavar "DIR" 78 | <> help "The directory of sweeps to dump" 79 | ) 80 | <*> option ( long "sweep-n" 81 | <> short 'N' 82 | <> reader (pure . auto) 83 | <> value Nothing 84 | <> metavar "N" 85 | <> help "The sweep number to dump" 86 | ) 87 | 88 | readSweep :: FilePath -> IO MState 89 | readSweep = decodeFile 90 | 91 | readNetData :: FilePath -> IO NetData 92 | readNetData = decodeFile 93 | 94 | main = do 95 | args <- execParser $ info (helper <*> opts) 96 | ( fullDesc 97 | <> progDesc "Dump distributions from an citation influence model sweep" 98 | <> header "dump-ci - Dump distributions from an citation influence model sweep" 99 | ) 100 | 101 | nd <- readNetData $ sweepDir args "data" 102 | itemMap <- readItemMap $ sweepDir args 103 | nodeMap <- readNodeMap $ sweepDir args 104 | m <- case sweepNum args of 105 | Nothing -> readSweep =<< getLastSweep (sweepDir args) 106 | Just n -> readSweep $ sweepDir args printf "%05d.state" n 107 | 108 | let showItem = showB . (itemMap M.!) 109 | showNode = showB . (nodeMap M.!) 110 | TL.putStr $ TB.toLazyText $ dumper args args nd m showItem showNode 111 | -------------------------------------------------------------------------------- /network-topic-models/DumpLDA.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | 3 | import Data.Monoid 4 | import Options.Applicative 5 | 6 | import qualified Data.Map as M 7 | import qualified Data.ByteString as BS 8 | 9 | import qualified Data.Text.Lazy.IO as TL 10 | import Data.Text.Lazy.Builder.Int 11 | import qualified Data.Text.Lazy.Builder as TB 12 | import Data.Binary 13 | 14 | import System.FilePath (()) 15 | import Text.Printf 16 | 17 | import BayesStack.Models.Topic.LDA 18 | import SerializeText 19 | import ReadData 20 | import FormatMultinom 21 | 22 | data Opts = Opts { nElems :: Maybe Int 23 | , dumper :: Dumper 24 | , sweepDir :: FilePath 25 | , sweepNum :: Maybe Int 26 | } 27 | 28 | type Dumper = Opts -> NetData -> MState 29 | -> (Item -> TB.Builder) -> (Node -> TB.Builder) 30 | -> TB.Builder 31 | 32 | showB :: Show a => a -> TB.Builder 33 | showB = TB.fromString . show 34 | 35 | readDumper :: String -> Maybe Dumper 36 | readDumper "thetas" = Just $ \opts nd m showItem showNode -> 37 | formatMultinoms showNode showB (nElems opts) (stThetas m) 38 | 39 | readDumper "phis" = Just $ \opts nd m showItem showNode -> 40 | formatMultinoms (\(Topic n)->"Topic "<>decimal n) showItem (nElems opts) (stPhis m) 41 | 42 | readDumper _ = Nothing 43 | 44 | opts = Opts 45 | <$> nullOption ( long "top" 46 | <> short 'n' 47 | <> value Nothing 48 | <> reader (pure . auto) 49 | <> metavar "N" 50 | <> help "Number of elements to output from each distribution" 51 | ) 52 | <*> argument readDumper 53 | ( metavar "STR" 54 | <> help "One of: thetas, lambdas" 55 | ) 56 | <*> strOption ( long "sweeps" 57 | <> short 's' 58 | <> value "sweeps" 59 | <> metavar "DIR" 60 | <> help "The directory of sweeps to dump" 61 | ) 62 | <*> option ( long "number" 63 | <> short 'N' 64 | <> reader (pure . auto) 65 | <> value Nothing 66 | <> metavar "N" 67 | <> help "The sweep number to dump" 68 | ) 69 | 70 | readSweep :: FilePath -> IO MState 71 | readSweep = decodeFile 72 | 73 | readNetData :: FilePath -> IO NetData 74 | readNetData = decodeFile 75 | 76 | main = do 77 | args <- execParser $ info (helper <*> opts) 78 | ( fullDesc 79 | <> progDesc "Dump distributions from an LDA sweep" 80 | <> header "dump-lda - Dump distributions from an LDA sweep" 81 | ) 82 | 83 | nd <- readNetData $ sweepDir args "data" 84 | itemMap <- readItemMap $ sweepDir args 85 | nodeMap <- readNodeMap $ sweepDir args 86 | m <- case sweepNum args of 87 | Nothing -> readSweep =<< getLastSweep (sweepDir args) 88 | Just n -> readSweep $ sweepDir args printf "%05d.state" n 89 | 90 | let showItem = showB . (itemMap M.!) 91 | showNode = showB . (nodeMap M.!) 92 | TL.putStr $ TB.toLazyText $ dumper args args nd m showItem showNode 93 | -------------------------------------------------------------------------------- /network-topic-models/DumpLDARelevance.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | 3 | import Data.Monoid 4 | import Options.Applicative 5 | 6 | import qualified Data.Map as M 7 | import qualified Data.ByteString as BS 8 | 9 | import qualified Data.Text.Lazy.IO as TL 10 | import Data.Text.Lazy.Builder.Int 11 | import qualified Data.Text.Lazy.Builder as TB 12 | import Data.Binary 13 | 14 | import System.FilePath (()) 15 | import Text.Printf 16 | 17 | import BayesStack.Models.Topic.LDARelevance 18 | import SerializeText 19 | import ReadData 20 | import FormatMultinom 21 | 22 | data Opts = Opts { nElems :: Maybe Int 23 | , dumper :: Dumper 24 | , sweepDir :: FilePath 25 | , sweepNum :: Maybe Int 26 | } 27 | 28 | type Dumper = Opts -> NetData -> MState 29 | -> (Item -> TB.Builder) -> (Node -> TB.Builder) 30 | -> TB.Builder 31 | 32 | showB :: Show a => a -> TB.Builder 33 | showB = TB.fromString . show 34 | 35 | readDumper :: String -> Maybe Dumper 36 | readDumper "thetas" = Just $ \opts nd m showItem showNode -> 37 | formatMultinoms showNode showB (nElems opts) (stThetas m) 38 | 39 | readDumper "phis" = Just $ \opts nd m showItem showNode -> 40 | formatMultinoms (\(Topic n)->"Topic "<>decimal n) showItem (nElems opts) (stPhis m) 41 | 42 | readDumper _ = Nothing 43 | 44 | opts = Opts 45 | <$> nullOption ( long "top" 46 | <> short 'n' 47 | <> value Nothing 48 | <> reader (pure . auto) 49 | <> metavar "N" 50 | <> help "Number of elements to output from each distribution" 51 | ) 52 | <*> argument readDumper 53 | ( metavar "STR" 54 | <> help "One of: thetas, lambdas" 55 | ) 56 | <*> strOption ( long "sweeps" 57 | <> short 's' 58 | <> value "sweeps" 59 | <> metavar "DIR" 60 | <> help "The directory of sweeps to dump" 61 | ) 62 | <*> option ( long "number" 63 | <> short 'N' 64 | <> reader (pure . auto) 65 | <> value Nothing 66 | <> metavar "N" 67 | <> help "The sweep number to dump" 68 | ) 69 | 70 | readSweep :: FilePath -> IO MState 71 | readSweep = decodeFile 72 | 73 | readNetData :: FilePath -> IO NetData 74 | readNetData = decodeFile 75 | 76 | main = do 77 | args <- execParser $ info (helper <*> opts) 78 | ( fullDesc 79 | <> progDesc "Dump distributions from an LDA sweep" 80 | <> header "dump-lda - Dump distributions from an LDA sweep" 81 | ) 82 | 83 | nd <- readNetData $ sweepDir args "data" 84 | itemMap <- readItemMap $ sweepDir args 85 | nodeMap <- readNodeMap $ sweepDir args 86 | m <- case sweepNum args of 87 | Nothing -> readSweep =<< getLastSweep (sweepDir args) 88 | Just n -> readSweep $ sweepDir args printf "%05d.state" n 89 | 90 | let showItem = showB . (itemMap M.!) 91 | showNode = showB . (nodeMap M.!) 92 | TL.putStr $ TB.toLazyText $ dumper args args nd m showItem showNode 93 | -------------------------------------------------------------------------------- /network-topic-models/DumpST.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | 3 | import Data.Monoid 4 | import Data.Foldable 5 | import Data.List 6 | import Data.Function (on) 7 | import Options.Applicative 8 | 9 | import qualified Data.Map as M 10 | import qualified Data.ByteString as BS 11 | 12 | import qualified Data.Text.Lazy.IO as TL 13 | import qualified Data.Text.Lazy.Builder as TB 14 | import Data.Text.Lazy.Builder.Int 15 | import Data.Text.Lazy.Builder.RealFloat 16 | import Data.Binary 17 | 18 | import System.FilePath (()) 19 | import Text.Printf 20 | 21 | import BayesStack.Models.Topic.SharedTaste 22 | import SerializeText 23 | import ReadData 24 | import FormatMultinom 25 | 26 | data Opts = Opts { nElems :: Maybe Int 27 | , dumper :: Dumper 28 | , sweepDir :: FilePath 29 | , sweepNum :: Maybe Int 30 | } 31 | 32 | type Dumper = Opts -> NetData -> MState 33 | -> (Item -> TB.Builder) -> (Node -> TB.Builder) 34 | -> TB.Builder 35 | 36 | showB :: Show a => a -> TB.Builder 37 | showB = TB.fromString . show 38 | 39 | readDumper :: String -> Maybe Dumper 40 | readDumper "phis" = Just $ \opts nd m showItem showNode -> 41 | formatMultinoms (\(Topic n)->"Topic "<>decimal n) showItem (nElems opts) (stPhis m) 42 | 43 | readDumper "psis" = Just $ \opts nd m showItem showNode -> 44 | formatMultinoms showNode showB (nElems opts) (stPsis m) 45 | 46 | readDumper "lambdas"= Just $ \opts nd m showItem showNode -> 47 | formatMultinoms showB showB (nElems opts) (stLambdas m) 48 | 49 | readDumper "omegas" = Just $ \opts nd m showItem showNode -> 50 | formatMultinoms showB showB (nElems opts) (stOmegas m) 51 | 52 | readDumper "gammas" = Just $ \opts nd m showItem showNode -> 53 | formatMultinoms showB showB (nElems opts) (stGammas m) 54 | 55 | readDumper "influences" = Just $ \opts nd m showItem showNode -> 56 | let formatProb = formatRealFloat Exponent (Just 3) . realToFrac 57 | formatInfluences u = 58 | foldMap (\(n,p)->"\t" <> showNode n <> "\t" <> formatProb p <> "\n") 59 | $ sortBy (flip (compare `on` snd)) 60 | $ M.assocs $ influence nd m u 61 | in foldMap (\u->"\n" <> showB u <> "\n" <> formatInfluences u) 62 | $ M.keys $ stGammas m 63 | 64 | opts = Opts 65 | <$> nullOption ( long "top" 66 | <> short 'n' 67 | <> value Nothing 68 | <> reader (pure . auto) 69 | <> metavar "N" 70 | <> help "Number of elements to output from each distribution" 71 | ) 72 | <*> argument readDumper 73 | ( metavar "STR" 74 | <> help "One of: phis, psis, lambdas, omegas, gammas, influences" 75 | ) 76 | <*> strOption ( long "sweeps" 77 | <> short 's' 78 | <> value "sweeps" 79 | <> metavar "DIR" 80 | <> help "The directory of sweeps to dump" 81 | ) 82 | <*> option ( long "number" 83 | <> short 'N' 84 | <> reader (pure . auto) 85 | <> value Nothing 86 | <> metavar "N" 87 | <> help "The sweep number to dump" 88 | ) 89 | 90 | readSweep :: FilePath -> IO MState 91 | readSweep = decodeFile 92 | 93 | readNetData :: FilePath -> IO NetData 94 | readNetData = decodeFile 95 | 96 | main = do 97 | args <- execParser $ info (helper <*> opts) 98 | ( fullDesc 99 | <> progDesc "Dump distributions from an shared taste model sweep" 100 | <> header "dump-lda - Dump distributions from an shared taste model sweep" 101 | ) 102 | 103 | nd <- readNetData $ sweepDir args "data" 104 | itemMap <- readItemMap $ sweepDir args 105 | nodeMap <- readNodeMap $ sweepDir args 106 | m <- case sweepNum args of 107 | Nothing -> readSweep =<< getLastSweep (sweepDir args) 108 | Just n -> readSweep $ sweepDir args printf "%05d.state" n 109 | 110 | let showItem = showB . (itemMap M.!) 111 | showNode = showB . (nodeMap M.!) 112 | TL.putStr $ TB.toLazyText $ dumper args args nd m showItem showNode 113 | -------------------------------------------------------------------------------- /network-topic-models/FormatMultinom.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | 3 | module FormatMultinom ( formatMultinom 4 | , formatMultinoms 5 | ) where 6 | 7 | import Data.Foldable 8 | import Data.Monoid 9 | 10 | import qualified Data.Text.Lazy.IO as TL 11 | import qualified Data.Text.Lazy.Builder as TB 12 | import Data.Text.Lazy.Builder.RealFloat 13 | 14 | import qualified Data.Map as M 15 | 16 | import BayesStack.DirMulti 17 | 18 | formatMultinom :: (Real w, Ord a, Enum a) 19 | => (a -> TB.Builder) -> Maybe Int -> Multinom w a -> TB.Builder 20 | formatMultinom show n = foldMap formatElem . takeTop . toList . decProbabilities 21 | where formatElem (p,x) = 22 | "\t" <> show x <> "\t" <> formatRealFloat Exponent (Just 3) p <> "\n" 23 | takeTop = maybe id take n 24 | 25 | formatMultinoms :: (Real w, Ord k, Ord a, Enum a) 26 | => (k -> TB.Builder) -> (a -> TB.Builder) -> Maybe Int 27 | -> M.Map k (Multinom w a) -> TB.Builder 28 | formatMultinoms showKey showElem n = foldMap go . M.assocs 29 | where go (k,v) = showKey k <> "\n" 30 | <> formatMultinom showElem n v <> "\n" 31 | -------------------------------------------------------------------------------- /network-topic-models/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012, Ben Gamari 2 | 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | 16 | * Neither the name of Ben Gamari nor the names of other 17 | contributors may be used to endorse or promote products derived 18 | from this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /network-topic-models/README.mkd: -------------------------------------------------------------------------------- 1 | # Network topic models for `bayes-stack` 2 | 3 | `network-topic-models` implements a few network topic models on top of 4 | `bayes-stack`. These include, 5 | 6 | * LDA [1] 7 | * Shared taste model [2] 8 | * Citation influence model [3] 9 | 10 | We adopt the following nomenclature, 11 | 12 | * Node: a document or user that contains items 13 | * Item: a word, tag, or other content of a node 14 | * Edge: a set of two nodes (undirected edge) 15 | * Arc: a pair of two nodes (directed edge) 16 | 17 | See `doc/models/models.tex` for graphical representation of the 18 | implemented models. 19 | 20 | ## References 21 | 22 | 1. D. Blei, A. Ng, and M. Jordan. "Latent Dirichlet allocation." _Journal of Machine Learning Research_, 3:993–1022, 2003. 23 | 2. L. Dietz, B. Gamari, J. Guiver, E. Snelson, and R. Herbrich. "De-layering social networks by shared tastes of friendships." _Proceedings of the Sixths International AAAI Conference on Weblogs and Social Media_, 2012. 24 | 3. L. Dietz, S. Bickel, and T. Scheffer. "Unsupervised prediction of citation influences." _Proceedings of the 24th international conference on machine learning_, 2007. 25 | 26 | -------------------------------------------------------------------------------- /network-topic-models/ReadData.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE PatternGuards #-} 2 | 3 | module ReadData ( Term, NodeName 4 | , readEdges 5 | , readNodeItems 6 | , getLastSweep 7 | , readItemMap, readNodeMap 8 | ) where 9 | 10 | import BayesStack.Models.Topic.Types 11 | import BayesStack.Models.Topic.CitationInfluence 12 | 13 | import qualified Data.Set as S 14 | import Data.Set (Set) 15 | 16 | import qualified Data.Map as M 17 | 18 | import Data.Maybe (mapMaybe) 19 | import Control.Applicative 20 | 21 | import Data.Char (isAlpha) 22 | import qualified Data.Text as T 23 | import qualified Data.Text.IO as TIO 24 | import Data.Text.Read (decimal) 25 | import Data.Binary 26 | import qualified Data.ByteString as BS 27 | import SerializeText () 28 | 29 | import System.FilePath (()) 30 | import System.Directory 31 | import Data.List 32 | 33 | type Term = T.Text 34 | type NodeName = T.Text 35 | 36 | readEdges :: FilePath -> IO (Set (NodeName, NodeName)) 37 | readEdges fname = 38 | S.fromList . mapMaybe parseLine . T.lines <$> TIO.readFile fname 39 | where parseLine :: T.Text -> Maybe (NodeName, NodeName) 40 | parseLine l = case T.words l of 41 | [a,b] -> Just (a, b) 42 | otherwise -> Nothing 43 | 44 | readNodeItems :: Set Term -> FilePath -> IO (M.Map NodeName [Term]) 45 | readNodeItems stopWords fname = 46 | M.unionsWith (++) . map parseLine . T.lines <$> TIO.readFile fname 47 | where parseLine :: T.Text -> M.Map NodeName [Term] 48 | parseLine l = case T.words l of 49 | n:words -> 50 | M.singleton n 51 | $ filter (`S.notMember` stopWords) words 52 | otherwise -> M.empty 53 | 54 | getLastSweep :: FilePath -> IO FilePath 55 | getLastSweep sweepsDir = 56 | (sweepsDir ) . last . sort . filter (".state" `isSuffixOf`) 57 | <$> getDirectoryContents sweepsDir 58 | 59 | readItemMap :: FilePath -> IO (M.Map Item Term) 60 | readItemMap sweepsDir = 61 | decodeFile $ sweepsDir "item-map" 62 | 63 | readNodeMap :: FilePath -> IO (M.Map Node NodeName) 64 | readNodeMap sweepsDir = 65 | decodeFile $ sweepsDir "node-map" 66 | -------------------------------------------------------------------------------- /network-topic-models/ReadRelevanceData.hs: -------------------------------------------------------------------------------- 1 | module ReadRelevanceData where 2 | 3 | import Control.Applicative 4 | import qualified Data.Map.Strict as M 5 | import Data.Set (Set) 6 | import qualified Data.Text as T 7 | import qualified Data.Text.IO as TIO 8 | 9 | import BayesStack.Models.Topic.LDARelevance 10 | import ReadData hiding (readNodeItems) 11 | import Data.Attoparsec.Text hiding (option) 12 | 13 | readNodeItems :: Set Term -> FilePath 14 | -> IO (Either String (M.Map NodeName [(Term, ItemWeight)])) 15 | readNodeItems stopWords fname = 16 | fmap (M.unionsWith (++)) . parseOnly (many1 line) <$> TIO.readFile fname 17 | where line = do doc <- takeTill isHorizontalSpace 18 | words <- flip manyTill endOfLine $ do 19 | skipSpace 20 | word <- takeTill isHorizontalSpace 21 | skipSpace 22 | weight <- double 23 | return (word, realToFrac weight) 24 | return $ M.singleton doc words 25 | -------------------------------------------------------------------------------- /network-topic-models/RunCI.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns, GeneralizedNewtypeDeriving, StandaloneDeriving #-} 2 | 3 | import Prelude hiding (mapM) 4 | 5 | import Options.Applicative 6 | import Data.Monoid ((<>)) 7 | import Control.Monad.Trans.Class 8 | 9 | import Data.Vector (Vector) 10 | import qualified Data.Vector.Unboxed as VU 11 | import qualified Data.Vector.Generic as V 12 | import Statistics.Sample (mean) 13 | 14 | import Data.Traversable (mapM) 15 | import qualified Data.Set as S 16 | import Data.Set (Set) 17 | import qualified Data.Map as M 18 | 19 | import ReadData 20 | import SerializeText 21 | import qualified RunSampler as Sampler 22 | import BayesStack.DirMulti 23 | import BayesStack.Models.Topic.CitationInfluence 24 | import BayesStack.UniqueKey 25 | 26 | import qualified Data.Text as T 27 | import qualified Data.Text.IO as TIO 28 | 29 | import System.FilePath.Posix (()) 30 | import Data.Binary 31 | import qualified Data.ByteString as BS 32 | import Text.Printf 33 | 34 | import Data.Random 35 | import System.Random.MWC 36 | 37 | data RunOpts = RunOpts { arcsFile :: FilePath 38 | , nodesFile :: FilePath 39 | , stopwords :: Maybe FilePath 40 | , nTopics :: Int 41 | , samplerOpts :: Sampler.SamplerOpts 42 | , hyperParams :: HyperParams 43 | , noClean :: Bool 44 | } 45 | 46 | runOpts = RunOpts 47 | <$> strOption ( long "arcs" 48 | <> short 'a' 49 | <> metavar "FILE" 50 | <> help "File containing arcs" 51 | ) 52 | <*> strOption ( long "nodes" 53 | <> short 'n' 54 | <> metavar "FILE" 55 | <> help "File containing nodes' items" 56 | ) 57 | <*> nullOption ( long "stopwords" 58 | <> short 's' 59 | <> metavar "FILE" 60 | <> reader (pure . Just) 61 | <> value Nothing 62 | <> help "Stop words list" 63 | ) 64 | <*> option ( long "topics" 65 | <> short 't' 66 | <> metavar "N" 67 | <> value 20 68 | <> help "Number of topics" 69 | ) 70 | <*> Sampler.samplerOpts 71 | <*> hyperOpts 72 | <*> flag False True ( long "no-clean" 73 | <> short 'c' 74 | <> help "Don't attempt to sanitize input data. Among other things, nodes without friends will not be discarded" 75 | ) 76 | 77 | hyperOpts = HyperParams 78 | <$> option ( long "prior-psi" 79 | <> value 1 80 | <> help "Dirichlet parameter for prior on psi" 81 | ) 82 | <*> option ( long "prior-lambda" 83 | <> value 0.1 84 | <> help "Dirichlet parameter for prior on lambda" 85 | ) 86 | <*> option ( long "prior-phi" 87 | <> value 0.01 88 | <> help "Dirichlet parameter for prior on phi" 89 | ) 90 | <*> option ( long "prior-omega" 91 | <> value 0.01 92 | <> help "Dirichlet parameter for prior on omega" 93 | ) 94 | <*> option ( long "prior-gamma-shared" 95 | <> value 0.9 96 | <> help "Beta parameter for prior on gamma (shared)" 97 | ) 98 | <*> option ( long "prior-gamma-own" 99 | <> value 0.1 100 | <> help "Beta parameter for prior on gamma (own)" 101 | ) 102 | 103 | mapMKeys :: (Ord k, Ord k', Monad m, Applicative m) 104 | => (a -> m a') -> (k -> m k') -> M.Map k a -> m (M.Map k' a') 105 | mapMKeys f g x = M.fromList <$> (mapM (\(k,v)->(,) <$> g k <*> f v) $ M.assocs x) 106 | 107 | termsToItems :: M.Map NodeName [Term] -> Set (NodeName, NodeName) 108 | -> ( (M.Map Node [Item], Set (Node, Node)) 109 | , (M.Map Item Term, M.Map Node NodeName)) 110 | termsToItems nodes arcs = 111 | let ((d', nodeMap), itemMap) = 112 | runUniqueKey' [Item i | i <- [0..]] $ 113 | runUniqueKeyT' [Node i | i <- [0..]] $ do 114 | a <- mapMKeys (mapM (lift . getUniqueKey)) getUniqueKey nodes 115 | b <- S.fromList <$> mapM (\(x,y)->(,) <$> getUniqueKey x <*> getUniqueKey y) 116 | (S.toList arcs) 117 | return (a,b) 118 | in (d', (itemMap, nodeMap)) 119 | 120 | makeNetData :: HyperParams -> M.Map Node [Item] -> Set Arc -> Int -> NetData 121 | makeNetData hp nodeItems arcs nTopics = 122 | netData hp arcs nodeItems' topics 123 | where topics = S.fromList [Topic i | i <- [1..nTopics]] 124 | nodeItems' = M.fromList 125 | $ zip [NodeItem i | i <- [0..]] 126 | $ do (n,items) <- M.assocs nodeItems 127 | item <- items 128 | return (n, item) 129 | 130 | opts = info runOpts 131 | ( fullDesc 132 | <> progDesc "Learn citation influence model" 133 | <> header "run-ci - learn citation influence model" 134 | ) 135 | 136 | edgesToArcs :: Set (Node, Node) -> Set Arc 137 | edgesToArcs = S.map (\(a,b)->Arc (Citing a) (Cited b)) 138 | 139 | instance Sampler.SamplerModel MState where 140 | estimateHypers = id -- reestimate -- FIXME 141 | modelLikelihood = modelLikelihood 142 | summarizeHypers ms = "" -- FIXME 143 | 144 | main = do 145 | args <- execParser opts 146 | stopWords <- case stopwords args of 147 | Just f -> S.fromList . T.words <$> TIO.readFile f 148 | Nothing -> return S.empty 149 | printf "Read %d stopwords\n" (S.size stopWords) 150 | 151 | ((nodeItems, a), (itemMap, nodeMap)) <- termsToItems 152 | <$> readNodeItems stopWords (nodesFile args) 153 | <*> readEdges (arcsFile args) 154 | let arcs = edgesToArcs a 155 | 156 | Sampler.createSweeps $ samplerOpts args 157 | let sweepsDir = Sampler.sweepsDir $ samplerOpts args 158 | encodeFile (sweepsDir "item-map") itemMap 159 | encodeFile (sweepsDir "node-map") nodeMap 160 | 161 | let termCounts = V.fromListN (M.size nodeItems) 162 | $ map length $ M.elems nodeItems :: Vector Int 163 | printf "Read %d arcs, %d nodes, %d node-items\n" (S.size arcs) (M.size nodeItems) (V.sum termCounts) 164 | printf "Mean items per node: %1.2f\n" (mean $ V.map realToFrac termCounts) 165 | 166 | withSystemRandom $ \mwc->do 167 | let nd = (if noClean args then id else cleanNetData) 168 | $ makeNetData (hyperParams args) nodeItems arcs (nTopics args) 169 | mapM_ putStrLn $ verifyNetData (\n->maybe (show n) show $ M.lookup n nodeMap) nd 170 | 171 | let nCitingNodes = VU.fromList $ M.elems $ M.unionsWith (+) 172 | $ map (\a->M.singleton (citingNode a) 1) 173 | $ S.toList $ dArcs nd 174 | 175 | nCitedNodes = VU.fromList $ M.elems $ M.unionsWith (+) 176 | $ map (\a->M.singleton (citedNode a) 1) 177 | $ S.toList $ dArcs nd 178 | printf "After cleaning: %d cited nodes, %d citing nodes, %d arcs, %d node-items\n" 179 | (S.size $ S.map citedNode $ dArcs nd) (S.size $ S.map citingNode $ dArcs nd) 180 | (S.size $ dArcs nd) (M.size $ dNodeItems nd) 181 | printf "In degree: mean=%3.1f, maximum=%3.1f\n" 182 | (mean nCitedNodes) (V.maximum nCitedNodes) 183 | printf "Out degree: mean=%3.1f, maximum=%3.1f\n" 184 | (mean nCitingNodes) (V.maximum nCitingNodes) 185 | 186 | encodeFile (sweepsDir "data") nd 187 | mInit <- runRVar (randomInitialize nd) mwc 188 | let m = model nd mInit 189 | Sampler.runSampler (samplerOpts args) m (updateUnits nd) 190 | return () 191 | -------------------------------------------------------------------------------- /network-topic-models/RunCINT.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns, GeneralizedNewtypeDeriving, StandaloneDeriving #-} 2 | 3 | import Prelude hiding (mapM) 4 | import Control.Lens 5 | 6 | import Options.Applicative 7 | import Data.Monoid ((<>)) 8 | import Control.Monad.Trans.Class 9 | 10 | import Data.Vector (Vector) 11 | import qualified Data.Vector.Unboxed as VU 12 | import qualified Data.Vector.Generic as V 13 | import Statistics.Sample (mean) 14 | 15 | import Data.Traversable (mapM) 16 | import qualified Data.Set as S 17 | import Data.Set (Set) 18 | import qualified Data.Map as M 19 | 20 | import ReadData 21 | import SerializeText 22 | import qualified RunSampler as Sampler 23 | import BayesStack.DirMulti 24 | import BayesStack.Models.Topic.CitationInfluenceNoTopics 25 | import BayesStack.UniqueKey 26 | 27 | import qualified Data.Text as T 28 | import qualified Data.Text.IO as TIO 29 | 30 | import System.FilePath.Posix (()) 31 | import Data.Binary 32 | import qualified Data.ByteString as BS 33 | import Text.Printf 34 | 35 | import Data.Random 36 | import System.Random.MWC 37 | 38 | data RunOpts = RunOpts { arcsFile :: FilePath 39 | , nodesFile :: FilePath 40 | , stopwords :: Maybe FilePath 41 | , samplerOpts :: Sampler.SamplerOpts 42 | , hyperParams :: HyperParams 43 | , noClean :: Bool 44 | } 45 | 46 | runOpts = RunOpts 47 | <$> strOption ( long "arcs" 48 | <> short 'a' 49 | <> metavar "FILE" 50 | <> help "File containing arcs" 51 | ) 52 | <*> strOption ( long "nodes" 53 | <> short 'n' 54 | <> metavar "FILE" 55 | <> help "File containing nodes' items" 56 | ) 57 | <*> nullOption ( long "stopwords" 58 | <> short 's' 59 | <> metavar "FILE" 60 | <> reader (pure . Just) 61 | <> value Nothing 62 | <> help "Stop words list" 63 | ) 64 | <*> Sampler.samplerOpts 65 | <*> hyperOpts 66 | <*> flag False True ( long "no-clean" 67 | <> short 'c' 68 | <> help "Don't attempt to sanitize input data. Among other things, nodes without friends will not be discarded" 69 | ) 70 | 71 | hyperOpts = HyperParams 72 | <$> option ( long "prior-psi" 73 | <> value 1 74 | <> help "Dirichlet parameter for prior on psi" 75 | ) 76 | <*> option ( long "prior-lambda" 77 | <> value 0.01 78 | <> help "Dirichlet parameter for prior on lambda" 79 | ) 80 | <*> option ( long "prior-omega" 81 | <> value 0.01 82 | <> help "Dirichlet parameter for prior on omega" 83 | ) 84 | <*> option ( long "prior-gamma-shared" 85 | <> value 0.9 86 | <> help "Beta parameter for prior on gamma (shared)" 87 | ) 88 | <*> option ( long "prior-gamma-own" 89 | <> value 0.1 90 | <> help "Beta parameter for prior on gamma (own)" 91 | ) 92 | <*> option ( long "prior-beta-fg" 93 | <> value 0.1 94 | <> help "Beta parameter for prior on language background model (foreground)" 95 | ) 96 | <*> option ( long "prior-beta-bg" 97 | <> value 0.1 98 | <> help "Beta parameter for prior on language background model (background)" 99 | ) 100 | 101 | mapMKeys :: (Ord k, Ord k', Monad m, Applicative m) 102 | => (a -> m a') -> (k -> m k') -> M.Map k a -> m (M.Map k' a') 103 | mapMKeys f g x = M.fromList <$> (mapM (\(k,v)->(,) <$> g k <*> f v) $ M.assocs x) 104 | 105 | termsToItems :: M.Map NodeName [Term] -> Set (NodeName, NodeName) 106 | -> ( (M.Map Node [Item], Set (Node, Node)) 107 | , (M.Map Item Term, M.Map Node NodeName)) 108 | termsToItems nodes arcs = 109 | let ((d', nodeMap), itemMap) = 110 | runUniqueKey' [Item i | i <- [0..]] $ 111 | runUniqueKeyT' [Node i | i <- [0..]] $ do 112 | a <- mapMKeys (mapM (lift . getUniqueKey)) getUniqueKey nodes 113 | b <- S.fromList <$> mapM (\(x,y)->(,) <$> getUniqueKey x <*> getUniqueKey y) 114 | (S.toList arcs) 115 | return (a,b) 116 | in (d', (itemMap, nodeMap)) 117 | 118 | makeNetData :: HyperParams -> M.Map Node [Item] -> Set Arc -> NetData 119 | makeNetData hp nodeItems arcs = 120 | netData hp arcs items nodeItems' 121 | where items = M.unions $ concatMap (map $ flip M.singleton 1) $ M.elems nodeItems 122 | nodeItems' = M.fromList 123 | $ zip [NodeItem i | i <- [0..]] 124 | $ do (n,items) <- M.assocs nodeItems 125 | item <- items 126 | return (n, item) 127 | 128 | opts = info runOpts 129 | ( fullDesc 130 | <> progDesc "Learn citation influence model" 131 | <> header "run-ci - learn citation influence model" 132 | ) 133 | 134 | edgesToArcs :: Set (Node, Node) -> Set Arc 135 | edgesToArcs = S.map (\(a,b)->Arc (Citing a) (Cited b)) 136 | 137 | instance Sampler.SamplerModel MState where 138 | estimateHypers = id -- reestimate -- FIXME 139 | modelLikelihood = modelLikelihood 140 | summarizeHypers ms = "" -- FIXME 141 | 142 | main = do 143 | args <- execParser opts 144 | stopWords <- case stopwords args of 145 | Just f -> S.fromList . T.words <$> TIO.readFile f 146 | Nothing -> return S.empty 147 | printf "Read %d stopwords\n" (S.size stopWords) 148 | 149 | ((nodeItems, a), (itemMap, nodeMap)) <- termsToItems 150 | <$> readNodeItems stopWords (nodesFile args) 151 | <*> readEdges (arcsFile args) 152 | let arcs = edgesToArcs a 153 | 154 | Sampler.createSweeps $ samplerOpts args 155 | let sweepsDir = Sampler.sweepsDir $ samplerOpts args 156 | encodeFile (sweepsDir "item-map") itemMap 157 | encodeFile (sweepsDir "node-map") nodeMap 158 | 159 | let termCounts = V.fromListN (M.size nodeItems) 160 | $ map length $ M.elems nodeItems :: Vector Int 161 | printf "Read %d arcs, %d nodes, %d node-items\n" (S.size arcs) (M.size nodeItems) (V.sum termCounts) 162 | printf "Mean items per node: %1.2f\n" (mean $ V.map realToFrac termCounts) 163 | 164 | withSystemRandom $ \mwc->do 165 | let nd = (if noClean args then id else cleanNetData) 166 | $ makeNetData (hyperParams args) nodeItems arcs 167 | mapM_ putStrLn $ verifyNetData (\n->maybe (show n) show $ M.lookup n nodeMap) nd 168 | 169 | let nCitingNodes = VU.fromList $ M.elems $ M.unionsWith (+) 170 | $ map (\a->M.singleton (citingNode a) 1) 171 | $ S.toList $ nd^.dArcs 172 | 173 | nCitedNodes = VU.fromList $ M.elems $ M.unionsWith (+) 174 | $ map (\a->M.singleton (citedNode a) 1) 175 | $ S.toList $ nd^.dArcs 176 | printf "After cleaning: %d cited nodes, %d citing nodes, %d arcs, %d node-items\n" 177 | (S.size $ S.map citedNode $ nd^.dArcs) (S.size $ S.map citingNode $ nd^.dArcs) 178 | (nd ^. dArcs . to S.size) (nd ^. dNodeItems . to M.size) 179 | printf "In degree: mean=%3.1f, maximum=%3.1f\n" 180 | (mean nCitedNodes) (V.maximum nCitedNodes) 181 | printf "Out degree: mean=%3.1f, maximum=%3.1f\n" 182 | (mean nCitingNodes) (V.maximum nCitingNodes) 183 | 184 | encodeFile (sweepsDir "data") nd 185 | mInit <- runRVar (randomInitialize nd) mwc 186 | let m = model nd mInit 187 | Sampler.runSampler (samplerOpts args) m (updateUnits nd) 188 | return () 189 | -------------------------------------------------------------------------------- /network-topic-models/RunLDA.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns, GeneralizedNewtypeDeriving, StandaloneDeriving #-} 2 | 3 | import Prelude hiding (mapM) 4 | 5 | import Options.Applicative 6 | import Data.Monoid ((<>)) 7 | import Control.Monad.Trans.Class 8 | 9 | import Data.Vector (Vector) 10 | import qualified Data.Vector.Generic as V 11 | import Statistics.Sample (mean) 12 | 13 | import Data.Traversable (mapM) 14 | import qualified Data.Set as S 15 | import Data.Set (Set) 16 | import qualified Data.Map.Strict as M 17 | 18 | import ReadData 19 | import SerializeText 20 | import qualified RunSampler as Sampler 21 | import BayesStack.DirMulti 22 | import BayesStack.Models.Topic.LDA 23 | import BayesStack.UniqueKey 24 | 25 | import qualified Data.Text as T 26 | import qualified Data.Text.IO as TIO 27 | 28 | import System.FilePath.Posix (()) 29 | import Data.Binary 30 | import qualified Data.ByteString as BS 31 | import Text.Printf 32 | 33 | import Data.Random 34 | import System.Random.MWC 35 | 36 | data RunOpts = RunOpts { nodesFile :: FilePath 37 | , stopwords :: Maybe FilePath 38 | , nTopics :: Int 39 | , samplerOpts :: Sampler.SamplerOpts 40 | , hyperParams :: HyperParams 41 | } 42 | 43 | runOpts :: Parser RunOpts 44 | runOpts = RunOpts 45 | <$> strOption ( long "nodes" 46 | <> short 'n' 47 | <> metavar "FILE" 48 | <> help "File containing nodes and their associated items" 49 | ) 50 | <*> nullOption ( long "stopwords" 51 | <> short 's' 52 | <> metavar "FILE" 53 | <> reader (pure . Just) 54 | <> value Nothing 55 | <> help "Stop word list" 56 | ) 57 | <*> option ( long "topics" 58 | <> short 't' 59 | <> metavar "N" 60 | <> value 20 61 | <> help "Number of topics" 62 | ) 63 | <*> Sampler.samplerOpts 64 | <*> hyperOpts 65 | 66 | hyperOpts = HyperParams 67 | <$> option ( long "prior-theta" 68 | <> value 1 69 | <> help "Dirichlet parameter for prior on theta" 70 | ) 71 | <*> option ( long "prior-phi" 72 | <> value 0.1 73 | <> help "Dirichlet parameter for prior on phi" 74 | ) 75 | 76 | mapMKeys :: (Ord k, Ord k', Monad m, Applicative m) 77 | => (a -> m a') -> (k -> m k') -> M.Map k a -> m (M.Map k' a') 78 | mapMKeys f g x = M.fromList <$> (mapM (\(k,v)->(,) <$> g k <*> f v) $ M.assocs x) 79 | 80 | termsToItems :: M.Map NodeName [Term] 81 | -> (M.Map Node [Item], (M.Map Item Term, M.Map Node NodeName)) 82 | termsToItems nodes = 83 | let ((d', nodeMap), itemMap) = 84 | runUniqueKey' [Item i | i <- [0..]] $ 85 | runUniqueKeyT' [Node i | i <- [0..]] $ do 86 | mapMKeys (mapM (lift . getUniqueKey)) getUniqueKey nodes 87 | in (d', (itemMap, nodeMap)) 88 | 89 | netData :: HyperParams -> M.Map Node [Item] -> Int -> NetData 90 | netData hp nodeItems nTopics = 91 | NetData { dHypers = hp 92 | , dItems = S.unions $ map S.fromList $ M.elems nodeItems 93 | , dTopics = S.fromList [Topic i | i <- [1..nTopics]] 94 | , dNodeItems = M.fromList 95 | $ zip [NodeItem i | i <- [0..]] 96 | $ do (n,items) <- M.assocs nodeItems 97 | item <- items 98 | return (n, item) 99 | , dNodes = M.keysSet nodeItems 100 | } 101 | 102 | opts :: ParserInfo RunOpts 103 | opts = info runOpts ( fullDesc 104 | <> progDesc "Learn LDA model" 105 | <> header "run-lda - learn LDA model" 106 | ) 107 | 108 | instance Sampler.SamplerModel MState where 109 | estimateHypers = reestimate 110 | modelLikelihood = modelLikelihood 111 | summarizeHypers ms = 112 | " phi : "++show (dmAlpha $ snd $ M.findMin $ stPhis ms)++"\n"++ 113 | " theta: "++show (dmAlpha $ snd $ M.findMin $ stThetas ms)++"\n" 114 | 115 | main :: IO () 116 | main = do 117 | args <- execParser opts 118 | stopWords <- case stopwords args of 119 | Just f -> S.fromList . T.words <$> TIO.readFile f 120 | Nothing -> return S.empty 121 | printf "Read %d stopwords\n" (S.size stopWords) 122 | 123 | (nodeItems, (itemMap, nodeMap)) <- termsToItems 124 | <$> readNodeItems stopWords (nodesFile args) 125 | 126 | Sampler.createSweeps $ samplerOpts args 127 | let sweepsDir = Sampler.sweepsDir $ samplerOpts args 128 | encodeFile (sweepsDir "item-map") itemMap 129 | encodeFile (sweepsDir "node-map") nodeMap 130 | 131 | let termCounts = V.fromListN (M.size nodeItems) 132 | $ map length $ M.elems nodeItems :: Vector Int 133 | printf "Read %d nodes\n" (M.size nodeItems) 134 | printf "Mean items per node: %1.2f\n" (mean $ V.map realToFrac termCounts) 135 | 136 | withSystemRandom $ \mwc->do 137 | let nd = netData (hyperParams args) nodeItems (nTopics args) 138 | encodeFile (sweepsDir "data") nd 139 | mInit <- runRVar (randomInitialize nd) mwc 140 | let m = model nd mInit 141 | Sampler.runSampler (samplerOpts args) m (updateUnits nd) 142 | return () 143 | -------------------------------------------------------------------------------- /network-topic-models/RunLDARelevance.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns, GeneralizedNewtypeDeriving, StandaloneDeriving, TupleSections #-} 2 | 3 | import Prelude hiding (mapM) 4 | 5 | import Options.Applicative 6 | import Data.Monoid ((<>)) 7 | import Control.Monad.Trans.Class 8 | 9 | import Data.Vector (Vector) 10 | import qualified Data.Vector.Generic as V 11 | import Statistics.Sample (mean) 12 | 13 | import Data.Traversable (mapM) 14 | import qualified Data.Set as S 15 | import Data.Set (Set) 16 | import qualified Data.Map.Strict as M 17 | 18 | import qualified Data.Text as T 19 | import qualified Data.Text.IO as TIO 20 | import ReadData hiding (readNodeItems) 21 | import ReadRelevanceData (readNodeItems) 22 | import SerializeText 23 | import qualified RunSampler as Sampler 24 | import BayesStack.DirMulti 25 | import BayesStack.Models.Topic.LDARelevance 26 | import BayesStack.UniqueKey 27 | 28 | import System.Directory (createDirectoryIfMissing) 29 | import System.FilePath.Posix (()) 30 | import Data.Binary 31 | import qualified Data.ByteString as BS 32 | import Text.Printf 33 | 34 | import Data.Random 35 | import System.Random.MWC 36 | 37 | data RunOpts = RunOpts { nodesFile :: FilePath 38 | , stopwords :: Maybe FilePath 39 | , nTopics :: Int 40 | , samplerOpts :: Sampler.SamplerOpts 41 | , hyperParams :: HyperParams 42 | } 43 | 44 | runOpts :: Parser RunOpts 45 | runOpts = RunOpts 46 | <$> strOption ( long "nodes" 47 | <> short 'n' 48 | <> metavar "FILE" 49 | <> help "File containing nodes and their associated items" 50 | ) 51 | <*> nullOption ( long "stopwords" 52 | <> short 's' 53 | <> metavar "FILE" 54 | <> reader (pure . Just) 55 | <> value Nothing 56 | <> help "Stop word list" 57 | ) 58 | <*> option ( long "topics" 59 | <> short 't' 60 | <> metavar "N" 61 | <> value 20 62 | <> help "Number of topics" 63 | ) 64 | <*> Sampler.samplerOpts 65 | <*> hyperOpts 66 | 67 | hyperOpts = HyperParams 68 | <$> option ( long "prior-theta" 69 | <> value 1 70 | <> help "Dirichlet parameter for prior on theta" 71 | ) 72 | <*> option ( long "prior-phi" 73 | <> value 0.1 74 | <> help "Dirichlet parameter for prior on phi" 75 | ) 76 | 77 | mapMKeys :: (Ord k, Ord k', Monad m, Applicative m) 78 | => (a -> m a') -> (k -> m k') -> M.Map k a -> m (M.Map k' a') 79 | mapMKeys f g x = M.fromList <$> (mapM (\(k,v)->(,) <$> g k <*> f v) $ M.assocs x) 80 | 81 | termsToItems :: M.Map NodeName [(Term, ItemWeight)] 82 | -> (M.Map Node [(Item, ItemWeight)], (M.Map Item Term, M.Map Node NodeName)) 83 | termsToItems nodes = 84 | let ((d', nodeMap), itemMap) = 85 | runUniqueKey' [Item i | i <- [0..]] $ 86 | runUniqueKeyT' [Node i | i <- [0..]] $ do 87 | mapMKeys (mapM (\(x,w)->(,w) `fmap` lift (getUniqueKey x))) getUniqueKey nodes 88 | in (d', (itemMap, nodeMap)) 89 | 90 | netData :: HyperParams -> M.Map Node [(Item,ItemWeight)] -> Int -> NetData 91 | netData hp nodeItems nTopics = 92 | NetData { dHypers = hp 93 | , dItems = M.unions $ map M.fromList $ M.elems nodeItems 94 | , dTopics = S.fromList [Topic i | i <- [1..nTopics]] 95 | , dNodeItems = M.fromList 96 | $ zip [NodeItem i | i <- [0..]] 97 | $ do (n,items) <- M.assocs nodeItems 98 | (item,weight) <- items 99 | return (n, item) 100 | , dNodes = M.keysSet nodeItems 101 | } 102 | 103 | opts :: ParserInfo RunOpts 104 | opts = info runOpts ( fullDesc 105 | <> progDesc "Learn LDA model" 106 | <> header "run-lda - learn LDA model" 107 | ) 108 | 109 | instance Sampler.SamplerModel MState where 110 | estimateHypers = reestimate 111 | modelLikelihood = modelLikelihood 112 | summarizeHypers ms = 113 | " phi : "++show (dmAlpha $ snd $ M.findMin $ stPhis ms)++"\n"++ 114 | " theta: "++show (dmAlpha $ snd $ M.findMin $ stThetas ms)++"\n" 115 | 116 | main :: IO () 117 | main = do 118 | args <- execParser opts 119 | stopWords <- case stopwords args of 120 | Just f -> S.fromList . T.words <$> TIO.readFile f 121 | Nothing -> return S.empty 122 | printf "Read %d stopwords\n" (S.size stopWords) 123 | 124 | let parseError s = do 125 | putStrLn $ "Error parsing nodes: "++s 126 | putStrLn $ "Expected format is:" 127 | putStrLn $ "(node) (term) (weight) (term) (weight) ..." 128 | fail "" 129 | (nodeItems, (itemMap, nodeMap)) <- either parseError (return . termsToItems) 130 | =<< readNodeItems stopWords (nodesFile args) 131 | 132 | let sweepsDir = Sampler.sweepsDir $ samplerOpts args 133 | Sampler.createSweeps $ samplerOpts args 134 | encodeFile (sweepsDir "item-map") itemMap 135 | encodeFile (sweepsDir "node-map") nodeMap 136 | 137 | let termCounts = V.fromListN (M.size nodeItems) 138 | $ map length $ M.elems nodeItems :: Vector Int 139 | printf "Read %d nodes\n" (M.size nodeItems) 140 | printf "Mean items per node: %1.2f\n" (mean $ V.map realToFrac termCounts) 141 | 142 | withSystemRandom $ \mwc->do 143 | let nd = netData (hyperParams args) nodeItems (nTopics args) 144 | encodeFile (sweepsDir "data") nd 145 | mInit <- runRVar (randomInitialize nd) mwc 146 | let m = model nd mInit 147 | Sampler.runSampler (samplerOpts args) m (updateUnits nd) 148 | return () 149 | -------------------------------------------------------------------------------- /network-topic-models/RunST.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns, GeneralizedNewtypeDeriving, StandaloneDeriving #-} 2 | 3 | import Prelude hiding (mapM) 4 | 5 | import Options.Applicative 6 | import Data.Monoid ((<>)) 7 | import Control.Monad.Trans.Class 8 | 9 | import Data.Vector (Vector) 10 | import qualified Data.Vector.Generic as V 11 | import Statistics.Sample (mean) 12 | 13 | import Data.Traversable (mapM) 14 | import qualified Data.Set as S 15 | import Data.Set (Set) 16 | import qualified Data.Map as M 17 | 18 | import ReadData 19 | import SerializeText 20 | import qualified RunSampler as Sampler 21 | import BayesStack.DirMulti 22 | import BayesStack.Models.Topic.SharedTaste 23 | import BayesStack.UniqueKey 24 | 25 | import qualified Data.Text as T 26 | import qualified Data.Text.IO as TIO 27 | 28 | import System.FilePath.Posix (()) 29 | import Data.Binary 30 | import qualified Data.ByteString as BS 31 | import Text.Printf 32 | 33 | import Data.Random 34 | import System.Random.MWC 35 | 36 | data RunOpts = RunOpts { arcsFile :: FilePath 37 | , nodesFile :: FilePath 38 | , stopwords :: Maybe FilePath 39 | , nTopics :: Int 40 | , samplerOpts :: Sampler.SamplerOpts 41 | , hyperParams :: HyperParams 42 | } 43 | 44 | runOpts = RunOpts 45 | <$> strOption ( long "edges" 46 | <> short 'e' 47 | <> metavar "FILE" 48 | <> help "File containing edges" 49 | ) 50 | <*> strOption ( long "nodes" 51 | <> short 'n' 52 | <> metavar "FILE" 53 | <> help "File containing nodes' items" 54 | ) 55 | <*> nullOption ( long "stopwords" 56 | <> short 's' 57 | <> metavar "FILE" 58 | <> reader (pure . Just) 59 | <> value Nothing 60 | <> help "Stop words list" 61 | ) 62 | <*> option ( long "topics" 63 | <> short 't' 64 | <> metavar "N" 65 | <> value 20 66 | <> help "Number of topics" 67 | ) 68 | <*> Sampler.samplerOpts 69 | <*> hyperOpts 70 | 71 | hyperOpts = HyperParams 72 | <$> option ( long "prior-psi" 73 | <> value 1 74 | <> help "Dirichlet parameter for prior on psi" 75 | ) 76 | <*> option ( long "prior-lambda" 77 | <> value 0.1 78 | <> help "Dirichlet parameter for prior on lambda" 79 | ) 80 | <*> option ( long "prior-phi" 81 | <> value 0.01 82 | <> help "Dirichlet parameter for prior on phi" 83 | ) 84 | <*> option ( long "prior-omega" 85 | <> value 0.01 86 | <> help "Dirichlet parameter for prior on omega" 87 | ) 88 | <*> option ( long "prior-gamma-shared" 89 | <> value 0.9 90 | <> help "Beta parameter for prior on gamma (shared)" 91 | ) 92 | <*> option ( long "prior-gamma-own" 93 | <> value 0.1 94 | <> help "Beta parameter for prior on gamma (own)" 95 | ) 96 | 97 | mapMKeys :: (Ord k, Ord k', Monad m, Applicative m) 98 | => (a -> m a') -> (k -> m k') -> M.Map k a -> m (M.Map k' a') 99 | mapMKeys f g x = M.fromList <$> (mapM (\(k,v)->(,) <$> g k <*> f v) $ M.assocs x) 100 | 101 | termsToItems :: M.Map NodeName [Term] -> Set (NodeName, NodeName) 102 | -> ( (M.Map Node [Item], Set (Node, Node)) 103 | , (M.Map Item Term, M.Map Node NodeName)) 104 | termsToItems nodes arcs = 105 | let ((d', nodeMap), itemMap) = 106 | runUniqueKey' [Item i | i <- [0..]] $ 107 | runUniqueKeyT' [Node i | i <- [0..]] $ do 108 | a <- mapMKeys (mapM (lift . getUniqueKey)) getUniqueKey nodes 109 | b <- S.fromList <$> mapM (\(x,y)->(,) <$> getUniqueKey x <*> getUniqueKey y) 110 | (S.toList arcs) 111 | return (a,b) 112 | in (d', (itemMap, nodeMap)) 113 | 114 | netData :: HyperParams -> M.Map Node [Item] -> Set Edge -> Int -> NetData 115 | netData hp nodeItems edges nTopics = 116 | NetData { dHypers = hp 117 | , dEdges = edges 118 | , dItems = S.unions $ map S.fromList $ M.elems nodeItems 119 | , dTopics = S.fromList [Topic i | i <- [1..nTopics]] 120 | , dNodeItems = M.fromList 121 | $ zip [NodeItem i | i <- [0..]] 122 | $ do (n,items) <- M.assocs nodeItems 123 | item <- items 124 | return (n, item) 125 | } 126 | 127 | opts = info runOpts 128 | ( fullDesc 129 | <> progDesc "Learn shared taste model" 130 | <> header "run-st - learn shared taste model" 131 | ) 132 | 133 | instance Sampler.SamplerModel MState where 134 | estimateHypers = id -- reestimate -- FIXME 135 | modelLikelihood = modelLikelihood 136 | summarizeHypers ms = "" -- FIXME 137 | 138 | main = do 139 | args <- execParser opts 140 | stopWords <- case stopwords args of 141 | Just f -> S.fromList . T.words <$> TIO.readFile f 142 | Nothing -> return S.empty 143 | printf "Read %d stopwords\n" (S.size stopWords) 144 | 145 | ((nodeItems, a), (itemMap, nodeMap)) <- termsToItems 146 | <$> readNodeItems stopWords (nodesFile args) 147 | <*> readEdges (arcsFile args) 148 | let edges = S.map Edge a 149 | 150 | Sampler.createSweeps $ samplerOpts args 151 | let sweepsDir = Sampler.sweepsDir $ samplerOpts args 152 | encodeFile (sweepsDir "item-map") itemMap 153 | encodeFile (sweepsDir "node-map") nodeMap 154 | 155 | let termCounts = V.fromListN (M.size nodeItems) 156 | $ map length $ M.elems nodeItems :: Vector Int 157 | printf "Read %d edges, %d items\n" (S.size edges) (M.size nodeItems) 158 | printf "Mean items per node: %1.2f\n" (mean $ V.map realToFrac termCounts) 159 | 160 | withSystemRandom $ \mwc->do 161 | let nd = netData (hyperParams args) nodeItems edges 10 162 | encodeFile (sweepsDir "data") nd 163 | mInit <- runRVar (randomInitialize nd) mwc 164 | let m = model nd mInit 165 | Sampler.runSampler (samplerOpts args) m (updateUnits nd) 166 | return () 167 | -------------------------------------------------------------------------------- /network-topic-models/RunSampler.hs: -------------------------------------------------------------------------------- 1 | module RunSampler ( SamplerModel (..) 2 | , SamplerOpts (..), samplerOpts 3 | , runSampler 4 | , createSweeps 5 | ) where 6 | 7 | import Options.Applicative 8 | import Data.Monoid ((<>)) 9 | import System.FilePath.Posix (()) 10 | import System.Directory 11 | 12 | import Control.Monad (when, forM_, void) 13 | import qualified Control.Monad.Trans.State as S 14 | import Control.Monad.IO.Class 15 | 16 | import Data.Binary 17 | import qualified Data.ByteString as BS 18 | import Text.Printf 19 | 20 | import Control.Concurrent 21 | import Control.Concurrent.STM 22 | 23 | import System.Random.MWC 24 | import Data.Random 25 | 26 | import Numeric.Log 27 | import BayesStack.Gibbs.Concurrent 28 | 29 | data SamplerOpts = SamplerOpts { burnin :: Int 30 | , lag :: Int 31 | , iterations :: Maybe Int 32 | , updateBlock :: Int 33 | , sweepsDir :: FilePath 34 | , nCaps :: Int 35 | , hyperEstOpts :: HyperEstOpts 36 | } 37 | 38 | samplerOpts = SamplerOpts 39 | <$> option ( long "burnin" 40 | <> short 'b' 41 | <> metavar "N" 42 | <> value 100 43 | <> help "Number of sweeps to run before taking samples" 44 | ) 45 | <*> option ( long "lag" 46 | <> short 'l' 47 | <> metavar "N" 48 | <> value 10 49 | <> help "Number of sweeps between diagnostic samples" 50 | ) 51 | <*> option ( long "iterations" 52 | <> short 'i' 53 | <> metavar "N" 54 | <> value Nothing 55 | <> reader (pure . auto) 56 | <> help "Number of sweeps to run for" 57 | ) 58 | <*> option ( long "diff-batch" 59 | <> short 'u' 60 | <> metavar "N" 61 | <> value 100 62 | <> help "Number of update diffs to batch before updating global state" 63 | ) 64 | <*> strOption ( long "sweeps" 65 | <> short 'S' 66 | <> metavar "DIR" 67 | <> value "sweeps" 68 | <> help "Directory in which to place model state output" 69 | ) 70 | <*> option ( long "threads" 71 | <> short 'N' 72 | <> value 1 73 | <> metavar "INT" 74 | <> help "Number of worker threads to start" 75 | ) 76 | <*> hyperEstOpts' 77 | 78 | data HyperEstOpts = HyperEstOpts { hyperEst :: Bool 79 | , hyperBurnin :: Int 80 | , hyperLag :: Int 81 | } 82 | 83 | hyperEstOpts' = HyperEstOpts 84 | <$> switch ( long "hyper" 85 | <> short 'H' 86 | <> help "Enable hyperparameter estimation" 87 | ) 88 | <*> option ( long "hyper-burnin" 89 | <> metavar "N" 90 | <> value 10 91 | <> help "Number of sweeps before starting hyperparameter estimations (must be multiple of --lag)" 92 | ) 93 | <*> option ( long "hyper-lag" 94 | <> metavar "L" 95 | <> value 10 96 | <> help "Number of sweeps between hyperparameter estimations (must be multiple of --lag)" 97 | ) 98 | 99 | class Binary ms => SamplerModel ms where 100 | modelLikelihood :: ms -> Log Double 101 | estimateHypers :: ms -> ms 102 | summarizeHypers :: ms -> String 103 | 104 | processSweep :: SamplerModel ms => SamplerOpts -> TVar (Log Double) -> Int -> ms -> IO () 105 | processSweep opts lastMaxV sweepN m = do 106 | let l = modelLikelihood m 107 | putStr $ printf "Sweep %d: %f\n" sweepN (ln l) 108 | appendFile (sweepsDir opts "likelihood.log") 109 | $ printf "%d\t%f\n" sweepN (ln l) 110 | when (sweepN >= burnin opts) $ do 111 | newMax <- atomically $ do oldL <- readTVar lastMaxV 112 | if l > oldL then writeTVar lastMaxV l >> return True 113 | else return False 114 | when (newMax && sweepN >= burnin opts) $ 115 | let fname = sweepsDir opts printf "%05d.state" sweepN 116 | in encodeFile fname m 117 | 118 | doEstimateHypers :: SamplerModel ms => SamplerOpts -> Int -> S.StateT ms IO () 119 | doEstimateHypers opts@(SamplerOpts {hyperEstOpts=HyperEstOpts True burnin lag}) iterN 120 | | iterN >= burnin && iterN `mod` lag == 0 = do 121 | liftIO $ putStrLn "Parameter estimation" 122 | m <- S.get 123 | S.modify estimateHypers 124 | m' <- S.get 125 | void $ liftIO $ forkIO 126 | $ appendFile (sweepsDir opts "hyperparams.log") 127 | $ printf "%5d\t%f\t%f\t%s\n" 128 | iterN 129 | (ln $ modelLikelihood m) 130 | (ln $ modelLikelihood m') 131 | (summarizeHypers m') 132 | doEstimateHypers _ _ = return () 133 | 134 | withSystemRandomIO = withSystemRandom :: (GenIO -> IO a) -> IO a 135 | 136 | samplerIter :: SamplerModel ms => SamplerOpts -> [WrappedUpdateUnit ms] 137 | -> TMVar () -> TVar (Log Double) 138 | -> Int -> S.StateT ms IO () 139 | samplerIter opts uus processSweepRunning lastMaxV lagN = do 140 | let sweepN = lagN * lag opts 141 | shuffledUus <- liftIO $ withSystemRandomIO $ \mwc->runRVar (shuffle uus) mwc 142 | let uus' = concat $ replicate (lag opts) shuffledUus 143 | m <- S.get 144 | S.put =<< liftIO (gibbsUpdate (nCaps opts) (updateBlock opts) m uus') 145 | when (sweepN == burnin opts) $ liftIO $ putStrLn "Burn-in complete" 146 | S.get >>= \m->do liftIO $ atomically $ takeTMVar processSweepRunning 147 | void $ liftIO $ forkIO $ do 148 | processSweep opts lastMaxV sweepN m 149 | liftIO $ atomically $ putTMVar processSweepRunning () 150 | 151 | doEstimateHypers opts sweepN 152 | 153 | checkOpts :: SamplerOpts -> IO () 154 | checkOpts opts = do 155 | let hyperOpts = hyperEstOpts opts 156 | when (burnin opts `mod` lag opts /= 0) 157 | $ error "--burnin must be multiple of --lag" 158 | when (hyperEst hyperOpts && hyperBurnin hyperOpts `mod` lag opts /= 0) 159 | $ error "--hyper-burnin must be multiple of --lag" 160 | when (hyperEst hyperOpts && hyperLag hyperOpts `mod` lag opts /= 0) 161 | $ error "--hyper-lag must be multiple of --lag" 162 | 163 | runSampler :: SamplerModel ms => SamplerOpts -> ms -> [WrappedUpdateUnit ms] -> IO () 164 | runSampler opts m uus = do 165 | checkOpts opts 166 | setNumCapabilities (nCaps opts) 167 | createDirectoryIfMissing False (sweepsDir opts) 168 | putStrLn "Starting sampler..." 169 | putStrLn $ "Initial likelihood = "++show (ln $ modelLikelihood m) 170 | putStrLn $ "Burning in for "++show (burnin opts)++" samples" 171 | let lagNs = maybe [0..] (\n->[0..n `div` lag opts]) $ iterations opts 172 | lastMaxV <- atomically $ newTVar 0 173 | processSweepRunning <- atomically $ newTMVar () 174 | void $ S.runStateT (forM_ lagNs (samplerIter opts uus processSweepRunning lastMaxV)) m 175 | atomically $ takeTMVar processSweepRunning 176 | 177 | createSweeps :: SamplerOpts -> IO () 178 | createSweeps (SamplerOpts {sweepsDir=sweepsDir}) = do 179 | exists <- doesDirectoryExist sweepsDir 180 | if exists 181 | then error "Sweeps directory already exists" 182 | else createDirectory sweepsDir 183 | -------------------------------------------------------------------------------- /network-topic-models/SerializeText.hs: -------------------------------------------------------------------------------- 1 | module SerializeText () where 2 | 3 | import Control.Applicative 4 | import Data.Binary 5 | import qualified Data.Text as T 6 | import qualified Data.Text.Encoding as TE 7 | 8 | -- FIXME: Why isn't there already an instance? 9 | instance Binary T.Text where 10 | put = put . TE.encodeUtf8 11 | get = TE.decodeUtf8 <$> get 12 | -------------------------------------------------------------------------------- /network-topic-models/Setup.hs: -------------------------------------------------------------------------------- 1 | import Distribution.Simple 2 | main = defaultMain 3 | -------------------------------------------------------------------------------- /network-topic-models/TopicLanguageModel.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | 3 | import qualified Data.ByteString as BS 4 | import qualified Data.Text as Text 5 | import qualified Data.Foldable as F 6 | import Data.List (sort, sortBy, isSuffixOf) 7 | import Data.Function 8 | import Data.Map (Map) 9 | import qualified Data.Map as M 10 | import Data.Monoid 11 | import Data.Tuple (swap) 12 | import qualified Data.Traversable as T 13 | import Options.Applicative 14 | 15 | import qualified Data.Text.Lazy.IO as TL 16 | import Data.Text.Lazy.Builder.Int 17 | import qualified Data.Text.Lazy.Builder as TB 18 | import Data.Binary 19 | 20 | import System.Directory 21 | import System.FilePath ((), takeDirectory) 22 | import Text.Printf 23 | 24 | import BayesStack.Types 25 | import BayesStack.DirMulti 26 | import BayesStack.Models.Topic.LDARelevance 27 | import SerializeText 28 | import ReadData 29 | import FormatMultinom 30 | 31 | data Opts = Opts { inputFiles :: [FilePath] 32 | , sweepNum :: Int 33 | , documents :: [NodeName] 34 | , topN :: Maybe Int 35 | } 36 | 37 | opts = Opts 38 | <$> arguments pure 39 | ( metavar "DIR|FILE" 40 | <> help "The directory of sweeps to dump" 41 | ) 42 | <*> option ( long "number" 43 | <> short 'N' 44 | <> value 10 45 | <> metavar "N" 46 | <> help "The number of sweeps to aggregate" 47 | ) 48 | <*> option ( long "document" 49 | <> short 'd' 50 | <> reader (fmap ((:[]) . Text.pack) . str) 51 | <> help "The documents to dump" 52 | ) 53 | <*> option ( long "top" 54 | <> short 'n' 55 | <> value Nothing 56 | <> reader (pure . auto) 57 | <> metavar "N" 58 | <> help "How many items to list for each" 59 | ) 60 | 61 | readSweep :: FilePath -> IO MState 62 | readSweep = decodeFile 63 | 64 | readNetData :: FilePath -> IO NetData 65 | readNetData = decodeFile 66 | 67 | readPhi' :: Node -> FilePath -> IO (Map Item Double) 68 | readPhi' n filePath = f <$> readSweep filePath 69 | where f s = M.unionsWith (+) $ do 70 | (pt,t) <- F.toList $ probabilities $ stThetas s M.! n 71 | (px,x) <- F.toList $ probabilities $ stPhis s M.! t 72 | return $ M.singleton x (pt*px) 73 | 74 | readPhi :: [NodeName] -> FilePath -> IO (Map NodeName (Map Term Double)) 75 | readPhi nodes filePath = do 76 | let sweepDir = takeDirectory filePath 77 | itemMap <- readItemMap sweepDir 78 | nodeMap <- readNodeMap sweepDir 79 | let invNodeMap = M.fromList $ map swap $ M.assocs nodeMap 80 | let f n = let n' = invNodeMap M.! n 81 | in M.singleton n . M.mapKeys (itemMap M.!) <$> readPhi' n' filePath 82 | M.unionsWith (M.unionWith (+)) <$> mapM f nodes 83 | 84 | getSweeps :: FilePath -> IO [FilePath] 85 | getSweeps sweepsDir = 86 | map (sweepsDir ) . sort . filter (".state" `isSuffixOf`) 87 | <$> getDirectoryContents sweepsDir 88 | 89 | main = do 90 | args <- execParser $ info (helper <*> opts) 91 | ( fullDesc 92 | <> progDesc "Dump distributions from an LDA sweep" 93 | <> header "dump-lda - Dump distributions from an LDA sweep" 94 | ) 95 | 96 | --nd <- readNetData $ sweepDir args "data" 97 | 98 | files <- T.forM (inputFiles args) $ \fname->do 99 | isDir <- doesDirectoryExist fname 100 | isFile <- doesFileExist fname 101 | case () of 102 | _ | isFile -> 103 | return [fname] 104 | _ | isDir -> 105 | take (sweepNum args) . reverse <$> getSweeps fname 106 | _ | otherwise -> error "oops" 107 | 108 | sweeps <- T.forM (concat files) $ readPhi (documents args) 109 | let sweeps' = M.unionsWith (M.unionWith (+)) sweeps 110 | :: Map NodeName (Map Term Double) 111 | takeTopN = maybe id take $ topN args 112 | F.forM_ (M.assocs sweeps') $ \(n,items)->do 113 | print n 114 | F.forM_ (takeTopN $ sortBy (flip compare `on` snd) $ M.assocs items) $ \(x,p)->do 115 | printf "\t%s\t%1.2e\n" (show x) p 116 | putStrLn "" 117 | -------------------------------------------------------------------------------- /network-topic-models/doc/models/model_ci.tex: -------------------------------------------------------------------------------- 1 | % model_citation_influence.tex 2 | % 3 | % Copyright (C) 2010,2011 Laura Dietz 4 | % Copyright (C) 2012 Jaakko Luttinen 5 | % 6 | % This file may be distributed and/or modified 7 | % 8 | % 1. under the LaTeX Project Public License and/or 9 | % 2. under the GNU General Public License. 10 | % 11 | % See the files LICENSE_LPPL and LICENSE_GPL for more details. 12 | 13 | % Citation influence model 14 | % Cite this model as 15 | % Laura Dietz, Steffen Bickel, Tobias Scheffer. 16 | % Unsupervised Prediction of Citation Influences. 17 | % In: Proceedings of International Conference on Machine Learning. 2007 18 | 19 | %\beginpgfgraphicnamed{model-citation-influence} 20 | \begin{tikzpicture} 21 | 22 | % Layout the variables 23 | \matrix[row sep=0.5cm, column sep=1.2cm] (LDA) 24 | { % 25 | & % 26 | & % 27 | \node[latent] (psi) {$\psi$} ; & % 28 | \node[latent] (gamma) {$\gamma$} ; & % 29 | \node[latent] (omega) {$\Omega$} ; & % 30 | \\ 31 | \\ 32 | \node[latent] (lambda) {$\lambda$} ; & % 33 | & % 34 | \node[latent] (C) {$C$} ; & % 35 | \node[latent] (S) {$S$} ; & % 36 | \\ 37 | & % 38 | & % 39 | \factor {T-f1} {Multi} {} {}; & % 40 | & % 41 | \factor {T-f2} {below:Multi} {} {}; & % 42 | \\ 43 | \node[latent] (T') {$T'$} ; & % 44 | & % 45 | \node[latent] (T) {$T$} ; & % 46 | \\ 47 | \factor {X'-f} {Multi} {} {}; & % 48 | \node[latent] (phi) {$\phi$} ; & % 49 | \factor {X-f} {Multi} {} {}; % 50 | \\ 51 | \node[obs] (X') {$X'$} ; & % 52 | & % 53 | \node[obs] (X) {$X$} ; 54 | \\ 55 | }; 56 | 57 | % Remaining factors 58 | \factor[above=of T'] {T'-f} {left:Multi} {} {}; % 59 | \factor[above=of lambda] {lambda-f} {left:Dir} {} {}; % 60 | \factor[above=of C] {C-f} {left:Multi} {} {}; % 61 | \factor[above=of S] {S-f} {left:Bern} {} {}; % 62 | \factor[above=of psi] {psi-f} {left:Dir} {} {}; % 63 | \factor[above=of gamma] {gamma-f} {left:Beta} {} {}; % 64 | \factor[above=of omega] {omega-f} {left:Dir} {} {}; % 65 | \factor[above=of phi] {phi-f} {left:Dir} {} {}; % 66 | 67 | % Hyperparameters 68 | \node[const, above=of lambda] (alambda) {$\alpha_\lambda$}; % 69 | \node[const, above=of phi] (aphi) {$\alpha_\phi$}; % 70 | \node[const, above=of psi] (apsi) {$\alpha_\psi$}; % 71 | \node[const, above=of gamma, xshift=-0.5cm] (agamma1) 72 | {$\alpha_{\gamma_\omega}$}; % 73 | \node[const, above=of gamma, xshift=0.5cm] (agamma2) 74 | {$\alpha_{\gamma_\psi}$}; % 75 | \node[const, above=of omega] (aomega) {$\alpha_\omega$}; % 76 | 77 | % Factor connections 78 | \factoredge {phi} {X'-f} {X'} ; % 79 | \factoredge {phi} {X-f} {X} ; % 80 | \factoredge {lambda} {T'-f} {T'} ; % 81 | \factoredge {lambda} {T-f1} {T} ; % 82 | \factoredge {omega} {T-f2} {T} ; % 83 | \factoredge {alambda} {lambda-f} {lambda} ; % 84 | \factoredge {psi} {C-f} {C} ; % 85 | \factoredge {gamma} {S-f} {S} ; % 86 | \factoredge {apsi} {psi-f} {psi} ; % 87 | \factoredge {agamma1,agamma2} {gamma-f} {gamma} ; % 88 | \factoredge {aomega} {omega-f} {omega} ; % 89 | \factoredge {aphi} {phi-f} {phi} ; % 90 | 91 | % Gates 92 | \gate {X'-gate} {(X'-f)(X'-f-caption)} {T'} ; % 93 | \gate {X-gate} {(X-f)(X-f-caption)} {T} ; % 94 | \gate {T-gate} {(T-f1)(T-f1-caption)} {C} ; 95 | \vgate {T-vgate} % 96 | {(T-gate)} {$S=0$} % 97 | {(T-f2)(T-f2-caption)} {$S=1$} % 98 | {S} ; % 99 | 100 | % Plates 101 | \plate {LDA1} { % 102 | (T')(T'-f)(T'-f-caption) % 103 | (X')(X'-gate) % 104 | } {$\forall w' \in c$} ; % 105 | 106 | \plate {LDA2} { % 107 | (LDA1) % 108 | (lambda)(lambda-f)(lambda-f-caption) % 109 | } {$\forall c \in \mathcal{C}$} ; % 110 | 111 | \plate {} { % 112 | (phi)(phi-f)(phi-f-caption) % 113 | } {$\forall t \in \mathcal{T}$} ; % 114 | 115 | \plate {P1} { % 116 | (X)(X-gate) % 117 | (T)(T-vgate) % 118 | (C)(C-f)(C-f-caption) % 119 | (S)(S-f)(S-f-caption) % 120 | } {$\forall w \in d$} ; 121 | 122 | \plate {} { % 123 | (P1) % 124 | (psi)(psi-f)(psi-f-caption) % 125 | (gamma)(gamma-f)(gamma-f-caption) % 126 | (omega)(omega-f)(omega-f-caption) % 127 | } {$\forall d \in \mathcal{D}$} ; % 128 | 129 | \end{tikzpicture} 130 | %\endpgfgraphicnamed 131 | 132 | %%% Local Variables: 133 | %%% mode: tex-pdf 134 | %%% TeX-master: "example" 135 | %%% End: 136 | -------------------------------------------------------------------------------- /network-topic-models/doc/models/model_ci_nt.tex: -------------------------------------------------------------------------------- 1 | % model_citation_influence.tex 2 | % 3 | % Copyright (C) 2010,2011 Laura Dietz 4 | % Copyright (C) 2012 Jaakko Luttinen 5 | % 6 | % This file may be distributed and/or modified 7 | % 8 | % 1. under the LaTeX Project Public License and/or 9 | % 2. under the GNU General Public License. 10 | % 11 | % See the files LICENSE_LPPL and LICENSE_GPL for more details. 12 | 13 | % Citation influence model 14 | % Cite this model as 15 | % Laura Dietz, Steffen Bickel, Tobias Scheffer. 16 | % Unsupervised Prediction of Citation Influences. 17 | % In: Proceedings of International Conference on Machine Learning. 2007 18 | 19 | %\beginpgfgraphicnamed{model-citation-influence} 20 | \begin{tikzpicture} 21 | 22 | % Layout the variables 23 | \matrix[row sep=0.5cm, column sep=1.2cm] (LDA) 24 | { % 25 | & % 26 | & % 27 | \node[latent] (psi) {$\psi$} ; & % 28 | \node[latent] (gamma) {$\gamma$} ; & % 29 | \node[latent] (omega) {$\Omega$} ; & % 30 | \\ 31 | \\ 32 | \node[latent] (lambda) {$\lambda$} ; & % 33 | & % 34 | \node[latent] (C) {$C$} ; & % 35 | \node[latent] (S) {$S$} ; & % 36 | \\ 37 | \factor {X'-f} {left:Multi} {} {}; & % 38 | & % 39 | \factor {X-f1} {Multi} {} {}; & % 40 | & % 41 | \factor {X-f2} {below:Multi} {} {}; & % 42 | \\ 43 | \node[obs] (X') {$X'$} ; & % 44 | & % 45 | \node[obs] (X) {$X$} ; 46 | \\ 47 | }; 48 | 49 | % Remaining factors 50 | \factor[above=of lambda] {lambda-f} {left:Dir} {} {}; % 51 | \factor[above=of C] {C-f} {left:Multi} {} {}; % 52 | \factor[above=of S] {S-f} {left:Bern} {} {}; % 53 | \factor[above=of psi] {psi-f} {left:Dir} {} {}; % 54 | \factor[above=of gamma] {gamma-f} {left:Beta} {} {}; % 55 | \factor[above=of omega] {omega-f} {left:Dir} {} {}; % 56 | 57 | % Hyperparameters 58 | \node[const, above=of lambda] (alambda) {$\alpha_\lambda$}; % 59 | \node[const, above=of psi] (apsi) {$\alpha_\psi$}; % 60 | \node[const, above=of gamma, xshift=-0.5cm] (agamma1) 61 | {$\alpha_{\gamma_\omega}$}; % 62 | \node[const, above=of gamma, xshift=0.5cm] (agamma2) 63 | {$\alpha_{\gamma_\psi}$}; % 64 | \node[const, above=of omega] (aomega) {$\alpha_\omega$}; % 65 | 66 | % Factor connections 67 | \factoredge {lambda} {X'-f} {X'} ; % 68 | \factoredge {lambda} {X-f1} {X} ; % 69 | \factoredge {omega} {X-f2} {X} ; % 70 | \factoredge {alambda} {lambda-f} {lambda} ; % 71 | \factoredge {psi} {C-f} {C} ; % 72 | \factoredge {gamma} {S-f} {S} ; % 73 | \factoredge {apsi} {psi-f} {psi} ; % 74 | \factoredge {agamma1,agamma2} {gamma-f} {gamma} ; % 75 | \factoredge {aomega} {omega-f} {omega} ; % 76 | 77 | % Gates 78 | \gate {X-gate} {(X-f1)(X-f1-caption)} {C} ; 79 | \vgate {X-vgate} % 80 | {(X-gate)} {$S=0$} % 81 | {(X-f2)(X-f2-caption)} {$S=1$} % 82 | {S} ; % 83 | 84 | % Plates 85 | \plate {LDA1} { % 86 | (X'-f)(X'-f-caption) % 87 | (X')(X'-gate) % 88 | } {$\forall w' \in c$} ; % 89 | 90 | \plate {LDA2} { % 91 | (LDA1) % 92 | (lambda)(lambda-f)(lambda-f-caption) % 93 | } {$\forall c \in \mathcal{C}$} ; % 94 | 95 | \plate {P1} { % 96 | (X)(X-gate) % 97 | (T)(T-vgate) % 98 | (C)(C-f)(C-f-caption) % 99 | (S)(S-f)(S-f-caption) % 100 | } {$\forall w \in d$} ; 101 | 102 | \plate {} { % 103 | (P1) % 104 | (psi)(psi-f)(psi-f-caption) % 105 | (gamma)(gamma-f)(gamma-f-caption) % 106 | (omega)(omega-f)(omega-f-caption) % 107 | } {$\forall d \in \mathcal{D}$} ; % 108 | 109 | \end{tikzpicture} 110 | %\endpgfgraphicnamed 111 | 112 | %%% Local Variables: 113 | %%% mode: tex-pdf 114 | %%% TeX-master: "example" 115 | %%% End: 116 | -------------------------------------------------------------------------------- /network-topic-models/doc/models/model_lda.tex: -------------------------------------------------------------------------------- 1 | % model_lda.tex 2 | % 3 | % Copyright (C) 2010,2011 Laura Dietz 4 | % Copyright (C) 2012 Jaakko Luttinen 5 | % 6 | % This file may be distributed and/or modified 7 | % 8 | % 1. under the LaTeX Project Public License and/or 9 | % 2. under the GNU General Public License. 10 | % 11 | % See the files LICENSE_LPPL and LICENSE_GPL for more details. 12 | 13 | % Latent Diriclet allocation model 14 | 15 | %\beginpgfgraphicnamed{model-lda} 16 | \begin{tikzpicture}[x=1.7cm,y=1.8cm] 17 | 18 | % Nodes 19 | 20 | \node[obs] (X) {$X$} ; % 21 | \node[latent, above=of X] (T) {$T$} ; % 22 | \node[latent, above=of T] (theta) {$\theta$}; % 23 | \node[const, above=of theta] (atheta) {$\alpha_\theta$}; 24 | 25 | 26 | % Factors 27 | \factor[above=of X] {X-f} {Multi} {} {} ; % 28 | \factor[above=of T] {T-f} {left:Multi} {} {} ; % 29 | \factor[above=of theta] {theta-f} {left:Dir} {} {} ; % 30 | 31 | % More nodes 32 | \node[latent, right=of X-f] (phi) {$\phi$}; % 33 | \node[const, above=of phi] (aphi) {$\alpha_\phi$}; % 34 | 35 | \factor[above=of phi] {phi-f} {right:Dir} {} {} ; % 36 | 37 | \factoredge {theta} {T-f} {T} ; % 38 | \factoredge {atheta} {theta-f} {theta} ; % 39 | \factoredge {phi} {X-f} {X} ; % 40 | \factoredge {aphi} {phi-f} {phi} ; % 41 | 42 | \gate {X-gate} {(X-f)(X-f-caption)} {T} 43 | 44 | \plate {plate1} { % 45 | (X)(X-gate) % 46 | (T)(T-f)(T-f-caption) % 47 | } {$\forall 1 \leq i \leq n_d$}; % 48 | \plate {} { % 49 | (plate1) % 50 | (theta)(theta-f)(theta-f-caption) % 51 | } {$\forall d \in \mathcal{D}$} ; % 52 | \plate {} { % 53 | (phi)(phi-f)(phi-f-caption) % 54 | } {$\forall t \in \mathcal{T}$} ; % 55 | 56 | \end{tikzpicture} 57 | %\endpgfgraphicnamed 58 | 59 | %%% Local Variables: 60 | %%% mode: tex-pdf 61 | %%% TeX-master: "example" 62 | %%% End: 63 | -------------------------------------------------------------------------------- /network-topic-models/doc/models/model_st.tex: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bgamari/bayes-stack/020df7bb7263104fdea254e57d6c7daf7806da3e/network-topic-models/doc/models/model_st.tex -------------------------------------------------------------------------------- /network-topic-models/doc/models/models.tex: -------------------------------------------------------------------------------- 1 | \documentclass[a4paper]{article} 2 | 3 | \usepackage{tikz} 4 | \usetikzlibrary{bayesnet} 5 | 6 | \begin{document} 7 | 8 | \section{Latent dirichlet allocation} 9 | \input{model_lda} 10 | 11 | \section{Latent dirichlet allocation (relevance)} 12 | This model is a variant on LDA which allows arbitrary item-wise 13 | weights to be assigned. This can be useful in document relevance 14 | scoring. 15 | 16 | \section{Shared taste model} 17 | \input{model_st} 18 | 19 | \section{Citation influence model} 20 | \input{model_ci} 21 | 22 | \section{Citation influence model with no topics} 23 | \input{model_ci_nt} 24 | 25 | \end{document} 26 | -------------------------------------------------------------------------------- /network-topic-models/doc/models/tikzlibrarybayesnet.code.tex: -------------------------------------------------------------------------------- 1 | % tikzlibrary.code.tex 2 | % 3 | % Copyright 2010-2011 by Laura Dietz 4 | % Copyright 2012 by Jaakko Luttinen 5 | % 6 | % This file may be distributed and/or modified 7 | % 8 | % 1. under the LaTeX Project Public License and/or 9 | % 2. under the GNU General Public License. 10 | % 11 | % See the files LICENSE_LPPL and LICENSE_GPL for more details. 12 | 13 | % Load other libraries 14 | \usetikzlibrary{shapes} 15 | \usetikzlibrary{fit} 16 | \usetikzlibrary{chains} 17 | \usetikzlibrary{arrows} 18 | 19 | % Latent node 20 | \tikzstyle{latent} = [circle,fill=white,draw=black,inner sep=1pt, 21 | minimum size=20pt, font=\fontsize{10}{10}\selectfont, node distance=1] 22 | % Observed node 23 | \tikzstyle{obs} = [latent,fill=gray!25] 24 | % Constant node 25 | \tikzstyle{const} = [rectangle, inner sep=0pt, node distance=1] 26 | % Factor node 27 | \tikzstyle{factor} = [rectangle, fill=black,minimum size=5pt, inner 28 | sep=0pt, node distance=0.4] 29 | % Deterministic node 30 | \tikzstyle{det} = [latent, diamond] 31 | 32 | % Plate node 33 | \tikzstyle{plate} = [draw, rectangle, rounded corners, fit=#1] 34 | % Invisible wrapper node 35 | \tikzstyle{wrap} = [inner sep=0pt, fit=#1] 36 | % Gate 37 | \tikzstyle{gate} = [draw, rectangle, dashed, fit=#1] 38 | 39 | % Caption node 40 | \tikzstyle{caption} = [font=\footnotesize, node distance=0] % 41 | \tikzstyle{plate caption} = [caption, node distance=0, inner sep=0pt, 42 | below left=5pt and 0pt of #1.south east] % 43 | \tikzstyle{factor caption} = [caption] % 44 | \tikzstyle{every label} += [caption] % 45 | 46 | \tikzset{>={triangle 45}} 47 | 48 | %\pgfdeclarelayer{b} 49 | %\pgfdeclarelayer{f} 50 | %\pgfsetlayers{b,main,f} 51 | 52 | % \factoredge [options] {inputs} {factors} {outputs} 53 | \newcommand{\factoredge}[4][]{ % 54 | % Connect all nodes #2 to all nodes #4 via all factors #3. 55 | \foreach \f in {#3} { % 56 | \foreach \x in {#2} { % 57 | \draw[-,#1] (\x) edge[-] (\f) ; % 58 | } ; 59 | \foreach \y in {#4} { % 60 | \draw[->,#1] (\f) -- (\y) ; % 61 | } ; 62 | } ; 63 | } 64 | 65 | % \edge [options] {inputs} {outputs} 66 | \newcommand{\edge}[3][]{ % 67 | % Connect all nodes #2 to all nodes #3. 68 | \foreach \x in {#2} { % 69 | \foreach \y in {#3} { % 70 | \draw[->,#1] (\x) -- (\y) ;% 71 | } ; 72 | } ; 73 | } 74 | 75 | % \factor [options] {name} {caption} {inputs} {outputs} 76 | \newcommand{\factor}[5][]{ % 77 | % Draw the factor node. Use alias to allow empty names. 78 | \node[factor, label={[name=#2-caption]#3}, name=#2, #1, 79 | alias=#2-alias] {} ; % 80 | % Connect all inputs to outputs via this factor 81 | \factoredge {#4} {#2-alias} {#5} ; % 82 | } 83 | 84 | % \plate [options] {name} {fitlist} {caption} 85 | \newcommand{\plate}[4][]{ % 86 | \node[wrap=#3] (#2-wrap) {}; % 87 | \node[plate caption=#2-wrap] (#2-caption) {#4}; % 88 | \node[plate=(#2-wrap)(#2-caption), #1] (#2) {}; % 89 | } 90 | 91 | % \gate [options] {name} {fitlist} {inputs} 92 | \newcommand{\gate}[4][]{ % 93 | \node[gate=#3, name=#2, #1, alias=#2-alias] {}; % 94 | \foreach \x in {#4} { % 95 | \draw [-*,thick] (\x) -- (#2-alias); % 96 | } ;% 97 | } 98 | 99 | % \vgate {name} {fitlist-left} {caption-left} {fitlist-right} 100 | % {caption-right} {inputs} 101 | \newcommand{\vgate}[6]{ % 102 | % Wrap the left and right parts 103 | \node[wrap=#2] (#1-left) {}; % 104 | \node[wrap=#4] (#1-right) {}; % 105 | % Draw the gate 106 | \node[gate=(#1-left)(#1-right)] (#1) {}; % 107 | % Add captions 108 | \node[caption, below left=of #1.north ] (#1-left-caption) 109 | {#3}; % 110 | \node[caption, below right=of #1.north ] (#1-right-caption) 111 | {#5}; % 112 | % Draw middle separation 113 | \draw [-, dashed] (#1.north) -- (#1.south); % 114 | % Draw inputs 115 | \foreach \x in {#6} { % 116 | \draw [-*,thick] (\x) -- (#1); % 117 | } ;% 118 | } 119 | 120 | % \hgate {name} {fitlist-top} {caption-top} {fitlist-bottom} 121 | % {caption-bottom} {inputs} 122 | \newcommand{\hgate}[6]{ % 123 | % Wrap the left and right parts 124 | \node[wrap=#2] (#1-top) {}; % 125 | \node[wrap=#4] (#1-bottom) {}; % 126 | % Draw the gate 127 | \node[gate=(#1-top)(#1-bottom)] (#1) {}; % 128 | % Add captions 129 | \node[caption, above right=of #1.west ] (#1-top-caption) 130 | {#3}; % 131 | \node[caption, below right=of #1.west ] (#1-bottom-caption) 132 | {#5}; % 133 | % Draw middle separation 134 | \draw [-, dashed] (#1.west) -- (#1.east); % 135 | % Draw inputs 136 | \foreach \x in {#6} { % 137 | \draw [-*,thick] (\x) -- (#1); % 138 | } ;% 139 | } 140 | 141 | -------------------------------------------------------------------------------- /network-topic-models/network-topic-models.cabal: -------------------------------------------------------------------------------- 1 | -- Initial bayes-stack-topic-models.cabal generated by cabal init. For 2 | -- further documentation, see http://haskell.org/cabal/users-guide/ 3 | 4 | name: network-topic-models 5 | version: 0.2.0.1 6 | synopsis: A few network topic model implementations for bayes-stack 7 | description: Implementations of a few network topic models build upon bayes-stack. 8 | The package includes Latent Dirichlet Allocation 9 | (LDA), the shared taste model, and the citation 10 | influence model. 11 | homepage: https://github.com/bgamari/bayes-stack 12 | license: BSD3 13 | license-file: LICENSE 14 | author: Ben Gamari 15 | maintainer: bgamari.foss@gmail.com 16 | copyright: Copyright (c) 2012 Ben Gamari 17 | category: Math 18 | build-type: Simple 19 | cabal-version: >=1.8 20 | 21 | source-repository head 22 | type: git 23 | location: https://github.com/bgamari/bayes-stack.git 24 | 25 | executable bayes-stack-lda 26 | main-is: RunLDA.hs 27 | ghc-options: -threaded 28 | build-depends: base >=4.6 && <4.8, 29 | stm >= 2.4 && < 2.5, 30 | containers >= 0.5 && < 0.6, 31 | deepseq >= 1.3 && < 2.0, 32 | bytestring >= 0.10 && < 0.11, 33 | binary >= 0.7 && < 0.8, 34 | vector >=0.9 && <0.11, 35 | transformers >= 0.3 && < 0.4, 36 | filepath >=1.3 && < 1.4, 37 | directory, 38 | text >= 0.11 && < 0.12, 39 | optparse-applicative >=0.5, 40 | statistics ==0.10.*, 41 | log-domain >= 0.6, 42 | mwc-random >= 0.12 && < 0.13, 43 | random-fu ==0.2.*, 44 | bayes-stack >= 0.2 && < 0.3 45 | 46 | executable bayes-stack-lda-rel 47 | main-is: RunLDARelevance.hs 48 | ghc-options: -threaded 49 | build-depends: base >=4.6 && <4.8, 50 | stm >= 2.4 && < 2.5, 51 | containers >= 0.5 && < 0.6, 52 | deepseq >= 1.3 && < 2.0, 53 | bytestring >= 0.10 && < 0.11, 54 | binary >= 0.7 && < 0.8, 55 | vector >=0.9 && <0.11, 56 | transformers >= 0.3 && < 0.4, 57 | filepath >=1.3 && < 1.4, 58 | directory, 59 | text >= 0.11 && < 0.12, 60 | attoparsec >= 0.10 && < 1.0, 61 | optparse-applicative >=0.5, 62 | statistics ==0.10.*, 63 | log-domain >= 0.6, 64 | mwc-random >= 0.12 && < 0.13, 65 | random-fu ==0.2.*, 66 | bayes-stack >= 0.2 && < 0.3 67 | 68 | executable bayes-stack-topic-language-model 69 | main-is: TopicLanguageModel.hs 70 | ghc-options: -threaded 71 | build-depends: base >=4.6 && <4.8, 72 | stm >= 2.4 && < 2.5, 73 | containers >= 0.5 && < 0.6, 74 | deepseq >= 1.3 && < 2.0, 75 | bytestring >= 0.10 && < 0.11, 76 | binary >= 0.7 && < 0.8, 77 | vector >=0.9 && <0.11, 78 | transformers >= 0.3 && < 0.4, 79 | filepath >=1.3 && < 1.4, 80 | directory, 81 | text >= 0.11 && < 0.12, 82 | attoparsec >= 0.10 && < 1.0, 83 | optparse-applicative >=0.5, 84 | statistics ==0.10.*, 85 | log-domain >= 0.6, 86 | mwc-random >= 0.12 && < 0.13, 87 | random-fu ==0.2.*, 88 | bayes-stack >= 0.2 && < 0.3 89 | 90 | executable bayes-stack-st 91 | main-is: RunST.hs 92 | ghc-options: -threaded 93 | build-depends: base >=4.6 && <4.8, 94 | stm >= 2.4 && < 2.5, 95 | containers >= 0.5 && < 0.6, 96 | deepseq >= 1.3 && < 2.0, 97 | bytestring >= 0.10 && < 0.11, 98 | binary >= 0.7 && < 0.8, 99 | vector >=0.9 && <0.11, 100 | transformers >= 0.3 && < 0.4, 101 | filepath >=1.3 && < 1.4, 102 | directory, 103 | text >= 0.11 && < 0.12, 104 | optparse-applicative >=0.5, 105 | statistics ==0.10.*, 106 | log-domain >= 0.6, 107 | mwc-random >= 0.12 && < 0.13, 108 | random-fu ==0.2.*, 109 | bayes-stack >= 0.2 && < 0.3 110 | 111 | executable bayes-stack-ci 112 | main-is: RunCI.hs 113 | ghc-options: -threaded 114 | build-depends: base >=4.6 && <4.8, 115 | stm >= 2.4 && < 2.5, 116 | containers >= 0.5 && < 0.6, 117 | deepseq >= 1.3 && < 2.0, 118 | bytestring >= 0.10 && < 0.11, 119 | binary >= 0.7 && < 0.8, 120 | vector >=0.9 && <0.11, 121 | transformers >= 0.3 && < 0.4, 122 | filepath >=1.3 && < 1.4, 123 | directory, 124 | text >= 0.11 && < 0.12, 125 | optparse-applicative >=0.5, 126 | statistics ==0.10.*, 127 | log-domain >= 0.6, 128 | mwc-random >= 0.12 && < 0.13, 129 | random-fu ==0.2.*, 130 | bayes-stack >= 0.2 && < 0.3 131 | 132 | executable bayes-stack-cint 133 | main-is: RunCINT.hs 134 | ghc-options: -threaded 135 | build-depends: base >=4.6 && <4.8, 136 | stm >= 2.4 && < 2.5, 137 | containers >= 0.5 && < 0.6, 138 | deepseq >= 1.3 && < 2.0, 139 | bytestring >= 0.10 && < 0.11, 140 | binary >= 0.7 && < 0.8, 141 | vector >=0.9 && <0.11, 142 | transformers >= 0.3 && < 0.4, 143 | filepath >=1.3 && < 1.4, 144 | directory, 145 | text >= 0.11 && < 0.12, 146 | optparse-applicative >=0.5, 147 | statistics ==0.10.*, 148 | log-domain >= 0.6, 149 | mwc-random >= 0.12 && < 0.13, 150 | random-fu ==0.2.*, 151 | bayes-stack >= 0.2 && < 0.3, 152 | lens 153 | 154 | executable bayes-stack-dump-lda 155 | main-is: DumpLDA.hs 156 | build-depends: base >=4.6 && <4.8, 157 | stm >= 2.4 && < 2.5, 158 | containers >= 0.5 && < 0.6, 159 | deepseq >= 1.3 && < 2.0, 160 | bytestring >= 0.10 && < 0.11, 161 | binary >= 0.7 && < 0.8, 162 | vector >=0.9 && <0.11, 163 | transformers >= 0.3 && < 0.4, 164 | filepath >=1.3 && < 1.4, 165 | directory, 166 | text >= 0.11 && < 0.12, 167 | optparse-applicative >=0.5, 168 | statistics ==0.10.*, 169 | log-domain >= 0.6, 170 | mwc-random >= 0.12 && < 0.13, 171 | random-fu ==0.2.*, 172 | bayes-stack >= 0.2 && < 0.3 173 | 174 | executable bayes-stack-dump-st 175 | main-is: DumpST.hs 176 | build-depends: base >=4.6 && <4.8, 177 | stm >= 2.4 && < 2.5, 178 | containers >= 0.5 && < 0.6, 179 | deepseq >= 1.3 && < 2.0, 180 | bytestring >= 0.10 && < 0.11, 181 | binary >= 0.7 && < 0.8, 182 | vector >=0.9 && <0.11, 183 | transformers >= 0.3 && < 0.4, 184 | filepath >=1.3 && < 1.4, 185 | directory, 186 | text >= 0.11 && < 0.12, 187 | optparse-applicative >=0.5, 188 | statistics ==0.10.*, 189 | log-domain >= 0.6, 190 | mwc-random >= 0.12 && < 0.13, 191 | random-fu ==0.2.*, 192 | bayes-stack >= 0.2 && < 0.3 193 | 194 | executable bayes-stack-dump-ci 195 | main-is: DumpCI.hs 196 | build-depends: base >=4.6 && <4.8, 197 | stm >= 2.4 && < 2.5, 198 | containers >= 0.5 && < 0.6, 199 | deepseq >= 1.3 && < 2.0, 200 | bytestring >= 0.10 && < 0.11, 201 | binary >= 0.7 && < 0.8, 202 | vector >=0.9 && <0.11, 203 | transformers >= 0.3 && < 0.4, 204 | filepath >=1.3 && < 1.4, 205 | directory, 206 | text >= 0.11 && < 0.12, 207 | optparse-applicative >=0.5, 208 | statistics ==0.10.*, 209 | log-domain >= 0.6, 210 | mwc-random >= 0.12 && < 0.13, 211 | random-fu ==0.2.*, 212 | bayes-stack >= 0.2 && < 0.3 213 | 214 | executable bayes-stack-dump-cint 215 | main-is: DumpCINT.hs 216 | build-depends: base >=4.6 && <4.8, 217 | stm >= 2.4 && < 2.5, 218 | containers >= 0.5 && < 0.6, 219 | deepseq >= 1.3 && < 2.0, 220 | bytestring >= 0.10 && < 0.11, 221 | binary >= 0.7 && < 0.8, 222 | vector >=0.9 && <0.11, 223 | transformers >= 0.3 && < 0.4, 224 | filepath >=1.3 && < 1.4, 225 | directory, 226 | text >= 0.11 && < 0.12, 227 | optparse-applicative >=0.5, 228 | statistics ==0.10.*, 229 | log-domain >= 0.6, 230 | mwc-random >= 0.12 && < 0.13, 231 | random-fu ==0.2.*, 232 | bayes-stack >= 0.2 && < 0.3, 233 | lens 234 | 235 | executable bayes-stack-dump-lda-rel 236 | main-is: DumpLDARelevance.hs 237 | build-depends: base >=4.6 && <4.8, 238 | stm >= 2.4 && < 2.5, 239 | containers >= 0.5 && < 0.6, 240 | deepseq >= 1.3 && < 2.0, 241 | bytestring >= 0.10 && < 0.11, 242 | binary >= 0.7 && < 0.8, 243 | vector >=0.9 && <0.11, 244 | transformers >= 0.3 && < 0.4, 245 | filepath >=1.3 && < 1.4, 246 | directory, 247 | text >= 0.11 && < 0.12, 248 | optparse-applicative >=0.5, 249 | statistics ==0.10.*, 250 | log-domain >= 0.6, 251 | mwc-random >= 0.12 && < 0.13, 252 | random-fu ==0.2.*, 253 | bayes-stack >= 0.2 && < 0.3 254 | 255 | benchmark lda-benchmark 256 | type: exitcode-stdio-1.0 257 | main-is: Benchmark.hs 258 | Ghc-Options: -H512M -A512M -O2 259 | build-depends: base >=4.6 && <4.8, 260 | vector >=0.9 && <0.11, 261 | statistics ==0.10.*, 262 | bimap ==0.2.*, 263 | containers ==0.5.*, 264 | transformers ==0.3.*, 265 | bayes-stack ==0.2.*, 266 | text ==0.11.*, 267 | random-fu ==0.2.*, 268 | mwc-random ==0.12.*, 269 | log-domain >= 0.6, 270 | bytestring ==0.10.*, 271 | binary >=0.7 && < 0.8, 272 | deepseq ==1.3.*, 273 | directory, 274 | criterion 275 | --------------------------------------------------------------------------------