├── data ├── asap.rda └── thesis.rda ├── vignettes ├── asap.pdf ├── PRISMA.pdf ├── PRISMA.bib └── PRISMA.Rnw ├── inst └── extdata │ ├── asap.tar.gz │ ├── README │ └── sallyPreprocessing.py ├── .gitignore ├── README.md ├── man ├── thesis.Rd ├── asap.Rd ├── prismaDuplicatePCA.Rd ├── generics.Rd ├── generics_dimension.Rd ├── getDuplicateData.Rd ├── PRISMA-package.Rd ├── getMatrixFactorizationLabels.Rd ├── corpusToPrisma.Rd ├── prismaHclust.Rd ├── estimateDimension.Rd ├── generics_mf.Rd ├── loadPrismaData.Rd └── prismaNMF.Rd ├── NAMESPACE ├── DESCRIPTION └── R ├── dimensionEstimation.R ├── matrixFactorization.R └── prisma.R /data/asap.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tammok/PRISMA/HEAD/data/asap.rda -------------------------------------------------------------------------------- /data/thesis.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tammok/PRISMA/HEAD/data/thesis.rda -------------------------------------------------------------------------------- /vignettes/asap.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tammok/PRISMA/HEAD/vignettes/asap.pdf -------------------------------------------------------------------------------- /vignettes/PRISMA.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tammok/PRISMA/HEAD/vignettes/PRISMA.pdf -------------------------------------------------------------------------------- /inst/extdata/asap.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tammok/PRISMA/HEAD/inst/extdata/asap.tar.gz -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | 4 | # Example code in package build process 5 | *-Ex.R 6 | *~ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | PRISMA 2 | ====== 3 | 4 | Protocol Inspection and State Machine Analysis 5 | 6 | The package PRISMA is hosted on CRAN, so 7 | 8 | install.packages("PRISMA") 9 | library(PRISMA) 10 | example(PRISMA) 11 | vignette("PRISMA") 12 | 13 | will give you a first impression. 14 | -------------------------------------------------------------------------------- /man/thesis.Rd: -------------------------------------------------------------------------------- 1 | \name{thesis} 2 | \docType{data} 3 | \alias{thesis} 4 | \title{The Thesis Data Set} 5 | \description{ 6 | The 15 sections of a thesis (see references) as a tm-corpus. 7 | } 8 | \usage{thesis} 9 | \format{A tm-corpus.} 10 | \references{ 11 | Tammo Krueger. \emph{Probabilistic Methods for Network Security. From Analysis to Response.} PhD thesis, 12 | TU Berlin, 2013. \url{http://opus.kobv.de/tuberlin/volltexte/2013/3881/} 13 | } 14 | \author{ 15 | Tammo Krueger 16 | } 17 | \keyword{datasets} -------------------------------------------------------------------------------- /man/asap.Rd: -------------------------------------------------------------------------------- 1 | \name{asap} 2 | \docType{data} 3 | \alias{asap} 4 | \title{The ASAP Data Set} 5 | \description{ 6 | Toy data set to show the capabilities of the PRISMA package. 7 | } 8 | \usage{asap} 9 | \format{A prisma object.} 10 | \references{ 11 | Krueger, T., Kraemer, N., Rieck, K. (2011) 12 | ASAP: Automatic Semantics-Aware Analysis of Network Payloads 13 | \emph{Privacy and Security Issues in Data Mining and Machine Learning - International ECML/PKDD Workshop. Lecture Notes in Computer Science 6549}, Springer. 50 - 63 14 | } 15 | \author{ 16 | Tammo Krueger 17 | } 18 | \keyword{datasets} -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | S3method(plot, prisma) 2 | S3method(print, prisma) 3 | S3method(plot, prismaDimension) 4 | S3method(print, prismaDimension) 5 | S3method(plot, prismaMF) 6 | export(prismaHclust, prismaDuplicatePCA, prismaNMF, loadPrismaData, getDuplicateData, corpusToPrisma, estimateDimension, plot.prisma, print.prisma, plot.prismaDimension, print.prismaDimension, plot.prismaMF, getMatrixFactorizationLabels) 7 | import(Matrix, gplots, ggplot2) 8 | importFrom("grDevices", "gray") 9 | importFrom("methods", "new") 10 | importFrom("stats", "as.dendrogram", "cutree", "dist", "hclust", "p.adjust", "pnorm", "prcomp", "qnorm", "rnorm", "var") 11 | importFrom("utils", "packageVersion", "read.table") 12 | -------------------------------------------------------------------------------- /inst/extdata/README: -------------------------------------------------------------------------------- 1 | This folder contains an example file to show the preprocessing step 2 | with the sally toolkit (see http://www.mlsec.org/sally/). Before 3 | executing the examples please extract asap.tar.gz to find all data 4 | necessary to understand the processing chain from the raw data 5 | (asap.raw) to the sally file (asap.sally) and the optimized file 6 | (asap.fsally). The asap.sally file can be produced as follows: 7 | 8 | sally -c asap.cfg asap.raw asap.sally 9 | 10 | this call generates asap.sally from the raw data found in asap.raw. To 11 | speed up the loading of the data in R, one should apply the 12 | sallyPreprocessing.py python script as follows: 13 | 14 | python sallyPreprocessing.py asap.sally asap.fsally 15 | 16 | Now the data is ready to be efficiently loaded and processed in R. -------------------------------------------------------------------------------- /man/prismaDuplicatePCA.Rd: -------------------------------------------------------------------------------- 1 | \name{prismaDuplicatePCA} 2 | \alias{prismaDuplicatePCA} 3 | %- Also NEED an '\alias' for EACH other topic documented here. 4 | \title{ 5 | Matrix Factorization Based on Replicate-Aware PCA 6 | } 7 | \description{ 8 | Efficient implementation of a replicate-aware principal component 9 | anaylsis (PCA). 10 | } 11 | \usage{ 12 | prismaDuplicatePCA(prismaData) 13 | } 14 | %- maybe also 'usage' for other objects documented here. 15 | \arguments{ 16 | \item{prismaData}{ 17 | PRISMA data for which a PCA should be calculated 18 | } 19 | } 20 | \value{ 21 | \item{prismaPCA}{Matrix factorization object $A = B C$, in which the 22 | factors are calculate by a replicate-aware PCA} 23 | } 24 | \author{ 25 | Tammo Krueger 26 | } 27 | \examples{ 28 | # please see the vingette for examles 29 | } 30 | -------------------------------------------------------------------------------- /man/generics.Rd: -------------------------------------------------------------------------------- 1 | \name{plot.prisma} 2 | \alias{plot.prisma} 3 | \alias{print.prisma} 4 | %- Also NEED an '\alias' for EACH other topic documented here. 5 | \title{ 6 | Generics For PRISMA Objects 7 | } 8 | \description{ 9 | Print and plot generic for the PRISMA objects. 10 | } 11 | \usage{ 12 | \method{print}{prisma}(x, ...) 13 | \method{plot}{prisma}(x, ...) 14 | } 15 | %- maybe also 'usage' for other objects documented here. 16 | \arguments{ 17 | \item{x}{ 18 | PRISMA data loaded via \code{\link{loadPrismaData}} 19 | } 20 | 21 | \item{...}{ 22 | not used 23 | } 24 | } 25 | \author{ 26 | Tammo Krueger 27 | } 28 | \seealso{ 29 | \code{\link{estimateDimension}}, \code{\link{prismaHclust}}, \code{\link{prismaDuplicatePCA}}, \code{\link{prismaNMF}} 30 | } 31 | \examples{ 32 | data(asap) 33 | print(asap) 34 | plot(asap) 35 | 36 | } 37 | -------------------------------------------------------------------------------- /man/generics_dimension.Rd: -------------------------------------------------------------------------------- 1 | \name{plot.prismaDimension} 2 | \alias{plot.prismaDimension} 3 | \alias{print.prismaDimension} 4 | %- Also NEED an '\alias' for EACH other topic documented here. 5 | \title{ 6 | Generics For PRISMA Objects 7 | } 8 | \description{ 9 | Print and plot generic for the PRISMA dimension objects. 10 | } 11 | \usage{ 12 | \method{print}{prismaDimension}(x, ...) 13 | \method{plot}{prismaDimension}(x, ...) 14 | } 15 | %- maybe also 'usage' for other objects documented here. 16 | \arguments{ 17 | \item{x}{ 18 | PRISMA dimension object generated via \code{\link{estimateDimension}} 19 | } 20 | \item{...}{ 21 | not used 22 | } 23 | } 24 | \author{ 25 | Tammo Krueger 26 | } 27 | \seealso{ 28 | \code{\link{estimateDimension}}, \code{\link{prismaHclust}}, \code{\link{prismaDuplicatePCA}}, \code{\link{prismaNMF}} 29 | } 30 | \examples{ 31 | # please see the vingette for examles 32 | } 33 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: PRISMA 2 | Type: Package 3 | Title: Protocol Inspection and State Machine Analysis 4 | Version: 0.2-7 5 | Date: 2018-05-26 6 | Depends: 7 | Matrix, 8 | gplots, 9 | methods, 10 | ggplot2 11 | Suggests: 12 | tm (>= 0.6) 13 | Author: Tammo Krueger, Nicole Kraemer 14 | Maintainer: Tammo Krueger 15 | Description: Loads and processes huge text 16 | corpora processed with the sally toolbox (). 17 | sally acts as a very fast preprocessor which splits the text files into 18 | tokens or n-grams. These output files can then be read with the PRISMA 19 | package which applies testing-based token selection and has some 20 | replicate-aware, highly tuned non-negative matrix factorization and 21 | principal component analysis implementation which allows the processing of 22 | very big data sets even on desktop machines. 23 | License: GPL (>=2.0) 24 | 25 | -------------------------------------------------------------------------------- /inst/extdata/sallyPreprocessing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import sys 3 | from optparse import OptionParser 4 | 5 | usage = "usage: %prog in.sally out.fsally" 6 | parser = OptionParser(usage) 7 | 8 | (options, args) = parser.parse_args() 9 | 10 | if len(args) != 2: 11 | parser.print_help() 12 | sys.exit() 13 | 14 | sallyIn = file(sys.argv[1]) 15 | sallyOut = file(sys.argv[2], "w") 16 | # skip first line 17 | sallyIn.readline() 18 | allNgrams = {} 19 | count = 0 20 | for l in sallyIn: 21 | count += 1 22 | if count % 1000 == 0: 23 | print(count) 24 | info = l.split(" ") 25 | if info[0] == "": 26 | curNgrams = [] 27 | else: 28 | curNgrams = [ngramInfo.split(":")[1] for ngramInfo in info[0].split(",")] 29 | allNgrams.update(allNgrams.fromkeys(curNgrams)) 30 | sallyOut.write("%s\n" % " ".join(curNgrams)) 31 | sallyOut.write("%s\n" % " ".join(allNgrams.keys())) 32 | sallyOut.close() 33 | sallyIn.close() 34 | -------------------------------------------------------------------------------- /man/getDuplicateData.Rd: -------------------------------------------------------------------------------- 1 | \name{getDuplicateData} 2 | \alias{getDuplicateData} 3 | \title{ 4 | Restores Data with Duplicates 5 | } 6 | \description{ 7 | The \code{\link{loadPrismaData}} function triggers a feature selection and 8 | data combination methods which subsequently remove duplicate entries for 9 | efficient representation of the data. The 10 | \code{\link{getDuplicateData}} rebuilds the data matrix with 11 | explicit representation of all duplicate entries. 12 | } 13 | \usage{ 14 | getDuplicateData(prismaData) 15 | } 16 | %- maybe also 'usage' for other objects documented here. 17 | \arguments{ 18 | \item{prismaData}{ 19 | prisma data loaded via \code{\link{loadPrismaData}} 20 | } 21 | } 22 | \value{ 23 | \item{dataWithDuplicates}{Data matrix containing explicit copies of all duplicates.} 24 | } 25 | \author{ 26 | Tammo Krueger 27 | } 28 | 29 | \examples{ 30 | data(asap) 31 | dataWithDuplicates = getDuplicateData(asap) 32 | } 33 | -------------------------------------------------------------------------------- /man/PRISMA-package.Rd: -------------------------------------------------------------------------------- 1 | \name{PRISMA-package} 2 | \alias{PRISMA-package} 3 | \alias{PRISMA} 4 | \docType{package} 5 | \title{ 6 | \packageTitle{PRISMA} 7 | } 8 | \description{ 9 | \packageDescription{PRISMA} 10 | } 11 | \details{ 12 | \packageDESCRIPTION{PRISMA} 13 | \packageIndices{PRISMA} 14 | } 15 | \author{ 16 | \packageAuthor{PRISMA} 17 | 18 | Maintainer: \packageMaintainer{PRISMA} 19 | } 20 | \references{ 21 | Krueger, T., Gascon, H., Kraemer, N., Rieck, K. (2012) 22 | Learning Stateful Models for Network Honeypots 23 | \emph{5th ACM Workshop on Artificial Intelligence and Security (AISEC 2012)}, accepted 24 | 25 | Krueger, T., Kraemer, N., Rieck, K. (2011) 26 | ASAP: Automatic Semantics-Aware Analysis of Network Payloads 27 | \emph{Privacy and Security Issues in Data Mining and Machine Learning - International ECML/PKDD Workshop. Lecture Notes in Computer Science 6549}, Springer. 50 - 63 28 | } 29 | \keyword{ package } 30 | \examples{ 31 | # please see the vingette for examples 32 | } 33 | -------------------------------------------------------------------------------- /man/getMatrixFactorizationLabels.Rd: -------------------------------------------------------------------------------- 1 | \name{getMatrixFactorizationLabels} 2 | \alias{getMatrixFactorizationLabels} 3 | %- Also NEED an '\alias' for EACH other topic documented here. 4 | \title{ 5 | Convert Coordinates of Matrix Factorization to Labels 6 | } 7 | \description{ 8 | Given a matrix factorization object \eqn{A = B C}, this function returns for each 9 | document the index of the inner dimension which has the maximal 10 | coordinate. Thus, it converts the fuzzy clustering found in the 11 | columns of the \eqn{C} matrix into a hard clustering by returning the 12 | position with the maximal coordinate value. 13 | } 14 | \usage{ 15 | getMatrixFactorizationLabels(prismaMF) 16 | } 17 | %- maybe also 'usage' for other objects documented here. 18 | \arguments{ 19 | \item{prismaMF}{ 20 | a matrix factorization object. 21 | } 22 | } 23 | \value{ 24 | \item{labels}{vector containing the label assignment for each document.} 25 | } 26 | \author{ 27 | Tammo Krueger 28 | } 29 | \seealso{ 30 | \code{\link{prismaNMF}} 31 | } 32 | 33 | -------------------------------------------------------------------------------- /man/corpusToPrisma.Rd: -------------------------------------------------------------------------------- 1 | \name{corpusToPrisma} 2 | \alias{corpusToPrisma} 3 | %- Also NEED an '\alias' for EACH other topic documented here. 4 | \title{ 5 | Convert tm copus to PRISMA 6 | } 7 | \description{ 8 | Converts a tm corpus object to a PRISMA object. 9 | } 10 | \usage{ 11 | corpusToPrisma(corpus, alpha = 0.05, skipFeatureCorrelation = FALSE) 12 | } 13 | %- maybe also 'usage' for other objects documented here. 14 | \arguments{ 15 | \item{corpus}{ 16 | a tm corpus 17 | } 18 | \item{alpha}{ 19 | significance level for the feature tests. If NULL, all features are kept. 20 | } 21 | \item{skipFeatureCorrelation}{ 22 | should the grouping of features based on correlation analysis be skipped. 23 | } 24 | } 25 | \value{ 26 | \item{prismaData}{data object representing the tokenized documents as 27 | features x samples matrix.} 28 | } 29 | \author{ 30 | Tammo Krueger 31 | } 32 | 33 | \examples{ 34 | if (require("tm") && packageVersion("tm") >= '0.6') { 35 | data(thesis) 36 | thesis 37 | thesis = corpusToPrisma(thesis, NULL, TRUE) 38 | thesis 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /man/prismaHclust.Rd: -------------------------------------------------------------------------------- 1 | \name{prismaHclust} 2 | \alias{prismaHclust} 3 | %- Also NEED an '\alias' for EACH other topic documented here. 4 | \title{ 5 | Matrix Factorization Based on Hierarchical Clustering 6 | } 7 | \description{ 8 | A matrix factorization \eqn{A = B C} based on the results of hclust is constructed, 9 | which holds the mean feature values for each cluster in the matrix \eqn{B} 10 | and the indication of the cluster in the matrix \eqn{C} for each data 11 | point (i.e. each data point is represented by its assigned cluster center). 12 | } 13 | \usage{ 14 | prismaHclust(prismaData, ncomp, method = "single") 15 | } 16 | %- maybe also 'usage' for other objects documented here. 17 | \arguments{ 18 | \item{prismaData}{ 19 | PRISMA data for which a clustering should be calculated. 20 | } 21 | \item{ncomp}{ 22 | the number of components that should be extracted. 23 | } 24 | \item{method}{ 25 | the method used for clustering. 26 | } 27 | } 28 | \value{ 29 | \item{prismaHclust}{Matrix factorization object containing \eqn{B} and \eqn{C} 30 | resulting from the hierarchical clustering of the data.} 31 | } 32 | \author{ 33 | Tammo Krueger 34 | } 35 | 36 | \seealso{ 37 | \code{\link{hclust}} 38 | } 39 | \examples{ 40 | # please see the vingette for examles 41 | } 42 | -------------------------------------------------------------------------------- /man/estimateDimension.Rd: -------------------------------------------------------------------------------- 1 | \name{estimateDimension} 2 | \alias{estimateDimension} 3 | \title{ 4 | Estimate Inner Dimension 5 | } 6 | \description{ 7 | Matrix factorization methods compress the original data matrix \eqn{A \in 8 | R^{f,N}} with \eqn{f} features and \eqn{N} samples into two parts, 9 | namely \eqn{A = B C} with \eqn{B \in R^{f,k}, C\in R^{k, 10 | N}}. The function estimateDimension estimates \eqn{k} based on a noise 11 | model estimated from a scrambled version of the original data matrix. 12 | } 13 | \usage{ 14 | estimateDimension(prismaData, alpha = 0.05, nScrambleSamples = NULL) 15 | } 16 | \arguments{ 17 | \item{prismaData}{ 18 | A prismaData object loaded via loadPrismaData 19 | } 20 | \item{alpha}{ 21 | Error probability for confidence intervals 22 | } 23 | \item{nScrambleSamples}{ 24 | The number of scrambled samples that should be used to estimate the 25 | noise model. NULL means to use the complete data set. 26 | } 27 | } 28 | \value{ 29 | \item{estDim}{prismaDimension object that can be printed and plotted.} 30 | } 31 | \references{ 32 | R. Schmidt. Multiple emitter location and signal parameter estimation. 33 | \emph{IEEE Transactions on Antennas and Propagation}, 34(3):276 -- 280, 1986. 34 | } 35 | \author{ 36 | Tammo Krueger 37 | } 38 | 39 | %% ~Make other sections like Warning with \section{Warning }{....} ~ 40 | 41 | \examples{ 42 | # please see the vingette for examles 43 | } 44 | -------------------------------------------------------------------------------- /man/generics_mf.Rd: -------------------------------------------------------------------------------- 1 | \name{plot.prismaMF} 2 | \alias{plot.prismaMF} 3 | %- Also NEED an '\alias' for EACH other topic documented here. 4 | \title{ 5 | Generics For PRISMA Objects 6 | } 7 | \description{ 8 | Print and plot generic for the PRISMA matrix factorization objects. 9 | } 10 | \usage{ 11 | \method{plot}{prismaMF}(x, nLines = NULL, baseIndex = NULL, sampleIndex = NULL, 12 | minValue = NULL, noRowClustering = FALSE, noColClustering = FALSE, type 13 | = c("base", "coordinates"), ...) 14 | } 15 | %- maybe also 'usage' for other objects documented here. 16 | \arguments{ 17 | 18 | \item{x}{ 19 | PRISMA matrix factorization object 20 | } 21 | \item{nLines}{ 22 | number of lines that should be plotted 23 | } 24 | \item{baseIndex}{ 25 | which bases should be plotted 26 | } 27 | \item{sampleIndex}{ 28 | which samples should be plotted 29 | } 30 | \item{minValue}{ 31 | cut-off value, i.e., every value smaller than \code{minValue} won't be shown 32 | } 33 | \item{noRowClustering}{ 34 | don't cluster the rows 35 | } 36 | \item{noColClustering}{ 37 | don't cluster the columns 38 | } 39 | \item{type}{ 40 | show the base (\code{type = "base"}, i.e. the \eqn{B} matrix) or 41 | show the coordinate (\code{type = "coordinates"}, i.e. the \eqn{C} matrix). 42 | } 43 | \item{...}{ 44 | not used 45 | } 46 | } 47 | \author{ 48 | Tammo Krueger 49 | } 50 | \seealso{ 51 | \code{\link{estimateDimension}}, \code{\link{prismaHclust}}, \code{\link{prismaDuplicatePCA}}, \code{\link{prismaNMF}} 52 | } 53 | \examples{ 54 | # please see the vingette for examles 55 | } 56 | -------------------------------------------------------------------------------- /vignettes/PRISMA.bib: -------------------------------------------------------------------------------- 1 | @inproceedings{krueger12, 2 | author = {Krueger, Tammo and Gascon, Hugo and Kr\"{a}mer, Nicole and Rieck, Konrad}, 3 | title = {Learning stateful models for network honeypots}, 4 | booktitle = {Proceedings of the 5th ACM workshop on Security and artificial intelligence}, 5 | series = {AISec '12}, 6 | year = {2012}, 7 | isbn = {978-1-4503-1664-4}, 8 | pages = {37--48}, 9 | numpages = {12}, 10 | note = {\url{http://doi.acm.org/10.1145/2381896.2381904}}, 11 | doi = {10.1145/2381896.2381904}, 12 | publisher = {ACM}, 13 | } 14 | 15 | @inproceedings{krueger10, 16 | year={2011}, 17 | isbn={978-3-642-19895-3}, 18 | booktitle={Privacy and Security Issues in Data Mining and Machine Learning}, 19 | volume={6549}, 20 | series={Lecture Notes in Computer Science}, 21 | editor={Dimitrakakis, Christos and Gkoulalas-Divanis, Aris and Mitrokotsa, Aikaterini and Verykios, VassiliosS. and Saygin, Y\"{u}cel}, 22 | doi={10.1007/978-3-642-19896-0_5}, 23 | title={{ASAP}: Automatic Semantics-Aware Analysis of Network Payloads}, 24 | note = {\url{http://dx.doi.org/10.1007/978-3-642-19896-0_5}}, 25 | publisher={Springer Berlin Heidelberg}, 26 | author={Krueger, Tammo and Kr\"{a}mer, Nicole and Rieck, Konrad}, 27 | pages={50-63} 28 | } 29 | 30 | @phdthesis{krueger2013, 31 | title={Probabilistic Methods for Network Security. From Analysis to Response}, 32 | author={Krueger, Tammo}, 33 | year={2013}, 34 | school={TU Berlin}, 35 | note = {\url{http://opus.kobv.de/tuberlin/volltexte/2013/3881/}} 36 | } 37 | 38 | @Article{feinerer08, 39 | title = {Text Mining Infrastructure in {R}}, 40 | author = {Ingo Feinerer and Kurt Hornik and David Meyer}, 41 | year = 2008, 42 | journal = {Journal of Statistical Software}, 43 | volume = 25, 44 | number = 5, 45 | pages = {1--54}, 46 | url = {http://www.jstatsoft.org/v25/i05/}, 47 | month = {March}, 48 | } 49 | -------------------------------------------------------------------------------- /man/loadPrismaData.Rd: -------------------------------------------------------------------------------- 1 | \name{loadPrismaData} 2 | \alias{loadPrismaData} 3 | %- Also NEED an '\alias' for EACH other topic documented here. 4 | \title{ 5 | Load PRISMA Data Files 6 | } 7 | \description{ 8 | Loads files generated by the sally tool (see 9 | \url{http://www.mlsec.org/sally/}) and represents the data as binary 10 | token/ngrams x documents matrix. After loading, statistical tests are 11 | applied to find features which are not volatile nor 12 | constant. Co-occurring features are grouped to further compactify the 13 | data. See \code{system.file("extdata","sallyPreprocessing.py", 14 | package="PRISMA")} for a Python script which generates the 15 | corresponding .fsally file from a .sally file which reduce the 16 | loading time via \code{\link{loadPrismaData}} considerably. 17 | } 18 | \usage{ 19 | loadPrismaData(path, maxLines = -1, fastSally = TRUE, 20 | alpha = 0.05, skipFeatureCorrelation=FALSE) 21 | } 22 | %- maybe also 'usage' for other objects documented here. 23 | \arguments{ 24 | \item{path}{ 25 | path of the data file without the .sally extension. loadPrisma loads 26 | path.sally or path.fsally depending on the fastSally switch. 27 | } 28 | \item{maxLines}{ 29 | maximal number of lines to read from the data file. -1 means to read 30 | all lines. 31 | } 32 | \item{fastSally}{ 33 | should the fsally file be used, which drastically decreases loading time. 34 | } 35 | \item{alpha}{ 36 | significance level for the feature tests. If NULL, all features are kept. 37 | } 38 | \item{skipFeatureCorrelation}{ 39 | should the grouping of features based on correlation analysis be skipped. 40 | } 41 | } 42 | \value{ 43 | \item{prismaData}{data object representing the tokenized documents as 44 | features x samples matrix.} 45 | } 46 | \references{ 47 | See \url{http://www.mlsec.org/sally/} for the sally utility. 48 | } 49 | \author{ 50 | Tammo Krueger 51 | } 52 | \examples{ 53 | # please see the vingette for examles 54 | # please see system.file("extdata","asap.tar.gz", package="PRISMA") for 55 | # an example sally output 56 | } 57 | -------------------------------------------------------------------------------- /man/prismaNMF.Rd: -------------------------------------------------------------------------------- 1 | \name{prismaNMF} 2 | \alias{prismaNMF} 3 | %- Also NEED an '\alias' for EACH other topic documented here. 4 | \title{ 5 | Matrix Factorization Based on Replicate-Aware NMF 6 | } 7 | \description{ 8 | Matrix factorization \eqn{A = B C} with strictly positiv matrices \eqn{B, C} 9 | which minimize the reconstruction error \eqn{\|A - B C\|}. This 10 | replicate-aware version of the non-negtive matrix factorization (NMF) 11 | is based on the alternating least squares 12 | approach and exploits the replicate information to speed up the calculation. 13 | } 14 | \usage{ 15 | prismaNMF(prismaData, ncomp, time = 60, pca.init = TRUE, doNorm = TRUE, oldResult = NULL) 16 | } 17 | %- maybe also 'usage' for other objects documented here. 18 | \arguments{ 19 | \item{prismaData}{ 20 | PRISMA data for which a NMF should be calculated. 21 | } 22 | \item{ncomp}{ 23 | either an \code{integer} or \code{prismaDimension} object specifying 24 | the inner dimension of the matrix factorization. 25 | } 26 | \item{time}{ 27 | seconds after which the calculation should end. 28 | } 29 | \item{pca.init}{ 30 | should the \eqn{B} matrix be initialized by a PCA. 31 | } 32 | \item{doNorm}{ 33 | should the \eqn{B} matrix normalized (i.e. all columns have the 34 | Euclidean length of 1). 35 | } 36 | \item{oldResult}{ 37 | re-use results of a previous run, i.e. \eqn{B} and \eqn{C} are 38 | pre-initialized with the values of this previous matrix 39 | factorization object. 40 | } 41 | } 42 | \value{ 43 | \item{prismaNMF}{Matrix factorization object containing the \eqn{B} and 44 | \eqn{C} matrix.} 45 | } 46 | \references{ 47 | Krueger, T., Gascon, H., Kraemer, N., Rieck, K. (2012) 48 | Learning Stateful Models for Network Honeypots 49 | \emph{5th ACM Workshop on Artificial Intelligence and Security (AISEC 2012)}, accepted 50 | 51 | R. Albright, J. Cox, D. Duling, A. Langville, and C. Meyer. (2006) 52 | Algorithms, initializations, and convergence for the nonnegative 53 | matrix factorization. \emph{Technical Report 81706, North Carolina State University} 54 | } 55 | \author{ 56 | Tammo Krueger 57 | } 58 | \examples{ 59 | # please see the vingette for examles 60 | } 61 | -------------------------------------------------------------------------------- /R/dimensionEstimation.R: -------------------------------------------------------------------------------- 1 | if (getRversion() >= "2.15.1") globalVariables(c("low", "up")) 2 | # public methods 3 | estimateDimension = function(prismaData, alpha=0.05, nScrambleSamples=NULL) { 4 | N = length(prismaData$remapper) 5 | pca = prismaDuplicatePCA(prismaData) 6 | remapper = prismaData$remapper 7 | if (!is.null(nScrambleSamples)) { 8 | remapper = sample(remapper, nScrambleSamples) 9 | } 10 | spca = scramblePCA(scrambleFeature(prismaData$data[, remapper])) 11 | nVal = min(c(length(pca$pca$sdev), length(spca$pca$sdev))) 12 | # Bonferroni correction: 13 | alpha = alpha / nVal 14 | 15 | calcConfidence = function(sdev) { 16 | v = sdev^2 17 | tau = sqrt(2/(N - 1)) 18 | z = qnorm(1-alpha/2) 19 | d1 = sqrt(1 + tau * z) 20 | d2 = sqrt(1 - tau * z) 21 | conf = cbind(v, v / d1, v / d2) 22 | return(conf) 23 | } 24 | cNorm = calcConfidence(pca$pca$sdev[1:nVal]) 25 | sNorm = calcConfidence(spca$pca$sdev[1:nVal]) 26 | data = data.frame(rbind(cbind(1:nVal, cNorm), cbind(1:nVal, sNorm)), rep(c("norm", "scramble"), c(nVal, nVal)), row.names=as.character(1:(2*nVal))) 27 | colnames(data) = c("x", "var", "low", "up", "class") 28 | 29 | norm = data$low[data$class == "norm"] 30 | scramble = data$up[data$class == "scramble"] 31 | dim = 2 * (match(TRUE, norm <= scramble) - 1) 32 | if (dim == 0) { 33 | warning("Not enough data for reasonable dimension estimation. Please adjust $dim according to your fallback heuristic!") 34 | } 35 | ret = list(data=data, dim=dim, pca=pca) 36 | class(ret) = "prismaDimension" 37 | return(ret) 38 | } 39 | 40 | print.prismaDimension = function(x, ...) { 41 | cat("Estimated data dimension for positive matrix factorization via simulated noise level:", x$dim, "\n") 42 | } 43 | 44 | plot.prismaDimension = function(x, ...) { 45 | dimData=x 46 | #require(ggplot2) 47 | data = dimData$data 48 | p = ggplot(data, aes(x=x, y=var, ymin=low, ymax=up, color=class)) 49 | p + geom_errorbar(width=2) + geom_line() 50 | } 51 | 52 | # private methods 53 | 54 | scramblePCA = function(mat) { 55 | # old version without duplicate information! 56 | pca = prcomp(t(mat), scale=FALSE, retx=FALSE) 57 | B = pca$rotation 58 | #C = t(pca$x) 59 | ret = list(B=B, C=NULL, pca=pca) 60 | return(ret) 61 | } 62 | 63 | scrambleFeature = function(mat) { 64 | #require(Matrix) 65 | N = ncol(mat) 66 | F = nrow(mat) 67 | if (inherits(mat, "Matrix")) { 68 | p = mat@p 69 | newI = rep(0, length(mat@i)) 70 | # scramble the features of all data points 71 | for (ind in 1:N) { 72 | if (p[ind+1]-p[ind] > 0) { 73 | newI[(p[ind]+1):p[ind+1]] = sample(F, p[ind+1]-p[ind], replace=FALSE) - 1 74 | } 75 | } 76 | ret = sparseMatrix(i=newI, p=p, x=mat@x, dims=c(F, N), dimnames=dimnames(mat), index1=FALSE) 77 | } 78 | else { 79 | ret = mat 80 | # scramble the features of all data points 81 | for (ind in 1:N) { 82 | ret[, ind] = ret[sample.int(F), ind] 83 | } 84 | } 85 | return(ret) 86 | } 87 | 88 | -------------------------------------------------------------------------------- /vignettes/PRISMA.Rnw: -------------------------------------------------------------------------------- 1 | \documentclass[a4paper]{article} 2 | 3 | \usepackage[margin=2.25cm]{geometry} 4 | \usepackage{xspace} 5 | %%\usepackage[round]{natbib} 6 | \usepackage[colorlinks=true,urlcolor=blue]{hyperref} 7 | 8 | \newcommand{\code}[1]{\texttt{#1}} 9 | \newcommand{\pkg}[1]{{\it #1}} 10 | \newcommand{\Prisma}{\pkg{PRISMA}\xspace} 11 | \SweaveOpts{keep.source=TRUE, strip.white=all} 12 | %% \VignetteIndexEntry{Quick introduction} 13 | 14 | <>= 15 | if (!exists("PRISMA",.GlobalEnv)) library(PRISMA) 16 | @ 17 | 18 | \begin{document} 19 | \title{Introduction to the \Prisma package} 20 | \author{Tammo Krueger} 21 | \date{\today\\[1cm] 22 | \url{https://github.com/tammok/PRISMA}} 23 | \maketitle 24 | 25 | \section*{Introduction} 26 | 27 | This vignette gives you a first tour to the features of the \Prisma 28 | package. We will give an overview of the application of the algorithm, 29 | yet, the full story is available in the papers 30 | \cite{krueger12,krueger10}. If you use the \Prisma package in your 31 | research, please cite at least one of these references. 32 | 33 | The \Prisma package consists essentially out of three parts: 34 | \begin{enumerate} 35 | \item Efficiently reading \code{sally} output, an extremely fast n-gram 36 | processor available at \url{http://www.mlsec.org/sally/} 37 | \item Testing-based feature dimension reduction 38 | \item Optimized matrix factorization of the reduced data exploiting 39 | the replicate structure of the data 40 | \end{enumerate} 41 | 42 | For the theory behind these parts please consult 43 | \cite{krueger12,krueger10}. We will start this walk-through with the 44 | reading of \code{sally} data, then showing the inner structure of the 45 | resulting data object on which the replicate-aware non-negative matrix 46 | factorization can be applied. 47 | 48 | \section*{Loading the Data} 49 | This section serves just as a reference how to apply the processing 50 | chain to new data, to get a usable \Prisma data set. The generated 51 | data set is already prepackaged inside the \Prisma package and can be 52 | loaded via \code{data(asap)}. 53 | 54 | Before executing the examples please extract asap.tar.gz located in 55 | the \code{extdata} path of the \Prisma package to find all data 56 | necessary to understand the processing chain from the raw data 57 | (asap.raw) to the sally file (asap.sally) and the optimized file 58 | (asap.fsally). The asap.sally file can be produced as follows: 59 | \begin{verbatim} 60 | sally -c asap.cfg asap.raw asap.sally 61 | \end{verbatim} 62 | 63 | this call generates asap.sally from the raw data found in asap.raw. To 64 | speed up the loading of the data in R, one should apply the 65 | \code{sallyPreprocessing.py} Python script as follows: 66 | 67 | \begin{verbatim} 68 | python sallyPreprocessing.py asap.sally asap.fsally 69 | \end{verbatim} 70 | 71 | Now the data is ready to be efficiently loaded and processed in R via 72 | \code{loadPrismaData("asap")} which also executes the feature 73 | dimension reduction step. 74 | 75 | \section*{The \Prisma Data Set} 76 | 77 | As an example we use the prepackages ASAP toy data set as described in \cite{krueger10}: 78 | <<>>= 79 | data(asap) 80 | asap 81 | @ 82 | We see that the feature reduction step worked quite well. Let's have a 83 | look behind the scenes: 84 | <<>>= 85 | asap$data 86 | @ 87 | This shows us the reduced form of the initial data matrix in a 88 | features $\times$ documents representation, i.e. this is a replicate-free 89 | version of it. We can see that the features partly consists of grouped 90 | tokens (for instance \code{admin.php par action} contains 3 tokens, 91 | which always co-occurred in the data) and how theses tokens are present 92 | in the different documents. We can see the initial tokens before the 93 | grouping and their corresponding group assignment in the \code{group} variable: 94 | <<>>= 95 | asap$group 96 | @ 97 | 98 | The member variable \code{unprocessed} contains the initial data matrix 99 | before the feature selection and grouping step. If we want to 100 | reconstruct all replicates in the reduced feature space, we need the 101 | \code{getDuplicateData} function: 102 | <<>>= 103 | dim(getDuplicateData(asap)) 104 | dim(asap$unprocessed) 105 | @ 106 | This will blow up the reduced matrix to the full 10.000 initial data 107 | points in the reduced feature space. To see, how often a specific 108 | entry in the reduced data matrix was present, we can have a look at 109 | the duplicate count: 110 | <<>>= 111 | asap$duplicatecount 112 | sum(asap$duplicatecount) 113 | @ 114 | \section*{The Replicate-Aware Non-Negative Matrix Factorization (NMF)} 115 | 116 | The replicate-aware NMF is a matrix factorization method which 117 | describes the data according to a new base vector system, i.e. each 118 | data point is described as a weighted sum of these base vectors. Thus, 119 | the base vectors can be seen as the parts of which a document is 120 | constructed. Furthermore, the new coordinates of a document (the base 121 | weights) can also be interpreted as a soft clustering. But before we 122 | can apply the NMF we need to specify the inner dimension of the 123 | factorization. This could either be supplied by a number (which should 124 | be even, if \code{pca.init} is \code{TRUE}), or a 125 | \code{prismaDimension} object generated by the fully automatized 126 | dimension estimation method: 127 | <<>>= 128 | asapDim = estimateDimension(asap) 129 | asapDim 130 | @ 131 | Equipped with this object, we can now apply the NMF to the data: 132 | \begin{verbatim} 133 | > asapNMF = prismaNMF(asap, asapDim, time=60) 134 | Error: 3771.392 135 | Error: 3113.138 136 | Error: 2855.863 137 | Error: 2810.286 138 | Error: 2765.763 139 | Error: 2755.29 140 | Error: 2752.505 141 | > asapLabels = getMatrixFactorizationLabels(asapNMF) 142 | > table(asapLabels) 143 | asapLabels 144 | 1 2 3 4 5 6 7 8 145 | 623 607 602 660 1696 2473 817 2522 146 | \end{verbatim} 147 | We can look at the results via \code{plot(asapNMF)} which is shown in 148 | Figure \ref{fig:asap}. We can see that the NMF extracts a 149 | \code{search} template, then the four \code{admin.php}-action 150 | templates, a Firefox template and two \code{static} templates, which 151 | reproduces the results in \cite{krueger10}, Section 3.1., with added 152 | user agents as ``noise''. 153 | 154 | \begin{figure}[tb] 155 | \centering 156 | \includegraphics{asap} 157 | \caption{Result of the replicate-aware NMF on the \code{asap} data set.} 158 | \label{fig:asap} 159 | \end{figure} 160 | 161 | \section*{Interface to the \pkg{tm} Package} 162 | 163 | To allow the application of the replicate-aware NMF to corpora 164 | generated by the \pkg{tm} package \cite{feinerer08}, the \Prisma 165 | package contains a converter function which maps a \pkg{tm} corpus 166 | object to a \Prisma data object. We exemplify this procedure with an 167 | already stemmed and cleansed version of the 15 subsections of 168 | \cite{krueger2013}: 169 | 170 | \begin{verbatim} 171 | > data(thesis) 172 | > thesis 173 | A corpus with 15 text documents 174 | > thesis = corpusToPrisma(thesis, NULL, TRUE) 175 | > thesis 176 | PRISMA data tm-Corpus 177 | Unprocessed data: # features: 2002 # entries: 15 178 | Processed data: # features: 2002 # entries: 15 179 | > thesisNMF = prismaNMF(thesis, 3, pca.init=FALSE) 180 | Error: 1329.73 181 | Error: 1310.481 182 | Error: 1295.959 183 | Error: 1295.509 184 | \end{verbatim} 185 | 186 | Since we have just 15 documents, the application of the feature 187 | reduction step and the correlation analysis suffers from too less 188 | data, which also holds true for the PCA-based initialization 189 | scheme. Thus, we ignore all these processings and apply the NMF 190 | directly on the data with three components as a sophisticated 191 | guess. To analyze the result we look at the top 20 words of the 192 | resulting base matrix: 193 | 194 | \begin{verbatim} 195 | > isQuantile = (t(thesisNMF$B) > apply(thesisNMF$B, 2, quantile, prob=.99)) 196 | > maxFeatures = apply(isQuantile, 1, function(r) which(r == 1)) 197 | > rownames(thesis$data)[maxFeatures[, 1]] 198 | [1] "add" "align" "associ" "cluster" "communic" "correct" 199 | [7] "extract" "fill" "format" "inner" "machin" "messag" 200 | [13] "obvious" "preserv" "reflect" "return" "simul" "templat" 201 | [19] "trace" "transit" "tri" 202 | > rownames(thesis$data)[maxFeatures[, 2]] 203 | [1] "behavior" "chang" "configur" "crossvalid" "drop" 204 | [6] "fast" "figur" "follow" "lead" "learn" 205 | [11] "lower" "observ" "optim" "overal" "procedur" 206 | [16] "process" "relat" "shown" "speed" "statist" 207 | [21] "use" 208 | > rownames(thesis$data)[maxFeatures[, 3]] 209 | [1] "addit" "applic" "approach" "attack" "base" "construct" 210 | [7] "content" "exploit" "method" "model" "network" "normal" 211 | [13] "protocol" "server" "similar" "simpl" "structur" "techniqu" 212 | [19] "token" "traffic" "use" 213 | \end{verbatim} 214 | 215 | These word stems accurately describe the contents of the three 216 | chapters of \cite{krueger2013} which concludes the analysis of this 217 | section. 218 | 219 | \bibliographystyle{plain} 220 | \bibliography{PRISMA} 221 | \end{document} 222 | -------------------------------------------------------------------------------- /R/matrixFactorization.R: -------------------------------------------------------------------------------- 1 | # public methods 2 | 3 | getMatrixFactorizationLabels = function(prismaMF) { 4 | labels = apply(prismaMF$C, 2, which.max) 5 | return(labels[prismaMF$remapper]) 6 | } 7 | 8 | prismaHclust = function(prismaData, ncomp, method="single") { 9 | mat = prismaData$data 10 | d = dist(t(mat), "binary") 11 | clust = hclust(d, method) 12 | labels = cutree(clust, k=ncomp) 13 | 14 | label2ind = split(1:length(labels), labels) 15 | B = sapply(label2ind, function(ind) apply(mat[, ind], 1, mean)) 16 | one = function(where) { 17 | ret = rep(0, ncomp) 18 | ret[where] = 1 19 | return(ret) 20 | } 21 | C = sapply(labels, function(l) one(l)) 22 | ret = list(B=B, C=C) 23 | rownames(ret$B) = rownames(mat) 24 | colnames(ret$B) = as.character(1:ncomp) 25 | colnames(ret$C) = colnames(mat) 26 | ret$type = "hclust" 27 | ret$remapper = prismaData$remapper 28 | class(ret) = "prismaMF" 29 | return(ret) 30 | } 31 | 32 | prismaDuplicatePCA = function(prismaData) { 33 | pca = sparsePCA(sparseCov(prismaData)) 34 | ret = list(B=pca$loadings, C=pca$scores, pca=pca) 35 | ret$type = "DuplicatePCA" 36 | ret$remapper = prismaData$remapper 37 | class(ret) = "prismaMF" 38 | return(ret) 39 | } 40 | 41 | prismaNMF = function(prismaData, ncomp, time=60, pca.init=TRUE, doNorm=TRUE, oldResult=NULL) { 42 | mat = prismaData$data 43 | B = NULL 44 | if (!(is.null(oldResult))) { 45 | B = oldResult$B 46 | k = ncol(B) 47 | } 48 | else if (pca.init) { 49 | # genBase duplicates the input, therefore we just take half of the components 50 | if (class(ncomp) == "prismaDimension") { 51 | k = ncomp$dim %/% 2 52 | B = genBase(ncomp$pca$B[, 1:k]) 53 | } 54 | else { 55 | pca = prismaDuplicatePCA(prismaData) 56 | k = ncomp %/% 2 57 | B = genBase(pca$B[, 1:k]) 58 | } 59 | k = 2*k 60 | } 61 | else { 62 | k = ncomp 63 | } 64 | weights = prismaData$duplicatecount 65 | ret = pmf(mat, k, calcTime=time, B=B, doNorm=doNorm, weights=weights) 66 | 67 | rownames(ret$B) = rownames(mat) 68 | 69 | ret$remapper = prismaData$remapper 70 | colnames(ret$C) = colnames(mat) 71 | class(ret) = "prismaMF" 72 | return(ret) 73 | } 74 | 75 | plot.prismaMF = function(x, nLines=NULL, baseIndex=NULL, sampleIndex=NULL, minValue=NULL, noRowClustering=FALSE, noColClustering=FALSE, type=c("base", "coordinates"), ...) { 76 | mf=x 77 | type = match.arg(type) 78 | if (type == "base") { 79 | B = mf$B 80 | if (!is.null(minValue)) { 81 | B[B < minValue] = 0 82 | } 83 | plotMatrixFactor(B, nLines, baseIndex, noRowClustering, noColClustering) 84 | } 85 | else if (type == "coordinates") { 86 | C = mf$C 87 | if (!is.null(sampleIndex)) { 88 | C = C[, sampleIndex] 89 | } 90 | if (!is.null(minValue)) { 91 | C[C < minValue] = 0 92 | } 93 | plotMatrixFactor(t(C), nLines, baseIndex) 94 | } 95 | else { 96 | stop("Unknown plot type!") 97 | } 98 | } 99 | 100 | # private methods 101 | 102 | sparseCov = function(prismaData) { 103 | N = length(prismaData$remapper) 104 | k = nrow(prismaData$data) 105 | x = rep(NA, k*k) 106 | # efficient mean calculation 107 | fmean = colSums(t(prismaData$data) * prismaData$duplicatecount) / N 108 | scount = sqrt(prismaData$duplicatecount) 109 | centeredData = t(prismaData$data - fmean) 110 | centered = centeredData * scount 111 | theCov = new("dsyMatrix", Dim = c(k, k), x=as.numeric(t(centered) %*% centered / (N - 1))) 112 | dimnames(theCov) = list(rownames(prismaData$data), rownames(prismaData$data)) 113 | return(list(cov=theCov, centeredData=centeredData, center=fmean)) 114 | } 115 | 116 | sparsePCA = function(sparsecov) { 117 | # here we emulate the princom method... some parts of it are "reused" here 118 | cl = match.call() 119 | cl[[1L]] = as.name("sparsePCA") 120 | cv = sparsecov$cov 121 | z = sparsecov$centeredData 122 | n.obs = nrow(z) 123 | cen = sparsecov$center 124 | edc = eigen(cv, symmetric = TRUE) 125 | ev = edc$values 126 | evec = edc$vectors 127 | if (any(neg <- ev < 0)) { 128 | # throw away negative eigenvalues 129 | pos = which(!neg) 130 | ev = ev[pos] 131 | evec = evec[, pos] 132 | } 133 | cn = paste0("Comp.", 1L:ncol(evec)) 134 | names(ev) = cn 135 | dimnames(evec) = list(dimnames(cv)[[2L]], cn) 136 | sdev = sqrt(ev) 137 | scr = t(z %*% evec) 138 | edc = list(sdev = sdev, loadings=evec, 139 | center = cen, n.obs = n.obs, scores = scr, call = cl) 140 | class(edc) = "princomp" 141 | return(edc) 142 | } 143 | 144 | reconstructSparsePCA = function(spca) { 145 | rec = spca$loadings %*% spca$scores + spca$center 146 | return(rec) 147 | } 148 | 149 | plotMatrixFactor = function(B, n.lines=NULL, base.index=NULL, noRowClustering=FALSE, noColClustering=FALSE) { 150 | #require(gplots) 151 | B = as.matrix(B) 152 | if (!is.null(base.index)) { 153 | B = B[, base.index] 154 | } 155 | if (!is.null(n.lines)) { 156 | B = calcLinesPerThreshold(B, n.lines) 157 | } 158 | if (noRowClustering) { 159 | row.clust = NULL 160 | dendrogram = "column" 161 | } 162 | else { 163 | row.clust = as.dendrogram(hclust(dist(B, method="euclidean"), method="complete")) 164 | dendrogram = "both" 165 | } 166 | if (noColClustering) { 167 | col.clust = NULL 168 | dendrogram = ifelse(dendrogram == "both", "row", "none") 169 | } 170 | else { 171 | col.clust = as.dendrogram(hclust(dist(t(B), method="binary"), method="complete")) 172 | } 173 | breaks = c(0, seq(min(B[B>0])-1e-9, max(B), length=15)) 174 | heatmap.2(B, Rowv=row.clust, Colv=col.clust, dendrogram=dendrogram, trace="none", breaks=breaks, col=function(n) gray(c(1, seq(0.8, 0, length=n-1)))) 175 | } 176 | 177 | genBase = function(B) { 178 | negB = -B 179 | negB[negB < 0] = 0 180 | mat = cbind(negB, B + negB) 181 | colnames(mat) = c(paste(colnames(B), "neg", sep="."), paste(colnames(B), "pos", sep=".")) 182 | return(mat) 183 | } 184 | 185 | normBase = function(ret) { 186 | r = ncol(ret$B) 187 | nfeats = nrow(ret$B) 188 | # norm the basis 189 | norms = sqrt(apply(ret$B^2, 2, sum)) 190 | # look for all-zero base 191 | allZero = (norms == 0) 192 | ret$B = ret$B[, !allZero] 193 | ret$C = ret$C[!allZero, ] 194 | r = r - sum(allZero) 195 | ret$B = ret$B / rep(norms[!allZero], rep(nfeats, r)) 196 | ret$C = ret$C * norms[!allZero] 197 | return(ret) 198 | } 199 | 200 | calcDatacluster = function(ret) { 201 | labels = apply(ret$C, 2, which.max) 202 | return(labels) 203 | } 204 | 205 | calcLinesPerThreshold = function(B, n.lines) { 206 | allVals = unique(sort(B, decreasing=TRUE)) 207 | allMax = apply(B, 1, max) 208 | lines = sapply(allVals, function(v) sum(allMax > v)) 209 | min.value = allVals[which.min(lines <= n.lines)-1] 210 | B = B[apply(B, 1, function(r) any(r > min.value)), ] 211 | return(B) 212 | } 213 | 214 | # pimped speed by crossprod... 215 | # see Least Squares Calculations in R by D.M. Bates in R News, 4/1:17-20 216 | RRbyCV = function(Y, D, fold=5, lambdas=10^(-4:2), weights=NULL) { 217 | # Y is the data assumed to be [# samples X # vars] 218 | N = nrow(Y) 219 | F = ncol(D) 220 | Nfold = floor(N / fold) 221 | res = matrix(0, length(lambdas), fold, dimnames=list(as.character(lambdas), NULL)) 222 | if (!is.null(weights)) { 223 | sweights = sqrt(weights) 224 | } 225 | for (l in lambdas) { 226 | for (f in 1:fold) { 227 | index = ((f-1)*Nfold + 1):(f * Nfold) 228 | Sub = D[-index, ] 229 | # estimate the coefficients on the subsample 230 | if (is.null(weights)) { 231 | #Beta1 = solve(t(Sub) %*% Sub + diag(l, F)) %*% t(Sub) %*% Y[-index, ] 232 | Beta2 = solve(crossprod(Sub) + diag(l, F), crossprod(Sub, Y[-index, ])) 233 | #cat(" No weights", round(mean(abs(Beta1 - Beta2)), 6)) 234 | Beta = Beta2 235 | res[as.character(l), f] = sqrt(sum((Y[index, ] - (D[index, ] %*% Beta))^2)) 236 | } 237 | else { 238 | #W = Diagonal(x=weights[-index]) 239 | #Beta1 = solve(t(Sub) %*% W %*% Sub + diag(l, F)) %*% t(Sub) %*% W %*% Y[-index, ] 240 | W = Diagonal(x=sweights[-index]) 241 | WSub = W %*% Sub 242 | WY = W %*% Y[-index, ] 243 | Beta2 = solve(crossprod(WSub) + diag(l, F), crossprod(WSub, WY)) 244 | #cat(" Weights", round(mean(abs(Beta1 - Beta2)), 6)) 245 | Beta = Beta2 246 | res[as.character(l), f] = sqrt(sum((Y[index, ] - (D[index, ] %*% Beta))^2)) 247 | } 248 | } 249 | } 250 | return(lambdas[which.min(apply(res, 1, mean))]) 251 | } 252 | 253 | pmf = function(A, r, calcTime, B=NULL, doNorm=TRUE, weights=NULL) { 254 | #require(Matrix) 255 | # A should contain the samples in the cols! 256 | nsamples = ncol(A) 257 | nfeats = nrow(A) 258 | if (is.null(weights)) { 259 | W = Diagonal(nsamples) 260 | weights = rep(1, nsamples) 261 | # SW = W 262 | } 263 | else { 264 | W = Diagonal(nsamples, weights) 265 | # SW = Diagonal(nsamples, sqrt(weights)) 266 | } 267 | # the new basis 268 | if (is.null(B)) { 269 | B = abs(matrix(rnorm(nfeats * r), nfeats, r)) 270 | } 271 | olderror = Inf 272 | iter = 0 273 | startTime = proc.time()[3] 274 | while (TRUE) { 275 | lambda = RRbyCV(A, B, weights=NULL) 276 | #S = t(B) %*% B + diag(lambda, r, r) 277 | S = crossprod(B) + diag(lambda, r, r) 278 | 279 | # faster? 280 | C = solve(S, crossprod(B, A)) 281 | # Cold = solve(S, t(B) %*% A) 282 | # cat(" No weights", round(mean(abs(C - Cold)), 6)) 283 | # C = solve(S) %*% (t(B) %*% A) 284 | # set all negative coordinates to 0 285 | C[C < 0] = 0 286 | 287 | lambda = RRbyCV(t(A), t(C), weights=weights) 288 | #WtC = SW %*% t(C) 289 | #S = crossprod(WtC) + diag(lambda, r, r) 290 | S = C %*% W %*% t(C) + diag(lambda, r, r) 291 | 292 | # faster? 293 | #B = t(solve(S, crossprod(WtC, SW %*% t(A)))) 294 | B = t(solve(S, C %*% W %*% t(A))) 295 | #B = (A %*% W %*% t(C)) %*% solve(S) 296 | B[B < 0] = 0 297 | if (doNorm) { 298 | # norm the basis 299 | norms = sqrt(apply(B^2, 2, sum)) 300 | # look for all-zero base 301 | allZero = (norms == 0) 302 | B = B[, !allZero, drop=FALSE] 303 | C = C[!allZero, , drop=FALSE] 304 | r = r - sum(allZero) 305 | B = B / rep(norms[!allZero], rep(nfeats, r)) 306 | C = C * norms[!allZero] 307 | } 308 | iter = iter + 1 309 | timeElapsed = proc.time()[3] - startTime 310 | if (iter %% 10 == 0) { 311 | error = .5 * sum(colSums((A - B %*% C)^2) * weights) 312 | cat("Error:", error, "\n") 313 | if (abs(olderror - error) < 1e-9) { 314 | break 315 | } 316 | olderror = error 317 | } 318 | if (timeElapsed >= calcTime) { 319 | break 320 | } 321 | } 322 | ret = list(B=B, C=C) 323 | return(ret) 324 | } 325 | -------------------------------------------------------------------------------- /R/prisma.R: -------------------------------------------------------------------------------- 1 | # public functions: 2 | loadPrismaData = function(path, maxLines=-1, fastSally=TRUE, alpha=.05, skipFeatureCorrelation=FALSE) { 3 | data = readPrismaInput(path, maxLines, fastSally) 4 | data = preprocessPrismaData(data, alpha, skipFeatureCorrelation) 5 | data$path = path 6 | class(data) = "prisma" 7 | return(data) 8 | } 9 | 10 | getDuplicateData = function(prismaData) { 11 | return(prismaData$data[, prismaData$remapper]) 12 | } 13 | 14 | corpusToPrisma = function(corpus, alpha=.05, skipFeatureCorrelation=FALSE) { 15 | #require(Matrix) 16 | if (requireNamespace("tm", quietly = TRUE) && packageVersion("tm") >= '0.6') { 17 | #require(tm) 18 | tdm = tm::TermDocumentMatrix(corpus) 19 | data = list(data=Matrix(as.matrix(tdm))) 20 | data = preprocessPrismaData(data, alpha, skipFeatureCorrelation) 21 | data$path = "tm-Corpus" 22 | class(data) = "prisma" 23 | return(data) 24 | } 25 | else { 26 | stop("Need package tm (>='0.6')") 27 | } 28 | } 29 | 30 | 31 | print.prisma = function(x, ...) { 32 | prismaData=x 33 | cat("PRISMA data", prismaData$path, "\n") 34 | cat("Unprocessed data: # features:", nrow(prismaData$unprocessed), 35 | "# entries:", ncol(prismaData$unprocessed), "\n") 36 | cat("Processed data: # features:", nrow(prismaData$data), 37 | "# entries:", ncol(prismaData$data), "\n") 38 | } 39 | 40 | plot.prisma = function(x, ...) { 41 | prismaData=x 42 | image(prismaData$data) 43 | } 44 | 45 | # private functions: 46 | readFSally = function(path, maxLines=-1) { 47 | #require(Matrix) 48 | f = file(path) 49 | cat("Reading data...\n") 50 | data = readLines(f) 51 | cat("Splitting ngrams...\n") 52 | ngrams = strsplit(data, " ", fixed=TRUE) 53 | total = length(data) 54 | allNgrams = ngrams[[total]] 55 | close(f) 56 | cat("Calc indices...\n") 57 | indices = match(unlist(ngrams[-total]), allNgrams) 58 | cat("Setup matrix...\n") 59 | N = total-1 60 | mat = sparseMatrix(indices, rep(1:N, sapply(ngrams[-total], length)), 61 | x=1, 62 | dims=c(length(allNgrams), N), 63 | dimnames=list(allNgrams, paste("line", 1:N, sep=""))) 64 | if (maxLines > 0) { 65 | return(mat[, 1:maxLines]) 66 | } 67 | else { 68 | return(mat) 69 | } 70 | } 71 | 72 | readSally = function(path, maxLines=-1) { 73 | #require(Matrix) 74 | f = file(path) 75 | data = scan(f, what="char", sep=" ", quote="", quiet=TRUE, comment.char="", skip=1, nlines=maxLines) 76 | close(f) 77 | rawngrams = data[c(TRUE, FALSE)] 78 | origin = data[c(FALSE, TRUE)] 79 | processNgram = function(cv) { 80 | ret = cv[3] 81 | names(ret) = cv[2] 82 | return(ret) 83 | } 84 | ngrams = lapply(strsplit(rawngrams, ",", fixed=TRUE), function(obj) sapply(strsplit(obj, ":", fixed=TRUE), processNgram)) 85 | allNgrams = unique(unlist(lapply(ngrams, function(ngram) names(ngram)), use.names=FALSE)) 86 | indices = unlist(lapply(ngrams, function(ngram) match(names(ngram), allNgrams)), use.names=FALSE) 87 | # generate a matrix in ml-style: rows are the features, cols are the samples 88 | mat = sparseMatrix(indices, rep(1:length(ngrams), sapply(ngrams, length)), x= as.numeric(unlist(ngrams, use.names=FALSE)), dims=c(length(allNgrams), length(ngrams)), dimnames=list(allNgrams, origin)) 89 | return(mat) 90 | } 91 | 92 | readHarry = function(path, maxLines=-1) { 93 | harry = read.table(path, sep="\t", quote="", comment.char="", 94 | as.is=TRUE, header=TRUE, nrows=maxLines) 95 | return(harry) 96 | } 97 | 98 | readRaw = function(path, maxLines=-1) { 99 | f = file(path) 100 | raw = readLines(f, n=maxLines) 101 | close(f) 102 | #rawsplit = strsplit(raw, " ", fixed=TRUE) 103 | return(raw) 104 | } 105 | 106 | readPrismaInput = function(path, maxLines=-1, fastSally=TRUE) { 107 | if (fastSally) { 108 | sally = readFSally(sprintf("%s.fsally", path), maxLines) 109 | } 110 | else { 111 | sally = readSally(sprintf("%s.sally", path), maxLines) 112 | } 113 | data = list(data=sally) 114 | hfile = sprintf("%s.harry", path) 115 | if (file.exists(hfile) && file.access(hfile, mode=4)) { 116 | data$annotation = readHarry(hfile, maxLines) 117 | } 118 | rfile = sprintf("%s.rawquoted", path) 119 | if (file.exists(rfile) && file.access(rfile, mode=4)) { 120 | data$raw = readRaw(rfile, maxLines) 121 | } 122 | return(data) 123 | } 124 | 125 | duplicateRemover = function(data) { 126 | if (inherits(data, "Matrix")) { 127 | classes = calcClassForSparseMatrix(data) 128 | } 129 | else { 130 | classes = sapply(1:ncol(data), function(colIndex) paste(which(data[, colIndex] == 1), collapse=" ")) 131 | } 132 | classCount = table(classes) 133 | uniqueClasses = names(classCount) 134 | # just pick the first data point for each class: 135 | classIndex = sapply(uniqueClasses, function(cl) match(cl, classes)) 136 | data = data[, classIndex] 137 | remapper = sapply(classes, function(cl) match(cl, uniqueClasses)) 138 | return(list(data=data, remapper=remapper, count=classCount)) 139 | } 140 | 141 | calcClassForSparseMatrix = function(data) { 142 | i = data@i 143 | dp = c(0, diff(data@p)) 144 | csdp = cumsum(dp) 145 | oneClass = function(index) { 146 | from = csdp[index]+1 147 | to = csdp[index+1] 148 | if (from > to) { 149 | # zero entry 150 | return("") 151 | } 152 | else { 153 | return(paste(i[from:to], collapse=" ")) 154 | } 155 | } 156 | sapply(1:ncol(data), oneClass) 157 | } 158 | 159 | preprocessPrismaData =function(data, alpha=.05, skipFeatureCorrelation=FALSE) { 160 | data$unprocessed = data$data 161 | processed = filterDataByTestAndCor(data$data, alpha, skipFeatureCorrelation) 162 | duplicatesRemoved = duplicateRemover(processed$mat) 163 | data$data = duplicatesRemoved$data 164 | data$remapper = duplicatesRemoved$remapper 165 | data$duplicatecount = as.vector(duplicatesRemoved$count) 166 | 167 | data$group = processed$group 168 | data$occAlways = processed$always 169 | data$occNever = processed$never 170 | 171 | return(data) 172 | } 173 | 174 | count2freq = function(mat) { 175 | # use the samples x features view for simpler calculation 176 | mat = t(mat) 177 | return(t(mat / rowSums(mat))) 178 | } 179 | 180 | count2bin = function(mat) { 181 | #require(Matrix) 182 | if (inherits(mat, "TsparseMatrix")) { 183 | ret = mat 184 | } 185 | else if (inherits(mat, "CsparseMatrix")) { 186 | ret = sparseMatrix(mat@i+1, p=mat@p, x=1, dims=mat@Dim, dimnames=mat@Dimnames) 187 | } 188 | else { 189 | ret = as.matrix(mat) 190 | ret[ret > 0] = 1 191 | } 192 | return(ret) 193 | } 194 | 195 | groupCorrelatedNgrams = function(data) { 196 | nfeats = nrow(data) 197 | ndocs = ncol(data) 198 | toCheck = 1:nfeats 199 | groups = rep(-1, nfeats) 200 | groupCount = 1 201 | # is it possible to calculate correlations on sparse matrices? 202 | #mat = as.matrix(data) 203 | mat = data 204 | while (length(toCheck) > 0) { 205 | cat("to check:", length(toCheck), "\n") 206 | if (length(toCheck) == 1) { 207 | curCor = 1 208 | } 209 | else { 210 | curCor = sparse.cor(mat[toCheck, ]) 211 | } 212 | group = toCheck[curCor == 1] 213 | groups[group] = groupCount 214 | groupCount = groupCount + 1 215 | toCheck = toCheck[curCor != 1] 216 | #cat(data$str[group], "\n") 217 | } 218 | return(groups) 219 | } 220 | 221 | sparse.cor <- function(X){ 222 | docsWithFeature = (X[1, ] != 0) 223 | onDocs = sum(docsWithFeature) 224 | offDocs = ncol(X) - onDocs 225 | ret = rep(0, nrow(X)) 226 | ret[1] = 1 227 | if (onDocs >= 1) { 228 | onFeatureDocs = X[, docsWithFeature] 229 | offFeatureDocs = X[, !docsWithFeature] 230 | if (onDocs > 1) { 231 | # we have more than one document for this feature... 232 | # so calculate the number of documents for this feature 233 | onFeatureDocs = rowSums(onFeatureDocs) 234 | } 235 | if (offDocs > 1) { 236 | offFeatureDocs = rowSums(offFeatureDocs) 237 | } 238 | # just set the correlation to one, if the number of 239 | # documents, in which the feature is turned of, is zero 240 | # and the number of documents, in which the feature is on, is the same 241 | ret[(offFeatureDocs == 0) & (onFeatureDocs == onDocs)] = 1 242 | } 243 | return(ret) 244 | } 245 | 246 | compressByGroup = function(data) { 247 | features = rownames(data) 248 | groups = groupCorrelatedNgrams(data) 249 | indByG = split(1:length(groups), groups) 250 | names(groups) = features 251 | newDimNames = sapply(indByG, function(g) paste(features[g], collapse=" ")) 252 | # just keep the first feature of the group... 253 | # since the rest contains the same information (cor=1) 254 | data = data[sapply(indByG, function(g) g[1]), ] 255 | rownames(data) = newDimNames 256 | return(list(data=data, group=groups)) 257 | } 258 | 259 | # data should be binary and unnormalized! 260 | # hmmm... the "normal" testing weirdness of thinking-negative: 261 | # never = ttestNgrams(data, 0, "greater") 262 | # we would keep these... 263 | # data$str[p.adjust(never, "bonf") < 0.05] 264 | ## [1] "\nAcc" "\nHos" " */*" " HTT" " cgi" " www" "*/*\n" ".1\nH" ".com" 265 | ## [10] ".foo" ".php" "/1.1" "/sea" "1\nHo" "1.1\n" ": */" ": ww" "Acce" 266 | ## [19] "ET c" "GET " "HTTP" "Host" "P/1." "T cg" "TP/1" "TTP/" "ar.c" 267 | ## [28] "arch" "bar." "ccep" "cept" "cgi/" "ch.p" "com\n" "earc" "ept:" 268 | ## [37] "foob" "gi/s" "h.ph" "hp?s" "i/se" "m\nAc" "obar" "om\nA" "ooba" 269 | ## [46] "ost:" "p?s=" "php?" "pt: " "r.co" "rch." "sear" "st: " "t: *" 270 | ## [55] "t: w" "w.fo" "ww.f" "www." "&par" "/adm" "=ren" "?act" "acti" 271 | ## [64] "admi" "ame&" "ctio" "dmin" "e&pa" "enam" "gi/a" "hp?a" "i/ad" 272 | ## [73] "in.p" "ion=" "me&p" "min." "n.ph" "n=re" "name" "on=r" "p?ac" 273 | ## [82] "par=" "rena" "tion" " sta" ".htm" "ET s" "T st" "atic" "html" 274 | ## [91] "l HT" "ml H" "stat" "tati" "tic/" "tml " "=mov" "move" "n=mo" 275 | ## [100] "on=m" "ove&" "ve&p" "=sho" "how&" "n=sh" "on=s" "ow&p" "show" 276 | ## [109] "w&pa" "=del" "dele" "elet" "ete&" "lete" "n=de" "on=d" "te&p" 277 | ## [118] "G HT" 278 | # always = ttestNgrams(data, 1, "less") 279 | # ...and drop these... 280 | # data$str[p.adjust(always, "bonf") > 0.05] 281 | ## [1] "\nAcc" "\nHos" " */*" " HTT" " www" "*/*\n" ".1\nH" ".com" ".foo" 282 | ## [10] "/1.1" "1\nHo" "1.1\n" ": */" ": ww" "Acce" "GET " "HTTP" "Host" 283 | ## [19] "P/1." "TP/1" "TTP/" "ar.c" "bar." "ccep" "cept" "com\n" "ept:" 284 | ## [28] "foob" "m\nAc" "obar" "om\nA" "ooba" "ost:" "pt: " "r.co" "st: " 285 | ## [37] "t: *" "t: w" "w.fo" "ww.f" "www." 286 | # So finally just keep these: 287 | # data$str[p.adjust(always, "bonf") < 0.05 & p.adjust(never, "bonf") < 0.05] 288 | ttestNgrams = function(data, mu, alternative=c("greater", "less")) { 289 | #require(Matrix) 290 | alternative <- match.arg(alternative) 291 | N = ncol(data) 292 | nfeats = nrow(data) 293 | muNgram = rowMeans(data) * N 294 | # some sources give 5, other 10 as a factor, of when the normal approx. works... 295 | # we just take the average here. 296 | mu = ifelse(mu == 0, 7.5/N, 1 - (7.5/N)) 297 | theVar = sqrt(N * mu * (1 - mu)) 298 | M = mu * N 299 | if (alternative == "greater") { 300 | pValues = sapply(muNgram, function(m) pnorm((m - M) / theVar, lower.tail = FALSE)) 301 | } 302 | if (alternative == "less") { 303 | pValues = sapply(muNgram, function(m) pnorm((m - M) / theVar, lower.tail = TRUE)) 304 | } 305 | return(pValues) 306 | } 307 | 308 | filterDataByTestAndCor = function(data, alpha=0.05, skipFeatureCorrelation=FALSE) { 309 | data = count2bin(data) 310 | if (is.null(alpha)) { 311 | #keep = (alwaysP != 1) 312 | keep = rep(TRUE, nrow(data)) 313 | } 314 | else { 315 | never = ttestNgrams(data, 0, "greater") 316 | always = ttestNgrams(data, 1, "less") 317 | 318 | alwaysP = p.adjust(always, "holm") 319 | neverP = p.adjust(never, "holm") 320 | keep = (alwaysP < alpha & neverP < alpha) 321 | } 322 | allStr = rownames(data) 323 | fdata = data[keep, ] 324 | if (skipFeatureCorrelation) { 325 | features = rownames(fdata) 326 | groups = 1:length(features) 327 | names(groups) = features 328 | dataAndGroup =list(data=fdata, group=groups) 329 | } 330 | else { 331 | dataAndGroup = compressByGroup(fdata) 332 | } 333 | if (is.null(alpha)) { 334 | #always = allStr[(alwaysP == 1)] 335 | always = c() 336 | never = c() 337 | } 338 | else { 339 | always = allStr[(alwaysP >= alpha)] 340 | never = allStr[(neverP >= alpha)] 341 | } 342 | return(list(mat=dataAndGroup$data, group=dataAndGroup$group, always=always, never=never)) 343 | } 344 | --------------------------------------------------------------------------------