├── data
    ├── asap.rda
    └── thesis.rda
├── vignettes
    ├── asap.pdf
    ├── PRISMA.pdf
    ├── PRISMA.bib
    └── PRISMA.Rnw
├── inst
    └── extdata
    │   ├── asap.tar.gz
    │   ├── README
    │   └── sallyPreprocessing.py
├── .gitignore
├── README.md
├── man
    ├── thesis.Rd
    ├── asap.Rd
    ├── prismaDuplicatePCA.Rd
    ├── generics.Rd
    ├── generics_dimension.Rd
    ├── getDuplicateData.Rd
    ├── PRISMA-package.Rd
    ├── getMatrixFactorizationLabels.Rd
    ├── corpusToPrisma.Rd
    ├── prismaHclust.Rd
    ├── estimateDimension.Rd
    ├── generics_mf.Rd
    ├── loadPrismaData.Rd
    └── prismaNMF.Rd
├── NAMESPACE
├── DESCRIPTION
└── R
    ├── dimensionEstimation.R
    ├── matrixFactorization.R
    └── prisma.R


/data/asap.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tammok/PRISMA/HEAD/data/asap.rda


--------------------------------------------------------------------------------
/data/thesis.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tammok/PRISMA/HEAD/data/thesis.rda


--------------------------------------------------------------------------------
/vignettes/asap.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tammok/PRISMA/HEAD/vignettes/asap.pdf


--------------------------------------------------------------------------------
/vignettes/PRISMA.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tammok/PRISMA/HEAD/vignettes/PRISMA.pdf


--------------------------------------------------------------------------------
/inst/extdata/asap.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tammok/PRISMA/HEAD/inst/extdata/asap.tar.gz


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # History files
2 | .Rhistory
3 | 
4 | # Example code in package build process
5 | *-Ex.R
6 | *~


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | PRISMA
 2 | ======
 3 | 
 4 | Protocol Inspection and State Machine Analysis
 5 | 
 6 | The package PRISMA is hosted on CRAN, so
 7 | 
 8 |     install.packages("PRISMA")
 9 |     library(PRISMA)
10 |     example(PRISMA)
11 |     vignette("PRISMA")
12 | 	
13 | will give you a first impression.
14 | 


--------------------------------------------------------------------------------
/man/thesis.Rd:
--------------------------------------------------------------------------------
 1 | \name{thesis}
 2 | \docType{data}
 3 | \alias{thesis}
 4 | \title{The Thesis Data Set}
 5 | \description{
 6 |   The 15 sections of a thesis (see references) as a tm-corpus.
 7 | }
 8 | \usage{thesis}
 9 | \format{A tm-corpus.}
10 | \references{
11 | Tammo Krueger. \emph{Probabilistic Methods for Network Security. From Analysis to Response.} PhD thesis,
12 | TU Berlin, 2013. \url{http://opus.kobv.de/tuberlin/volltexte/2013/3881/}
13 | }
14 | \author{
15 | Tammo Krueger <tammokrueger@googlemail.com>
16 | }
17 | \keyword{datasets}


--------------------------------------------------------------------------------
/man/asap.Rd:
--------------------------------------------------------------------------------
 1 | \name{asap}
 2 | \docType{data}
 3 | \alias{asap}
 4 | \title{The ASAP Data Set}
 5 | \description{
 6 |   Toy data set to show the capabilities of the PRISMA package.
 7 | }
 8 | \usage{asap}
 9 | \format{A prisma object.}
10 | \references{
11 | Krueger, T., Kraemer, N., Rieck, K. (2011)
12 | ASAP: Automatic Semantics-Aware Analysis of Network Payloads
13 | \emph{Privacy and Security Issues in Data Mining and Machine Learning - International ECML/PKDD Workshop. Lecture Notes in Computer Science 6549}, Springer. 50 - 63
14 | }
15 | \author{
16 | Tammo Krueger <tammokrueger@googlemail.com>
17 | }
18 | \keyword{datasets}


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | S3method(plot, prisma)
 2 | S3method(print, prisma)
 3 | S3method(plot, prismaDimension)
 4 | S3method(print, prismaDimension)
 5 | S3method(plot, prismaMF)
 6 | export(prismaHclust, prismaDuplicatePCA, prismaNMF, loadPrismaData, getDuplicateData, corpusToPrisma, estimateDimension, plot.prisma, print.prisma, plot.prismaDimension, print.prismaDimension, plot.prismaMF, getMatrixFactorizationLabels)
 7 | import(Matrix, gplots, ggplot2)
 8 | importFrom("grDevices", "gray")
 9 | importFrom("methods", "new")
10 | importFrom("stats", "as.dendrogram", "cutree", "dist", "hclust", "p.adjust", "pnorm", "prcomp", "qnorm", "rnorm", "var")
11 | importFrom("utils", "packageVersion", "read.table")
12 | 


--------------------------------------------------------------------------------
/inst/extdata/README:
--------------------------------------------------------------------------------
 1 | This folder contains an example file to show the preprocessing step
 2 | with the sally toolkit (see http://www.mlsec.org/sally/). Before
 3 | executing the examples please extract asap.tar.gz to find all data
 4 | necessary to understand the processing chain from the raw data
 5 | (asap.raw) to the sally file (asap.sally) and the optimized file
 6 | (asap.fsally). The asap.sally file can be produced as follows:
 7 | 
 8 | sally -c asap.cfg asap.raw asap.sally
 9 | 
10 | this call generates asap.sally from the raw data found in asap.raw. To
11 | speed up the loading of the data in R, one should apply the
12 | sallyPreprocessing.py python script as follows:
13 | 
14 | python sallyPreprocessing.py asap.sally asap.fsally
15 | 
16 | Now the data is ready to be efficiently loaded and processed in R.


--------------------------------------------------------------------------------
/man/prismaDuplicatePCA.Rd:
--------------------------------------------------------------------------------
 1 | \name{prismaDuplicatePCA}
 2 | \alias{prismaDuplicatePCA}
 3 | %- Also NEED an '\alias' for EACH other topic documented here.
 4 | \title{
 5 |   Matrix Factorization Based on Replicate-Aware PCA
 6 | }
 7 | \description{
 8 |   Efficient implementation of a replicate-aware principal component
 9 |   anaylsis (PCA).
10 | }
11 | \usage{
12 | prismaDuplicatePCA(prismaData)
13 | }
14 | %- maybe also 'usage' for other objects documented here.
15 | \arguments{
16 |   \item{prismaData}{
17 |     PRISMA data for which a PCA should be calculated
18 | }
19 | }
20 | \value{
21 |   \item{prismaPCA}{Matrix factorization object $A = B C$, in which the
22 |     factors are calculate by a replicate-aware PCA}
23 | }
24 | \author{
25 | Tammo Krueger <tammokrueger@googlemail.com>
26 | }
27 | \examples{
28 | # please see the vingette for examles
29 | }
30 | 


--------------------------------------------------------------------------------
/man/generics.Rd:
--------------------------------------------------------------------------------
 1 | \name{plot.prisma}
 2 | \alias{plot.prisma}
 3 | \alias{print.prisma}
 4 | %- Also NEED an '\alias' for EACH other topic documented here.
 5 | \title{
 6 |   Generics For PRISMA Objects
 7 | }
 8 | \description{
 9 |   Print and plot generic for the PRISMA objects.
10 | }
11 | \usage{
12 | \method{print}{prisma}(x, ...)
13 | \method{plot}{prisma}(x, ...)
14 | }
15 | %- maybe also 'usage' for other objects documented here.
16 | \arguments{
17 |   \item{x}{
18 |     PRISMA data loaded via \code{\link{loadPrismaData}}
19 | }
20 |   
21 |   \item{...}{
22 |     not used
23 | }
24 | }
25 | \author{
26 | Tammo Krueger <tammokrueger@googlemail.com>
27 | }
28 | \seealso{
29 | \code{\link{estimateDimension}}, \code{\link{prismaHclust}}, \code{\link{prismaDuplicatePCA}}, \code{\link{prismaNMF}}
30 | }
31 | \examples{
32 | data(asap)
33 | print(asap)
34 | plot(asap)
35 | 
36 | }
37 | 


--------------------------------------------------------------------------------
/man/generics_dimension.Rd:
--------------------------------------------------------------------------------
 1 | \name{plot.prismaDimension}
 2 | \alias{plot.prismaDimension}
 3 | \alias{print.prismaDimension}
 4 | %- Also NEED an '\alias' for EACH other topic documented here.
 5 | \title{
 6 |   Generics For PRISMA Objects
 7 | }
 8 | \description{
 9 |   Print and plot generic for the PRISMA dimension objects.
10 | }
11 | \usage{
12 | \method{print}{prismaDimension}(x, ...)
13 | \method{plot}{prismaDimension}(x, ...)
14 | }
15 | %- maybe also 'usage' for other objects documented here.
16 | \arguments{
17 |   \item{x}{
18 |     PRISMA dimension object generated via \code{\link{estimateDimension}}
19 | }
20 |   \item{...}{
21 |     not used
22 | }
23 | }
24 | \author{
25 | Tammo Krueger <tammokrueger@googlemail.com>
26 | }
27 | \seealso{
28 | \code{\link{estimateDimension}}, \code{\link{prismaHclust}}, \code{\link{prismaDuplicatePCA}}, \code{\link{prismaNMF}}
29 | }
30 | \examples{
31 | # please see the vingette for examles
32 | }
33 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: PRISMA
 2 | Type: Package
 3 | Title: Protocol Inspection and State Machine Analysis
 4 | Version: 0.2-7
 5 | Date: 2018-05-26
 6 | Depends:
 7 |     Matrix,
 8 |     gplots,
 9 |     methods,
10 |     ggplot2
11 | Suggests:
12 |     tm (>= 0.6)
13 | Author: Tammo Krueger, Nicole Kraemer
14 | Maintainer: Tammo Krueger <tammokrueger@googlemail.com>
15 | Description: Loads and processes huge text
16 |     corpora processed with the sally toolbox (<http://www.mlsec.org/sally/>).
17 |     sally acts as a very fast preprocessor which splits the text files into
18 |     tokens or n-grams. These output files can then be read with the PRISMA
19 |     package which applies testing-based token selection and has some
20 |     replicate-aware, highly tuned non-negative matrix factorization and
21 |     principal component analysis implementation which allows the processing of
22 |     very big data sets even on desktop machines.
23 | License: GPL (>=2.0)
24 | 
25 | 


--------------------------------------------------------------------------------
/inst/extdata/sallyPreprocessing.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import sys
 3 | from optparse import OptionParser
 4 | 
 5 | usage = "usage: %prog in.sally out.fsally"
 6 | parser = OptionParser(usage)
 7 | 
 8 | (options, args) = parser.parse_args()
 9 | 
10 | if len(args) != 2:
11 |     parser.print_help()
12 |     sys.exit()
13 | 
14 | sallyIn = file(sys.argv[1])
15 | sallyOut = file(sys.argv[2], "w")
16 | # skip first line
17 | sallyIn.readline()
18 | allNgrams = {}
19 | count = 0
20 | for l in sallyIn:
21 |     count += 1
22 |     if count % 1000 == 0: 
23 |         print(count)
24 |     info = l.split(" ")
25 |     if info[0] == "":
26 |         curNgrams = []
27 |     else:
28 |         curNgrams = [ngramInfo.split(":")[1] for ngramInfo in info[0].split(",")]
29 |         allNgrams.update(allNgrams.fromkeys(curNgrams))
30 |     sallyOut.write("%s\n" % " ".join(curNgrams))
31 | sallyOut.write("%s\n" % " ".join(allNgrams.keys()))
32 | sallyOut.close()
33 | sallyIn.close()
34 | 


--------------------------------------------------------------------------------
/man/getDuplicateData.Rd:
--------------------------------------------------------------------------------
 1 | \name{getDuplicateData}
 2 | \alias{getDuplicateData}
 3 | \title{
 4 |   Restores Data with Duplicates
 5 | }
 6 | \description{
 7 |   The \code{\link{loadPrismaData}} function triggers a feature selection and
 8 |   data combination methods which subsequently remove duplicate entries for
 9 |   efficient representation of the data. The
10 |   \code{\link{getDuplicateData}} rebuilds the data matrix with
11 |   explicit representation of all duplicate entries.
12 | }
13 | \usage{
14 | getDuplicateData(prismaData)
15 | }
16 | %- maybe also 'usage' for other objects documented here.
17 | \arguments{
18 |   \item{prismaData}{
19 |     prisma data loaded via \code{\link{loadPrismaData}}
20 | }
21 | }
22 | \value{
23 |   \item{dataWithDuplicates}{Data matrix containing explicit copies of all duplicates.}
24 | }
25 | \author{
26 | Tammo Krueger <tammokrueger@googlemail.com>
27 | }
28 | 
29 | \examples{
30 | data(asap)
31 | dataWithDuplicates = getDuplicateData(asap)
32 | }
33 | 


--------------------------------------------------------------------------------
/man/PRISMA-package.Rd:
--------------------------------------------------------------------------------
 1 | \name{PRISMA-package}
 2 | \alias{PRISMA-package}
 3 | \alias{PRISMA}
 4 | \docType{package}
 5 | \title{
 6 | \packageTitle{PRISMA}
 7 | }
 8 | \description{
 9 | \packageDescription{PRISMA}
10 | }
11 | \details{
12 | \packageDESCRIPTION{PRISMA}
13 | \packageIndices{PRISMA}
14 | }
15 | \author{
16 | \packageAuthor{PRISMA}
17 | 
18 | Maintainer: \packageMaintainer{PRISMA}
19 | }
20 | \references{
21 | Krueger, T., Gascon, H., Kraemer, N., Rieck, K. (2012)
22 | Learning Stateful Models for Network Honeypots
23 | \emph{5th ACM Workshop on Artificial Intelligence and Security (AISEC 2012)}, accepted
24 | 
25 | Krueger, T., Kraemer, N., Rieck, K. (2011)
26 | ASAP: Automatic Semantics-Aware Analysis of Network Payloads
27 | \emph{Privacy and Security Issues in Data Mining and Machine Learning - International ECML/PKDD Workshop. Lecture Notes in Computer Science 6549}, Springer. 50 - 63
28 | }
29 | \keyword{ package }
30 | \examples{
31 | # please see the vingette for examples
32 | }
33 | 


--------------------------------------------------------------------------------
/man/getMatrixFactorizationLabels.Rd:
--------------------------------------------------------------------------------
 1 | \name{getMatrixFactorizationLabels}
 2 | \alias{getMatrixFactorizationLabels}
 3 | %- Also NEED an '\alias' for EACH other topic documented here.
 4 | \title{
 5 |   Convert Coordinates of Matrix Factorization to Labels
 6 | }
 7 | \description{
 8 |   Given a matrix factorization object \eqn{A = B C}, this function returns for each
 9 |   document the index of the inner dimension which has the maximal
10 |   coordinate. Thus, it converts the fuzzy clustering found in the
11 |   columns of the \eqn{C} matrix into a hard clustering by returning the
12 |   position with the maximal coordinate value.
13 | }
14 | \usage{
15 | getMatrixFactorizationLabels(prismaMF)
16 | }
17 | %- maybe also 'usage' for other objects documented here.
18 | \arguments{
19 |   \item{prismaMF}{
20 |     a matrix factorization object.
21 | }
22 | }
23 | \value{
24 |   \item{labels}{vector containing the label assignment for each document.}
25 | }
26 | \author{
27 | Tammo Krueger <tammokrueger@googlemail.com>
28 | }
29 | \seealso{
30 | \code{\link{prismaNMF}}
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------
/man/corpusToPrisma.Rd:
--------------------------------------------------------------------------------
 1 | \name{corpusToPrisma}
 2 | \alias{corpusToPrisma}
 3 | %- Also NEED an '\alias' for EACH other topic documented here.
 4 | \title{
 5 |   Convert tm copus to PRISMA
 6 | }
 7 | \description{
 8 |   Converts a tm corpus object to a PRISMA object.
 9 | }
10 | \usage{
11 | corpusToPrisma(corpus, alpha = 0.05, skipFeatureCorrelation = FALSE)
12 | }
13 | %- maybe also 'usage' for other objects documented here.
14 | \arguments{
15 |   \item{corpus}{
16 |     a tm corpus
17 | }
18 |   \item{alpha}{
19 |     significance level for the feature tests. If NULL, all features are kept.
20 | }
21 |   \item{skipFeatureCorrelation}{
22 |     should the grouping of features based on correlation analysis be skipped.
23 | }
24 | }
25 | \value{
26 |   \item{prismaData}{data object representing the tokenized documents as
27 |     features x samples matrix.}
28 | }
29 | \author{
30 | Tammo Krueger <tammokrueger@googlemail.com>
31 | }
32 | 
33 | \examples{
34 | if (require("tm") && packageVersion("tm") >= '0.6') {
35 |   data(thesis)
36 |   thesis
37 |   thesis = corpusToPrisma(thesis, NULL, TRUE)
38 |   thesis
39 | }
40 | }
41 | 


--------------------------------------------------------------------------------
/man/prismaHclust.Rd:
--------------------------------------------------------------------------------
 1 | \name{prismaHclust}
 2 | \alias{prismaHclust}
 3 | %- Also NEED an '\alias' for EACH other topic documented here.
 4 | \title{
 5 |   Matrix Factorization Based on Hierarchical Clustering
 6 | }
 7 | \description{
 8 |   A matrix factorization \eqn{A = B C} based on the results of hclust is constructed,
 9 |   which holds the mean feature values for each cluster in the matrix \eqn{B}
10 |   and the indication of the cluster in the matrix \eqn{C} for each data
11 |   point (i.e. each data point is represented by its assigned cluster center).
12 | }
13 | \usage{
14 | prismaHclust(prismaData, ncomp, method = "single")
15 | }
16 | %- maybe also 'usage' for other objects documented here.
17 | \arguments{
18 |   \item{prismaData}{
19 |     PRISMA data for which a clustering should be calculated.
20 | }
21 |   \item{ncomp}{
22 |     the number of components that should be extracted.
23 | }
24 |   \item{method}{
25 |     the method used for clustering.
26 | }
27 | }
28 | \value{
29 | \item{prismaHclust}{Matrix factorization object containing \eqn{B} and \eqn{C}
30 |   resulting from the hierarchical clustering of the data.}
31 | }
32 | \author{
33 | Tammo Krueger <tammokrueger@googlemail.com>
34 | }
35 | 
36 | \seealso{
37 | \code{\link{hclust}}
38 | }
39 | \examples{
40 | # please see the vingette for examles
41 | }
42 | 


--------------------------------------------------------------------------------
/man/estimateDimension.Rd:
--------------------------------------------------------------------------------
 1 | \name{estimateDimension}
 2 | \alias{estimateDimension}
 3 | \title{
 4 |   Estimate Inner Dimension
 5 | }
 6 | \description{
 7 |   Matrix factorization methods compress the original data matrix \eqn{A \in
 8 |   R^{f,N}} with \eqn{f} features and \eqn{N} samples into two parts,
 9 |   namely \eqn{A = B C} with \eqn{B \in R^{f,k}, C\in R^{k,
10 |     N}}. The function estimateDimension estimates \eqn{k} based on a noise
11 |   model estimated from a scrambled version of the original data matrix.
12 | }
13 | \usage{
14 | estimateDimension(prismaData, alpha = 0.05, nScrambleSamples = NULL)
15 | }
16 | \arguments{
17 |   \item{prismaData}{
18 |     A prismaData object loaded via loadPrismaData
19 | }
20 |   \item{alpha}{
21 |     Error probability for confidence intervals
22 | }
23 |   \item{nScrambleSamples}{
24 |     The number of scrambled samples that should be used to estimate the
25 |     noise model. NULL means to use the complete data set.
26 | }
27 | }
28 | \value{
29 |   \item{estDim}{prismaDimension object that can be printed and plotted.}
30 | }
31 | \references{
32 | R. Schmidt. Multiple emitter location and signal parameter estimation.
33 | \emph{IEEE Transactions on Antennas and Propagation}, 34(3):276 -- 280, 1986.
34 | }
35 | \author{
36 | Tammo Krueger <tammokrueger@googlemail.com>
37 | }
38 | 
39 | %% ~Make other sections like Warning with \section{Warning }{....} ~
40 | 
41 | \examples{
42 | # please see the vingette for examles
43 | }
44 | 


--------------------------------------------------------------------------------
/man/generics_mf.Rd:
--------------------------------------------------------------------------------
 1 | \name{plot.prismaMF}
 2 | \alias{plot.prismaMF}
 3 | %- Also NEED an '\alias' for EACH other topic documented here.
 4 | \title{
 5 |   Generics For PRISMA Objects
 6 | }
 7 | \description{
 8 |   Print and plot generic for the PRISMA matrix factorization objects.
 9 | }
10 | \usage{
11 | \method{plot}{prismaMF}(x, nLines = NULL, baseIndex = NULL, sampleIndex = NULL,
12 | minValue = NULL, noRowClustering = FALSE, noColClustering = FALSE, type
13 | = c("base", "coordinates"), ...)
14 | }
15 | %- maybe also 'usage' for other objects documented here.
16 | \arguments{
17 |  
18 |   \item{x}{
19 |     PRISMA matrix factorization object
20 | }
21 |   \item{nLines}{
22 |     number of lines that should be plotted
23 | }
24 |   \item{baseIndex}{
25 |     which bases should be plotted
26 | }
27 |   \item{sampleIndex}{
28 |     which samples should be plotted
29 | }
30 |   \item{minValue}{
31 |     cut-off value, i.e., every value smaller than \code{minValue} won't be shown
32 | }
33 |   \item{noRowClustering}{
34 |     don't cluster the rows
35 | }
36 |   \item{noColClustering}{
37 |     don't cluster the columns
38 | }
39 |   \item{type}{
40 |     show the base (\code{type = "base"}, i.e. the \eqn{B} matrix) or
41 |     show the coordinate (\code{type = "coordinates"}, i.e. the \eqn{C} matrix).
42 | }
43 |   \item{...}{
44 |     not used
45 | }
46 | }
47 | \author{
48 | Tammo Krueger <tammokrueger@googlemail.com>
49 | }
50 | \seealso{
51 | \code{\link{estimateDimension}}, \code{\link{prismaHclust}}, \code{\link{prismaDuplicatePCA}}, \code{\link{prismaNMF}}
52 | }
53 | \examples{
54 | # please see the vingette for examles
55 | }
56 | 


--------------------------------------------------------------------------------
/vignettes/PRISMA.bib:
--------------------------------------------------------------------------------
 1 | @inproceedings{krueger12,
 2 | author = {Krueger, Tammo and Gascon, Hugo and Kr\"{a}mer, Nicole and Rieck, Konrad},
 3 |  title = {Learning stateful models for network honeypots},
 4 |  booktitle = {Proceedings of the 5th ACM workshop on Security and artificial intelligence},
 5 |  series = {AISec '12},
 6 |  year = {2012},
 7 |  isbn = {978-1-4503-1664-4},
 8 |  pages = {37--48},
 9 |  numpages = {12},
10 |  note = {\url{http://doi.acm.org/10.1145/2381896.2381904}},
11 |  doi = {10.1145/2381896.2381904},
12 |  publisher = {ACM},
13 | }
14 | 
15 | @inproceedings{krueger10,
16 | year={2011},
17 | isbn={978-3-642-19895-3},
18 | booktitle={Privacy and Security Issues in Data Mining and Machine Learning},
19 | volume={6549},
20 | series={Lecture Notes in Computer Science},
21 | editor={Dimitrakakis, Christos and Gkoulalas-Divanis, Aris and Mitrokotsa, Aikaterini and Verykios, VassiliosS. and Saygin, Y\"{u}cel},
22 | doi={10.1007/978-3-642-19896-0_5},
23 | title={{ASAP}: Automatic Semantics-Aware Analysis of Network Payloads},
24 |   note = {\url{http://dx.doi.org/10.1007/978-3-642-19896-0_5}},
25 | publisher={Springer Berlin Heidelberg},
26 | author={Krueger, Tammo and Kr\"{a}mer, Nicole and Rieck, Konrad},
27 | pages={50-63}
28 | } 
29 | 
30 | @phdthesis{krueger2013,
31 |   title={Probabilistic Methods for Network Security. From Analysis to Response},
32 |   author={Krueger, Tammo},
33 |   year={2013},
34 |   school={TU Berlin},
35 |   note = {\url{http://opus.kobv.de/tuberlin/volltexte/2013/3881/}}
36 | }
37 | 
38 | @Article{feinerer08,
39 |   title = {Text Mining Infrastructure in {R}},
40 |   author = {Ingo Feinerer and Kurt Hornik and David Meyer},
41 |   year = 2008,
42 |   journal = {Journal of Statistical Software},
43 |   volume = 25,
44 |   number = 5,
45 |   pages = {1--54},
46 |   url = {http://www.jstatsoft.org/v25/i05/},
47 |   month = {March},
48 | }
49 | 


--------------------------------------------------------------------------------
/man/loadPrismaData.Rd:
--------------------------------------------------------------------------------
 1 | \name{loadPrismaData}
 2 | \alias{loadPrismaData}
 3 | %- Also NEED an '\alias' for EACH other topic documented here.
 4 | \title{
 5 |   Load PRISMA Data Files
 6 | }
 7 | \description{
 8 |   Loads files generated by the sally tool (see
 9 |   \url{http://www.mlsec.org/sally/}) and represents the data as binary
10 |   token/ngrams x documents matrix. After loading, statistical tests are
11 |   applied to find features which are not volatile nor
12 |   constant. Co-occurring features are grouped to further compactify the
13 |   data. See \code{system.file("extdata","sallyPreprocessing.py",
14 |   package="PRISMA")} for a Python script which generates the
15 |   corresponding .fsally file from a .sally file which reduce the
16 |   loading time via \code{\link{loadPrismaData}} considerably.
17 | }
18 | \usage{
19 | loadPrismaData(path, maxLines = -1, fastSally = TRUE,
20 |                alpha = 0.05, skipFeatureCorrelation=FALSE)
21 | }
22 | %- maybe also 'usage' for other objects documented here.
23 | \arguments{
24 |   \item{path}{
25 |     path of the data file without the .sally extension. loadPrisma loads
26 |     path.sally or path.fsally depending on the fastSally switch.
27 | }
28 |   \item{maxLines}{
29 |     maximal number of lines to read from the data file. -1 means to read
30 |     all lines.
31 | }
32 |   \item{fastSally}{
33 |     should the fsally file be used, which drastically decreases loading time.
34 | }
35 |   \item{alpha}{
36 |     significance level for the feature tests. If NULL, all features are kept.
37 | }
38 |   \item{skipFeatureCorrelation}{
39 |     should the grouping of features based on correlation analysis be skipped.
40 | }
41 | }
42 | \value{
43 |   \item{prismaData}{data object representing the tokenized documents as
44 |     features x samples matrix.}
45 | }
46 | \references{
47 |   See \url{http://www.mlsec.org/sally/} for the sally utility.
48 | }
49 | \author{
50 | Tammo Krueger <tammokrueger@googlemail.com>
51 | }
52 | \examples{
53 | # please see the vingette for examles
54 | # please see system.file("extdata","asap.tar.gz", package="PRISMA") for
55 | # an example sally output
56 | }
57 | 


--------------------------------------------------------------------------------
/man/prismaNMF.Rd:
--------------------------------------------------------------------------------
 1 | \name{prismaNMF}
 2 | \alias{prismaNMF}
 3 | %- Also NEED an '\alias' for EACH other topic documented here.
 4 | \title{
 5 |   Matrix Factorization Based on Replicate-Aware NMF
 6 | }
 7 | \description{
 8 |   Matrix factorization \eqn{A = B C} with strictly positiv matrices \eqn{B, C}
 9 |   which minimize the reconstruction error \eqn{\|A - B C\|}. This
10 |   replicate-aware version of the non-negtive matrix factorization (NMF)
11 |   is based on the alternating least squares
12 |   approach and exploits the replicate information to speed up the calculation.
13 | }
14 | \usage{
15 | prismaNMF(prismaData, ncomp, time = 60, pca.init = TRUE, doNorm = TRUE, oldResult = NULL)
16 | }
17 | %- maybe also 'usage' for other objects documented here.
18 | \arguments{
19 |   \item{prismaData}{
20 |     PRISMA data for which a NMF should be calculated.
21 | }
22 |   \item{ncomp}{
23 |     either an \code{integer} or \code{prismaDimension} object specifying
24 |     the inner dimension of the matrix factorization.
25 | }
26 |   \item{time}{
27 |     seconds after which the calculation should end.
28 | }
29 |   \item{pca.init}{
30 |     should the \eqn{B} matrix be initialized by a PCA.
31 | }
32 |   \item{doNorm}{
33 |     should the \eqn{B} matrix normalized (i.e. all columns have the
34 |     Euclidean length of 1).
35 | }
36 |   \item{oldResult}{
37 |     re-use results of a previous run, i.e. \eqn{B} and \eqn{C} are
38 |     pre-initialized with the values of this previous matrix
39 |     factorization object.
40 | }
41 | }
42 | \value{
43 |   \item{prismaNMF}{Matrix factorization object containing the \eqn{B} and
44 |     \eqn{C} matrix.}
45 | }
46 | \references{
47 | Krueger, T., Gascon, H., Kraemer, N., Rieck, K. (2012)
48 | Learning Stateful Models for Network Honeypots
49 | \emph{5th ACM Workshop on Artificial Intelligence and Security (AISEC 2012)}, accepted
50 | 
51 | R. Albright, J. Cox, D. Duling, A. Langville, and C. Meyer. (2006)
52 | Algorithms, initializations, and convergence for the nonnegative
53 | matrix factorization. \emph{Technical Report 81706, North Carolina State University}
54 | }
55 | \author{
56 | Tammo Krueger <tammokrueger@googlemail.com>
57 | }
58 | \examples{
59 | # please see the vingette for examles
60 | }
61 | 


--------------------------------------------------------------------------------
/R/dimensionEstimation.R:
--------------------------------------------------------------------------------
 1 | if (getRversion() >= "2.15.1") globalVariables(c("low", "up"))
 2 | # public methods
 3 | estimateDimension = function(prismaData, alpha=0.05, nScrambleSamples=NULL) {
 4 |   N = length(prismaData$remapper)
 5 |   pca = prismaDuplicatePCA(prismaData)
 6 |   remapper = prismaData$remapper
 7 |   if (!is.null(nScrambleSamples)) {
 8 |     remapper = sample(remapper, nScrambleSamples)
 9 |   }
10 |   spca = scramblePCA(scrambleFeature(prismaData$data[, remapper]))
11 |   nVal = min(c(length(pca$pca$sdev), length(spca$pca$sdev)))
12 |   # Bonferroni correction:
13 |   alpha = alpha / nVal
14 | 
15 |   calcConfidence = function(sdev) {
16 |     v = sdev^2
17 |     tau = sqrt(2/(N - 1))
18 |     z = qnorm(1-alpha/2)
19 |     d1 = sqrt(1 + tau * z)
20 |     d2 = sqrt(1 - tau * z)
21 |     conf = cbind(v, v / d1, v / d2)
22 |     return(conf)
23 |   }
24 |   cNorm = calcConfidence(pca$pca$sdev[1:nVal])
25 |   sNorm = calcConfidence(spca$pca$sdev[1:nVal])
26 |   data = data.frame(rbind(cbind(1:nVal, cNorm), cbind(1:nVal, sNorm)), rep(c("norm", "scramble"), c(nVal, nVal)), row.names=as.character(1:(2*nVal)))
27 |   colnames(data) = c("x", "var", "low", "up", "class")
28 | 
29 |   norm = data$low[data$class == "norm"]
30 |   scramble = data$up[data$class == "scramble"]
31 |   dim = 2 * (match(TRUE, norm <= scramble) - 1)
32 |   if (dim == 0) {
33 |     warning("Not enough data for reasonable dimension estimation. Please adjust $dim according to your fallback heuristic!")
34 |   }
35 |   ret = list(data=data, dim=dim, pca=pca)
36 |   class(ret) = "prismaDimension"
37 |   return(ret)
38 | }
39 | 
40 | print.prismaDimension = function(x, ...) {
41 |   cat("Estimated data dimension for positive matrix factorization via simulated noise level:", x$dim, "\n")
42 | }
43 | 
44 | plot.prismaDimension = function(x, ...) {
45 | 	dimData=x
46 |   #require(ggplot2)
47 |   data = dimData$data
48 |   p = ggplot(data, aes(x=x, y=var, ymin=low, ymax=up, color=class))
49 |   p + geom_errorbar(width=2) + geom_line() 
50 | }
51 | 
52 | # private methods
53 | 
54 | scramblePCA = function(mat) {
55 |   # old version without duplicate information!
56 |   pca = prcomp(t(mat), scale=FALSE, retx=FALSE)
57 |   B = pca$rotation
58 |   #C = t(pca$x)
59 |   ret = list(B=B, C=NULL, pca=pca)
60 |   return(ret)
61 | }
62 | 
63 | scrambleFeature = function(mat) {
64 |   #require(Matrix)
65 |   N = ncol(mat)
66 |   F = nrow(mat)
67 |   if (inherits(mat, "Matrix")) {
68 |     p = mat@p
69 |     newI = rep(0, length(mat@i))
70 |     # scramble the features of all data points
71 |     for (ind in 1:N) {
72 |       if (p[ind+1]-p[ind] > 0) {
73 |         newI[(p[ind]+1):p[ind+1]] = sample(F, p[ind+1]-p[ind], replace=FALSE) - 1
74 |       }
75 |     }
76 |     ret = sparseMatrix(i=newI, p=p, x=mat@x, dims=c(F, N), dimnames=dimnames(mat), index1=FALSE)
77 |   }
78 |   else {
79 |     ret = mat
80 |     # scramble the features of all data points
81 |     for (ind in 1:N) {
82 |       ret[, ind] = ret[sample.int(F), ind]
83 |     }
84 |   }
85 |   return(ret)
86 | }
87 | 
88 | 


--------------------------------------------------------------------------------
/vignettes/PRISMA.Rnw:
--------------------------------------------------------------------------------
  1 | \documentclass[a4paper]{article}
  2 | 
  3 | \usepackage[margin=2.25cm]{geometry}
  4 | \usepackage{xspace}
  5 | %%\usepackage[round]{natbib}
  6 | \usepackage[colorlinks=true,urlcolor=blue]{hyperref}
  7 | 
  8 | \newcommand{\code}[1]{\texttt{#1}}
  9 | \newcommand{\pkg}[1]{{\it #1}}
 10 | \newcommand{\Prisma}{\pkg{PRISMA}\xspace}
 11 | \SweaveOpts{keep.source=TRUE, strip.white=all}
 12 | %% \VignetteIndexEntry{Quick introduction}
 13 | 
 14 | <<echo=FALSE,results=hide>>=
 15 | if (!exists("PRISMA",.GlobalEnv)) library(PRISMA)  
 16 | @
 17 | 
 18 | \begin{document}
 19 | \title{Introduction to the \Prisma package}
 20 | \author{Tammo Krueger}
 21 | \date{\today\\[1cm]
 22 |   \url{https://github.com/tammok/PRISMA}}
 23 | \maketitle
 24 | 
 25 | \section*{Introduction}
 26 | 
 27 | This vignette gives you a first tour to the features of the \Prisma
 28 | package. We will give an overview of the application of the algorithm,
 29 | yet, the full story is available in the papers
 30 | \cite{krueger12,krueger10}. If you use the \Prisma package in your
 31 | research, please cite at least one of these references.
 32 | 
 33 | The \Prisma package consists essentially out of three parts: 
 34 | \begin{enumerate}
 35 | \item Efficiently reading \code{sally} output, an extremely fast n-gram
 36 |   processor available at \url{http://www.mlsec.org/sally/}
 37 | \item Testing-based feature dimension reduction
 38 | \item Optimized matrix factorization of the reduced data exploiting
 39 |   the replicate structure of the data
 40 | \end{enumerate}
 41 | 
 42 | For the theory behind these parts please consult
 43 | \cite{krueger12,krueger10}. We will start this walk-through with the
 44 | reading of \code{sally} data, then showing the inner structure of the
 45 | resulting data object on which the replicate-aware non-negative matrix
 46 | factorization can be applied.
 47 | 
 48 | \section*{Loading the Data}
 49 | This section serves just as a reference how to apply the processing
 50 | chain to new data, to get a usable \Prisma data set. The generated
 51 | data set is already prepackaged inside the \Prisma package and can be
 52 | loaded via \code{data(asap)}. 
 53 | 
 54 | Before executing the examples please extract asap.tar.gz located in
 55 | the \code{extdata} path of the \Prisma package to find all data
 56 | necessary to understand the processing chain from the raw data
 57 | (asap.raw) to the sally file (asap.sally) and the optimized file
 58 | (asap.fsally). The asap.sally file can be produced as follows:
 59 | \begin{verbatim}
 60 | sally -c asap.cfg asap.raw asap.sally
 61 | \end{verbatim}
 62 | 
 63 | this call generates asap.sally from the raw data found in asap.raw. To
 64 | speed up the loading of the data in R, one should apply the
 65 | \code{sallyPreprocessing.py} Python script as follows:
 66 | 
 67 | \begin{verbatim}
 68 | python sallyPreprocessing.py asap.sally asap.fsally
 69 | \end{verbatim}
 70 | 
 71 | Now the data is ready to be efficiently loaded and processed in R via
 72 | \code{loadPrismaData("asap")} which also executes the feature
 73 | dimension reduction step.
 74 | 
 75 | \section*{The \Prisma Data Set}
 76 | 
 77 | As an example we use the prepackages ASAP toy data set as described in \cite{krueger10}:
 78 | <<>>=
 79 | data(asap)
 80 | asap
 81 | @
 82 | We see that the feature reduction step worked quite well. Let's have a
 83 | look behind the scenes:
 84 | <<>>=
 85 | asap$data
 86 | @ 
 87 | This shows us the reduced form of the initial data matrix in a
 88 | features $\times$ documents representation, i.e. this is a replicate-free
 89 | version of it. We can see that the features partly consists of grouped
 90 | tokens (for instance \code{admin.php par action} contains 3 tokens,
 91 | which always co-occurred in the data) and how theses tokens are present
 92 | in the different documents. We can see the initial tokens before the
 93 | grouping and their corresponding group assignment in the \code{group} variable:
 94 | <<>>=
 95 | asap$group
 96 | @ 
 97 | 
 98 | The member variable \code{unprocessed} contains the initial data matrix
 99 | before the feature selection and grouping step. If we want to
100 | reconstruct all replicates in the reduced feature space, we need the
101 | \code{getDuplicateData} function:
102 | <<>>=
103 | dim(getDuplicateData(asap))
104 | dim(asap$unprocessed)
105 | @ 
106 | This will blow up the reduced matrix to the full 10.000 initial data
107 | points in the reduced feature space. To see, how often a specific
108 | entry in the reduced data matrix was present, we can have a look at
109 | the duplicate count:
110 | <<>>=
111 | asap$duplicatecount
112 | sum(asap$duplicatecount)
113 | @ 
114 | \section*{The Replicate-Aware Non-Negative Matrix Factorization (NMF)}
115 | 
116 | The replicate-aware NMF is a matrix factorization method which
117 | describes the data according to a new base vector system, i.e. each
118 | data point is described as a weighted sum of these base vectors. Thus,
119 | the base vectors can be seen as the parts of which a document is
120 | constructed. Furthermore, the new coordinates of a document (the base
121 | weights) can also be interpreted as a soft clustering. But before we
122 | can apply the NMF we need to specify the inner dimension of the
123 | factorization. This could either be supplied by a number (which should
124 | be even, if \code{pca.init} is \code{TRUE}), or a
125 | \code{prismaDimension} object generated by the fully automatized
126 | dimension estimation method:
127 | <<>>=
128 | asapDim = estimateDimension(asap)
129 | asapDim
130 | @ 
131 | Equipped with this object, we can now apply the NMF to the data:
132 | \begin{verbatim}
133 | > asapNMF = prismaNMF(asap, asapDim, time=60)
134 | Error: 3771.392 
135 | Error: 3113.138 
136 | Error: 2855.863 
137 | Error: 2810.286 
138 | Error: 2765.763 
139 | Error: 2755.29 
140 | Error: 2752.505 
141 | > asapLabels = getMatrixFactorizationLabels(asapNMF)
142 | > table(asapLabels)
143 | asapLabels
144 |    1    2    3    4    5    6    7    8 
145 |  623  607  602  660 1696 2473  817 2522 
146 | \end{verbatim}
147 | We can look at the results via \code{plot(asapNMF)} which is shown in
148 | Figure \ref{fig:asap}. We can see that the NMF extracts a
149 | \code{search} template, then the four \code{admin.php}-action
150 | templates, a Firefox template and two \code{static} templates, which
151 | reproduces the results in \cite{krueger10}, Section 3.1., with added
152 | user agents as ``noise''.
153 | 
154 | \begin{figure}[tb]
155 |   \centering
156 |   \includegraphics{asap}
157 |   \caption{Result of the replicate-aware NMF on the \code{asap} data set.}
158 |   \label{fig:asap}
159 | \end{figure}
160 | 
161 | \section*{Interface to the \pkg{tm} Package}
162 | 
163 | To allow the application of the replicate-aware NMF to corpora
164 | generated by the \pkg{tm} package \cite{feinerer08}, the \Prisma
165 | package contains a converter function which maps a \pkg{tm} corpus
166 | object to a \Prisma data object. We exemplify this procedure with an
167 | already stemmed and cleansed version of the 15 subsections of
168 | \cite{krueger2013}:
169 | 
170 | \begin{verbatim}
171 | > data(thesis)
172 | > thesis
173 | A corpus with 15 text documents
174 | > thesis = corpusToPrisma(thesis, NULL, TRUE)
175 | > thesis
176 | PRISMA data tm-Corpus 
177 | Unprocessed data: # features: 2002 # entries: 15 
178 | Processed data: # features: 2002 # entries: 15 
179 | > thesisNMF = prismaNMF(thesis, 3, pca.init=FALSE)
180 | Error: 1329.73 
181 | Error: 1310.481 
182 | Error: 1295.959 
183 | Error: 1295.509 
184 | \end{verbatim}
185 | 
186 | Since we have just 15 documents, the application of the feature
187 | reduction step and the correlation analysis suffers from too less
188 | data, which also holds true for the PCA-based initialization
189 | scheme. Thus, we ignore all these processings and apply the NMF
190 | directly on the data with three components as a sophisticated
191 | guess. To analyze the result we look at the top 20 words of the
192 | resulting base matrix:
193 | 
194 | \begin{verbatim}
195 | > isQuantile = (t(thesisNMF$B) > apply(thesisNMF$B, 2, quantile, prob=.99))
196 | > maxFeatures = apply(isQuantile, 1, function(r) which(r == 1))
197 | > rownames(thesis$data)[maxFeatures[, 1]]
198 |  [1] "add"      "align"    "associ"   "cluster"  "communic" "correct" 
199 |  [7] "extract"  "fill"     "format"   "inner"    "machin"   "messag"  
200 | [13] "obvious"  "preserv"  "reflect"  "return"   "simul"    "templat" 
201 | [19] "trace"    "transit"  "tri"     
202 | > rownames(thesis$data)[maxFeatures[, 2]]
203 |  [1] "behavior"   "chang"      "configur"   "crossvalid" "drop"      
204 |  [6] "fast"       "figur"      "follow"     "lead"       "learn"     
205 | [11] "lower"      "observ"     "optim"      "overal"     "procedur"  
206 | [16] "process"    "relat"      "shown"      "speed"      "statist"   
207 | [21] "use"       
208 | > rownames(thesis$data)[maxFeatures[, 3]]
209 |  [1] "addit"     "applic"    "approach"  "attack"    "base"      "construct"
210 |  [7] "content"   "exploit"   "method"    "model"     "network"   "normal"   
211 | [13] "protocol"  "server"    "similar"   "simpl"     "structur"  "techniqu" 
212 | [19] "token"     "traffic"   "use"      
213 | \end{verbatim}
214 | 
215 | These word stems accurately describe the contents of the three
216 | chapters of \cite{krueger2013} which concludes the analysis of this
217 | section.
218 | 
219 | \bibliographystyle{plain}
220 | \bibliography{PRISMA}
221 | \end{document}
222 | 


--------------------------------------------------------------------------------
/R/matrixFactorization.R:
--------------------------------------------------------------------------------
  1 | # public methods
  2 | 
  3 | getMatrixFactorizationLabels = function(prismaMF) {
  4 |   labels = apply(prismaMF$C, 2, which.max)
  5 |   return(labels[prismaMF$remapper])
  6 | }
  7 | 
  8 | prismaHclust = function(prismaData, ncomp, method="single") {
  9 |   mat = prismaData$data
 10 |   d = dist(t(mat), "binary")
 11 |   clust = hclust(d, method)
 12 |   labels = cutree(clust, k=ncomp)
 13 | 
 14 |   label2ind = split(1:length(labels), labels)
 15 |   B = sapply(label2ind, function(ind) apply(mat[, ind], 1, mean))
 16 |   one = function(where) {
 17 |     ret = rep(0, ncomp)
 18 |     ret[where] = 1
 19 |     return(ret)
 20 |   }
 21 |   C = sapply(labels, function(l) one(l))
 22 |   ret = list(B=B, C=C)
 23 |   rownames(ret$B) = rownames(mat)
 24 |   colnames(ret$B) = as.character(1:ncomp)
 25 |   colnames(ret$C) = colnames(mat)
 26 |   ret$type = "hclust"
 27 |   ret$remapper = prismaData$remapper
 28 |   class(ret) = "prismaMF"
 29 |   return(ret)
 30 | }
 31 | 
 32 | prismaDuplicatePCA = function(prismaData) {
 33 |   pca = sparsePCA(sparseCov(prismaData))
 34 |   ret = list(B=pca$loadings, C=pca$scores, pca=pca)
 35 |   ret$type = "DuplicatePCA"
 36 |   ret$remapper = prismaData$remapper
 37 |   class(ret) = "prismaMF"
 38 |   return(ret)
 39 | }
 40 | 
 41 | prismaNMF = function(prismaData, ncomp, time=60, pca.init=TRUE, doNorm=TRUE, oldResult=NULL) {
 42 |   mat = prismaData$data
 43 |   B = NULL
 44 |   if (!(is.null(oldResult))) {
 45 |     B = oldResult$B
 46 |     k = ncol(B)
 47 |   }
 48 |   else if (pca.init) {
 49 |     # genBase duplicates the input, therefore we just take half of the components
 50 |     if (class(ncomp) == "prismaDimension") {
 51 |       k = ncomp$dim %/% 2
 52 |       B = genBase(ncomp$pca$B[, 1:k])
 53 |     }
 54 |     else {
 55 |       pca = prismaDuplicatePCA(prismaData)
 56 |       k = ncomp %/% 2
 57 |       B = genBase(pca$B[, 1:k])
 58 |     }
 59 |     k = 2*k
 60 |   }
 61 |   else {
 62 |     k = ncomp
 63 |   }
 64 |   weights = prismaData$duplicatecount
 65 |   ret = pmf(mat, k, calcTime=time, B=B, doNorm=doNorm, weights=weights)
 66 | 
 67 |   rownames(ret$B) = rownames(mat)
 68 | 
 69 |   ret$remapper = prismaData$remapper
 70 |   colnames(ret$C) = colnames(mat)
 71 |   class(ret) = "prismaMF"
 72 |   return(ret)
 73 | }
 74 | 
 75 | plot.prismaMF = function(x, nLines=NULL, baseIndex=NULL, sampleIndex=NULL, minValue=NULL, noRowClustering=FALSE, noColClustering=FALSE, type=c("base", "coordinates"), ...) {
 76 | 	mf=x  
 77 | 	type = match.arg(type)
 78 |   if (type == "base") {
 79 |     B = mf$B
 80 |     if (!is.null(minValue)) {
 81 |       B[B < minValue] = 0
 82 |     }
 83 |     plotMatrixFactor(B, nLines, baseIndex, noRowClustering, noColClustering)
 84 |   }
 85 |   else if (type == "coordinates") {
 86 |     C = mf$C
 87 |     if (!is.null(sampleIndex)) {
 88 |       C = C[, sampleIndex]
 89 |     }
 90 |     if (!is.null(minValue)) {
 91 |       C[C < minValue] = 0
 92 |     }
 93 |     plotMatrixFactor(t(C), nLines, baseIndex)
 94 |   }
 95 |   else {
 96 |     stop("Unknown plot type!")
 97 |   }
 98 | }
 99 | 
100 | # private methods
101 | 
102 | sparseCov = function(prismaData) {
103 |   N = length(prismaData$remapper)
104 |   k = nrow(prismaData$data)
105 |   x = rep(NA, k*k)
106 |   # efficient mean calculation
107 |   fmean = colSums(t(prismaData$data) * prismaData$duplicatecount) / N
108 |   scount = sqrt(prismaData$duplicatecount)
109 |   centeredData = t(prismaData$data - fmean)
110 |   centered = centeredData * scount
111 |   theCov = new("dsyMatrix", Dim = c(k, k), x=as.numeric(t(centered) %*% centered / (N - 1)))
112 |   dimnames(theCov) = list(rownames(prismaData$data), rownames(prismaData$data))
113 |   return(list(cov=theCov, centeredData=centeredData, center=fmean))
114 | }
115 | 
116 | sparsePCA = function(sparsecov) {
117 |   # here we emulate the princom method... some parts of it are "reused" here
118 |   cl = match.call()
119 |   cl[[1L]] = as.name("sparsePCA")
120 |   cv = sparsecov$cov
121 |   z = sparsecov$centeredData
122 |   n.obs = nrow(z)
123 |   cen = sparsecov$center
124 |   edc = eigen(cv, symmetric = TRUE)
125 |   ev = edc$values
126 |   evec = edc$vectors
127 |   if (any(neg <- ev < 0)) {
128 |     # throw away negative eigenvalues
129 |     pos = which(!neg)
130 |     ev = ev[pos]
131 |     evec = evec[, pos]
132 |   }
133 |   cn = paste0("Comp.", 1L:ncol(evec))
134 |   names(ev) = cn
135 |   dimnames(evec) = list(dimnames(cv)[[2L]], cn)
136 |   sdev = sqrt(ev)
137 |   scr = t(z %*% evec)
138 |   edc = list(sdev = sdev, loadings=evec,
139 |     center = cen, n.obs = n.obs, scores = scr, call = cl)
140 |   class(edc) = "princomp"
141 |   return(edc)
142 | }
143 | 
144 | reconstructSparsePCA = function(spca) {
145 |   rec = spca$loadings %*% spca$scores + spca$center
146 |   return(rec)
147 | }
148 | 
149 | plotMatrixFactor = function(B, n.lines=NULL, base.index=NULL, noRowClustering=FALSE, noColClustering=FALSE) {
150 |   #require(gplots)
151 |   B = as.matrix(B)
152 |   if (!is.null(base.index)) {
153 |     B = B[, base.index]
154 |   }
155 |   if (!is.null(n.lines)) {
156 |     B = calcLinesPerThreshold(B, n.lines)
157 |   }
158 |   if (noRowClustering) {
159 |     row.clust = NULL
160 |     dendrogram = "column"
161 |   }
162 |   else {
163 |     row.clust = as.dendrogram(hclust(dist(B, method="euclidean"), method="complete"))
164 |     dendrogram = "both"
165 |   }
166 |   if (noColClustering) {
167 |     col.clust = NULL
168 |     dendrogram = ifelse(dendrogram == "both", "row", "none")
169 |   }
170 |   else {
171 |     col.clust = as.dendrogram(hclust(dist(t(B), method="binary"), method="complete"))
172 |   }
173 |   breaks = c(0, seq(min(B[B>0])-1e-9, max(B), length=15))
174 |   heatmap.2(B, Rowv=row.clust, Colv=col.clust, dendrogram=dendrogram, trace="none", breaks=breaks, col=function(n) gray(c(1, seq(0.8, 0, length=n-1))))
175 | }
176 | 
177 | genBase = function(B) {
178 |   negB = -B
179 |   negB[negB < 0] = 0
180 |   mat = cbind(negB, B + negB)
181 |   colnames(mat) = c(paste(colnames(B), "neg", sep="."), paste(colnames(B), "pos", sep="."))
182 |   return(mat)
183 | }
184 | 
185 | normBase = function(ret) {
186 |   r = ncol(ret$B)
187 |   nfeats = nrow(ret$B)
188 |   # norm the basis
189 |   norms = sqrt(apply(ret$B^2, 2, sum))
190 |   # look for all-zero base
191 |   allZero = (norms == 0)
192 |   ret$B = ret$B[, !allZero]
193 |   ret$C = ret$C[!allZero, ]
194 |   r = r - sum(allZero)
195 |   ret$B = ret$B / rep(norms[!allZero], rep(nfeats, r))
196 |   ret$C = ret$C * norms[!allZero]
197 |   return(ret)
198 | }
199 | 
200 | calcDatacluster = function(ret) {
201 |   labels = apply(ret$C, 2, which.max)
202 |   return(labels)
203 | }
204 | 
205 | calcLinesPerThreshold = function(B, n.lines) {
206 |   allVals = unique(sort(B, decreasing=TRUE))
207 |   allMax = apply(B, 1, max)
208 |   lines = sapply(allVals, function(v) sum(allMax > v))
209 |   min.value = allVals[which.min(lines <= n.lines)-1]
210 |   B = B[apply(B, 1, function(r) any(r > min.value)), ]
211 |   return(B)
212 | }
213 | 
214 | # pimped speed by crossprod...
215 | # see Least Squares Calculations in R by D.M. Bates in R News, 4/1:17-20
216 | RRbyCV = function(Y, D, fold=5, lambdas=10^(-4:2), weights=NULL) {
217 |   # Y is the data assumed to be [# samples X # vars]
218 |   N = nrow(Y)
219 |   F = ncol(D)
220 |   Nfold = floor(N / fold)
221 |   res = matrix(0, length(lambdas), fold, dimnames=list(as.character(lambdas), NULL))
222 |   if (!is.null(weights)) {
223 |     sweights = sqrt(weights)
224 |   }
225 |   for (l in lambdas) {
226 |     for (f in 1:fold) {
227 |       index = ((f-1)*Nfold + 1):(f * Nfold)
228 |       Sub = D[-index, ]
229 |       # estimate the coefficients on the subsample
230 |       if (is.null(weights)) {
231 |         #Beta1 = solve(t(Sub) %*% Sub + diag(l, F)) %*% t(Sub) %*% Y[-index, ]
232 |         Beta2 = solve(crossprod(Sub) + diag(l, F), crossprod(Sub, Y[-index, ]))
233 |         #cat(" No weights", round(mean(abs(Beta1 - Beta2)), 6))
234 |         Beta = Beta2
235 |         res[as.character(l), f] = sqrt(sum((Y[index, ] - (D[index, ] %*% Beta))^2))
236 |       }
237 |       else {
238 |         #W = Diagonal(x=weights[-index])
239 |         #Beta1 = solve(t(Sub) %*% W %*% Sub + diag(l, F)) %*% t(Sub) %*% W %*% Y[-index, ]
240 |         W = Diagonal(x=sweights[-index])
241 |         WSub = W %*% Sub
242 |         WY = W %*% Y[-index, ]
243 |         Beta2 = solve(crossprod(WSub) + diag(l, F), crossprod(WSub, WY))
244 |         #cat(" Weights", round(mean(abs(Beta1 - Beta2)), 6))
245 |         Beta = Beta2
246 |         res[as.character(l), f] = sqrt(sum((Y[index, ] - (D[index, ] %*% Beta))^2))
247 |       }
248 |     }
249 |   }
250 |   return(lambdas[which.min(apply(res, 1, mean))])
251 | }
252 | 
253 | pmf = function(A, r, calcTime, B=NULL, doNorm=TRUE, weights=NULL) {
254 |   #require(Matrix)
255 |   # A should contain the samples in the cols!
256 |   nsamples = ncol(A)
257 |   nfeats = nrow(A)
258 |   if (is.null(weights)) {
259 |     W = Diagonal(nsamples)
260 |     weights = rep(1, nsamples)
261 |     # SW = W
262 |   }
263 |   else {
264 |     W = Diagonal(nsamples, weights)
265 |     # SW = Diagonal(nsamples, sqrt(weights))
266 |   }
267 |   # the new basis
268 |   if (is.null(B)) {
269 |     B = abs(matrix(rnorm(nfeats * r), nfeats, r))
270 |   }
271 |   olderror = Inf
272 |   iter = 0
273 |   startTime = proc.time()[3]
274 |   while (TRUE) {
275 |     lambda = RRbyCV(A, B, weights=NULL)
276 |     #S = t(B) %*% B + diag(lambda, r, r)
277 |     S = crossprod(B) + diag(lambda, r, r)
278 | 
279 |     # faster?
280 |     C = solve(S, crossprod(B, A))
281 |     # Cold = solve(S, t(B) %*% A)
282 |     # cat(" No weights", round(mean(abs(C - Cold)), 6))
283 |     # C = solve(S) %*% (t(B) %*% A)
284 |     # set all negative coordinates to 0
285 |     C[C < 0] = 0
286 | 
287 |     lambda = RRbyCV(t(A), t(C), weights=weights)
288 |     #WtC = SW %*% t(C)
289 |     #S = crossprod(WtC) + diag(lambda, r, r)
290 |     S = C %*% W %*% t(C) + diag(lambda, r, r)
291 | 
292 |     # faster?
293 |     #B = t(solve(S, crossprod(WtC, SW %*% t(A))))
294 |     B = t(solve(S, C %*% W %*% t(A)))
295 |     #B = (A %*% W %*% t(C)) %*% solve(S)
296 |     B[B < 0] = 0
297 |     if (doNorm) {
298 |       # norm the basis
299 |       norms = sqrt(apply(B^2, 2, sum))
300 |       # look for all-zero base
301 |       allZero = (norms == 0)
302 |       B = B[, !allZero, drop=FALSE]
303 |       C = C[!allZero, , drop=FALSE]
304 |       r = r - sum(allZero)
305 |       B = B / rep(norms[!allZero], rep(nfeats, r))
306 |       C = C * norms[!allZero]
307 |     }
308 |     iter = iter + 1
309 |     timeElapsed = proc.time()[3] - startTime
310 |     if (iter %% 10 == 0) {
311 |       error = .5 * sum(colSums((A - B %*% C)^2) * weights)
312 |       cat("Error:", error, "\n")
313 |       if (abs(olderror - error) < 1e-9) {
314 |         break
315 |       }
316 |       olderror = error
317 |     }
318 |     if (timeElapsed >= calcTime) {
319 |       break
320 |     }
321 |   }
322 |   ret = list(B=B, C=C)
323 |   return(ret)
324 | }
325 | 


--------------------------------------------------------------------------------
/R/prisma.R:
--------------------------------------------------------------------------------
  1 | # public functions:
  2 | loadPrismaData = function(path, maxLines=-1, fastSally=TRUE, alpha=.05, skipFeatureCorrelation=FALSE) {
  3 |   data = readPrismaInput(path, maxLines, fastSally)
  4 |   data = preprocessPrismaData(data, alpha, skipFeatureCorrelation)
  5 |   data$path = path
  6 |   class(data) = "prisma"
  7 |   return(data)
  8 | }
  9 | 
 10 | getDuplicateData = function(prismaData) {
 11 |   return(prismaData$data[, prismaData$remapper])
 12 | }
 13 | 
 14 | corpusToPrisma = function(corpus, alpha=.05, skipFeatureCorrelation=FALSE) {
 15 |     #require(Matrix)
 16 |     if (requireNamespace("tm", quietly = TRUE) && packageVersion("tm") >= '0.6') {
 17 |         #require(tm)
 18 |         tdm = tm::TermDocumentMatrix(corpus)
 19 |         data = list(data=Matrix(as.matrix(tdm)))
 20 |         data = preprocessPrismaData(data, alpha, skipFeatureCorrelation)
 21 |         data$path = "tm-Corpus"
 22 |         class(data) = "prisma"
 23 |         return(data)
 24 |     }
 25 |     else {
 26 |         stop("Need package tm (>='0.6')")
 27 |     }
 28 | }
 29 | 
 30 | 
 31 | print.prisma = function(x, ...) {
 32 | 	prismaData=x
 33 |   cat("PRISMA data", prismaData$path, "\n")
 34 |   cat("Unprocessed data: # features:", nrow(prismaData$unprocessed),
 35 |       "# entries:", ncol(prismaData$unprocessed), "\n")
 36 |   cat("Processed data: # features:", nrow(prismaData$data),
 37 |       "# entries:", ncol(prismaData$data), "\n")
 38 | }
 39 | 
 40 | plot.prisma = function(x, ...) {
 41 | 	prismaData=x
 42 |   image(prismaData$data)
 43 | }
 44 | 
 45 | # private functions:
 46 | readFSally = function(path, maxLines=-1) {
 47 |   #require(Matrix)
 48 |   f = file(path)
 49 |   cat("Reading data...\n")
 50 |   data = readLines(f)
 51 |   cat("Splitting ngrams...\n")
 52 |   ngrams = strsplit(data, " ", fixed=TRUE)
 53 |   total = length(data)
 54 |   allNgrams = ngrams[[total]]
 55 |   close(f)
 56 |   cat("Calc indices...\n")
 57 |   indices = match(unlist(ngrams[-total]), allNgrams)
 58 |   cat("Setup matrix...\n")
 59 |   N = total-1
 60 |   mat = sparseMatrix(indices, rep(1:N, sapply(ngrams[-total], length)),
 61 |     x=1,
 62 |     dims=c(length(allNgrams), N),
 63 |     dimnames=list(allNgrams, paste("line", 1:N, sep="")))
 64 |   if (maxLines > 0) {
 65 |     return(mat[, 1:maxLines])
 66 |   }
 67 |   else {
 68 |     return(mat)
 69 |   }
 70 | }
 71 | 
 72 | readSally = function(path, maxLines=-1) {
 73 |   #require(Matrix)
 74 |   f = file(path)
 75 |   data = scan(f, what="char", sep=" ", quote="", quiet=TRUE, comment.char="", skip=1, nlines=maxLines)
 76 |   close(f)
 77 |   rawngrams = data[c(TRUE, FALSE)]
 78 |   origin = data[c(FALSE, TRUE)]
 79 |   processNgram = function(cv) {
 80 |     ret = cv[3]
 81 |     names(ret) = cv[2]
 82 |     return(ret)
 83 |   }
 84 |   ngrams = lapply(strsplit(rawngrams, ",", fixed=TRUE), function(obj) sapply(strsplit(obj, ":", fixed=TRUE), processNgram))
 85 |   allNgrams = unique(unlist(lapply(ngrams, function(ngram) names(ngram)), use.names=FALSE))
 86 |   indices = unlist(lapply(ngrams, function(ngram) match(names(ngram), allNgrams)), use.names=FALSE)
 87 |   # generate a matrix in ml-style: rows are the features, cols are the samples
 88 |   mat = sparseMatrix(indices, rep(1:length(ngrams), sapply(ngrams, length)), x= as.numeric(unlist(ngrams, use.names=FALSE)), dims=c(length(allNgrams), length(ngrams)), dimnames=list(allNgrams, origin))
 89 |   return(mat)
 90 | }
 91 | 
 92 | readHarry = function(path, maxLines=-1) {
 93 |   harry = read.table(path, sep="\t", quote="", comment.char="",
 94 |     as.is=TRUE, header=TRUE, nrows=maxLines)
 95 |   return(harry)
 96 | }
 97 | 
 98 | readRaw = function(path, maxLines=-1) {
 99 |   f = file(path)
100 |   raw = readLines(f, n=maxLines)
101 |   close(f)
102 |   #rawsplit = strsplit(raw, " ", fixed=TRUE)
103 |   return(raw)
104 | }
105 | 
106 | readPrismaInput = function(path, maxLines=-1, fastSally=TRUE) {
107 |   if (fastSally) {
108 |     sally = readFSally(sprintf("%s.fsally", path), maxLines)
109 |   }
110 |   else {
111 |     sally = readSally(sprintf("%s.sally", path), maxLines)
112 |   }
113 |   data = list(data=sally)
114 |   hfile = sprintf("%s.harry", path)
115 |   if (file.exists(hfile) && file.access(hfile, mode=4)) {
116 |     data$annotation = readHarry(hfile, maxLines)
117 |   }
118 |   rfile = sprintf("%s.rawquoted", path)
119 |   if (file.exists(rfile) && file.access(rfile, mode=4)) {
120 |     data$raw = readRaw(rfile, maxLines)
121 |   }
122 |   return(data)
123 | }
124 | 
125 | duplicateRemover = function(data) {
126 |   if (inherits(data, "Matrix")) {
127 |     classes = calcClassForSparseMatrix(data)
128 |   }
129 |   else {
130 |     classes = sapply(1:ncol(data), function(colIndex) paste(which(data[, colIndex] == 1), collapse=" "))
131 |   }
132 |   classCount = table(classes)
133 |   uniqueClasses = names(classCount)
134 |   # just pick the first data point for each class:
135 |   classIndex = sapply(uniqueClasses, function(cl) match(cl, classes))
136 |   data = data[, classIndex]
137 |   remapper = sapply(classes, function(cl) match(cl, uniqueClasses))
138 |   return(list(data=data, remapper=remapper, count=classCount))
139 | }
140 | 
141 | calcClassForSparseMatrix = function(data) {
142 |   i = data@i
143 |   dp = c(0, diff(data@p))
144 |   csdp = cumsum(dp)
145 |   oneClass = function(index) {
146 |     from = csdp[index]+1
147 |     to = csdp[index+1]
148 |     if (from > to) {
149 |       # zero entry
150 |       return("")
151 |     }
152 |     else {
153 |       return(paste(i[from:to], collapse=" "))
154 |     }
155 |   }
156 |   sapply(1:ncol(data), oneClass)
157 | }
158 | 
159 | preprocessPrismaData =function(data, alpha=.05, skipFeatureCorrelation=FALSE) {
160 |   data$unprocessed = data$data
161 |   processed = filterDataByTestAndCor(data$data, alpha, skipFeatureCorrelation)
162 |   duplicatesRemoved = duplicateRemover(processed$mat)
163 |   data$data = duplicatesRemoved$data
164 |   data$remapper = duplicatesRemoved$remapper
165 |   data$duplicatecount = as.vector(duplicatesRemoved$count)
166 | 
167 |   data$group = processed$group
168 |   data$occAlways = processed$always
169 |   data$occNever = processed$never
170 | 
171 |   return(data)
172 | }
173 | 
174 | count2freq = function(mat) {
175 |   # use the samples x features view for simpler calculation
176 |   mat = t(mat)
177 |   return(t(mat / rowSums(mat)))
178 | }
179 | 
180 | count2bin = function(mat) {
181 |   #require(Matrix)
182 |   if (inherits(mat, "TsparseMatrix")) {
183 |     ret = mat
184 |   }
185 |   else if (inherits(mat, "CsparseMatrix")) {
186 |     ret = sparseMatrix(mat@i+1, p=mat@p, x=1, dims=mat@Dim, dimnames=mat@Dimnames)
187 |   }
188 |   else {
189 |     ret = as.matrix(mat)
190 |     ret[ret > 0] = 1
191 |   }
192 |   return(ret)
193 | }
194 | 
195 | groupCorrelatedNgrams = function(data) {
196 |   nfeats = nrow(data)
197 |   ndocs = ncol(data)
198 |   toCheck = 1:nfeats
199 |   groups = rep(-1, nfeats)
200 |   groupCount = 1
201 |   # is it possible to calculate correlations on sparse matrices?
202 |   #mat = as.matrix(data)
203 |   mat = data
204 |   while (length(toCheck) > 0) {
205 |     cat("to check:", length(toCheck), "\n")
206 |     if (length(toCheck) == 1) {
207 |       curCor = 1
208 |     }
209 |     else {
210 |       curCor = sparse.cor(mat[toCheck, ])
211 |     }
212 |     group = toCheck[curCor == 1]
213 |     groups[group] = groupCount
214 |     groupCount = groupCount + 1
215 |     toCheck = toCheck[curCor != 1]
216 |     #cat(data$str[group], "\n")
217 |   }
218 |   return(groups)
219 | }
220 | 
221 | sparse.cor <- function(X){
222 |   docsWithFeature = (X[1, ] != 0)
223 |   onDocs = sum(docsWithFeature)
224 |   offDocs = ncol(X) - onDocs
225 |   ret = rep(0, nrow(X))
226 |   ret[1] = 1
227 |   if (onDocs >= 1) {
228 |     onFeatureDocs = X[, docsWithFeature]
229 |     offFeatureDocs = X[, !docsWithFeature]
230 |     if (onDocs > 1) {
231 |       # we have more than one document for this feature... 
232 |       # so calculate the number of documents for this feature
233 |       onFeatureDocs = rowSums(onFeatureDocs)
234 |     }
235 |     if (offDocs > 1) {
236 |       offFeatureDocs = rowSums(offFeatureDocs)
237 |     }
238 |     # just set the correlation to one, if the number of
239 |     # documents, in which the feature is turned of, is zero
240 |     # and the number of documents, in which the feature is on, is the same
241 |     ret[(offFeatureDocs == 0) & (onFeatureDocs == onDocs)] = 1
242 |   }
243 |   return(ret)
244 | }
245 | 
246 | compressByGroup = function(data) {
247 |   features = rownames(data)
248 |   groups = groupCorrelatedNgrams(data)
249 |   indByG = split(1:length(groups), groups)
250 |   names(groups) = features
251 |   newDimNames = sapply(indByG, function(g) paste(features[g], collapse=" "))
252 |   # just keep the first feature of the group...
253 |   # since the rest contains the same information (cor=1)
254 |   data = data[sapply(indByG, function(g) g[1]), ]
255 |   rownames(data) = newDimNames
256 |   return(list(data=data, group=groups))
257 | }
258 | 
259 | # data should be binary and unnormalized!
260 | # hmmm... the "normal" testing weirdness of thinking-negative:
261 | # never = ttestNgrams(data, 0, "greater")
262 | # we would keep these...
263 | # data$str[p.adjust(never, "bonf") < 0.05]
264 | ##   [1] "\nAcc" "\nHos" " */*"  " HTT"  " cgi"  " www"  "*/*\n" ".1\nH" ".com" 
265 | ##  [10] ".foo"  ".php"  "/1.1"  "/sea"  "1\nHo" "1.1\n" ": */"  ": ww"  "Acce" 
266 | ##  [19] "ET c"  "GET "  "HTTP"  "Host"  "P/1."  "T cg"  "TP/1"  "TTP/"  "ar.c" 
267 | ##  [28] "arch"  "bar."  "ccep"  "cept"  "cgi/"  "ch.p"  "com\n" "earc"  "ept:" 
268 | ##  [37] "foob"  "gi/s"  "h.ph"  "hp?s"  "i/se"  "m\nAc" "obar"  "om\nA" "ooba" 
269 | ##  [46] "ost:"  "p?s="  "php?"  "pt: "  "r.co"  "rch."  "sear"  "st: "  "t: *" 
270 | ##  [55] "t: w"  "w.fo"  "ww.f"  "www."  "&par"  "/adm"  "=ren"  "?act"  "acti" 
271 | ##  [64] "admi"  "ame&"  "ctio"  "dmin"  "e&pa"  "enam"  "gi/a"  "hp?a"  "i/ad" 
272 | ##  [73] "in.p"  "ion="  "me&p"  "min."  "n.ph"  "n=re"  "name"  "on=r"  "p?ac" 
273 | ##  [82] "par="  "rena"  "tion"  " sta"  ".htm"  "ET s"  "T st"  "atic"  "html" 
274 | ##  [91] "l HT"  "ml H"  "stat"  "tati"  "tic/"  "tml "  "=mov"  "move"  "n=mo" 
275 | ## [100] "on=m"  "ove&"  "ve&p"  "=sho"  "how&"  "n=sh"  "on=s"  "ow&p"  "show" 
276 | ## [109] "w&pa"  "=del"  "dele"  "elet"  "ete&"  "lete"  "n=de"  "on=d"  "te&p" 
277 | ## [118] "G HT" 
278 | # always = ttestNgrams(data, 1, "less")
279 | # ...and drop these...
280 | # data$str[p.adjust(always, "bonf") > 0.05]
281 | ##  [1] "\nAcc" "\nHos" " */*"  " HTT"  " www"  "*/*\n" ".1\nH" ".com"  ".foo" 
282 | ## [10] "/1.1"  "1\nHo" "1.1\n" ": */"  ": ww"  "Acce"  "GET "  "HTTP"  "Host" 
283 | ## [19] "P/1."  "TP/1"  "TTP/"  "ar.c"  "bar."  "ccep"  "cept"  "com\n" "ept:" 
284 | ## [28] "foob"  "m\nAc" "obar"  "om\nA" "ooba"  "ost:"  "pt: "  "r.co"  "st: " 
285 | ## [37] "t: *"  "t: w"  "w.fo"  "ww.f"  "www."
286 | # So finally just keep these:
287 | # data$str[p.adjust(always, "bonf") < 0.05 & p.adjust(never, "bonf") < 0.05]
288 | ttestNgrams = function(data, mu, alternative=c("greater", "less")) {
289 |   #require(Matrix)
290 |   alternative <- match.arg(alternative)
291 |   N = ncol(data)
292 |   nfeats = nrow(data)
293 |   muNgram = rowMeans(data) * N
294 |   # some sources give 5, other 10 as a factor, of when the normal approx. works...
295 |   # we just take the average here.
296 |   mu = ifelse(mu == 0, 7.5/N, 1 - (7.5/N))
297 |   theVar = sqrt(N * mu * (1 - mu))
298 |   M = mu * N
299 |   if (alternative == "greater") {
300 |     pValues = sapply(muNgram, function(m) pnorm((m - M) / theVar, lower.tail = FALSE))
301 |   }
302 |   if (alternative == "less") {
303 |     pValues = sapply(muNgram, function(m) pnorm((m - M) / theVar, lower.tail = TRUE))
304 |   }
305 |   return(pValues)
306 | }
307 | 
308 | filterDataByTestAndCor = function(data, alpha=0.05, skipFeatureCorrelation=FALSE) {
309 |   data = count2bin(data)
310 |   if (is.null(alpha)) {
311 |     #keep = (alwaysP != 1)
312 |     keep = rep(TRUE, nrow(data))
313 |   }
314 |   else {
315 |     never = ttestNgrams(data, 0, "greater")
316 |     always = ttestNgrams(data, 1, "less")
317 |     
318 |     alwaysP = p.adjust(always, "holm")
319 |     neverP = p.adjust(never, "holm")
320 |     keep = (alwaysP < alpha & neverP < alpha)
321 |   }
322 |   allStr = rownames(data)
323 |   fdata = data[keep, ]
324 |   if (skipFeatureCorrelation) {
325 |     features = rownames(fdata)
326 |     groups = 1:length(features)
327 |     names(groups) = features
328 |     dataAndGroup =list(data=fdata, group=groups)
329 |   }
330 |   else {
331 |     dataAndGroup = compressByGroup(fdata)
332 |   }
333 |   if (is.null(alpha)) {
334 |     #always = allStr[(alwaysP == 1)]
335 |     always = c()
336 |     never = c()
337 |   }
338 |   else {
339 |     always = allStr[(alwaysP >= alpha)]
340 |     never = allStr[(neverP >= alpha)]
341 |   }
342 |   return(list(mat=dataAndGroup$data, group=dataAndGroup$group, always=always, never=never))
343 | }
344 | 


--------------------------------------------------------------------------------