.text. Timestamps should be formatted according to this format: YYYY-MM-DD HH:mm:ss (e.g. 2009-11-01 00:01:24)
42 |
43 | Time-slices are expected to be numbered starting from 0 and files are expected to be named with 8 digits (e.g. 00000000.text, 00000000.time, 00000001.text, 00000001.time)
44 |
45 | Parameter Setting
46 | -----------------
47 |
48 | All the parameters are set in the parameters.txt file:
49 |
50 | 1. prepareCorpus (boolean): if you are running MABED for the first time, or if the content of the input directory has been modified, this parameter should be set to 'true', otherwise 'false'.
51 | 2. timeSliceLength (int): length of each time-slice, expressed in minutes (e.g. 30);
52 | 3. numberOfThreads (int): the number of threads used by MABED (if > 1, then the parallelized implementation of MABED is executed)
53 | 4. k (int): desired number of events (e.g. 40);
54 | 5. p (int): maximum number of related words describing each event (e.g. 10);
55 | 6. theta (double): minimum weight of each related word (e.g. 0.7);
56 | 7. sigma (double): merging threshold (e.g. 0.5);
57 | 8. stopwords (String): name of the file that lists the stopwords, one word per line (e.g. stopwords.txt);
58 | 9. minSupport (double): minimum support of words in the vocabulary (e.g. 0)
59 | 10. maxSupport (double): maximum support of words in the vocabulary (e.g. 1)
60 |
61 | Running the program
62 | -------------------
63 |
64 | - Requirements: JAVA (7+)
65 | - Execute the program MABED.jar with the following command: "java -jar MABED.jar -run". It should process the input and save the output in the "ouput/" directory.
66 |
--------------------------------------------------------------------------------
/lib/Indexer.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AdrienGuille/MABED/b237855fb4f6ddebfb1ebf50fcbb43e1140b00e0/lib/Indexer.jar
--------------------------------------------------------------------------------
/lib/commons-io-2.4.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AdrienGuille/MABED/b237855fb4f6ddebfb1ebf50fcbb43e1140b00e0/lib/commons-io-2.4.jar
--------------------------------------------------------------------------------
/lib/commons-math3-3.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AdrienGuille/MABED/b237855fb4f6ddebfb1ebf50fcbb43e1140b00e0/lib/commons-math3-3.2.jar
--------------------------------------------------------------------------------
/lib/gs-algo-1.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AdrienGuille/MABED/b237855fb4f6ddebfb1ebf50fcbb43e1140b00e0/lib/gs-algo-1.2.jar
--------------------------------------------------------------------------------
/lib/gs-core-1.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AdrienGuille/MABED/b237855fb4f6ddebfb1ebf50fcbb43e1140b00e0/lib/gs-core-1.2.jar
--------------------------------------------------------------------------------
/lib/lucene-analyzers-common-4.10.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AdrienGuille/MABED/b237855fb4f6ddebfb1ebf50fcbb43e1140b00e0/lib/lucene-analyzers-common-4.10.2.jar
--------------------------------------------------------------------------------
/lib/lucene-core-4.10.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AdrienGuille/MABED/b237855fb4f6ddebfb1ebf50fcbb43e1140b00e0/lib/lucene-core-4.10.2.jar
--------------------------------------------------------------------------------
/parameters.txt:
--------------------------------------------------------------------------------
1 | # If you are running MABED for the first time, or if the content of the input directory has been modified, this parameter should be set to 'true', otherwise 'false'
2 | prepareCorpus = true
3 | # The length of each time-slice, expressed in minutes, e.g. 30
4 | timeSliceLength = 30
5 | # Number of threads to use
6 | numberOfThreads = 12
7 | # MABED parameters
8 | k = 40
9 | p = 10
10 | theta = 0.7
11 | sigma = 0.5
12 | # List of stopwords that are removed from the vocabularies
13 | stopwords = stopwords.txt
14 | # Adjust support to speed up the first phase
15 | minSupport = 0.001
16 | maxSupport = 0.01
17 |
--------------------------------------------------------------------------------
/screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AdrienGuille/MABED/b237855fb4f6ddebfb1ebf50fcbb43e1140b00e0/screenshot.png
--------------------------------------------------------------------------------
/src/cc/mallet/util/Util.java:
--------------------------------------------------------------------------------
1 | /* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept.
2 | This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
3 | http://www.cs.umass.edu/~mccallum/mallet
4 | This software is provided under the terms of the Common Public License,
5 | version 1.0, as published by http://www.opensource.org. For further
6 | information, see the file `LICENSE' included with this distribution. */
7 |
8 | package cc.mallet.util;
9 |
10 | /**
11 | *
12 | *
13 | * @author Charles Sutton
14 | * @version $Id: ArrayUtils.java,v 1.1 2007/10/22 21:37:40 mccallum Exp $
15 | */
16 | public class Util {
17 | /**
18 | * Returns the Jensen-Shannon divergence.
19 | */
20 | public static double jensenShannonDivergence(double[] p1, double[] p2) {
21 | assert(p1.length == p2.length);
22 | double[] average = new double[p1.length];
23 | for (int i = 0; i < p1.length; ++i) {
24 | average[i] += (p1[i] + p2[i])/2;
25 | }
26 | return (klDivergence(p1, average) + klDivergence(p2, average))/2;
27 | }
28 |
29 |
30 | public static final double log2 = Math.log(2);
31 | /**
32 | * Returns the KL divergence, K(p1 || p2).
33 | *
34 | * The log is w.r.t. base 2.
35 | *
36 | * *Note*: If any value in p2 is 0.0 then the KL-divergence
37 | * is infinite. Limin changes it to zero instead of infinite.
38 | *
39 | */
40 | public static double klDivergence(double[] p1, double[] p2) {
41 |
42 |
43 | double klDiv = 0.0;
44 |
45 | for (int i = 0; i < p1.length; ++i) {
46 | if (p1[i] == 0) { continue; }
47 | if (p2[i] == 0.0) { continue; } // Limin
48 |
49 | klDiv += p1[i] * Math.log( p1[i] / p2[i] );
50 | }
51 |
52 | return klDiv / log2; // moved this division out of the loop -DM
53 | }
54 | }
--------------------------------------------------------------------------------
/src/fr/ericlab/mabed/algo/Component1.java:
--------------------------------------------------------------------------------
1 | ////////////////////////////////////////////////////////////////////////////////
2 | // This file is part of MABED. //
3 | // //
4 | // MABED is free software: you can redistribute it and/or modify //
5 | // it under the terms of the GNU General Public License as published by //
6 | // the Free Software Foundation, either version 3 of the License, or //
7 | // (at your option) any later version. //
8 | // //
9 | // MABED is distributed in the hope that it will be useful, //
10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of //
11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
12 | // GNU General Public License for more details. //
13 | // //
14 | // You should have received a copy of the GNU General Public License //
15 | // along with MABED. If not, see . //
16 | ////////////////////////////////////////////////////////////////////////////////
17 |
18 | package fr.ericlab.mabed.algo;
19 |
20 | import static fr.ericlab.mabed.algo.MABED._SMOOTH_;
21 | import fr.ericlab.mabed.structure.Corpus;
22 | import fr.ericlab.mabed.structure.Event;
23 | import fr.ericlab.mabed.structure.EventList;
24 | import fr.ericlab.mabed.structure.TimeInterval;
25 | import fr.ericlab.util.Util;
26 | import java.util.ArrayList;
27 | import java.util.LinkedList;
28 |
29 | /**
30 | *
31 | * @author Adrien GUILLE, ERIC Lab, University of Lyon 2
32 | * @email adrien.guille@univ-lyon2.fr
33 | */
34 | public class Component1 extends Thread {
35 | public EventList events = new EventList();
36 | Corpus corpus;
37 | int from;
38 | int to;
39 | int minTermOccur;
40 | int maxTermOccur;
41 | int threadId;
42 |
43 | public Component1(int id, Corpus c, int a, int b, int min, int max){
44 | corpus = c;
45 | from = a;
46 | to = b;
47 | minTermOccur = min;
48 | maxTermOccur = max;
49 | threadId = id;
50 | }
51 |
52 | float expectation(int timeSlice, float tmf){
53 | return corpus.distribution[timeSlice]*(tmf/corpus.messageCount);
54 | }
55 |
56 | float anomaly(float expectation, float real){
57 | return real - expectation;
58 | }
59 |
60 | @Override
61 | public void run() {
62 | int m = corpus.nbTimeSlices;
63 | for(int t = from; t <= to; t++){
64 | String term = corpus.mentionVocabulary.get(t);
65 | float[] gf, mf;
66 | gf = Util.toFloatArray(corpus.getGlobalFrequency(term));
67 | mf = Util.toFloatArray(corpus.getMentionFrequency(t));
68 | int tmf = (int)Util.sum(mf,0,m-1);
69 | int tgf = (int)Util.sum(gf,0,m-1);
70 | if(tgf>minTermOccur && tgf 0){
73 | mf = Util.smoothArray(mf, _SMOOTH_);
74 | }
75 | float scoreSequence[] = new float[m];
76 | for(int i = 0; i < m; i++){
77 | expectation = expectation(i,tmf);
78 | scoreSequence[i] = anomaly(expectation, mf[i]);
79 | }
80 | LinkedList I = new LinkedList<>();
81 | LinkedList L = new LinkedList<>();
82 | LinkedList R = new LinkedList<>();
83 | ArrayList anomaly = new ArrayList<>();
84 | for(int i = 0; i < m; i++){
85 | anomaly.add(scoreSequence[i]>0?scoreSequence[i]:0);
86 | if(scoreSequence[i]>0){
87 | int k = I.size();
88 | float Lk = 0, Rk = Util.sum(scoreSequence,0,i);
89 | if(i>0){
90 | Lk = Util.sum(scoreSequence,0,i-1);
91 | }
92 | int j = 0;
93 | boolean foundJ = false;
94 | for(int l=k-1; l>=0 && !foundJ; l--){
95 | if(L.get(l)0){
119 | TimeInterval maxI = I.get(0);
120 | for(TimeInterval Ii : I){
121 | if(Util.sum(scoreSequence,Ii.timeSliceA,Ii.timeSliceB)>Util.sum(scoreSequence,maxI.timeSliceA,maxI.timeSliceB)){
122 | maxI.timeSliceA = Ii.timeSliceA;
123 | maxI.timeSliceB = Ii.timeSliceB;
124 | }
125 | }
126 | double score = Util.sum(scoreSequence,I.get(0).timeSliceA,I.get(0).timeSliceB);
127 | events.add(new Event(term,maxI,score,anomaly));
128 | }
129 | }
130 | }
131 | System.out.println(" - number of detected events (thread "+threadId+"): "+events.size());
132 | }
133 | }
134 |
--------------------------------------------------------------------------------
/src/fr/ericlab/mabed/algo/Component2.java:
--------------------------------------------------------------------------------
1 | ////////////////////////////////////////////////////////////////////////////////
2 | // This file is part of MABED. //
3 | // //
4 | // MABED is free software: you can redistribute it and/or modify //
5 | // it under the terms of the GNU General Public License as published by //
6 | // the Free Software Foundation, either version 3 of the License, or //
7 | // (at your option) any later version. //
8 | // //
9 | // MABED is distributed in the hope that it will be useful, //
10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of //
11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
12 | // GNU General Public License for more details. //
13 | // //
14 | // You should have received a copy of the GNU General Public License //
15 | // along with MABED. If not, see . //
16 | ////////////////////////////////////////////////////////////////////////////////
17 |
18 | package fr.ericlab.mabed.algo;
19 |
20 | import fr.ericlab.mabed.structure.Corpus;
21 | import fr.ericlab.mabed.structure.Event;
22 | import fr.ericlab.mabed.structure.WeightedTerm;
23 | import indexer.Indexer;
24 | import java.util.ArrayList;
25 |
26 | /**
27 | *
28 | * @author Adrien GUILLE, ERIC Lab, University of Lyon 2
29 | * @email adrien.guille@univ-lyon2.fr
30 | */
31 | public class Component2 extends Thread {
32 | Corpus corpus;
33 | Event basicEvent;
34 | int candidateWordSetSize;
35 | double theta;
36 | Event refinedEvent;
37 | int threadId;
38 |
39 | public Component2(int id, Corpus c, Event be, int p, double t){
40 | corpus = c;
41 | basicEvent = be;
42 | candidateWordSetSize = p;
43 | theta = t;
44 | threadId = id;
45 | }
46 |
47 | double getErdemCoefficient(short[] ref, short[] comp, int a, int b){
48 | double scores1[] = new double[b-a+1], scores2[] = new double[b-a+1];
49 | for(int i = a; i <= b; i++){
50 | scores1[i-a] = ref[i];
51 | scores2[i-a] = comp[i];
52 | }
53 | double result;
54 | double A12 = 0, A1 = 0, A2 = 0;
55 | for(int i=2;i candidateWords = indexer.getMostFrequentWords(corpus.getMessages(basicEvent),basicEvent.mainTerm,candidateWordSetSize);
71 | short ref[] = corpus.getGlobalFrequency(basicEvent.mainTerm);
72 | short comp[];
73 | refinedEvent = new Event(basicEvent.mainTerm, basicEvent.I, basicEvent.score, basicEvent.anomaly);
74 | for(String word : candidateWords){
75 | comp = corpus.getGlobalFrequency(word);
76 | double w = getErdemCoefficient(ref, comp, basicEvent.I.timeSliceA, basicEvent.I.timeSliceB);
77 | if(w >= theta){
78 | refinedEvent.relatedTerms.add(new WeightedTerm(word,w));
79 | }
80 | }
81 | }
82 | }
83 |
--------------------------------------------------------------------------------
/src/fr/ericlab/mabed/algo/MABED.java:
--------------------------------------------------------------------------------
1 | ////////////////////////////////////////////////////////////////////////////////
2 | // This file is part of MABED. //
3 | // //
4 | // MABED is free software: you can redistribute it and/or modify //
5 | // it under the terms of the GNU General Public License as published by //
6 | // the Free Software Foundation, either version 3 of the License, or //
7 | // (at your option) any later version. //
8 | // //
9 | // MABED is distributed in the hope that it will be useful, //
10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of //
11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
12 | // GNU General Public License for more details. //
13 | // //
14 | // You should have received a copy of the GNU General Public License //
15 | // along with MABED. If not, see . //
16 | ////////////////////////////////////////////////////////////////////////////////
17 |
18 | package fr.ericlab.mabed.algo;
19 |
20 | import fr.ericlab.mabed.app.Configuration;
21 | import fr.ericlab.mabed.structure.EventList;
22 | import fr.ericlab.mabed.structure.Corpus;
23 | import fr.ericlab.mabed.structure.WeightedTerm;
24 | import fr.ericlab.mabed.structure.Event;
25 | import fr.ericlab.mabed.structure.TimeInterval;
26 | import fr.ericlab.util.Util;
27 | import fr.ericlab.mabed.structure.EventGraph;
28 | import indexer.Indexer;
29 | import java.text.DecimalFormat;
30 | import java.util.ArrayList;
31 | import java.util.LinkedList;
32 |
33 | /**
34 | *
35 | * @author Adrien GUILLE, ERIC Lab, University of Lyon 2
36 | * @email adrien.guille@univ-lyon2.fr
37 | */
38 | final public class MABED {
39 | // stopwords
40 | LinkedList stopWords = new LinkedList<>();
41 |
42 | // dataset
43 | public Corpus corpus;
44 |
45 | // algo
46 | double maximumScore;
47 | static int _SMOOTH_ = 4;
48 | static int _MIN_RELATED_WORDS_ = 2;
49 | public String info;
50 |
51 | // results
52 | public EventList events;
53 | public EventGraph eventGraph;
54 |
55 | public String applyCentralized(Corpus c, Configuration configuration){
56 | corpus = c;
57 | info = " - minimum support for main terms: "+configuration.minSupport+"
- maximum support for main terms: "+configuration.maxSupport+"
- maximum number of related terms: "+configuration.p+"
- minimum weight for related terms: "+configuration.theta;
58 | String output = " - min suppport = "+configuration.minSupport+", max support = "+configuration.maxSupport+", p = "+configuration.p+", theta = "+configuration.theta+", sigma = "+configuration.sigma+"\n";
59 |
60 | stopWords = Util.readStopWords(configuration.stopwords);
61 | System.out.println(Util.getDate()+" Loaded stopwords:\n - filename: "+configuration.stopwords+"\n - number of words: "+stopWords.size());
62 |
63 | // Get basic events
64 | long startP1 = Util.getTime();
65 | EventList basicEvents = getSimpleEvents((int)(configuration.minSupport*corpus.messageCount), (int)(configuration.maxSupport*corpus.messageCount));
66 | basicEvents.sort();
67 | long endP1 = Util.getTime();
68 |
69 | // Get final events
70 | System.out.println(Util.getDate()+" Selecting related terms ("+configuration.k+" events with at most "+configuration.p+" related terms)");
71 | int nbFinalEvents = 0;
72 | int i = 0;
73 | long startP2 = Util.getTime();
74 | if(basicEvents.size() > 0){
75 | eventGraph = new EventGraph(corpus, basicEvents.get(0).score, configuration.sigma);
76 | System.out.print(" - k: ");
77 | while(nbFinalEvents < configuration.k && i < basicEvents.size()){
78 | Event event = getRefinedEvent(corpus, basicEvents.get(i), configuration.p, configuration.theta);
79 | if(event.relatedTerms.size() >= _MIN_RELATED_WORDS_){
80 | int previousNb = nbFinalEvents;
81 | nbFinalEvents += eventGraph.addEvent(event);
82 | if(nbFinalEvents > previousNb){
83 | System.out.print(" "+nbFinalEvents);
84 | }
85 | }
86 | i++;
87 | }
88 | long endP2 = Util.getTime();
89 | System.out.println();
90 | long startP3 = Util.getTime();
91 | mergeRedundantEvents(eventGraph);
92 | events = eventGraph.toEventList();
93 | long endP3 = Util.getTime();
94 | double p1 = (double)(endP1-startP1)/(double)1000, p2 = (double)(endP2-startP2)/(double)1000, p3 = (double)(endP3-startP3)/(double)1000;
95 | DecimalFormat df = new DecimalFormat("#.00");
96 | System.out.println(Util.getDate()+" Computation time: "+df.format(p1)+"s + "+df.format(p2)+"s + "+df.format(p3)+"s = "+df.format(p1+p2+p3)+"s");
97 | output += " - computation time: "+df.format(p1)+"s + "+df.format(p2)+"s + "+df.format(p3)+"s = "+df.format(p1+p2+p3)+"s\n";
98 | }
99 | return output;
100 | }
101 |
102 | public String applyParallelized(Corpus d, Configuration configuration) throws InterruptedException{
103 | corpus = d;
104 | info = " - minimum support for main terms: "+configuration.minSupport+"
- maximum support for main terms: "+configuration.maxSupport+"
- maximum number of related terms: "+configuration.p+"
- minimum weight for related terms: "+configuration.theta;
105 | String output = " - min suppport = "+configuration.minSupport+", max support = "+configuration.maxSupport+", p = "+configuration.p+", theta = "+configuration.theta+", sigma = "+configuration.sigma+"\n";
106 |
107 | stopWords = Util.readStopWords(configuration.stopwords);
108 | System.out.println(Util.getDate()+" Loaded stopwords:\n - filename: "+configuration.stopwords+"\n - number of words: "+stopWords.size());
109 |
110 | // Phase 1
111 | long startP1 = Util.getTime();
112 | System.out.println(Util.getDate()+" Detecting events based on mention anomaly...");
113 | LinkedList c1Threads = new LinkedList<>();
114 | int numberOfWordsPerThread = corpus.mentionVocabulary.size()/configuration.numberOfThreads;
115 | for(int i = 0; i < configuration.numberOfThreads; i++){
116 | int upperBound = (i==configuration.numberOfThreads-1)?corpus.mentionVocabulary.size()-1:numberOfWordsPerThread*(i+1);
117 | c1Threads.add(new Component1(i,corpus,numberOfWordsPerThread*i+1,upperBound,(int)(configuration.minSupport*corpus.messageCount),(int)(configuration.maxSupport*corpus.messageCount)));
118 | c1Threads.get(i).start();
119 | }
120 | for(Component1 c1 : c1Threads){
121 | c1.join();
122 | }
123 | EventList basicEvents = new EventList();
124 | for(Component1 c1 : c1Threads){
125 | basicEvents.addAll(c1.events);
126 | }
127 | basicEvents.sort();
128 | c1Threads.clear();
129 | System.out.println(" - number of detected events (total): "+basicEvents.size());
130 | long endP1 = Util.getTime();
131 |
132 | // Phase 2
133 | System.out.println(Util.getDate()+" Selecting related terms ("+configuration.k+" events with at most "+configuration.p+" related terms)");
134 | int nbFinalEvents = 0;
135 | int i = 0;
136 | long startP2 = Util.getTime();
137 | if(basicEvents.size() > 0){
138 | eventGraph = new EventGraph(corpus, basicEvents.get(0).score, configuration.sigma);
139 | System.out.print(" - k: ");
140 | while(nbFinalEvents < configuration.k && i < basicEvents.size()-configuration.numberOfThreads){
141 | int numberOfC2Threads = ((configuration.k - nbFinalEvents)<=configuration.numberOfThreads)?(configuration.k-nbFinalEvents):configuration.numberOfThreads;
142 | Event[] refinedEvents = new Event[numberOfC2Threads];
143 | LinkedList c2Threads = new LinkedList<>();
144 | for(int j = 0; j < numberOfC2Threads; j++){
145 | c2Threads.add(new Component2(j,corpus,basicEvents.get(i+j),configuration.p,configuration.theta));
146 | c2Threads.get(j).start();
147 | }
148 | for(Component2 c2 : c2Threads){
149 | c2.join();
150 | }
151 | for(Component2 c2 : c2Threads){
152 | refinedEvents[c2.threadId] = c2.refinedEvent;
153 | }
154 | for(Event refinedEvent : refinedEvents){
155 | if(refinedEvent.relatedTerms.size() >= _MIN_RELATED_WORDS_){
156 | int previousNb = nbFinalEvents;
157 | nbFinalEvents += eventGraph.addEvent(refinedEvent);
158 | if(nbFinalEvents > previousNb){
159 | System.out.print(" "+nbFinalEvents);
160 | }
161 | }
162 | i++;
163 | }
164 | }
165 | long endP2 = Util.getTime();
166 | System.out.println();
167 | long startP3 = Util.getTime();
168 | mergeRedundantEvents(eventGraph);
169 | events = eventGraph.toEventList();
170 | long endP3 = Util.getTime();
171 | double p1 = (double)(endP1-startP1)/(double)1000, p2 = (double)(endP2-startP2)/(double)1000, p3 = (double)(endP3-startP3)/(double)1000;
172 | DecimalFormat df = new DecimalFormat("#.00");
173 | System.out.println(Util.getDate()+" Computation time: "+df.format(p1)+"s + "+df.format(p2)+"s + "+df.format(p3)+"s = "+df.format(p1+p2+p3)+"s");
174 | output += " - computation time: "+df.format(p1)+"s + "+df.format(p2)+"s + "+df.format(p3)+"s = "+df.format(p1+p2+p3)+"s\n";
175 | }
176 | return output;
177 | }
178 |
179 | float expectation(int timeSlice, float tmf){
180 | return corpus.distribution[timeSlice]*(tmf/corpus.messageCount);
181 | }
182 |
183 | float anomaly(float expectation, float real){
184 | return real - expectation;
185 | }
186 |
187 | double getErdemCoefficient(short[] ref, short[] comp, int a, int b){
188 | double scores1[] = new double[b-a+1], scores2[] = new double[b-a+1];
189 | for(int i = a; i <= b; i++){
190 | scores1[i-a] = ref[i];
191 | scores2[i-a] = comp[i];
192 | }
193 | double result;
194 | double A12 = 0, A1 = 0, A2 = 0;
195 | for(int i=2;i candidateWords = indexer.getMostFrequentWords(corpus.getMessages(basicEvent),basicEvent.mainTerm,p);
210 | short ref[] = corpus.getGlobalFrequency(basicEvent.mainTerm);
211 | short comp[];
212 | refinedEvent = new Event(basicEvent.mainTerm, basicEvent.I, basicEvent.score, basicEvent.anomaly);
213 | for(String word : candidateWords){
214 | comp = corpus.getGlobalFrequency(word);
215 | double w = getErdemCoefficient(ref, comp, basicEvent.I.timeSliceA, basicEvent.I.timeSliceB);
216 | if(w >= theta){
217 | refinedEvent.relatedTerms.add(new WeightedTerm(word,w));
218 | }
219 | }
220 | return refinedEvent;
221 | }
222 |
223 | EventList getSimpleEvents(int minTermOccur, int maxTermOccur){
224 | System.out.println(Util.getDate()+" Detecting events based on mention anomaly...");
225 | EventList simpleEvents = new EventList();
226 | int m = corpus.nbTimeSlices;
227 | for(int t = 0; t < corpus.mentionVocabulary.size(); t++){
228 | String term = corpus.mentionVocabulary.get(t);
229 | float[] gf, mf;
230 | gf = Util.toFloatArray(corpus.getGlobalFrequency(term));
231 | mf = Util.toFloatArray(corpus.getMentionFrequency(t));
232 | int tmf = (int)Util.sum(mf,0,m-1);
233 | int tgf = (int)Util.sum(gf,0,m-1);
234 | if(tgf>minTermOccur && tgf 0){
237 | mf = Util.smoothArray(mf, _SMOOTH_);
238 | }
239 | float scoreSequence[] = new float[m];
240 | for(int i = 0; i < m; i++){
241 | expectation = expectation(i,tmf);
242 | scoreSequence[i] = anomaly(expectation, mf[i]);
243 | }
244 | LinkedList I = new LinkedList<>();
245 | LinkedList L = new LinkedList<>();
246 | LinkedList R = new LinkedList<>();
247 | ArrayList anomaly = new ArrayList<>();
248 | for(int i = 0; i < m; i++){
249 | anomaly.add(scoreSequence[i]>0?scoreSequence[i]:0);
250 | if(scoreSequence[i]>0){
251 | int k = I.size();
252 | float Lk = 0, Rk = Util.sum(scoreSequence,0,i);
253 | if(i>0){
254 | Lk = Util.sum(scoreSequence,0,i-1);
255 | }
256 | int j = 0;
257 | boolean foundJ = false;
258 | for(int l=k-1; l>=0 && !foundJ; l--){
259 | if(L.get(l)0){
283 | TimeInterval maxI = I.get(0);
284 | for(TimeInterval Ii : I){
285 | if(Util.sum(scoreSequence,Ii.timeSliceA,Ii.timeSliceB)>Util.sum(scoreSequence,maxI.timeSliceA,maxI.timeSliceB)){
286 | maxI.timeSliceA = Ii.timeSliceA;
287 | maxI.timeSliceB = Ii.timeSliceB;
288 | }
289 | }
290 | double score = Util.sum(scoreSequence,I.get(0).timeSliceA,I.get(0).timeSliceB);
291 | simpleEvents.add(new Event(term,maxI,score,anomaly));
292 | }
293 | }
294 | }
295 | System.out.println(" - number of detected events: "+simpleEvents.size());
296 | simpleEvents.sort();
297 | return simpleEvents;
298 | }
299 |
300 | void mergeRedundantEvents(EventGraph eventGraph){
301 | System.out.println(Util.getDate()+" Merging duplicated events...");
302 | eventGraph.identifyConnectedComponents();
303 | }
304 | }
305 |
--------------------------------------------------------------------------------
/src/fr/ericlab/mabed/app/Configuration.java:
--------------------------------------------------------------------------------
1 | ////////////////////////////////////////////////////////////////////////////////
2 | // This file is part of MABED. //
3 | // //
4 | // MABED is free software: you can redistribute it and/or modify //
5 | // it under the terms of the GNU General Public License as published by //
6 | // the Free Software Foundation, either version 3 of the License, or //
7 | // (at your option) any later version. //
8 | // //
9 | // MABED is distributed in the hope that it will be useful, //
10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of //
11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
12 | // GNU General Public License for more details. //
13 | // //
14 | // You should have received a copy of the GNU General Public License //
15 | // along with MABED. If not, see . //
16 | ////////////////////////////////////////////////////////////////////////////////
17 |
18 | package fr.ericlab.mabed.app;
19 |
20 | import java.io.File;
21 | import java.io.FileInputStream;
22 | import java.io.IOException;
23 | import java.util.Properties;
24 |
25 | /**
26 | *
27 | * @author Adrien GUILLE, ERIC Lab, University of Lyon 2
28 | * @email adrien.guille@univ-lyon2.fr
29 | */
30 | public class Configuration {
31 |
32 | // Parallel
33 | public int numberOfThreads;
34 |
35 | // Corpus
36 | public boolean prepareCorpus;
37 | public int timeSliceLength;
38 | public String stopwords;
39 |
40 | // MABED
41 | public int k;
42 | public int p;
43 | public double theta;
44 | public double sigma;
45 | public double minSupport;
46 | public double maxSupport;
47 |
48 | public Configuration() throws IOException{
49 | File inputFile = new File("parameters.txt");
50 | Properties prop = new Properties();
51 | if(!inputFile.exists()){
52 | System.out.println("Configuration file not found! See README.txt");
53 | System.exit(-1);
54 | }else{
55 | try (FileInputStream inputStream = new FileInputStream(inputFile)) {
56 | prop.load(inputStream);
57 | prepareCorpus = Boolean.parseBoolean(prop.getProperty("prepareCorpus"));
58 | timeSliceLength = Integer.parseInt(prop.getProperty("timeSliceLength"));
59 | k = Integer.parseInt(prop.getProperty("k"));
60 | p = Integer.parseInt(prop.getProperty("p"));
61 | theta = Double.parseDouble(prop.getProperty("theta"));
62 | sigma = Double.parseDouble(prop.getProperty("sigma"));
63 | minSupport = Double.parseDouble(prop.getProperty("minSupport"));
64 | maxSupport = Double.parseDouble(prop.getProperty("maxSupport"));
65 | stopwords = prop.getProperty("stopwords");
66 | numberOfThreads = Integer.parseInt(prop.getProperty("numberOfThreads"));
67 | }
68 | }
69 | }
70 | }
71 |
--------------------------------------------------------------------------------
/src/fr/ericlab/mabed/app/Main.java:
--------------------------------------------------------------------------------
1 | ////////////////////////////////////////////////////////////////////////////////
2 | // This file is part of MABED. //
3 | // //
4 | // MABED is free software: you can redistribute it and/or modify //
5 | // it under the terms of the GNU General Public License as published by //
6 | // the Free Software Foundation, either version 3 of the License, or //
7 | // (at your option) any later version. //
8 | // //
9 | // MABED is distributed in the hope that it will be useful, //
10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of //
11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
12 | // GNU General Public License for more details. //
13 | // //
14 | // You should have received a copy of the GNU General Public License //
15 | // along with MABED. If not, see . //
16 | ////////////////////////////////////////////////////////////////////////////////
17 |
18 | package fr.ericlab.mabed.app;
19 |
20 | import fr.ericlab.mabed.structure.Corpus;
21 | import fr.ericlab.util.Util;
22 | import fr.ericlab.mabed.algo.MABED;
23 | import java.io.File;
24 | import java.io.IOException;
25 | import java.util.Locale;
26 | import java.util.logging.Level;
27 | import java.util.logging.Logger;
28 | import org.apache.commons.io.FileUtils;
29 |
30 | /**
31 | *
32 | * @author Adrien GUILLE, ERIC Lab, University of Lyon 2
33 | * @email adrien.guille@univ-lyon2.fr
34 | */
35 | public class Main {
36 |
37 |
38 | public static void main(String[] args) throws IOException {
39 | Locale.setDefault(Locale.US);
40 | Configuration configuration = new Configuration();
41 | Corpus corpus = new Corpus(configuration);
42 | System.out.println("MABED: Mention-Anomaly-Based Event Detection");
43 | if(args.length == 0 || args[0].equals("-help")){
44 | System.out.println("For more information on how to run MABED, see the README.txt file");
45 | }else{
46 | if(args[0].equals("-run") ){
47 | try {
48 | if(configuration.numberOfThreads>1){
49 | System.out.println("Running the parallelized implementation with "+configuration.numberOfThreads+" threads (this computer has "+ Runtime.getRuntime().availableProcessors()+" available threads)");
50 | }else{
51 | System.out.println("Running the centralized implementation");
52 | }
53 | corpus.loadCorpus(configuration.numberOfThreads>1);
54 | String output = "MABED: Mention-Anomaly-Based Event Detection\n"+corpus.output+"\n";
55 | System.out.println("-------------------------\n"+Util.getDate()+" MABED is running\n-------------------------");
56 | output += "-------------------------\n"+Util.getDate()+" MABED is running\n-------------------------\n";
57 | System.out.println(Util.getDate()+" Reading parameters:\n - k = "+configuration.k+", p = "+configuration.p+", theta = "+configuration.theta+", sigma = "+configuration.sigma);
58 | MABED mabed = new MABED();
59 | if(configuration.numberOfThreads>1){
60 | output += mabed.applyParallelized(corpus,configuration);
61 | }else{
62 | output += mabed.applyCentralized(corpus,configuration);
63 | }
64 | System.out.println("--------------------\n"+Util.getDate()+" MABED ended\n--------------------");
65 | output += "--------------------\n"+Util.getDate()+" MABED ended\n--------------------\n";
66 | File outputDir = new File("output");
67 | if(!outputDir.isDirectory()){
68 | outputDir.mkdir();
69 | }
70 | File textFile = new File("output/MABED.tex");
71 | FileUtils.writeStringToFile(textFile,mabed.events.toLatex(corpus),false);
72 | textFile = new File("output/MABED.log");
73 | FileUtils.writeStringToFile(textFile,output,false);
74 | mabed.events.printLatex(corpus);
75 | } catch (InterruptedException ex) {
76 | Logger.getLogger(Main.class.getName()).log(Level.SEVERE, null, ex);
77 | }
78 | }else{
79 | System.out.println("Unknown option '"+args[0]+"'\nType 'java -jar MABED.jar -help' for more information on how to run MABED");
80 | }
81 | }
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/src/fr/ericlab/mabed/structure/Corpus.java:
--------------------------------------------------------------------------------
1 | ////////////////////////////////////////////////////////////////////////////////
2 | // This file is part of MABED. //
3 | // //
4 | // MABED is free software: you can redistribute it and/or modify //
5 | // it under the terms of the GNU General Public License as published by //
6 | // the Free Software Foundation, either version 3 of the License, or //
7 | // (at your option) any later version. //
8 | // //
9 | // MABED is distributed in the hope that it will be useful, //
10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of //
11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
12 | // GNU General Public License for more details. //
13 | // //
14 | // You should have received a copy of the GNU General Public License //
15 | // along with MABED. If not, see . //
16 | ////////////////////////////////////////////////////////////////////////////////
17 |
18 | package fr.ericlab.mabed.structure;
19 |
20 | import fr.ericlab.mabed.app.Configuration;
21 | import fr.ericlab.util.Util;
22 | import indexer.GlobalIndexer;
23 | import java.io.File;
24 | import java.io.FileInputStream;
25 | import java.io.FileNotFoundException;
26 | import java.io.IOException;
27 | import java.io.ObjectInputStream;
28 | import java.sql.Timestamp;
29 | import java.text.DecimalFormat;
30 | import java.text.NumberFormat;
31 | import java.text.ParseException;
32 | import java.text.SimpleDateFormat;
33 | import java.util.ArrayList;
34 | import java.util.Collections;
35 | import java.util.Date;
36 | import java.util.List;
37 | import java.util.logging.Level;
38 | import java.util.logging.Logger;
39 | import org.apache.commons.io.FileUtils;
40 | import org.apache.commons.io.LineIterator;
41 |
42 | /**
43 | *
44 | * @author Adrien GUILLE, ERIC Lab, University of Lyon 2
45 | * @email adrien.guille@univ-lyon2.fr
46 | */
47 | public class Corpus {
48 | public Configuration configuration;
49 |
50 | public String info;
51 | public int messageCount;
52 | public int nbTimeSlices;
53 | public boolean loaded = false;
54 | public Timestamp startTimestamp;
55 | public Timestamp endTimestamp;
56 | public int[] distribution;
57 | public String output;
58 |
59 | // Indexes
60 | short[][] frequencyMatrix;
61 | public ArrayList vocabulary;
62 | short[][] mentionFrequencyMatrix;
63 | public ArrayList mentionVocabulary;
64 |
65 | public Corpus(Configuration conf){
66 | configuration = conf;
67 | }
68 |
69 | public void prepareCorpus(){
70 | System.out.println(Util.getDate()+" Preparing corpus...");
71 | String[] fileArray = new File("input/").list();
72 | nbTimeSlices = 0;
73 | NumberFormat formatter = new DecimalFormat("00000000");
74 | ArrayList list = new ArrayList<>();
75 | for(String filename : fileArray){
76 | if(filename.endsWith(".text")){
77 | try {
78 | list.add(formatter.parse(filename.substring(0, 8)).intValue());
79 | } catch (ParseException ex) {
80 | Logger.getLogger(Corpus.class.getName()).log(Level.SEVERE, null, ex);
81 | }
82 | nbTimeSlices++;
83 | }
84 | }
85 | int a = Collections.min(list), b = Collections.max(list);
86 | LineIterator it = null;
87 | try {
88 | SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss");
89 | it = FileUtils.lineIterator(new File("input/"+formatter.format(a)+".time"), "UTF-8");
90 | if(it.hasNext()) {
91 | Date parsedDate = dateFormat.parse(it.nextLine());
92 | startTimestamp = new java.sql.Timestamp(parsedDate.getTime());
93 | }
94 | it = FileUtils.lineIterator(new File("input/"+formatter.format(b)+".time"), "UTF-8");
95 | String lastLine = "";
96 | while(it.hasNext()) {
97 | lastLine = it.nextLine();
98 | }
99 | Date parsedDate = dateFormat.parse(lastLine);
100 | endTimestamp = new java.sql.Timestamp(parsedDate.getTime());
101 | } catch (IOException | ParseException ex) {
102 | Logger.getLogger(Corpus.class.getName()).log(Level.SEVERE, null, ex);
103 | } finally {
104 | LineIterator.closeQuietly(it);
105 | }
106 | System.out.print(" - Computing word frequencies");
107 | GlobalIndexer indexer = new GlobalIndexer(configuration.numberOfThreads,false);
108 | try {
109 | indexer.index("input/", configuration.stopwords);
110 | } catch ( InterruptedException | IOException ex) {
111 | Logger.getLogger(Corpus.class.getName()).log(Level.SEVERE, null, ex);
112 | }
113 | indexer = new GlobalIndexer(configuration.numberOfThreads,true);
114 | try {
115 | indexer.index("input/", configuration.stopwords);
116 | } catch ( InterruptedException | IOException ex) {
117 | Logger.getLogger(Corpus.class.getName()).log(Level.SEVERE, null, ex);
118 | }
119 | System.out.println(", 100% done.");
120 | }
121 |
122 | public void loadCorpus(boolean parallelized){
123 | output = "";
124 | if(configuration.prepareCorpus){
125 | prepareCorpus();
126 | }
127 | String[] fileArray = new File("input/").list();
128 | nbTimeSlices = 0;
129 | NumberFormat formatter = new DecimalFormat("00000000");
130 | ArrayList list = new ArrayList<>();
131 | for(String filename : fileArray){
132 | if(filename.endsWith(".text")){
133 | try {
134 | list.add(formatter.parse(filename.substring(0, 8)).intValue());
135 | } catch (ParseException ex) {
136 | Logger.getLogger(Corpus.class.getName()).log(Level.SEVERE, null, ex);
137 | }
138 | nbTimeSlices++;
139 | }
140 | }
141 | int a = Collections.min(list), b = Collections.max(list);
142 | distribution = new int[nbTimeSlices];
143 | messageCount = 0;
144 | LineIterator it = null;
145 | try {
146 | it = FileUtils.lineIterator(new File("input/"+formatter.format(a)+".time"), "UTF-8");
147 | if(it.hasNext()) {
148 | SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
149 | Date parsedDate = dateFormat.parse(it.nextLine());
150 | startTimestamp = new java.sql.Timestamp(parsedDate.getTime());
151 | }
152 | it = FileUtils.lineIterator(new File("input/"+formatter.format(b)+".time"), "UTF-8");
153 | String timestamp = "";
154 | while(it.hasNext()) {
155 | timestamp = it.nextLine();
156 | }
157 | SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
158 | Date parsedDate = dateFormat.parse(timestamp);
159 | endTimestamp = new java.sql.Timestamp(parsedDate.getTime());
160 | } catch (IOException | ParseException ex) {
161 | Logger.getLogger(Corpus.class.getName()).log(Level.SEVERE, null, ex);
162 | } finally {
163 | LineIterator.closeQuietly(it);
164 | }
165 | try {
166 | // Global index
167 | FileInputStream fisMatrix = new FileInputStream("input/indexes/frequencyMatrix.dat");
168 | ObjectInputStream oisMatrix = new ObjectInputStream(fisMatrix);
169 | frequencyMatrix = (short[][]) oisMatrix.readObject();
170 | FileInputStream fisVocabulary = new FileInputStream("input/indexes/vocabulary.dat");
171 | ObjectInputStream oisVocabulary = new ObjectInputStream(fisVocabulary);
172 | vocabulary = (ArrayList) oisVocabulary.readObject();
173 | // Mention index
174 | FileInputStream fisMentionMatrix = new FileInputStream("input/indexes/mentionFrequencyMatrix.dat");
175 | ObjectInputStream oisMentionMatrix = new ObjectInputStream(fisMentionMatrix);
176 | mentionFrequencyMatrix = (short[][]) oisMentionMatrix.readObject();
177 | FileInputStream fisMentionVocabulary = new FileInputStream("input/indexes/mentionVocabulary.dat");
178 | ObjectInputStream oisMentionVocabulary = new ObjectInputStream(fisMentionVocabulary);
179 | mentionVocabulary = (ArrayList) oisMentionVocabulary.readObject();
180 | // Message count
181 | String messageCountStr = FileUtils.readFileToString(new File("input/indexes/messageCount.txt"));
182 | messageCount = Integer.parseInt(messageCountStr);
183 | // Message count distribution
184 | FileInputStream fisDistribution = new FileInputStream("input/indexes/messageCountDistribution.dat");
185 | ObjectInputStream oisDistribution = new ObjectInputStream(fisDistribution);
186 | distribution = (int[]) oisDistribution.readObject();
187 | } catch (FileNotFoundException ex) {
188 | Logger.getLogger(Corpus.class.getName()).log(Level.SEVERE, null, ex);
189 | } catch (IOException | ClassNotFoundException ex) {
190 | Logger.getLogger(Corpus.class.getName()).log(Level.SEVERE, null, ex);
191 | }
192 | DecimalFormat df = new DecimalFormat("#,###");
193 | System.out.println(Util.getDate()+" Loaded corpus:");
194 | output += Util.getDate()+" Loaded corpus:\n";
195 | info =" - time-slices: "+df.format(nbTimeSlices)+" time-slices of "+configuration.timeSliceLength+" minutes each\n";
196 | info +=" - first message: "+startTimestamp+"\n";
197 | double datasetLength = (nbTimeSlices*configuration.timeSliceLength)/60/24;
198 | info +=" - last message: "+endTimestamp+" ("+datasetLength+" days)\n";
199 | info +=" - number of messages: "+df.format(messageCount);
200 | output += info;
201 | System.out.println(info);
202 | }
203 |
204 | public short[] getMentionFrequency(int i){
205 | return mentionFrequencyMatrix[i];
206 | }
207 |
208 | public short[] getGlobalFrequency(String term){
209 | int i = vocabulary.indexOf(term);
210 | if(i == -1){
211 | return new short[nbTimeSlices];
212 | }else{
213 | return frequencyMatrix[i];
214 | }
215 | }
216 |
217 | public String getMessages(Event event){
218 | String messages = "";
219 | NumberFormat formatter = new DecimalFormat("00000000");
220 | String mainTerm = event.mainTerm;
221 | int count = 0;
222 | for(int i = event.I.timeSliceA; i <= event.I.timeSliceB; i++){
223 | try {
224 | String filename = "input/"+formatter.format(i)+".text";
225 | List lines = FileUtils.readLines(new File(filename));
226 | for(String line : lines){
227 | if(line.contains(" "+mainTerm+" ")){
228 | messages += line+"\n";
229 | count++;
230 | }
231 | }
232 | } catch (IOException ex) {
233 | Logger.getLogger(Corpus.class.getName()).log(Level.SEVERE, null, ex);
234 | }
235 | }
236 | return messages;
237 | }
238 |
239 | public Timestamp toDate(int timeSlice){
240 | Timestamp date = startTimestamp;
241 | long dateLong = date.getTime() + timeSlice*configuration.timeSliceLength*60*1000L;
242 | return new Timestamp(dateLong);
243 | }
244 | }
245 |
--------------------------------------------------------------------------------
/src/fr/ericlab/mabed/structure/Event.java:
--------------------------------------------------------------------------------
1 | ////////////////////////////////////////////////////////////////////////////////
2 | // This file is part of MABED. //
3 | // //
4 | // MABED is free software: you can redistribute it and/or modify //
5 | // it under the terms of the GNU General Public License as published by //
6 | // the Free Software Foundation, either version 3 of the License, or //
7 | // (at your option) any later version. //
8 | // //
9 | // MABED is distributed in the hope that it will be useful, //
10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of //
11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
12 | // GNU General Public License for more details. //
13 | // //
14 | // You should have received a copy of the GNU General Public License //
15 | // along with MABED. If not, see . //
16 | ////////////////////////////////////////////////////////////////////////////////
17 |
18 | package fr.ericlab.mabed.structure;
19 |
20 | import java.util.ArrayList;
21 | import java.util.HashMap;
22 | import java.util.Map;
23 |
24 | /**
25 | *
26 | * @author Adrien GUILLE, ERIC Lab, University of Lyon 2
27 | * @email adrien.guille@univ-lyon2.fr
28 | */
29 | public class Event implements Comparable {
30 | public String mainTerm;
31 | public WeightedTermList relatedTerms;
32 | public TimeInterval I;
33 | public double score;
34 | public ArrayList anomaly;
35 |
36 | public Event(){
37 | mainTerm = "noMainTerm";
38 | score = 0;
39 | relatedTerms = new WeightedTermList();
40 | anomaly = new ArrayList<>();
41 | }
42 |
43 | public Event(String t, TimeInterval tI, double s){
44 | mainTerm = t;
45 | I = tI;
46 | score = s;
47 | relatedTerms = new WeightedTermList();
48 | anomaly = new ArrayList<>();
49 | }
50 |
51 | public Event(String t, TimeInterval tI, double s, ArrayList a){
52 | mainTerm = t;
53 | I = tI;
54 | score = s;
55 | relatedTerms = new WeightedTermList();
56 | anomaly = a;
57 | }
58 |
59 | public void setMainTerm(String t){
60 | mainTerm = t;
61 | }
62 |
63 | public Map getMainTermAttributes(){
64 | HashMap map = new HashMap<>();
65 | map.put("ui.class","mainTerm");
66 | map.put("ui.color",1);
67 | map.put("ui.color",1);
68 | map.put("I", I.timeSliceA+":"+I.timeSliceB);
69 | map.put("score",score);
70 | return map;
71 | }
72 |
73 | public Event merge(Event t){
74 | Event t1 = new Event(this.mainTerm+", "+t.mainTerm, this.I, this.score);
75 | for(WeightedTerm wt : this.relatedTerms.list){
76 | if(!t1.contains(wt.term)){
77 | t1.relatedTerms.add(wt);
78 | }
79 | }
80 | for(WeightedTerm wt : t.relatedTerms.list){
81 | if(!t1.contains(wt.term)){
82 | t1.relatedTerms.add(wt);
83 | }
84 | }
85 | return t1;
86 | }
87 |
88 | public Event merge(EventList tl){
89 | Event t1 = new Event("",this.I,this.score,this.anomaly);
90 | String mT = this.mainTerm;
91 | for(Event t : tl.list){
92 | mT += ", "+t.mainTerm;
93 | }
94 | t1.setMainTerm(mT);
95 | for(Event t : tl.list){
96 | for(WeightedTerm wt : t.relatedTerms.list){
97 | if(!t1.contains(wt.term)){
98 | t1.relatedTerms.add(wt);
99 | }
100 | }
101 | }
102 | for(WeightedTerm wt : this.relatedTerms.list){
103 | if(!t1.contains(wt.term)){
104 | t1.relatedTerms.add(wt);
105 | }
106 | }
107 | return t1;
108 | }
109 |
110 | public String toString(boolean printI){
111 | String str = "";
112 | if(printI){
113 | str = "["+I.toString()+"] ";
114 | }
115 | str += mainTerm+"("+score+"): ";
116 | for(WeightedTerm wt : relatedTerms.list){
117 | str += wt.term+"("+wt.weight+") ";
118 | }
119 | return str;
120 | }
121 |
122 | public String intervalAsString(String lang){
123 |
124 | return "";
125 | }
126 |
127 | public boolean contains(String term){
128 | return this.mainTerm.contains(term) || containsrelatedTerm(term);
129 | }
130 |
131 | public boolean containsrelatedTerm(String term){
132 | for(WeightedTerm wt : relatedTerms.list){
133 | if(wt.term.equals(term)){
134 | return true;
135 | }
136 | }
137 | return false;
138 | }
139 |
140 | public String relatedTermAsList(){
141 | String st = "";
142 | for(WeightedTerm wt : relatedTerms.list){
143 | st += wt.term+" ";
144 | }
145 | return st;
146 | }
147 |
148 | public String anomalyToString(){
149 | String string = "[";
150 | for(double d : anomaly){
151 | string += d+",";
152 | }
153 | string = string.substring(0,string.length());
154 | return string+"]";
155 | }
156 |
157 | @Override
158 | public int compareTo(Event o) {
159 | if((o.score - this.score) == 0){
160 | return 0;
161 | }else{
162 | if(this.score > o.score){
163 | return -1;
164 | }else{
165 | return 1;
166 | }
167 | }
168 | }
169 | }
170 |
--------------------------------------------------------------------------------
/src/fr/ericlab/mabed/structure/EventGraph.java:
--------------------------------------------------------------------------------
1 | ////////////////////////////////////////////////////////////////////////////////
2 | // This file is part of MABED. //
3 | // //
4 | // MABED is free software: you can redistribute it and/or modify //
5 | // it under the terms of the GNU General Public License as published by //
6 | // the Free Software Foundation, either version 3 of the License, or //
7 | // (at your option) any later version. //
8 | // //
9 | // MABED is distributed in the hope that it will be useful, //
10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of //
11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
12 | // GNU General Public License for more details. //
13 | // //
14 | // You should have received a copy of the GNU General Public License //
15 | // along with MABED. If not, see . //
16 | ////////////////////////////////////////////////////////////////////////////////
17 |
18 | package fr.ericlab.mabed.structure;
19 |
20 | import java.util.ArrayList;
21 | import java.util.Collection;
22 | import java.util.HashMap;
23 | import org.graphstream.algorithm.ConnectedComponents;
24 | import org.graphstream.algorithm.ConnectedComponents.ConnectedComponent;
25 | import org.graphstream.graph.Edge;
26 | import org.graphstream.graph.Graph;
27 | import org.graphstream.graph.Node;
28 | import org.graphstream.graph.implementations.SingleGraph;
29 |
30 | /**
31 | *
32 | * @author Adrien GUILLE, ERIC Lab, University of Lyon 2
33 | * @email adrien.guille@univ-lyon2.fr
34 | */
35 | public class EventGraph {
36 | public Graph graph;
37 | public Graph redundancyGraph;
38 | public HashMap duplicatedEvents;
39 | double maximumScore;
40 | double sigma;
41 | Corpus corpus;
42 |
43 | public EventGraph(Corpus d, double ms, Double sig){
44 | sigma = sig;
45 | graph = new SingleGraph("");
46 | redundancyGraph = new SingleGraph("");
47 | maximumScore = ms;
48 | corpus = d;
49 | duplicatedEvents = new HashMap<>();
50 | }
51 |
52 | public int addEvent(Event event){
53 | int added = 0;
54 | boolean redundant = false;
55 | if(graph.getNode(event.mainTerm) != null){
56 | for(WeightedTerm wt : event.relatedTerms.list){
57 | Node wtNode = graph.getNode(wt.term);
58 | if(wtNode != null){
59 | if(wtNode.getAttribute("ui.class").equals("mainTerm") && wtNode.hasEdgeFrom((Node)graph.getNode(event.mainTerm))){
60 | Event event1 = getEvent(wtNode);
61 | double intersection = Math.max(event.I.intersectionProportion(event1.I), event1.I.intersectionProportion(event.I));
62 | if(intersection > sigma){
63 | redundant = true;
64 | duplicatedEvents.put(event.mainTerm,event);
65 | duplicatedEvents.put(event1.mainTerm,event1);
66 | // new way of managing redundancy
67 | if(redundancyGraph.getNode(event1.mainTerm) == null){
68 | redundancyGraph.addNode(event1.mainTerm);
69 | }
70 | if(redundancyGraph.getNode(event.mainTerm) == null){
71 | redundancyGraph.addNode(event.mainTerm);
72 | }
73 | if(redundancyGraph.getEdge(event.mainTerm+"-"+event1.mainTerm) == null){
74 | redundancyGraph.addEdge(event.mainTerm+"-"+event1.mainTerm, event.mainTerm, event1.mainTerm, false);
75 | }
76 | }
77 | }
78 | }
79 | }
80 | }
81 | if(!redundant){
82 | if(event.mainTerm != null){
83 | if(graph.getNode(event.mainTerm) == null){
84 | graph.addNode(event.mainTerm);
85 | }
86 | graph.getNode(event.mainTerm).addAttributes(event.getMainTermAttributes());
87 | graph.getNode(event.mainTerm).setAttribute("ui.size",20+(event.score/maximumScore)*10);
88 | graph.getNode(event.mainTerm).addAttribute("ui.label", "["+corpus.toDate(event.I.timeSliceA)+"::"+corpus.toDate(event.I.timeSliceB)+"]:"+event.mainTerm);
89 | graph.getNode(event.mainTerm).addAttribute("anomaly",event.anomaly);
90 | for(WeightedTerm wt : event.relatedTerms.list){
91 | if(wt.term != null){
92 | if(graph.getNode(wt.term)==null){
93 | graph.addNode(wt.term);
94 | graph.getNode(wt.term).addAttribute("ui.label", wt.term);
95 | graph.getNode(wt.term).setAttribute("ui.class","relatedTerm");
96 | }
97 | graph.addEdge("["+event.I.timeSliceA+":"+event.I.timeSliceB+"]"+event.mainTerm+"-"+wt.term,wt.term,event.mainTerm,true);
98 | graph.getEdge("["+event.I.timeSliceA+":"+event.I.timeSliceB+"]"+event.mainTerm+"-"+wt.term).addAttribute("weight", wt.weight);
99 | }
100 | }
101 | }
102 | added = 1;
103 | }
104 | return added;
105 | }
106 |
107 | public EventList identifyConnectedComponents(){
108 | ConnectedComponents ccs = new ConnectedComponents(redundancyGraph);
109 | ccs.setCountAttribute("component");
110 | System.out.println(" - number of connected component(s): "+ccs.getConnectedComponentsCount());
111 | int i = 0;
112 | EventList globalEvents = new EventList();
113 | for (ConnectedComponent cc : ccs) {
114 | EventList ccEvents = new EventList();
115 | for(Node node : cc.getEachNode()){
116 | Event event = duplicatedEvents.get(node.getId());
117 | ccEvents.add(event);
118 | removeEvent(event);
119 | }
120 | ccEvents.sort();
121 | Event mainEvent = ccEvents.list.pop();
122 | Event globalEvent = mainEvent.merge(ccEvents);
123 | globalEvents.add(globalEvent);
124 | addEvent(globalEvent);
125 | i++;
126 | }
127 | return globalEvents;
128 | }
129 |
130 | public void removeEvent(Event event){
131 | Node mainNode = graph.getNode(event.mainTerm);
132 | if(mainNode != null){
133 | if(mainNode.getAttribute("ui.class").equals("mainTerm")){
134 | Collection edges = mainNode.getEnteringEdgeSet();
135 | if(edges != null){
136 | for(Edge edge : edges){
137 | if(edge != null){
138 | // remove obsolete edge
139 | graph.removeEdge(edge);
140 | if(edge.getSourceNode().getDegree() == 0){
141 | graph.removeNode((Node)edge.getSourceNode());
142 | }
143 | }
144 | }
145 | }
146 | if(mainNode.getOutDegree() == 0){
147 | graph.removeNode(mainNode);
148 | }else{
149 | mainNode.setAttribute("ui.class", "relatedTerm");
150 | }
151 | }
152 | }
153 | }
154 |
155 | public void replaceEvent(Node nodeT0, Event t1){
156 | nodeT0.setAttribute("ui.class","relatedTerm");
157 | Collection edges = nodeT0.getEnteringEdgeSet();
158 | graph.addNode(t1.mainTerm);
159 | graph.getNode(t1.mainTerm).addAttributes(t1.getMainTermAttributes());
160 | graph.getNode(t1.mainTerm).setAttribute("ui.size",20+(t1.score/maximumScore)*10);
161 | graph.getNode(t1.mainTerm).addAttribute("ui.label", "["+corpus.toDate(t1.I.timeSliceA)+"::"+corpus.toDate(t1.I.timeSliceB)+"]:"+t1.mainTerm);
162 | graph.getNode(t1.mainTerm).setAttribute("anomaly",t1.anomaly);
163 | if(edges != null){
164 | for(Edge edge : edges){
165 | if(edge != null){
166 | // remove obsolete edge
167 | graph.removeEdge(edge);
168 | // add edge toward the new main term
169 | if(!t1.mainTerm.contains(edge.getSourceNode().getId())){
170 | String edgeId = "["+t1.I.timeSliceA+":"+t1.I.timeSliceB+"]"+t1.mainTerm+"-"+edge.getSourceNode().getId();
171 | graph.addEdge(edgeId,edge.getSourceNode().getId(),t1.mainTerm,true);
172 | graph.getEdge(edgeId).addAttribute("weight", edge.getAttribute("weight"));
173 | }
174 | if(edge.getSourceNode().getDegree() == 0){
175 | graph.removeNode((Node)edge.getSourceNode());
176 | }
177 | }
178 | }
179 | }
180 | graph.removeNode(nodeT0);
181 | for(WeightedTerm wt : t1.relatedTerms.list){
182 | if(wt.term != null){
183 | if(graph.getNode(wt.term)==null){
184 | graph.addNode(wt.term);
185 | graph.getNode(wt.term).addAttribute("ui.label", wt.term);
186 | graph.getNode(wt.term).setAttribute("ui.class","relatedTerm");
187 | }
188 | if(graph.getEdge("["+t1.I.timeSliceA+":"+t1.I.timeSliceB+"]"+t1.mainTerm+"-"+wt.term) == null){
189 | graph.addEdge("["+t1.I.timeSliceA+":"+t1.I.timeSliceB+"]"+t1.mainTerm+"-"+wt.term,wt.term,t1.mainTerm,true);
190 | }
191 | graph.getEdge("["+t1.I.timeSliceA+":"+t1.I.timeSliceB+"]"+t1.mainTerm+"-"+wt.term).addAttribute("weight", wt.weight);
192 | }
193 | }
194 | // if(nodeT0.getDegree() == 0){
195 | // System.out.println(" - "+nodeT0.getId()+" is a leaf node.");
196 | // graph.removeNode(nodeT0);
197 | // }
198 | }
199 |
200 | public Event getEvent(Node mainNode){
201 | Event event = new Event();
202 | if(mainNode.getAttribute("ui.class")!=null && mainNode.getAttribute("ui.class").equals("mainTerm")){
203 | event = new Event(mainNode.getId(), new TimeInterval((String) mainNode.getAttribute("I")), (double) mainNode.getAttribute("score"), (ArrayList) mainNode.getAttribute("anomaly"));
204 | for(Edge edge : mainNode.getEnteringEdgeSet()){
205 | event.relatedTerms.add(new WeightedTerm(edge.getSourceNode().getId(), (double) edge.getAttribute("weight")));
206 | }
207 | }
208 | return event;
209 | }
210 |
211 | public EventList toEventList(){
212 | EventList events = new EventList();
213 | for(Node node : graph){
214 | if(node.getAttribute("ui.class")!=null && node.getAttribute("ui.class").equals("mainTerm")){
215 | Event event = new Event(node.getId(), new TimeInterval((String) node.getAttribute("I")), (double) node.getAttribute("score"), (ArrayList) node.getAttribute("anomaly"));
216 | for(Edge edge : node.getEnteringEdgeSet()){
217 | event.relatedTerms.add(new WeightedTerm(edge.getSourceNode().getId(), (double) edge.getAttribute("weight")));
218 | }
219 | events.add(event);
220 | }
221 | }
222 | events.sort();
223 | return events;
224 | }
225 |
226 | }
227 |
--------------------------------------------------------------------------------
/src/fr/ericlab/mabed/structure/EventList.java:
--------------------------------------------------------------------------------
1 | ////////////////////////////////////////////////////////////////////////////////
2 | // This file is part of MABED. //
3 | // //
4 | // MABED is free software: you can redistribute it and/or modify //
5 | // it under the terms of the GNU General Public License as published by //
6 | // the Free Software Foundation, either version 3 of the License, or //
7 | // (at your option) any later version. //
8 | // //
9 | // MABED is distributed in the hope that it will be useful, //
10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of //
11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
12 | // GNU General Public License for more details. //
13 | // //
14 | // You should have received a copy of the GNU General Public License //
15 | // along with MABED. If not, see . //
16 | ////////////////////////////////////////////////////////////////////////////////
17 |
18 | package fr.ericlab.mabed.structure;
19 |
20 | import fr.ericlab.mabed.algo.MABED;
21 | import java.io.File;
22 | import java.io.IOException;
23 | import java.text.DecimalFormat;
24 | import java.text.NumberFormat;
25 | import java.text.SimpleDateFormat;
26 | import java.util.Collections;
27 | import java.util.LinkedList;
28 | import java.util.logging.Level;
29 | import java.util.logging.Logger;
30 | import org.apache.commons.io.FileUtils;
31 |
32 | /**
33 | *
34 | * @author Adrien GUILLE, ERIC Lab, University of Lyon 2
35 | * @email adrien.guille@univ-lyon2.fr
36 | */
37 | public class EventList {
38 | public LinkedList list;
39 |
40 | public EventList(){
41 | list = new LinkedList<>();
42 | }
43 |
44 | public void writeEventsToFile(Corpus dataset, String filename){
45 | try {
46 | File textFile = new File("output/"+filename+".txt");
47 | FileUtils.writeStringToFile(textFile,"",false);
48 | for(Event event : list){
49 | FileUtils.writeStringToFile(textFile," - ["+new SimpleDateFormat("yyyy-MM-dd hh:mm").format(dataset.toDate(event.I.timeSliceA))+"//"+new SimpleDateFormat("yyyy-MM-dd hh:mm").format(dataset.toDate(event.I.timeSliceB))+
50 | "] "+event.toString(false)+"\n---------------------------------\n",true);
51 | }
52 | } catch (IOException ex) {
53 | Logger.getLogger(MABED.class.getName()).log(Level.SEVERE, null, ex);
54 | }
55 | }
56 |
57 | public void scoreEvolution(){
58 | for(int i = 1; i <= list.size(); i++){
59 | System.out.print(i+",");
60 | }
61 | for(Event event : list){
62 | System.out.print(event.score+",");
63 | }
64 | }
65 |
66 | public void printLatex(Corpus corpus){
67 | System.out.println(toLatex(corpus));
68 | }
69 |
70 | public String toLatex(Corpus corpus){
71 | int rank = 1;
72 | String string = "";
73 | for(Event topic : list){
74 | string += rank+" & "+new SimpleDateFormat("dd/MM HH:mm").format(corpus.toDate(topic.I.timeSliceA))+" -- "+new SimpleDateFormat("dd/MM HH:mm").format(corpus.toDate(topic.I.timeSliceB))+" & "+topic.mainTerm+": "+topic.relatedTerms.toString().replace("related terms:","")+"\\\\ \\hline\n";
75 | rank++;
76 | }
77 | return string;
78 | }
79 |
80 | public void add(Event t){
81 | list.add(t);
82 | }
83 |
84 | public void sort(){
85 | Collections.sort(list);
86 | }
87 |
88 | public int size(){
89 | return list.size();
90 | }
91 |
92 | public Event get(int i){
93 | return list.get(i);
94 | }
95 |
96 | public void addAll(EventList tl){
97 | list.addAll(tl.list);
98 | }
99 |
100 | public void exportDetailledResults(Corpus corpus){
101 | File outputDir = new File("output/csv/");
102 | if(!outputDir.isDirectory()){
103 | outputDir.mkdir();
104 | }else{
105 | for(String filename : outputDir.list()){
106 | FileUtils.deleteQuietly(new File("output/csv/"+filename));
107 | }
108 | }
109 | NumberFormat formatter = new DecimalFormat("000");
110 | for(int i = 0; i < list.size(); i++){
111 | Event event = list.get(i);
112 | String mainTerm = event.mainTerm.replace(", ", "_");
113 | File descFile = new File("output/csv/"+formatter.format(i)+"-"+mainTerm+".desc");
114 | File wordsFile = new File("output/csv/"+formatter.format(i)+"-"+mainTerm+".words");
115 | File seriesFile = new File("output/csv/"+formatter.format(i)+"-"+mainTerm+".anomaly");
116 | try {
117 | FileUtils.writeStringToFile(descFile,event.score+"\t"+event.I.timeSliceA+"\t"+event.I.timeSliceB+"\t"+new SimpleDateFormat("YYYY-MM-dd HH:mm").format(corpus.toDate(event.I.timeSliceA))+"\t"+new SimpleDateFormat("YYYY-MM-dd HH:mm").format(corpus.toDate(event.I.timeSliceB))+"\n",true);
118 | } catch (IOException ex) {
119 | Logger.getLogger(EventList.class.getName()).log(Level.SEVERE, null, ex);
120 | }
121 | for(WeightedTerm wt : event.relatedTerms.list){
122 | try {
123 | FileUtils.writeStringToFile(wordsFile,wt.term+"\t"+wt.weight+"\n",true);
124 | } catch (IOException ex) {
125 | Logger.getLogger(EventList.class.getName()).log(Level.SEVERE, null, ex);
126 | }
127 | }
128 | for(int j = 0; j < event.anomaly.size(); j++){
129 | try {
130 | FileUtils.writeStringToFile(seriesFile,j+"\t"+event.anomaly.get(j)+"\n",true);
131 | } catch (IOException ex) {
132 | Logger.getLogger(EventList.class.getName()).log(Level.SEVERE, null, ex);
133 | }
134 | }
135 | }
136 | }
137 | }
138 |
--------------------------------------------------------------------------------
/src/fr/ericlab/mabed/structure/StopWords.java:
--------------------------------------------------------------------------------
1 | ////////////////////////////////////////////////////////////////////////////////
2 | // This file is part of MABED. //
3 | // //
4 | // MABED is free software: you can redistribute it and/or modify //
5 | // it under the terms of the GNU General Public License as published by //
6 | // the Free Software Foundation, either version 3 of the License, or //
7 | // (at your option) any later version. //
8 | // //
9 | // MABED is distributed in the hope that it will be useful, //
10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of //
11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
12 | // GNU General Public License for more details. //
13 | // //
14 | // You should have received a copy of the GNU General Public License //
15 | // along with MABED. If not, see . //
16 | ////////////////////////////////////////////////////////////////////////////////
17 |
18 | package fr.ericlab.mabed.structure;
19 |
20 | /**
21 | *
22 | * @author Adrien GUILLE, ERIC Lab, University of Lyon 2
23 | * @email adrien.guille@univ-lyon2.fr
24 | */
25 | public class StopWords {
26 |
27 | }
28 |
--------------------------------------------------------------------------------
/src/fr/ericlab/mabed/structure/TimeInterval.java:
--------------------------------------------------------------------------------
1 | ////////////////////////////////////////////////////////////////////////////////
2 | // This file is part of MABED. //
3 | // //
4 | // MABED is free software: you can redistribute it and/or modify //
5 | // it under the terms of the GNU General Public License as published by //
6 | // the Free Software Foundation, either version 3 of the License, or //
7 | // (at your option) any later version. //
8 | // //
9 | // MABED is distributed in the hope that it will be useful, //
10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of //
11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
12 | // GNU General Public License for more details. //
13 | // //
14 | // You should have received a copy of the GNU General Public License //
15 | // along with MABED. If not, see . //
16 | ////////////////////////////////////////////////////////////////////////////////
17 |
18 | package fr.ericlab.mabed.structure;
19 |
20 | import java.util.HashSet;
21 |
22 | /**
23 | *
24 | * @author Adrien GUILLE, ERIC Lab, University of Lyon 2
25 | * @email adrien.guille@univ-lyon2.fr
26 | */
27 | public class TimeInterval implements Comparable {
28 | public int timeSliceA;
29 | public int timeSliceB;
30 |
31 | public TimeInterval(int a, int b){
32 | timeSliceA = a;
33 | timeSliceB = b;
34 | }
35 |
36 | public TimeInterval(String str){
37 | String[] split = str.split(":");
38 | timeSliceA = Integer.parseInt(split[0]);
39 | timeSliceB = Integer.parseInt(split[1]);
40 | }
41 |
42 | public String toString(){
43 | return timeSliceA+":"+timeSliceB;
44 | }
45 |
46 | public double intersection(TimeInterval ti){
47 | HashSet set1 = new HashSet<>();
48 | HashSet set2 = new HashSet<>();
49 | int intersectionSize = 0;
50 | for(int i = this.timeSliceA; i <= this.timeSliceB; i++){
51 | set1.add(i);
52 | }
53 | for(int j = ti.timeSliceA; j <= ti.timeSliceB; j++){
54 | set2.add(j);
55 | }
56 | for(int k : set1){
57 | if(set2.contains(k)){
58 | intersectionSize++;
59 | }
60 | }
61 | return intersectionSize;
62 | }
63 |
64 | public double intersectionProportion(TimeInterval ti){
65 | return intersection(ti)/(this.timeSliceB-this.timeSliceA+1);
66 | }
67 |
68 | @Override
69 | public int compareTo(Object o) {
70 | TimeInterval point = (TimeInterval)o;
71 | if(this.timeSliceA > point.timeSliceA){
72 | return 1;
73 | }else{
74 | if(this.timeSliceA == point.timeSliceA){
75 | return 0;
76 | }else{
77 | return -1;
78 | }
79 | }
80 | }
81 | }
82 |
--------------------------------------------------------------------------------
/src/fr/ericlab/mabed/structure/WeightedTerm.java:
--------------------------------------------------------------------------------
1 | ////////////////////////////////////////////////////////////////////////////////
2 | // This file is part of MABED. //
3 | // //
4 | // MABED is free software: you can redistribute it and/or modify //
5 | // it under the terms of the GNU General Public License as published by //
6 | // the Free Software Foundation, either version 3 of the License, or //
7 | // (at your option) any later version. //
8 | // //
9 | // MABED is distributed in the hope that it will be useful, //
10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of //
11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
12 | // GNU General Public License for more details. //
13 | // //
14 | // You should have received a copy of the GNU General Public License //
15 | // along with MABED. If not, see . //
16 | ////////////////////////////////////////////////////////////////////////////////
17 |
18 | package fr.ericlab.mabed.structure;
19 |
20 | /**
21 | *
22 | * @author Adrien GUILLE, ERIC Lab, University of Lyon 2
23 | * @email adrien.guille@univ-lyon2.fr
24 | */
25 | public class WeightedTerm implements Comparable{
26 | public String term;
27 | public double weight;
28 |
29 | public WeightedTerm(String t){
30 | term = t;
31 | weight = 0;
32 | }
33 |
34 | public WeightedTerm(String t, double w){
35 | term = t;
36 | weight = w;
37 | }
38 |
39 | @Override
40 | public int compareTo(WeightedTerm o) {
41 | if(o.weight-this.weight<0){
42 | return -1;
43 | }else{
44 | if(o.weight-this.weight>0){
45 | return 1;
46 | }else{
47 | return 0;
48 | }
49 | }
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/src/fr/ericlab/mabed/structure/WeightedTermList.java:
--------------------------------------------------------------------------------
1 | ////////////////////////////////////////////////////////////////////////////////
2 | // This file is part of MABED. //
3 | // //
4 | // MABED is free software: you can redistribute it and/or modify //
5 | // it under the terms of the GNU General Public License as published by //
6 | // the Free Software Foundation, either version 3 of the License, or //
7 | // (at your option) any later version. //
8 | // //
9 | // MABED is distributed in the hope that it will be useful, //
10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of //
11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
12 | // GNU General Public License for more details. //
13 | // //
14 | // You should have received a copy of the GNU General Public License //
15 | // along with MABED. If not, see . //
16 | ////////////////////////////////////////////////////////////////////////////////
17 |
18 | package fr.ericlab.mabed.structure;
19 |
20 | import java.text.DecimalFormat;
21 | import java.util.Collections;
22 | import java.util.LinkedList;
23 |
24 | /**
25 | *
26 | * @author Adrien GUILLE, ERIC Lab, University of Lyon 2
27 | * @email adrien.guille@univ-lyon2.fr
28 | */
29 | public class WeightedTermList {
30 | public LinkedList list;
31 |
32 | public WeightedTermList(){
33 | list = new LinkedList<>();
34 | }
35 |
36 | public void add(WeightedTerm t){
37 | list.add(t);
38 | }
39 |
40 | public void sort(){
41 | Collections.sort(list);
42 | }
43 |
44 | public int size(){
45 | return list.size();
46 | }
47 |
48 | public WeightedTerm get(int i){
49 | return list.get(i);
50 | }
51 |
52 | public void addAll(WeightedTermList tl){
53 | list.addAll(tl.list);
54 | }
55 |
56 | public String toString(){
57 | sort();
58 | String str = "related terms: ";
59 | DecimalFormat df = new DecimalFormat("0.00");
60 | for(WeightedTerm t: list){
61 | str += t.term+" ("+df.format(t.weight)+"), ";
62 | }
63 | return str.substring(0,str.length()-2);
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/src/fr/ericlab/util/Util.java:
--------------------------------------------------------------------------------
1 | ////////////////////////////////////////////////////////////////////////////////
2 | // This file is part of MABED. //
3 | // //
4 | // MABED is free software: you can redistribute it and/or modify //
5 | // it under the terms of the GNU General Public License as published by //
6 | // the Free Software Foundation, either version 3 of the License, or //
7 | // (at your option) any later version. //
8 | // //
9 | // MABED is distributed in the hope that it will be useful, //
10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of //
11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
12 | // GNU General Public License for more details. //
13 | // //
14 | // You should have received a copy of the GNU General Public License //
15 | // along with MABED. If not, see . //
16 | ////////////////////////////////////////////////////////////////////////////////
17 |
18 | package fr.ericlab.util;
19 |
20 | import fr.ericlab.mabed.algo.MABED;
21 | import java.io.File;
22 | import java.io.IOException;
23 | import java.text.DateFormat;
24 | import java.text.SimpleDateFormat;
25 | import java.util.Arrays;
26 | import java.util.Date;
27 | import java.util.LinkedList;
28 | import java.util.logging.Level;
29 | import java.util.logging.Logger;
30 | import org.apache.commons.io.FileUtils;
31 | import org.apache.commons.io.LineIterator;
32 |
33 | /**
34 | *
35 | * @author Adrien GUILLE, ERIC Lab, University of Lyon 2
36 | * @email adrien.guille@univ-lyon2.fr
37 | */
38 | public class Util {
39 |
40 | static public LinkedList readStopWords(String pathToStopwordsFile){
41 | LinkedList stopWords = new LinkedList<>();
42 | if(pathToStopwordsFile != null){
43 | LineIterator it = null;
44 | try {
45 | it = FileUtils.lineIterator(new File(pathToStopwordsFile), "UTF-8");
46 | while (it.hasNext()) {
47 | stopWords.add(it.nextLine());
48 | }
49 | } catch (IOException ex) {
50 | Logger.getLogger(MABED.class.getName()).log(Level.SEVERE, null, ex);
51 | } finally {
52 | LineIterator.closeQuietly(it);
53 | }
54 | }
55 | return stopWords;
56 | }
57 |
58 | static public String getDate(){
59 | DateFormat dateFormat = new SimpleDateFormat("HH:mm:ss");
60 | Date date = new Date();
61 | String dateString = dateFormat.format(date);
62 | return dateString;
63 | }
64 |
65 | static public long getTime(){
66 | Date date = new Date();
67 | return date.getTime();
68 | }
69 |
70 | static public int sum(short tab[], int a, int b){
71 | int sum = 0;
72 | for(int i = a; i <= b; i++){
73 | sum += tab[i];
74 | }
75 | return sum;
76 | }
77 |
78 | static public float sum(float tab[], int a, int b){
79 | float sum = 0;
80 | for(int i = a; i <= b; i++){
81 | sum += tab[i];
82 | }
83 | return sum;
84 | }
85 |
86 | static public float[] toFloatArray(short[] array){
87 | float[] newArray = new float[array.length];
88 | for(int i = 0; i < array.length; i++){
89 | newArray[i] = array[i];
90 | }
91 | return newArray;
92 | }
93 |
94 | static public float[] smoothArray(float array[], int windowSize){
95 | float[] smoothedArray = new float[array.length];
96 | for(int i = 0; i < array.length-1; i++){
97 | smoothedArray[i] = centeredMovingAverage(array, i, windowSize);
98 | }
99 | return smoothedArray;
100 | }
101 |
102 | static public float[] smoothArray(short array[], int windowSize){
103 | float[] smoothedArray = new float[array.length];
104 | for(int i = 0; i < array.length-1; i++){
105 | smoothedArray[i] = centeredMovingAverage(array, i, windowSize);
106 | }
107 | return smoothedArray;
108 | }
109 |
110 | static public double getMedian(double array[]){
111 | Arrays.sort(array);
112 | return array[array.length/2];
113 | }
114 |
115 | static public float centeredMovingAverage(float[] array, int index, int windowSize){
116 | int halfWindowSize = windowSize/2;
117 | int possibleLeftWindow = (index >= halfWindowSize)?halfWindowSize:index;
118 | int possibleRightWindow = (index+halfWindowSize < array.length-1)? halfWindowSize:array.length-2-index;
119 | int i1 = index - possibleLeftWindow, i2 = index + possibleRightWindow;
120 | float total = 0;
121 | for(int i = i1; i <= i2; i++){
122 | total += array[i];
123 | }
124 | return total/(float)(possibleLeftWindow+possibleRightWindow);
125 | }
126 |
127 | static public float centeredMovingAverage(short[] array, int index, int windowSize){
128 | int halfWindowSize = windowSize/2;
129 | int possibleLeftWindow = (index >= halfWindowSize)?halfWindowSize:index;
130 | int possibleRightWindow = (index+halfWindowSize < array.length-1)? halfWindowSize:array.length-2-index;
131 | int i1 = index - possibleLeftWindow, i2 = index + possibleRightWindow;
132 | float total = 0;
133 | for(int i = i1; i <= i2; i++){
134 | total += array[i];
135 | }
136 | return total/(float)(possibleLeftWindow+possibleRightWindow);
137 | }
138 | }
139 |
--------------------------------------------------------------------------------
/stopwords.txt:
--------------------------------------------------------------------------------
1 | ...
2 | ....
3 | 0
4 | 1
5 | 2
6 | 3
7 | 4
8 | 5
9 | 6
10 | 7
11 | 8
12 | 9
13 | a
14 | about
15 | above
16 | accordingly
17 | across
18 | after
19 | afterwards
20 | again
21 | against
22 | al
23 | all
24 | allows
25 | almost
26 | alone
27 | along
28 | already
29 | also
30 | although
31 | always
32 | am
33 | among
34 | amongst
35 | an
36 | and
37 | another
38 | any
39 | anybody
40 | anyhow
41 | anyone
42 | anything
43 | anywhere
44 | apart
45 | appear
46 | appropriate
47 | are
48 | around
49 | as
50 | aside
51 | associated
52 | at
53 | available
54 | away
55 | awfully
56 | b
57 | back
58 | be
59 | became
60 | because
61 | become
62 | becomes
63 | becoming
64 | been
65 | before
66 | beforehand
67 | behind
68 | being
69 | below
70 | beside
71 | besides
72 | best
73 | better
74 | between
75 | beyond
76 | both
77 | brief
78 | but
79 | by
80 | c
81 | came
82 | can
83 | cannot
84 | cant
85 | care
86 | cause
87 | causes
88 | certain
89 | changes
90 | co
91 | come
92 | consequently
93 | contain
94 | containing
95 | contains
96 | corresponding
97 | could
98 | currently
99 | d
100 | day
101 | described
102 | did
103 | different
104 | do
105 | does
106 | doesnt
107 | doing
108 | don
109 | done
110 | dont
111 | down
112 | downwards
113 | did
114 | didnt
115 | during
116 | e
117 | each
118 | eg
119 | eight
120 | either
121 | else
122 | elsewhere
123 | enough
124 | eq
125 | et
126 | etc
127 | even
128 | ever
129 | every
130 | everybody
131 | everyone
132 | everything
133 | everywhere
134 | ex
135 | example
136 | except
137 | f
138 | far
139 | few
140 | fifth
141 | first
142 | five
143 | followed
144 | following
145 | for
146 | former
147 | formerly
148 | forth
149 | four
150 | friend
151 | friends
152 | from
153 | further
154 | furthermore
155 | g
156 | get
157 | gets
158 | given
159 | gives
160 | go
161 | goes
162 | going
163 | gonna
164 | gone
165 | good
166 | got
167 | great
168 | h
169 | had
170 | hardly
171 | has
172 | have
173 | having
174 | he
175 | hence
176 | her
177 | hes
178 | here
179 | hereafter
180 | hereby
181 | herein
182 | heres
183 | hereupon
184 | hers
185 | herself
186 | him
187 | himself
188 | his
189 | hither
190 | how
191 | howbeit
192 | however
193 | http
194 | i
195 | ie
196 | if
197 | ignored
198 | ill
199 | im
200 | immediate
201 | in
202 | inasmuch
203 | inc
204 | indeed
205 | indicate
206 | indicated
207 | indicates
208 | inner
209 | insofar
210 | instead
211 | into
212 | inward
213 | is
214 | it
215 | its
216 | itself
217 | ive
218 | j
219 | just
220 | k
221 | keep
222 | kept
223 | know
224 | knows
225 | l
226 | last
227 | latter
228 | latterly
229 | least
230 | less
231 | lest
232 | let
233 | life
234 | like
235 | little
236 | long
237 | love
238 | ltd
239 | m
240 | made
241 | make
242 | man
243 | many
244 | may
245 | me
246 | meanwhile
247 | men
248 | might
249 | more
250 | moreover
251 | most
252 | mostly
253 | mr
254 | much
255 | must
256 | my
257 | myself
258 | n
259 | name
260 | namely
261 | near
262 | necessary
263 | need
264 | needed
265 | neither
266 | never
267 | nevertheless
268 | new
269 | next
270 | nine
271 | no
272 | nobody
273 | none
274 | noone
275 | nor
276 | normally
277 | not
278 | nothing
279 | novel
280 | now
281 | nowhere
282 | o
283 | of
284 | off
285 | often
286 | oh
287 | old
288 | on
289 | once
290 | one
291 | ones
292 | only
293 | onto
294 | or
295 | other
296 | others
297 | otherwise
298 | ought
299 | our
300 | ours
301 | ourselves
302 | out
303 | outside
304 | over
305 | overall
306 | own
307 | p
308 | particular
309 | particularly
310 | people
311 | per
312 | perhaps
313 | placed
314 | please
315 | plus
316 | possible
317 | probably
318 | provides
319 | q
320 | que
321 | quite
322 | r
323 | rather
324 | really
325 | relatively
326 | respectively
327 | right
328 | s
329 | said
330 | same
331 | say
332 | says
333 | second
334 | secondly
335 | see
336 | seem
337 | seemed
338 | seeming
339 | seems
340 | self
341 | selves
342 | sensible
343 | sent
344 | serious
345 | seven
346 | several
347 | shall
348 | she
349 | should
350 | since
351 | six
352 | so
353 | some
354 | somebody
355 | somehow
356 | someone
357 | something
358 | sometime
359 | sometimes
360 | somewhat
361 | somewhere
362 | specified
363 | specify
364 | specifying
365 | state
366 | still
367 | sub
368 | such
369 | sup
370 | sure
371 | t
372 | take
373 | taken
374 | than
375 | thank
376 | thanks
377 | that
378 | thats
379 | the
380 | their
381 | theirs
382 | them
383 | themselves
384 | then
385 | thence
386 | there
387 | thereafter
388 | thereby
389 | therefore
390 | therein
391 | thereupon
392 | these
393 | they
394 | think
395 | thinks
396 | third
397 | this
398 | thorough
399 | thoroughly
400 | those
401 | though
402 | three
403 | through
404 | throughout
405 | thru
406 | thus
407 | time
408 | to
409 | today
410 | together
411 | too
412 | toward
413 | towards
414 | twice
415 | two
416 | u
417 | under
418 | unless
419 | until
420 | unto
421 | up
422 | upon
423 | us
424 | use
425 | used
426 | useful
427 | uses
428 | using
429 | usually
430 | v
431 | value
432 | various
433 | very
434 | via
435 | viz
436 | vs
437 | w
438 | want
439 | was
440 | way
441 | we
442 | well
443 | went
444 | were
445 | what
446 | whatever
447 | when
448 | whence
449 | whenever
450 | where
451 | whereafter
452 | whereas
453 | whereby
454 | wherein
455 | whereupon
456 | wherever
457 | whether
458 | which
459 | while
460 | whither
461 | who
462 | whoever
463 | whole
464 | whom
465 | whose
466 | why
467 | will
468 | with
469 | within
470 | without
471 | work
472 | world
473 | would
474 | x
475 | y
476 | year
477 | years
478 | yes
479 | yet
480 | you
481 | youd
482 | your
483 | youre
484 | yours
485 | yourself
486 | yourselves
487 | z
488 | zero
489 | via
490 | rt
491 | http
492 | html
493 | php
494 | tinyurl.com
495 | tweet
496 | tweets
497 | bit.ly
498 | twitpic.com
499 | ow.ly
500 | im
501 | ive
502 | lol
503 | com
504 | bit
505 | ff
506 | followfriday
507 | follow
508 | retweet
509 | post
510 | list
511 | lists
512 | tcot
--------------------------------------------------------------------------------