├── .gitignore ├── lib └── json-simple-1.1.1.jar ├── src └── com │ └── dataclox │ └── tweetie │ ├── config │ └── IndexConfig.java │ ├── main │ ├── TweetTimeStamp.java │ ├── Tweet.java │ ├── TweeStruct.java │ ├── Tweetie.java │ ├── TweeChat.java │ └── StatGenerator.java │ └── parser │ ├── ConfigParser.java │ ├── DumpParser.java │ ├── TweetProcessor.java │ └── IntermediateDumpParser.java └── config.txt /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/* 2 | *.iml 3 | out/* 4 | -------------------------------------------------------------------------------- /lib/json-simple-1.1.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arpitbbhayani/Tweetie/HEAD/lib/json-simple-1.1.1.jar -------------------------------------------------------------------------------- /src/com/dataclox/tweetie/config/IndexConfig.java: -------------------------------------------------------------------------------- 1 | package com.dataclox.tweetie.config; 2 | 3 | /** 4 | * Created by devilo on 16/8/14. 5 | */ 6 | public class IndexConfig { 7 | 8 | public static String twitterDumpPath = null; 9 | public static boolean createIndex = false; 10 | public static String indexLoc = null; 11 | public static boolean useIntermediateIndex = false; 12 | 13 | public static String interDump = null; 14 | public static String nullDump = null; 15 | 16 | 17 | public static String statLoc = null; 18 | public static String convFilePath = null; 19 | public static String numUsersVsFreqFilePath = null; 20 | public static String convLengthVsFreqFilePath = null; 21 | 22 | 23 | 24 | public static String minutesVsFreqFilePath = null; 25 | } 26 | -------------------------------------------------------------------------------- /config.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is config.txt that will be used for configuring 3 | # the tweetie application. 4 | # 5 | # Note: Comment in this file starts with # 6 | # That is beginning of a line with # treats whole line as comment. 7 | # 8 | 9 | # 10 | # Required Variables: 11 | # TWITTER_DUMP : Path of the twitter dump 12 | # 13 | # CREATE_INDEX : Boolean value in quotes to specify if there is a need to create index 14 | # Possible values: "true","false" 15 | # 16 | # 17 | 18 | TWITTER_DUMP=/media/devilo/TeChIe/Web Mining/Monsoon 2014/Assignments/tweets_2014_07_31_00.txt 19 | #TWITTER_DUMP=/media/devilo/TeChIe/Web Mining/Monsoon 2014/Assignments/tweets10.txt 20 | 21 | CREATE_INDEX=false 22 | USE_INTERMEDIATE_INDEX=true 23 | 24 | INTER_DUMP=/media/devilo/TeChIe/Web Mining/Monsoon 2014/Assignments/index/INTER_tweets_2014_07_31_00.txt 25 | NULL_DUMP=/media/devilo/TeChIe/Web Mining/Monsoon 2014/Assignments/index/NULL_tweets_2014_07_31_00.txt 26 | 27 | INDEX_LOC=/media/devilo/TeChIe/Web Mining/Monsoon 2014/Assignments/index 28 | 29 | #Statistics 30 | STAT_LOC=/media/devilo/TeChIe/Web Mining/Monsoon 2014/Assignments/stat 31 | -------------------------------------------------------------------------------- /src/com/dataclox/tweetie/main/TweetTimeStamp.java: -------------------------------------------------------------------------------- 1 | package com.dataclox.tweetie.main; 2 | 3 | /** 4 | * Created by devilo on 19/8/14. 5 | */ 6 | public class TweetTimeStamp { 7 | 8 | private String day = null; 9 | private String month = null; 10 | private int date = 0; 11 | private int hh = 0 , mm = 0 , ss = 0; 12 | private int year = 0; 13 | 14 | private String timestampStr = null; 15 | 16 | private boolean isNull = false; 17 | 18 | public TweetTimeStamp( String t ) { 19 | 20 | timestampStr = t; 21 | 22 | String[] str = t.split(" "); 23 | 24 | if( str.length != 6 ) { 25 | isNull = true; 26 | return; 27 | } 28 | 29 | day = str[0]; 30 | month = str[1]; 31 | date = Integer.parseInt(str[2]); 32 | 33 | String[] time = str[3].split(":"); 34 | 35 | hh = Integer.parseInt(time[0]); 36 | mm = Integer.parseInt(time[1]); 37 | ss = Integer.parseInt(time[2]); 38 | 39 | 40 | year = Integer.parseInt(str[5]); 41 | 42 | } 43 | 44 | public boolean isNull() { 45 | return isNull; 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /src/com/dataclox/tweetie/main/Tweet.java: -------------------------------------------------------------------------------- 1 | package com.dataclox.tweetie.main; 2 | 3 | import java.text.SimpleDateFormat; 4 | import java.util.Date; 5 | 6 | /** 7 | * Created by devilo on 19/8/14. 8 | */ 9 | public class Tweet { 10 | 11 | String tweetText = null; 12 | Date tweetTimestamp = null; 13 | Long tweetId = null; 14 | Long tweetUserId = null; 15 | Long tweetInReplyToStatusId = null; 16 | 17 | boolean isEnglish = false; 18 | 19 | public Tweet() { 20 | 21 | } 22 | 23 | public Tweet(String tweetText, Date tweetTimestamp, Long tweetId, Long tweetUserId, Long tweetInReplyToStatusId) { 24 | this.tweetText = tweetText; 25 | this.tweetTimestamp = tweetTimestamp; 26 | this.tweetId = tweetId; 27 | this.tweetUserId = tweetUserId; 28 | this.tweetInReplyToStatusId = tweetInReplyToStatusId; 29 | } 30 | 31 | public String getTweetText() { 32 | return tweetText; 33 | } 34 | 35 | public void setTweetText(String tweetText) { 36 | this.tweetText = tweetText; 37 | } 38 | 39 | public Date getTweetTimestamp() { 40 | return tweetTimestamp; 41 | } 42 | 43 | public void setTweetTimestamp(Date tweetTimestamp) { 44 | this.tweetTimestamp = tweetTimestamp; 45 | } 46 | 47 | public Long getTweetId() { 48 | return tweetId; 49 | } 50 | 51 | public void setTweetId(Long tweetId) { 52 | this.tweetId = tweetId; 53 | } 54 | 55 | public Long getTweetUserId() { 56 | return tweetUserId; 57 | } 58 | 59 | public void setTweetUserId(Long tweetUserId) { 60 | this.tweetUserId = tweetUserId; 61 | } 62 | 63 | public Long getTweetInReplyToStatusId() { 64 | return tweetInReplyToStatusId; 65 | } 66 | 67 | public void setTweetInReplyToStatusId(Long tweetInReplyToStatusId) { 68 | this.tweetInReplyToStatusId = tweetInReplyToStatusId; 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/com/dataclox/tweetie/main/TweeStruct.java: -------------------------------------------------------------------------------- 1 | package com.dataclox.tweetie.main; 2 | 3 | import java.util.HashMap; 4 | import java.util.HashSet; 5 | import java.util.LinkedHashSet; 6 | import java.util.TreeMap; 7 | 8 | /** 9 | * Created by devilo on 19/8/14. 10 | */ 11 | public class TweeStruct { 12 | 13 | /* This is a Singleton class */ 14 | 15 | private static TweeStruct instance = null; 16 | 17 | private HashMap tweetMap = null; 18 | private TreeMap parentChildMap = null; 19 | private HashSet roots = null; 20 | 21 | TreeMap> adjacencyList = null; 22 | TreeMap tweetIdVsUserId = null; 23 | 24 | 25 | protected TweeStruct() { 26 | 27 | roots = new HashSet(); 28 | tweetMap = new HashMap(); 29 | parentChildMap = new TreeMap(); 30 | adjacencyList = new TreeMap>(); 31 | 32 | tweetIdVsUserId = new TreeMap(); 33 | } 34 | 35 | public static TweeStruct getInstance() { 36 | 37 | if( instance == null ) 38 | instance = new TweeStruct(); 39 | 40 | return instance; 41 | } 42 | 43 | public void insert( Tweet tweet ) { 44 | 45 | tweetIdVsUserId.put(tweet.getTweetId(), tweet.getTweetUserId()); 46 | tweetMap.put(tweet.getTweetId(), tweet); 47 | parentChildMap.put(tweet.getTweetId(), tweet.getTweetInReplyToStatusId()); 48 | 49 | if( tweet.getTweetInReplyToStatusId() == 0 ) { 50 | roots.add(tweet.getTweetId()); 51 | } 52 | 53 | } 54 | 55 | public int getSize() { 56 | return tweetMap.size(); 57 | } 58 | 59 | public HashMap getTweetMap() { 60 | return tweetMap; 61 | } 62 | 63 | public TreeMap getParentChildMap() { 64 | return parentChildMap; 65 | } 66 | 67 | public HashSet getRoots() { 68 | return roots; 69 | } 70 | 71 | public TreeMap> getAdjacencyList() { 72 | return adjacencyList; 73 | } 74 | 75 | public TreeMap getTweetIdVsUserId() { 76 | return tweetIdVsUserId; 77 | } 78 | 79 | } 80 | -------------------------------------------------------------------------------- /src/com/dataclox/tweetie/parser/ConfigParser.java: -------------------------------------------------------------------------------- 1 | package com.dataclox.tweetie.parser; 2 | 3 | import com.dataclox.tweetie.config.IndexConfig; 4 | 5 | import java.io.BufferedReader; 6 | import java.io.FileNotFoundException; 7 | import java.io.FileReader; 8 | import java.io.IOException; 9 | 10 | /** 11 | * Created by devilo on 16/8/14. 12 | */ 13 | public class ConfigParser { 14 | 15 | private String configFilePath = "config.txt"; 16 | 17 | BufferedReader configReader; 18 | 19 | public void parse() throws IOException { 20 | 21 | String configLine = null; 22 | 23 | configReader = new BufferedReader(new FileReader(configFilePath)); 24 | 25 | while( (configLine = configReader.readLine()) != null ) { 26 | 27 | configLine = configLine.trim(); 28 | 29 | if( configLine.startsWith("#") || configLine.length() == 0 ) { 30 | continue; 31 | } 32 | 33 | String[] keyValue = configLine.split("="); 34 | 35 | if( keyValue[0].equalsIgnoreCase("TWITTER_DUMP") ) { 36 | IndexConfig.twitterDumpPath = keyValue[1]; 37 | } 38 | else if( keyValue[0].equalsIgnoreCase("CREATE_INDEX") ) { 39 | IndexConfig.createIndex = Boolean.parseBoolean(keyValue[1]); 40 | } 41 | else if( keyValue[0].equalsIgnoreCase("INDEX_LOC") ) { 42 | IndexConfig.indexLoc = keyValue[1]; 43 | } 44 | else if( keyValue[0].equalsIgnoreCase("USE_INTERMEDIATE_INDEX") ) { 45 | IndexConfig.useIntermediateIndex = Boolean.parseBoolean(keyValue[1]); 46 | } 47 | else if( keyValue[0].equalsIgnoreCase("INTER_DUMP") ) { 48 | IndexConfig.interDump = keyValue[1]; 49 | } 50 | else if( keyValue[0].equalsIgnoreCase("NULL_DUMP") ) { 51 | IndexConfig.nullDump = keyValue[1]; 52 | } 53 | else if( keyValue[0].equalsIgnoreCase("STAT_LOC") ) { 54 | IndexConfig.statLoc = keyValue[1]; 55 | } 56 | else { 57 | System.err.println("Invalid Key found : " + keyValue[0]); 58 | throw new IllegalArgumentException(); 59 | } 60 | 61 | } 62 | 63 | 64 | } 65 | 66 | 67 | } 68 | -------------------------------------------------------------------------------- /src/com/dataclox/tweetie/parser/DumpParser.java: -------------------------------------------------------------------------------- 1 | package com.dataclox.tweetie.parser; 2 | 3 | import com.dataclox.tweetie.config.IndexConfig; 4 | 5 | import java.io.*; 6 | 7 | /** 8 | * Created by devilo on 16/8/14. 9 | */ 10 | public class DumpParser { 11 | 12 | private File indexFolder = null; 13 | private File dumpFile = null; 14 | 15 | public void createIndex() throws IOException { 16 | 17 | validate(); 18 | 19 | String tweet = null; 20 | 21 | TweetProcessor tweetProcessor = new TweetProcessor(indexFolder, dumpFile); 22 | 23 | BufferedReader dumpReader = new BufferedReader(new FileReader(dumpFile)); 24 | 25 | while((tweet = dumpReader.readLine()) != null) { 26 | 27 | String processedTweet = tweet.trim(); 28 | if( processedTweet.length() > 0 ) 29 | tweetProcessor.process(processedTweet); 30 | 31 | } 32 | 33 | dumpReader.close(); 34 | tweetProcessor.finish(); 35 | 36 | } 37 | 38 | private void validate() throws IOException { 39 | 40 | if(IndexConfig.indexLoc == null ) 41 | throw new NullPointerException(); 42 | 43 | indexFolder = new File(IndexConfig.indexLoc); 44 | 45 | if( indexFolder.isDirectory() ) { 46 | System.err.println("Directory at INDEX_LOC already exists, please delete it and run again."); 47 | throw new IOException(); 48 | } 49 | 50 | if( indexFolder.isFile() ) { 51 | System.err.println("File found at INDEX_LOC while expecting nothing, please delete it and run again."); 52 | throw new IOException(); 53 | } 54 | 55 | boolean isDirectoryCreated = indexFolder.mkdir(); 56 | 57 | if( isDirectoryCreated == false ) { 58 | System.err.println("Something went wrong with directory creation; please check INDEX_LOC once again."); 59 | throw new IOException(); 60 | } 61 | 62 | dumpFile = new File(IndexConfig.twitterDumpPath); 63 | 64 | if( dumpFile.isDirectory() ) { 65 | System.err.println("Expecting file found directory, please check the TWITTER_DUMP and run again."); 66 | throw new IOException(); 67 | } 68 | 69 | if( dumpFile.exists() == false ) { 70 | System.err.println("File at TWITTER_DUMP does not exists, check TWITTER_DUMP and run again."); 71 | throw new IOException(); 72 | } 73 | 74 | } 75 | 76 | } 77 | -------------------------------------------------------------------------------- /src/com/dataclox/tweetie/main/Tweetie.java: -------------------------------------------------------------------------------- 1 | package com.dataclox.tweetie.main; 2 | 3 | import com.dataclox.tweetie.config.IndexConfig; 4 | import com.dataclox.tweetie.parser.ConfigParser; 5 | import com.dataclox.tweetie.parser.DumpParser; 6 | import com.dataclox.tweetie.parser.IntermediateDumpParser; 7 | 8 | import java.io.IOException; 9 | import java.text.ParseException; 10 | 11 | /** 12 | * Created by devilo on 16/8/14. 13 | */ 14 | public class Tweetie { 15 | 16 | DumpParser dumpParser = null; 17 | ConfigParser configParser = null; 18 | StatGenerator statGenerator = null; 19 | IntermediateDumpParser intermediateDumpParser = null; 20 | 21 | public Tweetie() { 22 | 23 | dumpParser = new DumpParser(); 24 | configParser = new ConfigParser(); 25 | statGenerator = new StatGenerator(); 26 | } 27 | 28 | public void start() { 29 | 30 | try { 31 | configParser.parse(); 32 | } catch (IOException e) { 33 | e.printStackTrace(); 34 | } 35 | 36 | System.out.println("-------------------- Variables Set --------------------"); 37 | System.out.println("CREATE_INDEX = " + IndexConfig.createIndex); 38 | System.out.println("USE_INTERMEDIATE_INDEX = " + IndexConfig.useIntermediateIndex); 39 | System.out.println("TWITTER_DUMP = " + IndexConfig.twitterDumpPath); 40 | System.out.println("INDEX_LOC = " + IndexConfig.indexLoc); 41 | System.out.println("INTER_DUMP = " + IndexConfig.interDump); 42 | System.out.println("NULL_DUMP = " + IndexConfig.nullDump); 43 | 44 | 45 | if( IndexConfig.createIndex == true ) { 46 | 47 | try { 48 | dumpParser.createIndex(); 49 | 50 | System.out.println("Index at INDEX_LOC created ..."); 51 | 52 | } catch (IOException e) { 53 | e.printStackTrace(); 54 | } 55 | 56 | } 57 | else if( IndexConfig.createIndex == true || IndexConfig.useIntermediateIndex == true ) { 58 | 59 | intermediateDumpParser = new IntermediateDumpParser(IndexConfig.interDump); 60 | System.out.println("Using intermediate file : " + IndexConfig.interDump); 61 | 62 | try { 63 | 64 | intermediateDumpParser.createTweetStruct(); 65 | 66 | System.out.println("Total roots : " + TweeStruct.getInstance().getAdjacencyList().keySet().size()); 67 | 68 | statGenerator.generateConversations(); 69 | statGenerator.generateNumOfDistinctUsersVsFreq(); 70 | statGenerator.generateConversationLengthVsFreq(); 71 | statGenerator.generateMinutesVsFreq(); 72 | 73 | //statGenerator.dumpTweets(); 74 | 75 | statGenerator.printConversationTree(); 76 | 77 | 78 | } 79 | catch (IOException e) { 80 | e.printStackTrace(); 81 | } 82 | catch (ParseException e) { 83 | e.printStackTrace(); 84 | } 85 | 86 | 87 | } 88 | else { 89 | System.out.println("Index at INDEX_LOC is used ..."); 90 | } 91 | 92 | 93 | TweeChat tweeChat = new TweeChat(); 94 | tweeChat.initialize(); 95 | tweeChat.start(); 96 | 97 | } 98 | 99 | 100 | public static void main(String[] args) { 101 | 102 | long s,e; 103 | 104 | s = System.currentTimeMillis(); 105 | 106 | Tweetie tweetie = new Tweetie(); 107 | tweetie.start(); 108 | 109 | e = System.currentTimeMillis(); 110 | 111 | System.out.println("Time elapsed : " + (e-s)/1000 + " sec."); 112 | 113 | } 114 | 115 | } 116 | -------------------------------------------------------------------------------- /src/com/dataclox/tweetie/parser/TweetProcessor.java: -------------------------------------------------------------------------------- 1 | package com.dataclox.tweetie.parser; 2 | 3 | import com.dataclox.tweetie.config.IndexConfig; 4 | import org.json.simple.JSONObject; 5 | import org.json.simple.parser.JSONParser; 6 | import org.json.simple.parser.ParseException; 7 | 8 | import java.io.*; 9 | import java.nio.Buffer; 10 | 11 | /** 12 | * Created by devilo on 17/8/14. 13 | */ 14 | public class TweetProcessor { 15 | 16 | private final String PREFIX = "INTER"; 17 | 18 | File dumpFile = null; 19 | File indexFolder = null; 20 | File nullFile = null; 21 | File intermediateFile = null; 22 | 23 | BufferedWriter fileWriter = null; 24 | BufferedWriter nullWriter = null; 25 | 26 | public TweetProcessor(File inFolder, File dFile) { 27 | 28 | this.dumpFile = dFile; 29 | this.indexFolder = inFolder; 30 | 31 | intermediateFile = new File(indexFolder.getAbsolutePath() + File.separator + PREFIX + "_" + dumpFile.getName()); 32 | nullFile = new File(IndexConfig.nullDump); 33 | 34 | System.out.println("Intermediate File : " + intermediateFile.getAbsolutePath()); 35 | 36 | try { 37 | 38 | intermediateFile.createNewFile(); 39 | 40 | //fileWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(intermediateFile) , "UTF-8")); 41 | fileWriter = new BufferedWriter(new FileWriter(intermediateFile)); 42 | nullWriter = new BufferedWriter(new FileWriter(nullFile)); 43 | 44 | } catch (IOException e) { 45 | e.printStackTrace(); 46 | } 47 | 48 | } 49 | 50 | public void process(String tweet) throws IOException { 51 | 52 | JSONParser jsonParser = new JSONParser(); 53 | 54 | String tweetText = null; 55 | String tweetTimestamp = null; 56 | String tweetId = null; 57 | String tweetUserId = null; 58 | String tweetInReplyToStatusId = null; 59 | 60 | try { 61 | 62 | JSONObject tweetJO = (JSONObject) jsonParser.parse(tweet); 63 | JSONObject tweetDataJO = (JSONObject) jsonParser.parse((String) tweetJO.get("Data")); 64 | 65 | tweetId = (String) tweetDataJO.get("IdStr"); 66 | tweetTimestamp = (String) tweetDataJO.get("CreatedAt"); 67 | tweetInReplyToStatusId = (String) tweetDataJO.get("InReplyToStatusIdStr"); 68 | tweetText = ((String) tweetDataJO.get("Text")); 69 | tweetUserId = (String) ((JSONObject)tweetDataJO.get("User")).get("IdStr"); 70 | 71 | if(tweetText == null) { 72 | 73 | nullWriter.write("$" + tweetId + "\n"); 74 | nullWriter.write("$" + tweetTimestamp + "\n"); 75 | nullWriter.write("$" + tweetText + "\n"); 76 | nullWriter.write("$" + tweetUserId + "\n"); 77 | nullWriter.write("$" + tweetInReplyToStatusId + "\n"); 78 | nullWriter.write("$\n"); 79 | return; 80 | } 81 | 82 | tweetText = tweetText.replaceAll("[\\n\\r\\t]" , " "); 83 | 84 | fileWriter.write("$" + tweetId + "\n"); 85 | fileWriter.write("$" + tweetTimestamp + "\n"); 86 | fileWriter.write("$" + tweetText + "\n"); 87 | fileWriter.write("$" + tweetUserId + "\n"); 88 | fileWriter.write("$" + tweetInReplyToStatusId + "\n"); 89 | fileWriter.write("$\n"); 90 | 91 | } 92 | catch (ParseException e) { 93 | e.printStackTrace(); 94 | } 95 | 96 | } 97 | 98 | public void finish() throws IOException { 99 | 100 | if( fileWriter != null ) 101 | fileWriter.close(); 102 | 103 | if( nullWriter != null ) 104 | nullWriter.close(); 105 | } 106 | 107 | 108 | } 109 | -------------------------------------------------------------------------------- /src/com/dataclox/tweetie/parser/IntermediateDumpParser.java: -------------------------------------------------------------------------------- 1 | package com.dataclox.tweetie.parser; 2 | 3 | import com.dataclox.tweetie.main.TweeStruct; 4 | import com.dataclox.tweetie.main.Tweet; 5 | import com.dataclox.tweetie.main.TweetTimeStamp; 6 | 7 | import java.io.BufferedReader; 8 | import java.io.File; 9 | import java.io.FileReader; 10 | import java.io.IOException; 11 | import java.text.ParseException; 12 | import java.text.SimpleDateFormat; 13 | import java.util.*; 14 | 15 | /** 16 | * Created by devilo on 19/8/14. 17 | */ 18 | public class IntermediateDumpParser { 19 | 20 | private TweeStruct tweeStruct = null; 21 | private String intermediateDumpPath = null; 22 | private File intermediateFile = null; 23 | 24 | public IntermediateDumpParser(String interDump) { 25 | 26 | this.intermediateDumpPath = interDump; 27 | tweeStruct = TweeStruct.getInstance(); 28 | intermediateFile = new File(this.intermediateDumpPath); 29 | } 30 | 31 | 32 | public void createTweetStruct() throws ParseException,IOException { 33 | 34 | SimpleDateFormat simpleDateFormat = new SimpleDateFormat("EEE MMM dd HH:mm:ss Z yyyy"); 35 | simpleDateFormat.setLenient(true); 36 | 37 | String line = null; 38 | 39 | BufferedReader intermediateDumpReader = new BufferedReader(new FileReader(intermediateFile)); 40 | 41 | int count = 0; 42 | 43 | while((line = intermediateDumpReader.readLine()) != null) { 44 | 45 | Tweet t = new Tweet(); 46 | 47 | t.setTweetId(new Long(line.substring(1))); 48 | 49 | line = intermediateDumpReader.readLine(); 50 | t.setTweetTimestamp(simpleDateFormat.parse(line.substring(1))); 51 | 52 | line = intermediateDumpReader.readLine(); 53 | t.setTweetText(line.trim().substring(1)); 54 | 55 | line = intermediateDumpReader.readLine(); 56 | t.setTweetUserId(new Long(line.substring(1))); 57 | 58 | line = intermediateDumpReader.readLine(); 59 | 60 | if( line.equals("$null")) 61 | line = "$0"; 62 | 63 | t.setTweetInReplyToStatusId(new Long(line.substring(1))); 64 | 65 | line = intermediateDumpReader.readLine(); 66 | tweeStruct.insert(t); 67 | count++; 68 | 69 | } 70 | 71 | System.out.println("insertion count : " + count); 72 | 73 | intermediateDumpReader.close(); 74 | 75 | postProcess(); 76 | 77 | } 78 | 79 | private void postProcess() { 80 | 81 | HashSet roots = tweeStruct.getRoots(); 82 | HashMap tweetMap = tweeStruct.getTweetMap(); 83 | TreeMap parentChildMap = tweeStruct.getParentChildMap(); //Child - Parent 84 | 85 | TreeMap> adjacencyList = tweeStruct.getAdjacencyList(); 86 | 87 | HashSet toBeRemoved = new HashSet(); 88 | 89 | 90 | /* Removing all children whose parents do not exist in the data-set */ 91 | for( Long childId : parentChildMap.keySet() ) { 92 | 93 | Long parentId = parentChildMap.get(childId); 94 | 95 | if(parentId != 0 && !tweetMap.containsKey(parentId)) { 96 | toBeRemoved.add(childId); 97 | } 98 | 99 | } 100 | 101 | System.out.println("Whose parent were not found in the dataset : " + toBeRemoved.size()); 102 | 103 | for( Long id: toBeRemoved ) { 104 | parentChildMap.remove(id); 105 | } 106 | toBeRemoved.clear(); 107 | 108 | 109 | TreeMap unsortedChildParent = new TreeMap(); 110 | 111 | for( Long childId : parentChildMap.keySet() ) { 112 | 113 | Long parentId = parentChildMap.get(childId); 114 | 115 | if( parentId == 0 ) { 116 | adjacencyList.put(childId, new LinkedHashSet()); 117 | } 118 | else { 119 | if( adjacencyList.containsKey(parentId) ) 120 | adjacencyList.get(parentId).add(childId); 121 | else 122 | unsortedChildParent.put(childId,parentId); 123 | } 124 | 125 | } 126 | 127 | //System.out.println("Unsorted count : " + unsortedChildParent.size()); 128 | //System.out.println("Total roots : " + adjacencyList.keySet().size()); 129 | 130 | int count = 0; 131 | int max = 0; 132 | 133 | for( Long rootId : adjacencyList.keySet()) { 134 | if( adjacencyList.get(rootId).size() > 0 ) 135 | count++; 136 | 137 | if( adjacencyList.get(rootId).size() > max ) 138 | max = adjacencyList.get(rootId).size(); 139 | } 140 | System.out.println("Total conversations = " + count); 141 | System.out.println("Max length conversation = " + max); 142 | } 143 | 144 | } 145 | -------------------------------------------------------------------------------- /src/com/dataclox/tweetie/main/TweeChat.java: -------------------------------------------------------------------------------- 1 | package com.dataclox.tweetie.main; 2 | 3 | import java.util.*; 4 | 5 | /** 6 | * Created by devilo on 21/8/14. 7 | */ 8 | public class TweeChat { 9 | 10 | private HashSet roots = null; 11 | private HashMap tweetMap = null; 12 | private TweeStruct tweeStruct = TweeStruct.getInstance(); 13 | private TreeMap> adjacencyList = null; 14 | 15 | 16 | HashMap tweetIdVsTweetText = null; 17 | HashSet conversationTweetIds = null; 18 | HashMap> tweetIdVsSet= null; 19 | 20 | 21 | public TweeChat() { 22 | roots = tweeStruct.getRoots(); 23 | tweetMap = tweeStruct.getTweetMap(); 24 | adjacencyList = tweeStruct.getAdjacencyList(); 25 | tweetIdVsTweetText = new HashMap(); 26 | 27 | tweetIdVsSet = new HashMap>(); 28 | conversationTweetIds = new HashSet(); 29 | 30 | } 31 | 32 | 33 | 34 | public void start() { 35 | 36 | Random random = new Random(); 37 | 38 | Scanner scanner = new Scanner(System.in); 39 | 40 | while ( true ) { 41 | 42 | Long maxId = null; 43 | float maxJC = Float.MIN_VALUE; 44 | float jc; 45 | 46 | System.out.print(">>> "); 47 | String humanSay = scanner.nextLine(); 48 | 49 | //System.out.println("For line : " + humanSay); 50 | //System.out.println("Set is : " + stringToSet(humanSay.toLowerCase())); 51 | HashSet humanSaySet = stringToSet(humanSay.toLowerCase()); 52 | 53 | HashSet union = new HashSet(); 54 | HashSet intersection = new HashSet(); 55 | 56 | for( Long id : conversationTweetIds ) { 57 | 58 | union.clear(); 59 | intersection.clear(); 60 | 61 | union.addAll(humanSaySet); 62 | intersection.addAll(humanSaySet); 63 | 64 | HashSet candidateSay = tweetIdVsSet.get(id); 65 | 66 | union.addAll(candidateSay); 67 | intersection.retainAll(candidateSay); 68 | 69 | jc = ((float)candidateSay.size()) * ((float) intersection.size() / (float)union.size()); 70 | 71 | //System.out.println("JC = " + jc); 72 | 73 | if( jc > maxJC ) { 74 | maxJC = jc; 75 | maxId = id; 76 | 77 | //System.out.println("maxJC = " + maxJC + " and maxId = " + maxId); 78 | //System.out.println("Inter : " + intersection + " and Union : " + union); 79 | 80 | } 81 | 82 | } 83 | 84 | 85 | if( maxId == null ) { 86 | System.out.println("Okay ... I c"); 87 | continue; 88 | } 89 | 90 | Long replyId = 0L; 91 | LinkedHashSet possibleReplies = adjacencyList.get(maxId); 92 | if( possibleReplies == null ) { 93 | System.out.println("Okay ... I c"); 94 | } 95 | else if(possibleReplies.size() > 0 ) { 96 | 97 | Long[] arr = ((Long[]) possibleReplies.toArray()); 98 | int size = arr.length; 99 | 100 | replyId = arr[((random.nextInt() % size) + (random.nextInt() % size)) % arr.length]; 101 | System.out.println(tweetIdVsTweetText.get(replyId)); 102 | } 103 | else { 104 | System.out.println(tweetIdVsTweetText.get(maxId)); 105 | } 106 | 107 | } 108 | 109 | } 110 | 111 | public void initialize() { 112 | 113 | for( Long rootId : roots ) { 114 | 115 | Queue q = new LinkedList(); 116 | q.add(rootId); 117 | 118 | while ( !q.isEmpty() ) { 119 | 120 | Long id = q.poll(); 121 | 122 | String text = getProcessedTweet(tweetMap.get(id).getTweetText()); 123 | 124 | if( text == null || text.length() == 0 ) 125 | continue; 126 | 127 | if( isEnglish(text) ) { 128 | conversationTweetIds.add(id); 129 | tweetIdVsTweetText.put(id, text); 130 | tweetIdVsSet.put(id, stringToSet(text)); 131 | } 132 | 133 | if( adjacencyList.containsKey(id)) { 134 | for (Long childId : adjacencyList.get(id)) { 135 | q.add(childId); 136 | } 137 | } 138 | } 139 | 140 | } 141 | 142 | //System.out.println("Total english tweets in conversation : " + tweetIdVsTweetText.size()); 143 | 144 | } 145 | 146 | private HashSet stringToSet(String text) { 147 | HashSet s = new HashSet(); 148 | 149 | String[] array = text.split(" "); 150 | 151 | for( String str : array ) 152 | s.add(str); 153 | 154 | return s; 155 | } 156 | 157 | private String getProcessedTweet(String tweetText) { 158 | 159 | StringBuilder stringBuilder = new StringBuilder(); 160 | int i = 0; 161 | 162 | if(tweetText.charAt(0) == 'R' && tweetText.charAt(1) == 'T' ) { 163 | i = 2; 164 | } 165 | 166 | for( ; i < tweetText.length() ; i++ ) { 167 | 168 | char ch = tweetText.charAt(i); 169 | 170 | if( ch == '@' || ch == '#' ) { 171 | 172 | i++; 173 | 174 | if( i == tweetText.length() ) 175 | break; 176 | 177 | char c = tweetText.charAt(i); 178 | 179 | while( Character.isLetterOrDigit(c) || c == '_' ) { 180 | i++; 181 | 182 | if( i == tweetText.length() ) 183 | break; 184 | 185 | c = tweetText.charAt(i); 186 | 187 | } 188 | 189 | } 190 | else if( tweetText.substring(i).startsWith("http://") || tweetText.substring(i).startsWith("https://") ) { 191 | 192 | char c = tweetText.charAt(i); 193 | 194 | while( c != ' ' ) { 195 | i++; 196 | 197 | if( i == tweetText.length() ) 198 | break; 199 | 200 | c = tweetText.charAt(i); 201 | 202 | } 203 | 204 | } 205 | else if( !( ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || Character.isDigit(ch) ) || ch == ':' || ch == ';' || ch == '(' || ch == ')' || ch == ',' || ch == ' ') ) { 206 | 207 | } 208 | else { 209 | stringBuilder.append(ch); 210 | } 211 | 212 | } 213 | 214 | return new String(stringBuilder).replaceAll(" " , " ").trim().toLowerCase(); 215 | } 216 | 217 | private boolean isEnglish(String text) { 218 | 219 | if( text.length() > 0 ) 220 | return true; 221 | 222 | return false; 223 | } 224 | 225 | } -------------------------------------------------------------------------------- /src/com/dataclox/tweetie/main/StatGenerator.java: -------------------------------------------------------------------------------- 1 | package com.dataclox.tweetie.main; 2 | 3 | import com.dataclox.tweetie.config.IndexConfig; 4 | import javafx.util.Pair; 5 | 6 | import java.io.BufferedWriter; 7 | import java.io.File; 8 | import java.io.FileWriter; 9 | import java.io.IOException; 10 | import java.util.*; 11 | 12 | /** 13 | * Created by devilo on 20/8/14. 14 | */ 15 | public class StatGenerator { 16 | 17 | private HashMap tweetMap = TweeStruct.getInstance().getTweetMap(); 18 | 19 | private HashSet roots = TweeStruct.getInstance().getRoots(); 20 | private TreeMap tweetIdVsUserId = TweeStruct.getInstance().getTweetIdVsUserId(); 21 | private TreeMap> adjacencyList = TweeStruct.getInstance().getAdjacencyList(); 22 | 23 | private File statFolder = null; 24 | private File convFile = null; 25 | private File numUsersVsFreqFile = null; 26 | private File convLengthVsFreqFile = null; 27 | private File minutesVsFreqFile = null; 28 | 29 | private BufferedWriter convFileWriter = null; 30 | private BufferedWriter numUsersVsFreqFileWriter = null; 31 | private BufferedWriter conversationLengthVsFreqFileWriter = null; 32 | private BufferedWriter minutesVsFreqFileWriter = null; 33 | 34 | 35 | public void generateConversations() throws IOException { 36 | validate(); 37 | } 38 | 39 | 40 | public void generateMinutesVsFreq() throws IOException { 41 | 42 | minutesVsFreqFileWriter = new BufferedWriter(new FileWriter(minutesVsFreqFile)); 43 | 44 | TreeMap minutesVsFrequency = new TreeMap(); 45 | 46 | for( Long rootId : roots ) { 47 | 48 | Long minId = Long.MAX_VALUE, maxId = Long.MIN_VALUE; 49 | 50 | Queue q = new LinkedList(); 51 | q.add(rootId); 52 | 53 | while ( !q.isEmpty() ) { 54 | 55 | Long id = q.poll(); 56 | 57 | if( id < minId ) 58 | minId = id; 59 | 60 | if( id > maxId ) 61 | maxId = id; 62 | 63 | if( adjacencyList.containsKey(id)) { 64 | for (Long childId : adjacencyList.get(id)) { 65 | q.add(childId); 66 | } 67 | } 68 | } 69 | 70 | long minutes = (tweetMap.get(maxId).getTweetTimestamp().getTime() - tweetMap.get(minId).getTweetTimestamp().getTime())/60000; 71 | 72 | if( !minutesVsFrequency.containsKey(minutes) ) 73 | minutesVsFrequency.put(minutes, 1L); 74 | else { 75 | long freq = minutesVsFrequency.get(minutes); 76 | minutesVsFrequency.put(minutes, freq + 1); 77 | } 78 | 79 | } 80 | 81 | for( Long numUsers: minutesVsFrequency.keySet() ) { 82 | minutesVsFreqFileWriter.write(numUsers + "-" + minutesVsFrequency.get(numUsers) + '\n'); 83 | } 84 | 85 | minutesVsFreqFileWriter.close(); 86 | 87 | 88 | } 89 | 90 | public void generateNumOfDistinctUsersVsFreq() throws IOException { 91 | 92 | numUsersVsFreqFileWriter = new BufferedWriter(new FileWriter(numUsersVsFreqFile)); 93 | 94 | HashSet users = new HashSet(); 95 | TreeMap numberOfDistinctUserVsFrequency = new TreeMap(); 96 | 97 | for( Long rootId : roots ) { 98 | 99 | users.clear(); 100 | 101 | /*for( Long childId : adjacencyList.get(rootId) ) { 102 | users.add(tweetIdVsUserId.get(childId)); 103 | }*/ 104 | 105 | Queue q = new LinkedList(); 106 | q.add(rootId); 107 | 108 | while ( !q.isEmpty() ) { 109 | 110 | Long id = q.poll(); 111 | users.add(tweetIdVsUserId.get(id)); 112 | 113 | if( adjacencyList.containsKey(id)) { 114 | for (Long childId : adjacencyList.get(id)) { 115 | q.add(childId); 116 | } 117 | } 118 | } 119 | 120 | 121 | if( !numberOfDistinctUserVsFrequency.containsKey(users.size()) ) 122 | numberOfDistinctUserVsFrequency.put(users.size(), 1); 123 | else { 124 | int freq = numberOfDistinctUserVsFrequency.get(users.size()); 125 | numberOfDistinctUserVsFrequency.put(users.size(), freq + 1); 126 | } 127 | 128 | } 129 | 130 | for( Integer numUsers: numberOfDistinctUserVsFrequency.keySet() ) { 131 | numUsersVsFreqFileWriter.write(numUsers + "-" + numberOfDistinctUserVsFrequency.get(numUsers) + '\n'); 132 | } 133 | 134 | numUsersVsFreqFileWriter.close(); 135 | 136 | 137 | } 138 | 139 | public void generateConversationLengthVsFreq() throws IOException{ 140 | 141 | conversationLengthVsFreqFileWriter = new BufferedWriter(new FileWriter(convLengthVsFreqFile)); 142 | 143 | TreeMap conversationLengthVsFrequency = new TreeMap(); 144 | 145 | for( Long rootId : roots ) { 146 | 147 | //int conversationLength = adjacencyList.get(rootId).size(); 148 | int conversationLength = bfs(rootId); 149 | 150 | if( !conversationLengthVsFrequency.containsKey(conversationLength) ) 151 | conversationLengthVsFrequency.put(conversationLength, 0); 152 | 153 | int freq = conversationLengthVsFrequency.get(conversationLength); 154 | conversationLengthVsFrequency.put(conversationLength, freq+1); 155 | } 156 | 157 | for( Integer conversationLength: conversationLengthVsFrequency.keySet() ) { 158 | 159 | conversationLengthVsFreqFileWriter.write(conversationLength + "-" + conversationLengthVsFrequency.get(conversationLength) + '\n'); 160 | } 161 | 162 | conversationLengthVsFreqFileWriter.close(); 163 | 164 | } 165 | 166 | private void validate() throws IOException, NullPointerException { 167 | 168 | if(IndexConfig.statLoc == null ) 169 | throw new NullPointerException(); 170 | 171 | statFolder = new File(IndexConfig.statLoc); 172 | 173 | if( statFolder.isDirectory() ) { 174 | System.err.println("Directory at STAT_LOC already exists, please delete it and run again."); 175 | throw new IOException(); 176 | } 177 | 178 | if( statFolder.isFile() ) { 179 | System.err.println("File found at STAT_LOC while expecting nothing, please delete it and run again."); 180 | throw new IOException(); 181 | } 182 | 183 | boolean isDirectoryCreated = statFolder.mkdir(); 184 | 185 | if( isDirectoryCreated == false ) { 186 | System.err.println("Something went wrong with directory creation; please check STAT_LOC once again."); 187 | throw new IOException(); 188 | } 189 | 190 | IndexConfig.convFilePath = IndexConfig.statLoc + File.separator + "conv.txt"; 191 | convFile = new File(IndexConfig.convFilePath); 192 | 193 | IndexConfig.numUsersVsFreqFilePath = IndexConfig.statLoc + File.separator + "num_users_vs_freq.txt"; 194 | numUsersVsFreqFile = new File(IndexConfig.numUsersVsFreqFilePath); 195 | 196 | IndexConfig.convLengthVsFreqFilePath = IndexConfig.statLoc + File.separator + "conv_length_vs_freq.txt"; 197 | convLengthVsFreqFile = new File(IndexConfig.convLengthVsFreqFilePath); 198 | 199 | IndexConfig.minutesVsFreqFilePath = IndexConfig.statLoc + File.separator + "minutes_vs_freq.txt"; 200 | minutesVsFreqFile = new File(IndexConfig.minutesVsFreqFilePath); 201 | 202 | } 203 | 204 | 205 | int bfs( Long rootId ) { 206 | 207 | int numOfNodes = 0; 208 | 209 | Queue q = new LinkedList(); 210 | 211 | q.add(rootId); 212 | 213 | while( !q.isEmpty() ) { 214 | numOfNodes ++; 215 | 216 | Long id = q.poll(); 217 | 218 | if( adjacencyList.containsKey(id)) { 219 | for (Long childId : adjacencyList.get(id)) { 220 | q.add(childId); 221 | } 222 | } 223 | } 224 | 225 | return numOfNodes; 226 | } 227 | 228 | 229 | 230 | public void dumpTweets() throws IOException { 231 | 232 | BufferedWriter fileWriter = new BufferedWriter(new FileWriter(IndexConfig.statLoc + File.separator + "tweets.txt")); 233 | 234 | for( Long rootId : roots ) { 235 | 236 | Stack> s = new Stack>(); 237 | 238 | s.add(new Pair(rootId,0)); 239 | 240 | while ( !s.isEmpty() ) { 241 | 242 | Pair id = s.pop(); 243 | 244 | for( int i = 0 ; i <= id.getValue() ; i++ ) 245 | fileWriter.write("---"); 246 | fileWriter.write(id.getKey() + " : " + tweetMap.get(id.getKey()).getTweetText()); 247 | fileWriter.write('\n'); 248 | 249 | if( adjacencyList.containsKey(id.getKey())) { 250 | for (Long childId : adjacencyList.get(id.getKey())) { 251 | s.push(new Pair(childId,id.getValue()+1)); 252 | } 253 | } 254 | 255 | } 256 | 257 | } 258 | 259 | fileWriter.close(); 260 | 261 | } 262 | 263 | private void printDirectReplies( Long parentId ) throws IOException { 264 | 265 | convFileWriter.write(parentId.toString()); 266 | 267 | if( !adjacencyList.containsKey(parentId)) 268 | return; 269 | 270 | long numOfReplies = adjacencyList.get(parentId).size(); 271 | 272 | if(numOfReplies == 0) 273 | return; 274 | 275 | convFileWriter.write('('); 276 | 277 | for( Long childId: adjacencyList.get(parentId)) { 278 | 279 | printDirectReplies(childId); 280 | 281 | if( numOfReplies > 1 ) 282 | convFileWriter.write(','); 283 | 284 | numOfReplies--; 285 | } 286 | convFileWriter.write(')'); 287 | } 288 | 289 | public void printConversationTree() throws IOException { 290 | 291 | convFileWriter = new BufferedWriter(new FileWriter(convFile)); 292 | 293 | for( Long tweetId : roots ) { 294 | 295 | convFileWriter.write("("); 296 | printDirectReplies(tweetId); 297 | convFileWriter.write(")"); 298 | convFileWriter.write('\n'); 299 | } 300 | 301 | 302 | convFileWriter.close(); 303 | 304 | } 305 | 306 | } 307 | --------------------------------------------------------------------------------