├── .gitignore ├── API-agreement.pdf ├── HISTORY.md ├── README.md ├── data ├── clusters.training.microblog2011-2012.json ├── qrels.microblog2011-2012.txt ├── qrels.microblog2011.txt.gz ├── qrels.microblog2012.txt.gz ├── qrels.microblog2013.txt.gz ├── qrels.microblog2014.txt.gz ├── queries.trec2005efficiency.txt.gz ├── run.lm.xml ├── topics.microblog2011.txt ├── topics.microblog2012.txt ├── topics.microblog2013.txt └── topics.microblog2014.txt ├── etc ├── trec_eval.9.0.tar.gz └── ttg_eval.py ├── twitter-tools-core ├── .settings │ ├── org.eclipse.jdt.core.prefs │ └── org.eclipse.jdt.ui.prefs ├── pom.xml └── src │ ├── attic │ └── java │ │ └── cc │ │ └── twittertools │ │ ├── corpus │ │ └── data │ │ │ ├── TSVStatusBlockReader.java │ │ │ └── TSVStatusCorpusReader.java │ │ └── download │ │ ├── AsyncJsonStatusBlockCrawler.java │ │ └── VerifyJsonStatusBlockCrawl.java │ ├── main │ ├── java │ │ ├── cc │ │ │ └── twittertools │ │ │ │ ├── corpus │ │ │ │ ├── data │ │ │ │ │ ├── HTMLStatusExtractor.java │ │ │ │ │ ├── JsonStatusBlockReader.java │ │ │ │ │ ├── JsonStatusCorpusReader.java │ │ │ │ │ ├── Status.java │ │ │ │ │ └── StatusStream.java │ │ │ │ └── demo │ │ │ │ │ └── ReadStatuses.java │ │ │ │ ├── download │ │ │ │ ├── AsyncEmbeddedJsonStatusBlockCrawler.java │ │ │ │ └── AsyncHTMLStatusBlockCrawler.java │ │ │ │ ├── index │ │ │ │ ├── ExtractTermStatisticsFromIndex.java │ │ │ │ ├── ExtractTweetidsFromCollection.java │ │ │ │ ├── ExtractTweetidsFromIndex.java │ │ │ │ ├── IndexStatuses.java │ │ │ │ ├── LowerCaseEntityPreservingFilter.java │ │ │ │ └── TweetAnalyzer.java │ │ │ │ ├── search │ │ │ │ ├── TrecTopic.java │ │ │ │ ├── TrecTopicSet.java │ │ │ │ ├── api │ │ │ │ │ ├── RunQueriesBaselineThrift.java │ │ │ │ │ ├── RunQueriesThrift.java │ │ │ │ │ ├── SearchStatusesThrift.java │ │ │ │ │ ├── TResultComparable.java │ │ │ │ │ ├── TrecSearchHandler.java │ │ │ │ │ ├── TrecSearchThriftClient.java │ │ │ │ │ ├── TrecSearchThriftLoadGenerator.java │ │ │ │ │ └── TrecSearchThriftServer.java │ │ │ │ └── local │ │ │ │ │ ├── RunQueries.java │ │ │ │ │ └── SearchStatuses.java │ │ │ │ ├── stream │ │ │ │ └── GatherStatusStream.java │ │ │ │ ├── thrift │ │ │ │ └── gen │ │ │ │ │ ├── TQuery.java │ │ │ │ │ ├── TResult.java │ │ │ │ │ ├── TrecSearch.java │ │ │ │ │ └── TrecSearchException.java │ │ │ │ └── util │ │ │ │ ├── ExtractSubcollection.java │ │ │ │ └── VerifySubcollection.java │ │ └── log4j.properties │ ├── perl │ │ ├── extract_deletes.pl │ │ └── join_deletes_with_collection.pl │ ├── python │ │ ├── Search │ │ │ ├── TrecSearch-remote │ │ │ ├── TrecSearch.py │ │ │ ├── __init__.py │ │ │ ├── constants.py │ │ │ └── ttypes.py │ │ ├── TrecSearchThriftClientCli.py │ │ └── twittertools │ │ │ └── stream │ │ │ └── gather_status_stream.py │ ├── resources │ │ └── log4j.properties │ └── thrift │ │ ├── gen-py │ │ ├── __init__.py │ │ └── twittertools │ │ │ ├── TrecSearch-remote │ │ │ ├── TrecSearch.py │ │ │ ├── __init__.py │ │ │ ├── constants.py │ │ │ └── ttypes.py │ │ └── twittertools.thrift │ └── test │ └── java │ └── cc │ └── twittertools │ ├── download │ └── FetchStatusTest.java │ ├── index │ └── TokenizationTest.java │ └── search │ └── TrecTopicSetTest.java ├── twitter-tools-hadoop ├── .settings │ ├── org.eclipse.jdt.core.prefs │ └── org.eclipse.jdt.ui.prefs ├── README.md ├── pom.xml ├── src │ └── main │ │ └── java │ │ └── cc │ │ └── twittertools │ │ ├── hadoop │ │ └── Example.java │ │ ├── hbase │ │ ├── LoadWordCount.java │ │ └── WordCountDAO.java │ │ ├── piggybank │ │ ├── ConvertCreatedAtToEpoch.java │ │ ├── GetLatitude.java │ │ ├── GetLongitude.java │ │ └── IsMap.java │ │ └── udf │ │ ├── GetDate.java │ │ ├── GetInterval.java │ │ └── LuceneTokenizer.java └── wordcountbytime.pig ├── twitter-tools-rm3 ├── README.md ├── build.sh ├── config │ └── run_params_sample.json ├── data │ ├── qrels.microblog │ ├── stoplist.twitter │ ├── topics.microblog2011.json │ ├── topics.microblog2012.json │ └── topics.microblog2013.json ├── pom.xml └── src │ └── main │ ├── java │ └── edu │ │ └── illinois │ │ └── lis │ │ ├── document │ │ └── FeatureVector.java │ │ ├── feedback │ │ ├── FeedbackModel.java │ │ └── FeedbackRelevanceModel.java │ │ ├── query │ │ ├── GQueries.java │ │ ├── GQueriesJsonImpl.java │ │ ├── GQuery.java │ │ ├── TrecTemporalTopic.java │ │ └── TrecTemporalTopicSet.java │ │ ├── rerank │ │ ├── SearchReranker.java │ │ └── TResultComparator.java │ │ ├── search │ │ └── RunQueries.java │ │ ├── searchsource │ │ └── IndexWrapperMicroblogApi.java │ │ └── utils │ │ ├── ExtractGqueriesFromTrecFormat.java │ │ ├── KeyValuePair.java │ │ ├── ListUtils.java │ │ ├── LuceneQuery.java │ │ ├── ParameterBroker.java │ │ ├── Qrels.java │ │ ├── Scorable.java │ │ ├── ScorableComparator.java │ │ └── Stopper.java │ └── resources │ └── log4j.properties └── twitter-tools-ttgbaseline ├── README.md ├── config └── run_params.json ├── pom.xml ├── src └── edu │ └── gslis │ └── ttg │ ├── clusters │ ├── Cluster.java │ ├── Clusters.java │ └── clusterers │ │ └── SimpleJaccardClusterer.java │ ├── jaccard │ └── JaccardStore.java │ ├── main │ └── RunTTGBaseline.java │ └── searchers │ └── SimpleSearcher.java └── topics ├── topics.microblog-2011.json ├── topics.microblog-2012.json ├── topics.microblog-2013.json └── topics.ttg-training.json /.gitignore: -------------------------------------------------------------------------------- 1 | twitter-tools-core/.classpath 2 | twitter-tools-core/.project 3 | twitter-tools-core/target/ 4 | twitter-tools-rm3/.classpath 5 | twitter-tools-rm3/.project 6 | twitter-tools-rm3/target/ 7 | twitter-tools-ttgbaseline/.classpath 8 | twitter-tools-ttgbaseline/.project 9 | twitter-tools-ttgbaseline/.settings/ 10 | twitter-tools-ttgbaseline/target/ 11 | twitter-tools-ttgbaseline/output.txt 12 | twitter-tools-hadoop/.classpath 13 | twitter-tools-hadoop/.project 14 | twitter-tools-hadoop/target/ 15 | etc/run.sh 16 | etc/trec_eval.9.0/ 17 | etc/trec_eval 18 | data/qrels.microblog2011.txt 19 | data/qrels.microblog2012.txt 20 | data/qrels.microblog2013.txt 21 | data/qrels.microblog2014.txt 22 | data/queries.trec2005efficiency.txt 23 | *~ 24 | .DS_Store 25 | *.pyc 26 | -------------------------------------------------------------------------------- /API-agreement.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/twitter-tools/777776083d1e4a76da5bdc5860551a38f8fd6766/API-agreement.pdf -------------------------------------------------------------------------------- /HISTORY.md: -------------------------------------------------------------------------------- 1 | Version 1.4.3 2 | ============= 3 | December 26, 2014 4 | 5 | + API serving the Tweets2013 collection for TREC 2014, includes minor code fixes during TREC evaluations that have been merged back to master 6 | 7 | Version 1.4.2 8 | ============= 9 | March 15, 2014 10 | 11 | + Added code to generate Thrift baseline runs 12 | + Added code to extract subcollection and term statistics 13 | + Added topics and qrels for TREC 2013 14 | 15 | Version 1.4.1 16 | ============= 17 | July 7, 2013 18 | 19 | + Cleaned up dependencies and eliminated direct dependency on Solr 20 | + Fixed unnecessary string -> int/long parsing in retrieval 21 | 22 | Version 1.4.0 23 | ============= 24 | July 3, 2013 25 | 26 | + Switched over from Ant to Maven for build management, with artifactId `twitter-tools-core` 27 | 28 | Version 1.3.0 29 | ============= 30 | June 12, 2013 31 | 32 | + Package refactoring/renaming and code cleanup 33 | + Upgraded to Lucene 4.3 34 | + Added initial Python client 35 | + Installed Tweet-specific Lucene analyzer 36 | + Added simple Perl scripts for processing deletes 37 | 38 | Version 1.2.0 39 | ============= 40 | June 6, 2013 41 | 42 | + Initial release of the API for TREC 2013 43 | 44 | Version 1.1.1 45 | ============= 46 | January 28, 2013 47 | 48 | + Noted that `AsyncEmbeddedJsonStatusBlockCrawler` is currently broken 49 | 50 | Version 1.1.0 51 | ============= 52 | January 23, 2013 53 | 54 | + Added crawler for Twitter public stream 55 | 56 | Version 1.0.0 57 | ============= 58 | January 15, 2013 59 | 60 | + Cleaned up code 61 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Twitter Tools 2 | ============= 3 | 4 | This repo holds a collection of tools for the TREC Microblog tracks, which officially ended in 2015. The track mailing list can be found at [trec-microblog@googlegroups.com](http://groups.google.com/group/trec-microblog). 5 | 6 | Archival Documents 7 | ------------------ 8 | 9 | + [TREC 2013 API Specifications](https://github.com/lintool/twitter-tools/wiki/TREC-2013-API-Specifications) 10 | + [TREC 2013 Track Guidelines](https://github.com/lintool/twitter-tools/wiki/TREC-2013-Track-Guidelines) 11 | + [TREC 2014 Track Guidelines](https://github.com/lintool/twitter-tools/wiki/TREC-2014-Track-Guidelines) 12 | + [TREC 2015 Track Guidelines](https://github.com/lintool/twitter-tools/wiki/TREC-2015-Track-Guidelines) 13 | 14 | API Access 15 | ---------- 16 | 17 | The Microblog tracks in 2013 and 2014 used the "evaluation as a service" (EaaS) model, where teams interact with the official corpus via a common API. Although the evaluation has ended, the API is still available for researcher use. 18 | 19 | To request access to the API, follow these steps: 20 | 21 | 1. Fill out the [API usage agreement](http://lintool.github.io/twitter-tools/API-agreement.pdf). 22 | 2. Email the usage agreement to `microblog-request@nist.gov`. 23 | 3. After NIST receives your request, you will receive an access token from NIST. 24 | 4. The code for accessing the API can be found in this repository. The endpoint of API itself (i.e., hostname, port) will be provided by NIST. 25 | 26 | Getting Stated 27 | -------------- 28 | 29 | The main Maven artifact for the TREC Microblog API is `twitter-tools-core`. The latest releases of Maven artifacts are available at [Maven Central](http://search.maven.org/#search%7Cga%7C1%7Ccc.twittertools). 30 | 31 | You can clone the repo with the following command: 32 | 33 | ``` 34 | $ git clone git://github.com/lintool/twitter-tools.git 35 | ``` 36 | 37 | Once you've cloned the repository, change directory into `twitter-tools-core` and build the package with Maven: 38 | 39 | ``` 40 | $ cd twitter-tools-core 41 | $ mvn clean package appassembler:assemble 42 | ``` 43 | 44 | For more information, see the [project wiki](https://github.com/lintool/twitter-tools/wiki). 45 | 46 | Replicating TREC Baselines 47 | -------------------------- 48 | 49 | One advantage of the TREC Microblog API is that it is possible to deploy a community baseline whose results are replicable by *anyone*. The `raw` results are simply the output of the API unmodified. The `baseline` results are the `raw` results that have been post-processed to remove retweets and break score ties by reverse chronological order (earliest first). 50 | 51 | To run the `raw` results for TREC 2011, issue the following command: 52 | 53 | ``` 54 | sh target/appassembler/bin/RunQueriesThrift \ 55 | -host [host] -port [port] -group [group] -token [token] \ 56 | -queries ../data/topics.microblog2011.txt > run.microblog2011.raw.txt 57 | ``` 58 | 59 | And to run the `baseline` results for TREC 2011, issue the following command: 60 | 61 | ``` 62 | sh target/appassembler/bin/RunQueriesBaselineThrift \ 63 | -host [host] -port [port] -group [group] -token [token] \ 64 | -queries ../data/topics.microblog2011.txt > run.microblog2011.baseline.txt 65 | ``` 66 | 67 | Note that `trec_eval` is included in `twitter-tools/etc` (just needs to be compiled), and the qrels are stored in `twitter-tools/data` (just needs to be uncompressed), so you can evaluate as follows: 68 | 69 | ``` 70 | ../etc/trec_eval.9.0/trec_eval ../data/qrels.microblog2011.txt run.microblog2011.raw.txt 71 | ``` 72 | 73 | Similar commands will allow you to replicate runs for TREC 2012 and TREC 2013. With `trec_eval`, you should get *exactly* the following results: 74 | 75 | MAP | raw | baseline 76 | ----------|--------|--------- 77 | TREC 2011 | 0.3050 | 0.3576 78 | TREC 2012 | 0.1751 | 0.2091 79 | TREC 2013 | 0.2044 | 0.2532 80 | TREC 2014 | 0.3090 | 0.3924 81 | 82 | P30 | raw | baseline 83 | ----------|--------|--------- 84 | TREC 2011 | 0.3483 | 0.4000 85 | TREC 2012 | 0.2831 | 0.3311 86 | TREC 2013 | 0.3761 | 0.4450 87 | TREC 2014 | 0.5145 | 0.6182 88 | 89 | 90 | License 91 | ------- 92 | 93 | Licensed under the [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0). 94 | 95 | 96 | Acknowledgments 97 | --------------- 98 | 99 | This work is supported in part by the National Science Foundation under award [IIS-1218043](http://www.nsf.gov/awardsearch/showAward?AWD_ID=1218043). Any opinions, findings, and conclusions or recommendations expressed are those of the researchers and do not necessarily reflect the views of the National Science Foundation. 100 | -------------------------------------------------------------------------------- /data/qrels.microblog2011.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/twitter-tools/777776083d1e4a76da5bdc5860551a38f8fd6766/data/qrels.microblog2011.txt.gz -------------------------------------------------------------------------------- /data/qrels.microblog2012.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/twitter-tools/777776083d1e4a76da5bdc5860551a38f8fd6766/data/qrels.microblog2012.txt.gz -------------------------------------------------------------------------------- /data/qrels.microblog2013.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/twitter-tools/777776083d1e4a76da5bdc5860551a38f8fd6766/data/qrels.microblog2013.txt.gz -------------------------------------------------------------------------------- /data/qrels.microblog2014.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/twitter-tools/777776083d1e4a76da5bdc5860551a38f8fd6766/data/qrels.microblog2014.txt.gz -------------------------------------------------------------------------------- /data/queries.trec2005efficiency.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/twitter-tools/777776083d1e4a76da5bdc5860551a38f8fd6766/data/queries.trec2005efficiency.txt.gz -------------------------------------------------------------------------------- /data/run.lm.xml: -------------------------------------------------------------------------------- 1 | 2 | tweets2011-index 3 | true 4 | 1000 5 | lm 6 | 7 | -------------------------------------------------------------------------------- /etc/trec_eval.9.0.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/twitter-tools/777776083d1e4a76da5bdc5860551a38f8fd6766/etc/trec_eval.9.0.tar.gz -------------------------------------------------------------------------------- /etc/ttg_eval.py: -------------------------------------------------------------------------------- 1 | #This file is to take run file (as an input argument) and ground truth non-redundant tweets 2 | #to compute the unweighted precision, recall and weighted precision per topic. 3 | import json 4 | from sets import Set 5 | import argparse 6 | 7 | parser = argparse.ArgumentParser(description='Tweet Timeline Generation (TTG) evaluation script (version 1.0)') 8 | parser.add_argument('-q', required=True, metavar='qrels', help='qrels file') 9 | parser.add_argument('-c', required=True, metavar='clusters', help='cluster anotations') 10 | parser.add_argument('-r', required=True, metavar='run', help='run file') 11 | 12 | args = parser.parse_args() 13 | file_qrels_path = vars(args)['q'] 14 | clusters_path = vars(args)['c'] 15 | run_path = vars(args)['r'] 16 | 17 | #Take qrels to generate dictionary of {topic number:{tweetid:weight}} 18 | #where weight is 0(non-relevant), 1(relevant), 2(highly relevant) 19 | qrels_dt = {} 20 | file_qrels = open(file_qrels_path, "r") 21 | lines = file_qrels.readlines() 22 | for line in lines: 23 | line = line.strip().split() 24 | topic_ind = line[0] 25 | if topic_ind not in qrels_dt: 26 | qrels_dt[topic_ind] = {} 27 | qrels_dt[topic_ind][line[2]] = line[3] 28 | 29 | #Take run file and generate dictionary of {topic number:Set of tweetids for that topic} 30 | runlength = len(run_path) - run_path.index("/") - 1 31 | clusters_run_dt = {} 32 | file_run = open(run_path, "r") 33 | lines = file_run.readlines() 34 | for line in lines: 35 | line = line.strip().split() 36 | topic_ind = line[0][line[0].index("MB") + 2:] 37 | if topic_ind not in clusters_run_dt: 38 | clusters_run_dt[topic_ind] = Set() 39 | clusters_run_dt[topic_ind].add(line[2]) 40 | 41 | #Take ground truth, generate dictionary of {topic number:2D array of clusters of tweetids}, for each topic, 42 | #compare tweet from each cluster with that from run file and compute unweighted precision, recall and weighted recall. 43 | clusters_dt = {} 44 | precision_total = 0 45 | unweighted_recall_total = 0 46 | weighted_recall_total = 0 47 | file_clusters = open(clusters_path, "r") 48 | data = json.load(file_clusters) 49 | topics = data["topics"] 50 | print "runtag".ljust(runlength) + "\ttopic\tunweighted_recall weighted_recall precision" 51 | for topic in sorted(topics.keys()): 52 | total_weight = 0 53 | credits = 0 54 | hit_num = 0 55 | topic_ind = topic[line[0].index("MB") + 2:] 56 | topic_ind = topic_ind.encode("utf-8") 57 | clusters_json = topics[topic]["clusters"] 58 | for i in range(len(clusters_json)): 59 | clusters_json[i] = [s.encode("utf-8") for s in clusters_json[i]] 60 | clusters_dt[topic_ind] = clusters_json 61 | for cluster in clusters_dt[topic_ind]: 62 | weight = 0 63 | hit_flag = 0 64 | for tweet in cluster: 65 | weight = weight + int(qrels_dt[topic_ind][tweet]) 66 | if tweet in clusters_run_dt[topic_ind]: 67 | hit_flag = 1 68 | total_weight = total_weight + weight 69 | if hit_flag == 1: 70 | credits = credits + weight 71 | hit_num = hit_num + 1 72 | hit_flag = 0 73 | precision = float(hit_num) / len(clusters_run_dt[topic_ind]) 74 | unweighted_recall = float(hit_num) / len(clusters_dt[topic_ind]) 75 | weighted_recall = float(credits) / total_weight 76 | precision_total = precision_total + precision 77 | unweighted_recall_total = unweighted_recall_total + unweighted_recall 78 | weighted_recall_total = weighted_recall_total + weighted_recall 79 | print run_path[run_path.rindex("/") + 1:].ljust(max(runlength, 6)) + "\t" + "MB" + str(topic_ind) + "\t" + "%12.4f" % unweighted_recall + "\t" + "%12.4f" % weighted_recall + "\t" + "%10.4f" % precision 80 | precision_mean = precision_total / len(clusters_dt) 81 | unweighted_recall_mean = unweighted_recall_total / len(clusters_dt) 82 | weighted_recall_mean = weighted_recall_total / len(clusters_dt) 83 | print run_path[run_path.rindex("/") + 1:].ljust(max(runlength, 6)) + "\t" + "all".ljust(5) + "\t" + "%12.4f" % unweighted_recall_mean + "\t" + "%12.4f" % weighted_recall_mean + "\t" + "%10.4f" % precision_mean 84 | file_run.close() 85 | file_clusters.close() 86 | -------------------------------------------------------------------------------- /twitter-tools-core/.settings/org.eclipse.jdt.ui.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | formatter_profile=_twitter-tools 3 | formatter_settings_version=12 4 | org.eclipse.jdt.ui.exception.name=e 5 | org.eclipse.jdt.ui.gettersetter.use.is=true 6 | org.eclipse.jdt.ui.keywordthis=false 7 | org.eclipse.jdt.ui.overrideannotation=true 8 | -------------------------------------------------------------------------------- /twitter-tools-core/src/attic/java/cc/twittertools/corpus/data/TSVStatusBlockReader.java: -------------------------------------------------------------------------------- 1 | package cc.twittertools.corpus.data; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileInputStream; 6 | import java.io.IOException; 7 | import java.io.InputStreamReader; 8 | import java.util.zip.GZIPInputStream; 9 | 10 | 11 | /** 12 | * Abstraction for an stream of statuses, backed by an underlying gzipped file with JSON-encoded 13 | * tweets, one per line. 14 | */ 15 | public class TSVStatusBlockReader implements StatusStream { 16 | private final BufferedReader br; 17 | 18 | public TSVStatusBlockReader(File file) throws IOException { 19 | 20 | if (!file.getName().endsWith(".gz")) { 21 | throw new IOException("Expecting .gz compressed file!"); 22 | } 23 | 24 | br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file)), "UTF-8")); 25 | } 26 | 27 | /** 28 | * Returns the next status, or null if no more statuses. 29 | */ 30 | public Status next() throws IOException { 31 | Status nxt = null; 32 | String raw = null; 33 | 34 | while (nxt == null) { 35 | raw = br.readLine(); 36 | 37 | // Check to see if we've reached end of file. 38 | if ( raw == null) { 39 | return null; 40 | } 41 | 42 | nxt = Status.fromTSV(raw); 43 | } 44 | return Status.fromTSV(raw); 45 | } 46 | 47 | public void close() throws IOException { 48 | br.close(); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /twitter-tools-core/src/attic/java/cc/twittertools/corpus/data/TSVStatusCorpusReader.java: -------------------------------------------------------------------------------- 1 | package cc.twittertools.corpus.data; 2 | 3 | import java.io.File; 4 | import java.io.FileFilter; 5 | import java.io.IOException; 6 | 7 | 8 | /** 9 | * Abstraction for a corpus of statuses. A corpus is assumed to consist of a number of blocks, each 10 | * represented by a gzipped file within a root directory. This object will allow to caller to read 11 | * through all blocks, in sorted lexicographic order of the files. 12 | */ 13 | public class TSVStatusCorpusReader implements StatusStream { 14 | private final File[] files; 15 | private int nextFile = 0; 16 | private TSVStatusBlockReader currentBlock = null; 17 | 18 | public TSVStatusCorpusReader(File file) throws IOException { 19 | 20 | if (!file.isDirectory()) { 21 | throw new IOException("Expecting " + file + " to be a directory!"); 22 | } 23 | 24 | files = file.listFiles(new FileFilter() { 25 | public boolean accept(File path) { 26 | return path.getName().endsWith(".gz") ? true : false; 27 | } 28 | }); 29 | 30 | if (files.length == 0) { 31 | throw new IOException(file + " does not contain any .gz files!"); 32 | } 33 | } 34 | 35 | /** 36 | * Returns the next status, or null if no more statuses. 37 | */ 38 | public Status next() throws IOException { 39 | if (currentBlock == null) { 40 | currentBlock = new TSVStatusBlockReader(files[nextFile]); 41 | nextFile++; 42 | } 43 | 44 | Status status = null; 45 | while (true) { 46 | status = currentBlock.next(); 47 | if (status != null) { 48 | return status; 49 | } 50 | 51 | if (nextFile >= files.length) { 52 | // We're out of files to read. Must be the end of the corpus. 53 | return null; 54 | } 55 | 56 | currentBlock.close(); 57 | // Move to next file. 58 | currentBlock = new TSVStatusBlockReader(files[nextFile]); 59 | nextFile++; 60 | } 61 | } 62 | 63 | public void close() throws IOException { 64 | currentBlock.close(); 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /twitter-tools-core/src/main/java/cc/twittertools/corpus/data/HTMLStatusExtractor.java: -------------------------------------------------------------------------------- 1 | package cc.twittertools.corpus.data; 2 | 3 | import java.util.HashMap; 4 | import java.util.LinkedHashMap; 5 | import java.util.Map; 6 | import java.io.BufferedReader; 7 | import java.io.InputStreamReader; 8 | import java.io.FileInputStream; 9 | import java.io.IOException; 10 | import java.net.URL; 11 | import java.net.URLDecoder; 12 | import java.text.SimpleDateFormat; 13 | import java.util.Date; 14 | import java.util.TimeZone; 15 | 16 | import org.jsoup.Jsoup; 17 | import org.jsoup.nodes.Element; 18 | import org.jsoup.nodes.Document; 19 | import org.jsoup.select.Elements; 20 | 21 | import com.google.gson.Gson; 22 | import com.google.gson.GsonBuilder; 23 | import com.google.gson.JsonObject; 24 | 25 | import org.apache.commons.cli.CommandLine; 26 | import org.apache.commons.cli.CommandLineParser; 27 | import org.apache.commons.cli.GnuParser; 28 | import org.apache.commons.cli.HelpFormatter; 29 | import org.apache.commons.cli.OptionBuilder; 30 | import org.apache.commons.cli.Options; 31 | import org.apache.commons.cli.ParseException; 32 | 33 | public class HTMLStatusExtractor { 34 | 35 | public SimpleDateFormat date_fmt = new SimpleDateFormat("EEE MMM d kk:mm:ss Z yyyy"); 36 | 37 | public HTMLStatusExtractor() { 38 | date_fmt.setTimeZone(TimeZone.getTimeZone("UTC")); 39 | } 40 | 41 | public static Map splitQuery(URL url) 42 | throws java.io.UnsupportedEncodingException { 43 | Map query_pairs = new LinkedHashMap(); 44 | String query = url.getQuery(); 45 | String[] pairs = query.split("&"); 46 | for (String pair : pairs) { 47 | int idx = pair.indexOf("="); 48 | query_pairs.put(URLDecoder.decode(pair.substring(0, idx), "UTF-8"), 49 | URLDecoder.decode(pair.substring(idx + 1), "UTF-8")); 50 | } 51 | return query_pairs; 52 | } 53 | 54 | public JsonObject extractTweet(String html) 55 | throws java.net.MalformedURLException, java.io.UnsupportedEncodingException { 56 | JsonObject status = new JsonObject(); 57 | 58 | Document doc = Jsoup.parse(html); 59 | Element tweet_div = doc.select("div.permalink-tweet").first(); 60 | 61 | String tweet_text = tweet_div.select("p.tweet-text").first().text(); 62 | status.addProperty("text", tweet_text); 63 | 64 | String tweet_id = tweet_div.attr("data-tweet-id"); 65 | status.addProperty("id_str", tweet_id); 66 | status.addProperty("id", Long.parseLong(tweet_id)); 67 | 68 | String timestamp = doc.select("span.js-short-timestamp").first().attr("data-time"); 69 | Date created_at = new Date(); 70 | created_at.setTime(Long.parseLong(timestamp) * 1000); 71 | status.addProperty("created_at", date_fmt.format(created_at)); 72 | 73 | Elements js_stats_retweets = doc.select("li.js-stat-retweets"); 74 | if (!js_stats_retweets.isEmpty()) { 75 | status.addProperty("retweeted", true); 76 | String count = js_stats_retweets.select("strong").first().text(); 77 | status.addProperty("retweet_count", Long.parseLong(count)); 78 | } else { 79 | status.addProperty("retweeted", false); 80 | status.addProperty("retweet_count", 0); 81 | } 82 | Elements js_stats_favs = doc.select("li.js-stat-favorites"); 83 | status.addProperty("favorited", !js_stats_favs.isEmpty()); 84 | 85 | 86 | // User subfield 87 | JsonObject user = new JsonObject(); 88 | String user_id = tweet_div.attr("data-user-id"); 89 | user.addProperty("id_str", user_id); 90 | user.addProperty("id", Long.parseLong(user_id)); 91 | String screen_name = tweet_div.attr("data-screen-name"); 92 | user.addProperty("screen_name", screen_name); 93 | String user_name = tweet_div.attr("data-name"); 94 | user.addProperty("name", user_name); 95 | 96 | status.add("user", user); 97 | 98 | // Geo information 99 | Elements tweet_loc = doc.select("a.tweet-geo-text"); 100 | if (!tweet_loc.isEmpty()) { 101 | JsonObject location = new JsonObject(); 102 | Element loc = tweet_loc.first(); 103 | // Adding http to avoid malformed URL exception 104 | URL url = new URL("http:" + loc.attr("href")); 105 | Map query_params = HTMLStatusExtractor.splitQuery(url); 106 | // Loop over possible query parameters 107 | // http://asnsblues.blogspot.ch/2011/11/google-maps-query-string-parameters.html 108 | String lat_and_long = null; 109 | if ((lat_and_long = query_params.get("ll")) != null 110 | || (lat_and_long = query_params.get("sll")) != null 111 | || (lat_and_long = query_params.get("cbll")) != null 112 | || (lat_and_long = query_params.get("q")) != null) { 113 | String[] coordinates = lat_and_long.split(","); 114 | double latitude = Double.parseDouble(coordinates[0]); 115 | double longitude = Double.parseDouble(coordinates[1]); 116 | location.addProperty("latitude", latitude); 117 | location.addProperty("longitude", longitude); 118 | } 119 | location.addProperty("location_text", loc.text()); 120 | status.add("location", location); 121 | } 122 | 123 | return status; 124 | } 125 | 126 | private static final String HTML_OPTION = "html"; 127 | 128 | @SuppressWarnings("static-access") 129 | public static void main(String[] args) throws Exception { 130 | Options options = new Options(); 131 | options.addOption(OptionBuilder.withArgName("path").hasArg() 132 | .withDescription("HTML file from twitter.com").create(HTML_OPTION)); 133 | 134 | CommandLine cmdline = null; 135 | CommandLineParser parser = new GnuParser(); 136 | try { 137 | cmdline = parser.parse(options, args); 138 | } catch (ParseException exp) { 139 | System.err.println("Error parsing command line: " + exp.getMessage()); 140 | System.exit(-1); 141 | } 142 | 143 | if (!cmdline.hasOption(HTML_OPTION)) { 144 | HelpFormatter formatter = new HelpFormatter(); 145 | formatter.printHelp(HTMLStatusExtractor.class.getName(), options); 146 | System.exit(-1); 147 | } 148 | 149 | String html_filename = cmdline.getOptionValue(HTML_OPTION); 150 | BufferedReader html_file = null; 151 | StringBuffer buf = new StringBuffer(); 152 | try { 153 | html_file = new BufferedReader(new InputStreamReader(new FileInputStream(html_filename))); 154 | String line; 155 | while ((line = html_file.readLine()) != null) { 156 | buf.append(line); 157 | buf.append('\n'); 158 | } 159 | } catch (IOException e) { 160 | e.printStackTrace(); 161 | } finally { 162 | html_file.close(); 163 | } 164 | 165 | HTMLStatusExtractor hse = new HTMLStatusExtractor(); 166 | JsonObject json = hse.extractTweet(buf.toString()); 167 | Gson gson = new GsonBuilder().setPrettyPrinting().create(); 168 | System.out.println(gson.toJson(json)); 169 | } 170 | } 171 | -------------------------------------------------------------------------------- /twitter-tools-core/src/main/java/cc/twittertools/corpus/data/JsonStatusBlockReader.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Twitter Tools 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package cc.twittertools.corpus.data; 18 | 19 | import java.io.BufferedReader; 20 | import java.io.File; 21 | import java.io.FileInputStream; 22 | import java.io.IOException; 23 | import java.io.InputStreamReader; 24 | import java.util.zip.GZIPInputStream; 25 | 26 | import com.google.common.base.Preconditions; 27 | 28 | /** 29 | * Abstraction for an stream of statuses, backed by an underlying gzipped file with JSON-encoded 30 | * tweets, one per line. 31 | */ 32 | public class JsonStatusBlockReader implements StatusStream { 33 | private final BufferedReader br; 34 | 35 | public JsonStatusBlockReader(File file) throws IOException { 36 | Preconditions.checkNotNull(file); 37 | 38 | if (!file.getName().endsWith(".gz")) { 39 | throw new IOException("Expecting .gz compressed file!"); 40 | } 41 | 42 | br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file)), "UTF-8")); 43 | } 44 | 45 | /** 46 | * Returns the next status, or null if no more statuses. 47 | */ 48 | public Status next() throws IOException { 49 | Status nxt = null; 50 | String raw = null; 51 | 52 | while (nxt == null) { 53 | raw = br.readLine(); 54 | 55 | // Check to see if we've reached end of file. 56 | if (raw == null) { 57 | return null; 58 | } 59 | 60 | nxt = Status.fromJson(raw); 61 | } 62 | return Status.fromJson(raw); 63 | } 64 | 65 | public void close() throws IOException { 66 | br.close(); 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /twitter-tools-core/src/main/java/cc/twittertools/corpus/data/JsonStatusCorpusReader.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Twitter Tools 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package cc.twittertools.corpus.data; 18 | 19 | import java.io.File; 20 | import java.io.FileFilter; 21 | import java.io.IOException; 22 | 23 | import com.google.common.base.Preconditions; 24 | 25 | /** 26 | * Abstraction for a corpus of statuses. A corpus is assumed to consist of a number of blocks, each 27 | * represented by a gzipped file within a root directory. This object will allow to caller to read 28 | * through all blocks, in sorted lexicographic order of the files. 29 | */ 30 | public class JsonStatusCorpusReader implements StatusStream { 31 | private final File[] files; 32 | private int nextFile = 0; 33 | private JsonStatusBlockReader currentBlock = null; 34 | 35 | public JsonStatusCorpusReader(File file) throws IOException { 36 | Preconditions.checkNotNull(file); 37 | 38 | if (!file.isDirectory()) { 39 | throw new IOException("Expecting " + file + " to be a directory!"); 40 | } 41 | 42 | files = file.listFiles(new FileFilter() { 43 | public boolean accept(File path) { 44 | return path.getName().endsWith(".gz") ? true : false; 45 | } 46 | }); 47 | 48 | if (files.length == 0) { 49 | throw new IOException(file + " does not contain any .gz files!"); 50 | } 51 | } 52 | 53 | /** 54 | * Returns the next status, or null if no more statuses. 55 | */ 56 | public Status next() throws IOException { 57 | if (currentBlock == null) { 58 | currentBlock = new JsonStatusBlockReader(files[nextFile]); 59 | nextFile++; 60 | } 61 | 62 | Status status = null; 63 | while (true) { 64 | status = currentBlock.next(); 65 | if (status != null) { 66 | return status; 67 | } 68 | 69 | if (nextFile >= files.length) { 70 | // We're out of files to read. Must be the end of the corpus. 71 | return null; 72 | } 73 | 74 | currentBlock.close(); 75 | // Move to next file. 76 | currentBlock = new JsonStatusBlockReader(files[nextFile]); 77 | nextFile++; 78 | } 79 | } 80 | 81 | public void close() throws IOException { 82 | currentBlock.close(); 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /twitter-tools-core/src/main/java/cc/twittertools/corpus/data/StatusStream.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Twitter Tools 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package cc.twittertools.corpus.data; 18 | 19 | import java.io.IOException; 20 | 21 | /** 22 | * Abstraction for a stream of statuses. Ordering of the statuses is left to the implementation. 23 | */ 24 | public interface StatusStream { 25 | public Status next() throws IOException; 26 | public void close() throws IOException; 27 | } 28 | -------------------------------------------------------------------------------- /twitter-tools-core/src/main/java/cc/twittertools/corpus/demo/ReadStatuses.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Twitter Tools 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package cc.twittertools.corpus.demo; 18 | 19 | import java.io.File; 20 | import java.io.PrintStream; 21 | 22 | import org.apache.commons.cli.CommandLine; 23 | import org.apache.commons.cli.CommandLineParser; 24 | import org.apache.commons.cli.GnuParser; 25 | import org.apache.commons.cli.HelpFormatter; 26 | import org.apache.commons.cli.OptionBuilder; 27 | import org.apache.commons.cli.Options; 28 | import org.apache.commons.cli.ParseException; 29 | import org.apache.log4j.Logger; 30 | 31 | import cc.twittertools.corpus.data.JsonStatusBlockReader; 32 | import cc.twittertools.corpus.data.JsonStatusCorpusReader; 33 | import cc.twittertools.corpus.data.Status; 34 | import cc.twittertools.corpus.data.StatusStream; 35 | 36 | /** 37 | * Sample program to illustrate how to work with {@link StatusStream}. 38 | */ 39 | public class ReadStatuses { 40 | private static final Logger LOG = Logger.getLogger(ReadStatuses.class); 41 | 42 | private ReadStatuses() {} 43 | 44 | private static final String INPUT_OPTION = "input"; 45 | private static final String VERBOSE_OPTION = "verbose"; 46 | private static final String DUMP_OPTION = "dump"; 47 | 48 | @SuppressWarnings("static-access") 49 | public static void main(String[] args) throws Exception { 50 | Options options = new Options(); 51 | options.addOption(OptionBuilder.withArgName("path").hasArg() 52 | .withDescription("input directory or file").create(INPUT_OPTION)); 53 | options.addOption(VERBOSE_OPTION, false, "print logging output every 10000 tweets"); 54 | options.addOption(DUMP_OPTION, false, "dump statuses"); 55 | 56 | CommandLine cmdline = null; 57 | CommandLineParser parser = new GnuParser(); 58 | try { 59 | cmdline = parser.parse(options, args); 60 | } catch (ParseException exp) { 61 | System.err.println("Error parsing command line: " + exp.getMessage()); 62 | System.exit(-1); 63 | } 64 | 65 | if (!cmdline.hasOption(INPUT_OPTION)) { 66 | HelpFormatter formatter = new HelpFormatter(); 67 | formatter.printHelp(ReadStatuses.class.getName(), options); 68 | System.exit(-1); 69 | } 70 | 71 | PrintStream out = new PrintStream(System.out, true, "UTF-8"); 72 | 73 | StatusStream stream; 74 | // Figure out if we're reading from HTML SequenceFiles or JSON. 75 | File file = new File(cmdline.getOptionValue(INPUT_OPTION)); 76 | if (!file.exists()) { 77 | System.err.println("Error: " + file + " does not exist!"); 78 | System.exit(-1); 79 | } 80 | 81 | if (file.isDirectory()) { 82 | stream = new JsonStatusCorpusReader(file); 83 | } else { 84 | stream = new JsonStatusBlockReader(file); 85 | } 86 | 87 | int cnt = 0; 88 | Status status; 89 | while ((status = stream.next()) != null) { 90 | if (cmdline.hasOption(DUMP_OPTION)) { 91 | String text = status.getText(); 92 | if (text != null) { 93 | text = text.replaceAll("\\s+", " "); 94 | text = text.replaceAll("\0", ""); 95 | } 96 | out.println(String.format("%d\t%s\t%s\t%s", status.getId(), status.getScreenname(), 97 | status.getCreatedAt(), text)); 98 | } 99 | cnt++; 100 | if ( cnt % 10000 == 0 && cmdline.hasOption(VERBOSE_OPTION)) { 101 | LOG.info(cnt + " statuses read"); 102 | } 103 | } 104 | stream.close(); 105 | LOG.info(String.format("Total of %s statuses read.", cnt)); 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /twitter-tools-core/src/main/java/cc/twittertools/index/ExtractTermStatisticsFromIndex.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Twitter Tools 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package cc.twittertools.index; 18 | 19 | import java.io.File; 20 | import java.io.PrintStream; 21 | 22 | import org.apache.commons.cli.CommandLine; 23 | import org.apache.commons.cli.CommandLineParser; 24 | import org.apache.commons.cli.GnuParser; 25 | import org.apache.commons.cli.HelpFormatter; 26 | import org.apache.commons.cli.OptionBuilder; 27 | import org.apache.commons.cli.Options; 28 | import org.apache.commons.cli.ParseException; 29 | import org.apache.lucene.index.DirectoryReader; 30 | import org.apache.lucene.index.IndexReader; 31 | import org.apache.lucene.index.SlowCompositeReaderWrapper; 32 | import org.apache.lucene.index.Terms; 33 | import org.apache.lucene.index.TermsEnum; 34 | import org.apache.lucene.store.FSDirectory; 35 | import org.apache.lucene.util.BytesRef; 36 | 37 | import cc.twittertools.index.IndexStatuses.StatusField; 38 | 39 | public class ExtractTermStatisticsFromIndex { 40 | private static final String INDEX_OPTION = "index"; 41 | private static final String MIN_OPTION = "min"; 42 | 43 | @SuppressWarnings("static-access") 44 | public static void main(String[] args) throws Exception { 45 | Options options = new Options(); 46 | 47 | options.addOption(OptionBuilder.withArgName("dir").hasArg() 48 | .withDescription("index").create(INDEX_OPTION)); 49 | options.addOption(OptionBuilder.withArgName("num").hasArg() 50 | .withDescription("min").create(MIN_OPTION)); 51 | 52 | CommandLine cmdline = null; 53 | CommandLineParser parser = new GnuParser(); 54 | try { 55 | cmdline = parser.parse(options, args); 56 | } catch (ParseException exp) { 57 | System.err.println("Error parsing command line: " + exp.getMessage()); 58 | System.exit(-1); 59 | } 60 | 61 | if (!cmdline.hasOption(INDEX_OPTION)) { 62 | HelpFormatter formatter = new HelpFormatter(); 63 | formatter.printHelp(ExtractTermStatisticsFromIndex.class.getName(), options); 64 | System.exit(-1); 65 | } 66 | 67 | String indexLocation = cmdline.getOptionValue(INDEX_OPTION); 68 | int min = cmdline.hasOption(MIN_OPTION) ? 69 | Integer.parseInt(cmdline.getOptionValue(MIN_OPTION)) : 1; 70 | 71 | PrintStream out = new PrintStream(System.out, true, "UTF-8"); 72 | 73 | IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation))); 74 | Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms(StatusField.TEXT.name); 75 | TermsEnum termsEnum = terms.iterator(TermsEnum.EMPTY); 76 | 77 | long missingCnt = 0; 78 | int skippedTerms = 0; 79 | BytesRef bytes = new BytesRef(); 80 | while ( (bytes = termsEnum.next()) != null) { 81 | byte[] buf = new byte[bytes.length]; 82 | System.arraycopy(bytes.bytes, 0, buf, 0, bytes.length); 83 | String term = new String(buf, "UTF-8"); 84 | int df = termsEnum.docFreq(); 85 | long cf = termsEnum.totalTermFreq(); 86 | 87 | if ( df < min) { 88 | skippedTerms++; 89 | missingCnt += cf; 90 | continue; 91 | } 92 | 93 | out.println(term + "\t" + df + "\t" + cf); 94 | } 95 | 96 | reader.close(); 97 | out.close(); 98 | System.err.println("skipped terms: " + skippedTerms + ", cnt: " + missingCnt); 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /twitter-tools-core/src/main/java/cc/twittertools/index/ExtractTweetidsFromCollection.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Twitter Tools 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package cc.twittertools.index; 18 | 19 | import java.io.File; 20 | 21 | import org.apache.commons.cli.CommandLine; 22 | import org.apache.commons.cli.CommandLineParser; 23 | import org.apache.commons.cli.GnuParser; 24 | import org.apache.commons.cli.HelpFormatter; 25 | import org.apache.commons.cli.OptionBuilder; 26 | import org.apache.commons.cli.Options; 27 | import org.apache.commons.cli.ParseException; 28 | 29 | import cc.twittertools.corpus.data.JsonStatusCorpusReader; 30 | import cc.twittertools.corpus.data.Status; 31 | import cc.twittertools.corpus.data.StatusStream; 32 | 33 | public class ExtractTweetidsFromCollection { 34 | private static final String COLLECTION_OPTION = "collection"; 35 | 36 | @SuppressWarnings("static-access") 37 | public static void main(String[] args) throws Exception { 38 | Options options = new Options(); 39 | 40 | options.addOption(OptionBuilder.withArgName("dir").hasArg() 41 | .withDescription("source collection directory").create(COLLECTION_OPTION)); 42 | 43 | CommandLine cmdline = null; 44 | CommandLineParser parser = new GnuParser(); 45 | try { 46 | cmdline = parser.parse(options, args); 47 | } catch (ParseException exp) { 48 | System.err.println("Error parsing command line: " + exp.getMessage()); 49 | System.exit(-1); 50 | } 51 | 52 | if (!cmdline.hasOption(COLLECTION_OPTION)) { 53 | HelpFormatter formatter = new HelpFormatter(); 54 | formatter.printHelp(ExtractTweetidsFromCollection.class.getName(), options); 55 | System.exit(-1); 56 | } 57 | 58 | String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION); 59 | 60 | File file = new File(collectionPath); 61 | if (!file.exists()) { 62 | System.err.println("Error: " + file + " does not exist!"); 63 | System.exit(-1); 64 | } 65 | 66 | StatusStream stream = new JsonStatusCorpusReader(file); 67 | 68 | Status status; 69 | while ((status = stream.next()) != null) { 70 | System.out.println(status.getId() + "\t" + status.getScreenname()); 71 | } 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /twitter-tools-core/src/main/java/cc/twittertools/index/ExtractTweetidsFromIndex.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Twitter Tools 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package cc.twittertools.index; 18 | 19 | import java.io.File; 20 | import java.io.PrintStream; 21 | 22 | import org.apache.commons.cli.CommandLine; 23 | import org.apache.commons.cli.CommandLineParser; 24 | import org.apache.commons.cli.GnuParser; 25 | import org.apache.commons.cli.HelpFormatter; 26 | import org.apache.commons.cli.OptionBuilder; 27 | import org.apache.commons.cli.Options; 28 | import org.apache.commons.cli.ParseException; 29 | import org.apache.lucene.document.Document; 30 | import org.apache.lucene.index.DirectoryReader; 31 | import org.apache.lucene.index.IndexReader; 32 | import org.apache.lucene.store.FSDirectory; 33 | 34 | import cc.twittertools.index.IndexStatuses.StatusField; 35 | 36 | /** 37 | * Reference implementation for indexing statuses. 38 | */ 39 | public class ExtractTweetidsFromIndex { 40 | private ExtractTweetidsFromIndex() {} 41 | 42 | private static final String INDEX_OPTION = "index"; 43 | 44 | @SuppressWarnings("static-access") 45 | public static void main(String[] args) throws Exception { 46 | Options options = new Options(); 47 | 48 | options.addOption(OptionBuilder.withArgName("dir").hasArg() 49 | .withDescription("index location").create(INDEX_OPTION)); 50 | 51 | CommandLine cmdline = null; 52 | CommandLineParser parser = new GnuParser(); 53 | try { 54 | cmdline = parser.parse(options, args); 55 | } catch (ParseException exp) { 56 | System.err.println("Error parsing command line: " + exp.getMessage()); 57 | System.exit(-1); 58 | } 59 | 60 | if (!cmdline.hasOption(INDEX_OPTION)) { 61 | HelpFormatter formatter = new HelpFormatter(); 62 | formatter.printHelp(ExtractTweetidsFromIndex.class.getName(), options); 63 | System.exit(-1); 64 | } 65 | 66 | File indexLocation = new File(cmdline.getOptionValue(INDEX_OPTION)); 67 | if (!indexLocation.exists()) { 68 | System.err.println("Error: " + indexLocation + " does not exist!"); 69 | System.exit(-1); 70 | } 71 | 72 | IndexReader reader = DirectoryReader.open(FSDirectory.open(indexLocation)); 73 | PrintStream out = new PrintStream(System.out, true, "UTF-8"); 74 | for (int i=0; i 0); 30 | this.time = time; 31 | } 32 | 33 | public String getId() { 34 | return id; 35 | } 36 | 37 | public String getQuery() { 38 | return query; 39 | } 40 | 41 | public long getQueryTweetTime() { 42 | return time; 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /twitter-tools-core/src/main/java/cc/twittertools/search/TrecTopicSet.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Twitter Tools 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package cc.twittertools.search; 18 | 19 | import java.io.File; 20 | import java.io.IOException; 21 | import java.util.Iterator; 22 | import java.util.List; 23 | import java.util.regex.Matcher; 24 | import java.util.regex.Pattern; 25 | 26 | import com.google.common.base.Charsets; 27 | import com.google.common.base.Joiner; 28 | import com.google.common.base.Preconditions; 29 | import com.google.common.collect.Lists; 30 | import com.google.common.io.Files; 31 | 32 | public class TrecTopicSet implements Iterable{ 33 | private List queries = Lists.newArrayList(); 34 | 35 | private TrecTopicSet() {} 36 | 37 | private void add(TrecTopic q) { 38 | queries.add(q); 39 | } 40 | 41 | @Override 42 | public Iterator iterator() { 43 | return queries.iterator(); 44 | } 45 | 46 | private static final Pattern TOP_PATTERN = Pattern.compile("", Pattern.DOTALL); 47 | private static final Pattern NUM_PATTERN = Pattern.compile(" Number: (MB\\d+) ", Pattern.DOTALL); 48 | 49 | // TREC 2011 topics uses tag 50 | private static final Pattern TITLE_PATTERN = Pattern.compile("<title>\\s*(.*?)\\s*", Pattern.DOTALL); 51 | // TREC 2012 topics use tag 52 | private static final Pattern TITLE_PATTERN2 = Pattern.compile("\\s*(.*?)\\s*", Pattern.DOTALL); 53 | 54 | private static final Pattern TWEETTIME_PATTERN = Pattern.compile("\\s*(\\d+)\\s*", Pattern.DOTALL); 55 | 56 | public static TrecTopicSet fromFile(File f) throws IOException { 57 | Preconditions.checkNotNull(f); 58 | Preconditions.checkArgument(f.exists()); 59 | 60 | String s = Joiner.on("\n").join(Files.readLines(f, Charsets.UTF_8)); 61 | TrecTopicSet queries = new TrecTopicSet(); 62 | 63 | Matcher matcher = TOP_PATTERN.matcher(s); 64 | while (matcher.find()) { 65 | String top = matcher.group(0); 66 | 67 | Matcher m = NUM_PATTERN.matcher(top); 68 | if (!m.find()) { 69 | throw new IOException("Error parsing " + f); 70 | } 71 | String id = m.group(1); 72 | // Topics from 2012 are inconsistently numbered, 73 | // e.g., MB051 should match the qrels, which has MB51 74 | if (id.matches("MB0\\d\\d")) { 75 | id = id.replace("MB0", "MB"); 76 | } 77 | 78 | m = TITLE_PATTERN.matcher(top); 79 | if (!m.find()) { 80 | m = TITLE_PATTERN2.matcher(top); 81 | if (!m.find()) { 82 | throw new IOException("Error parsing " + f); 83 | } 84 | } 85 | String text = m.group(1); 86 | 87 | m = TWEETTIME_PATTERN.matcher(top); 88 | if (!m.find()) { 89 | throw new IOException("Error parsing " + f); 90 | } 91 | long time = Long.parseLong(m.group(1)); 92 | queries.add(new TrecTopic(id, text, time)); 93 | } 94 | return queries; 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /twitter-tools-core/src/main/java/cc/twittertools/search/api/RunQueriesThrift.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Twitter Tools 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package cc.twittertools.search.api; 18 | 19 | import java.io.File; 20 | import java.io.PrintStream; 21 | import java.util.List; 22 | import java.util.Set; 23 | import java.util.HashSet; 24 | 25 | import org.apache.commons.cli.CommandLine; 26 | import org.apache.commons.cli.CommandLineParser; 27 | import org.apache.commons.cli.GnuParser; 28 | import org.apache.commons.cli.HelpFormatter; 29 | import org.apache.commons.cli.Option; 30 | import org.apache.commons.cli.OptionBuilder; 31 | import org.apache.commons.cli.Options; 32 | import org.apache.commons.cli.ParseException; 33 | 34 | import cc.twittertools.search.TrecTopicSet; 35 | import cc.twittertools.thrift.gen.TResult; 36 | 37 | public class RunQueriesThrift { 38 | private static final String DEFAULT_RUNTAG = "lucene4lm"; 39 | 40 | private static final String HOST_OPTION = "host"; 41 | private static final String PORT_OPTION = "port"; 42 | private static final String QUERIES_OPTION = "queries"; 43 | private static final String NUM_RESULTS_OPTION = "num_results"; 44 | private static final String GROUP_OPTION = "group"; 45 | private static final String TOKEN_OPTION = "token"; 46 | private static final String RUNTAG_OPTION = "runtag"; 47 | private static final String VERBOSE_OPTION = "verbose"; 48 | 49 | private RunQueriesThrift() {} 50 | 51 | @SuppressWarnings("static-access") 52 | public static void main(String[] args) throws Exception { 53 | Options options = new Options(); 54 | 55 | options.addOption(OptionBuilder.withArgName("string").hasArg() 56 | .withDescription("host").create(HOST_OPTION)); 57 | options.addOption(OptionBuilder.withArgName("port").hasArg() 58 | .withDescription("port").create(PORT_OPTION)); 59 | options.addOption(OptionBuilder.withArgName("file").hasArg() 60 | .withDescription("file containing topics in TREC format").create(QUERIES_OPTION)); 61 | options.addOption(OptionBuilder.withArgName("num").hasArg() 62 | .withDescription("number of results to return").create(NUM_RESULTS_OPTION)); 63 | options.addOption(OptionBuilder.withArgName("string").hasArg() 64 | .withDescription("group id").create(GROUP_OPTION)); 65 | options.addOption(OptionBuilder.withArgName("string").hasArg() 66 | .withDescription("access token").create(TOKEN_OPTION)); 67 | options.addOption(OptionBuilder.withArgName("string").hasArg() 68 | .withDescription("runtag").create(RUNTAG_OPTION)); 69 | options.addOption(new Option(VERBOSE_OPTION, "print out complete document")); 70 | 71 | CommandLine cmdline = null; 72 | CommandLineParser parser = new GnuParser(); 73 | try { 74 | cmdline = parser.parse(options, args); 75 | } catch (ParseException exp) { 76 | System.err.println("Error parsing command line: " + exp.getMessage()); 77 | System.exit(-1); 78 | } 79 | 80 | if (!cmdline.hasOption(HOST_OPTION) || !cmdline.hasOption(PORT_OPTION) 81 | || !cmdline.hasOption(QUERIES_OPTION)) { 82 | HelpFormatter formatter = new HelpFormatter(); 83 | formatter.printHelp(RunQueriesThrift.class.getName(), options); 84 | System.exit(-1); 85 | } 86 | 87 | String queryFile = cmdline.getOptionValue(QUERIES_OPTION); 88 | if (!new File(queryFile).exists()) { 89 | System.err.println("Error: " + queryFile + " doesn't exist!"); 90 | System.exit(-1); 91 | } 92 | 93 | String runtag = cmdline.hasOption(RUNTAG_OPTION) ? 94 | cmdline.getOptionValue(RUNTAG_OPTION) : DEFAULT_RUNTAG; 95 | 96 | TrecTopicSet topicsFile = TrecTopicSet.fromFile(new File(queryFile)); 97 | 98 | int numResults = 1000; 99 | try { 100 | if (cmdline.hasOption(NUM_RESULTS_OPTION)) { 101 | numResults = Integer.parseInt(cmdline.getOptionValue(NUM_RESULTS_OPTION)); 102 | } 103 | } catch (NumberFormatException e) { 104 | System.err.println("Invalid " + NUM_RESULTS_OPTION + ": " + cmdline.getOptionValue(NUM_RESULTS_OPTION)); 105 | System.exit(-1); 106 | } 107 | 108 | String group = cmdline.hasOption(GROUP_OPTION) ? cmdline.getOptionValue(GROUP_OPTION) : null; 109 | String token = cmdline.hasOption(TOKEN_OPTION) ? cmdline.getOptionValue(TOKEN_OPTION) : null; 110 | 111 | boolean verbose = cmdline.hasOption(VERBOSE_OPTION); 112 | 113 | PrintStream out = new PrintStream(System.out, true, "UTF-8"); 114 | 115 | TrecSearchThriftClient client = new TrecSearchThriftClient(cmdline.getOptionValue(HOST_OPTION), 116 | Integer.parseInt(cmdline.getOptionValue(PORT_OPTION)), group, token); 117 | 118 | for (cc.twittertools.search.TrecTopic query : topicsFile) { 119 | List results = client.search(query.getQuery(), 120 | query.getQueryTweetTime(), numResults); 121 | int i = 1; 122 | Set tweetIds = new HashSet(); 123 | for (TResult result : results) { 124 | if (!tweetIds.contains(result.id)) { 125 | // The TREC official qrels don't have the "MB" prefix and trailing zeros, so we perform 126 | // this transformation so that trec_eval doesn't complain. 127 | String qid = query.getId().replaceFirst("^MB0*", ""); 128 | tweetIds.add(result.id); 129 | out.println(String.format("%s Q0 %d %d %f %s", qid, result.id, i, result.rsv, runtag)); 130 | if (verbose) { 131 | out.println("# " + result.toString().replaceAll("[\\n\\r]+", " ")); 132 | } 133 | i++; 134 | } 135 | } 136 | } 137 | out.close(); 138 | } 139 | } 140 | -------------------------------------------------------------------------------- /twitter-tools-core/src/main/java/cc/twittertools/search/api/SearchStatusesThrift.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Twitter Tools 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package cc.twittertools.search.api; 18 | 19 | import java.io.PrintStream; 20 | import java.util.List; 21 | 22 | import org.apache.commons.cli.CommandLine; 23 | import org.apache.commons.cli.CommandLineParser; 24 | import org.apache.commons.cli.GnuParser; 25 | import org.apache.commons.cli.HelpFormatter; 26 | import org.apache.commons.cli.Option; 27 | import org.apache.commons.cli.OptionBuilder; 28 | import org.apache.commons.cli.Options; 29 | import org.apache.commons.cli.ParseException; 30 | 31 | import cc.twittertools.thrift.gen.TResult; 32 | 33 | public class SearchStatusesThrift { 34 | // Defaults: if user doesn't specify an actual query, run MB01 as a demo. 35 | private static final String DEFAULT_QID = "MB01"; 36 | private static final String DEFAULT_Q = "BBC World Service staff cuts"; 37 | private static final long DEFAULT_MAX_ID = 34952194402811905L; 38 | private static final int DEFAULT_NUM_RESULTS = 10; 39 | private static final String DEFAULT_RUNTAG = "lucene4lm"; 40 | 41 | private static final String HELP_OPTION = "h"; 42 | private static final String HOST_OPTION = "host"; 43 | private static final String PORT_OPTION = "port"; 44 | private static final String QID_OPTION = "qid"; 45 | private static final String QUERY_OPTION = "q"; 46 | private static final String RUNTAG_OPTION = "runtag"; 47 | private static final String MAX_ID_OPTION = "max_id"; 48 | private static final String NUM_RESULTS_OPTION = "num_results"; 49 | private static final String GROUP_OPTION = "group"; 50 | private static final String TOKEN_OPTION = "token"; 51 | private static final String VERBOSE_OPTION = "verbose"; 52 | 53 | @SuppressWarnings("static-access") 54 | public static void main(String[] args) throws Exception { 55 | Options options = new Options(); 56 | 57 | options.addOption(new Option(HELP_OPTION, "show help")); 58 | options.addOption(OptionBuilder.withArgName("string").hasArg() 59 | .withDescription("host").create(HOST_OPTION)); 60 | options.addOption(OptionBuilder.withArgName("port").hasArg() 61 | .withDescription("port").create(PORT_OPTION)); 62 | options.addOption(OptionBuilder.withArgName("string").hasArg() 63 | .withDescription("query id").create(QID_OPTION)); 64 | options.addOption(OptionBuilder.withArgName("string").hasArg() 65 | .withDescription("query text").create(QUERY_OPTION)); 66 | options.addOption(OptionBuilder.withArgName("string").hasArg() 67 | .withDescription("runtag").create(RUNTAG_OPTION)); 68 | options.addOption(OptionBuilder.withArgName("num").hasArg() 69 | .withDescription("maxid").create(MAX_ID_OPTION)); 70 | options.addOption(OptionBuilder.withArgName("num").hasArg() 71 | .withDescription("number of results to return").create(NUM_RESULTS_OPTION)); 72 | options.addOption(OptionBuilder.withArgName("string").hasArg() 73 | .withDescription("group id").create(GROUP_OPTION)); 74 | options.addOption(OptionBuilder.withArgName("string").hasArg() 75 | .withDescription("access token").create(TOKEN_OPTION)); 76 | options.addOption(new Option(VERBOSE_OPTION, "print out complete document")); 77 | 78 | CommandLine cmdline = null; 79 | CommandLineParser parser = new GnuParser(); 80 | try { 81 | cmdline = parser.parse(options, args); 82 | } catch (ParseException exp) { 83 | System.err.println("Error parsing command line: " + exp.getMessage()); 84 | System.exit(-1); 85 | } 86 | 87 | if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(HOST_OPTION) 88 | || !cmdline.hasOption(PORT_OPTION)) { 89 | HelpFormatter formatter = new HelpFormatter(); 90 | formatter.printHelp(SearchStatusesThrift.class.getName(), options); 91 | System.exit(-1); 92 | } 93 | 94 | String qid = cmdline.hasOption(QID_OPTION) ? 95 | cmdline.getOptionValue(QID_OPTION) : DEFAULT_QID; 96 | String query = cmdline.hasOption(QUERY_OPTION) ? 97 | cmdline.getOptionValue(QUERY_OPTION) : DEFAULT_Q; 98 | String runtag = cmdline.hasOption(RUNTAG_OPTION) ? 99 | cmdline.getOptionValue(RUNTAG_OPTION) : DEFAULT_RUNTAG; 100 | long maxId = cmdline.hasOption(MAX_ID_OPTION) ? 101 | Long.parseLong(cmdline.getOptionValue(MAX_ID_OPTION)) : DEFAULT_MAX_ID; 102 | int numResults = cmdline.hasOption(NUM_RESULTS_OPTION) ? 103 | Integer.parseInt(cmdline.getOptionValue(NUM_RESULTS_OPTION)) : DEFAULT_NUM_RESULTS; 104 | boolean verbose = cmdline.hasOption(VERBOSE_OPTION); 105 | 106 | String group = cmdline.hasOption(GROUP_OPTION) ? cmdline.getOptionValue(GROUP_OPTION) : null; 107 | String token = cmdline.hasOption(TOKEN_OPTION) ? cmdline.getOptionValue(TOKEN_OPTION) : null; 108 | TrecSearchThriftClient client = new TrecSearchThriftClient(cmdline.getOptionValue(HOST_OPTION), 109 | Integer.parseInt(cmdline.getOptionValue(PORT_OPTION)), group, token); 110 | 111 | System.err.println("qid: " + qid); 112 | System.err.println("q: " + query); 113 | System.err.println("max_id: " + maxId); 114 | System.err.println("num_results: " + numResults); 115 | 116 | PrintStream out = new PrintStream(System.out, true, "UTF-8"); 117 | 118 | List results = client.search(query, maxId, numResults); 119 | int i = 1; 120 | for (TResult result : results) { 121 | out.println(String.format("%s Q0 %d %d %f %s", qid, result.id, i, result.rsv, runtag)); 122 | if (verbose) { 123 | System.out.println("# " + result.toString().replaceAll("[\\n\\r]+", " ")); 124 | } 125 | i++; 126 | } 127 | out.close(); 128 | } 129 | } -------------------------------------------------------------------------------- /twitter-tools-core/src/main/java/cc/twittertools/search/api/TResultComparable.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Twitter Tools 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package cc.twittertools.search.api; 18 | 19 | import cc.twittertools.thrift.gen.TResult; 20 | 21 | public class TResultComparable implements Comparable { 22 | private TResult tresult; 23 | 24 | public TResultComparable(TResult tresult) { 25 | this.tresult = tresult; 26 | } 27 | 28 | public TResult getTResult() { 29 | return tresult; 30 | } 31 | 32 | public int compareTo(TResultComparable other) { 33 | if (tresult.rsv > other.tresult.rsv) { 34 | return -1; 35 | } else if (tresult.rsv < other.tresult.rsv) { 36 | return 1; 37 | } else { 38 | if (tresult.id > other.tresult.id) { 39 | return -1; 40 | } else if (tresult.id < other.tresult.id) { 41 | return 1; 42 | } else { 43 | return 0; 44 | } 45 | } 46 | } 47 | 48 | public boolean equals(Object other) { 49 | if (other == null) { 50 | return false; 51 | } if (other.getClass() != this.getClass()) { 52 | return false; 53 | } 54 | 55 | return ((TResultComparable) other).tresult.id == this.tresult.id; 56 | } 57 | } -------------------------------------------------------------------------------- /twitter-tools-core/src/main/java/cc/twittertools/search/api/TrecSearchHandler.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Twitter Tools 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package cc.twittertools.search.api; 18 | 19 | import java.io.File; 20 | import java.io.IOException; 21 | import java.util.List; 22 | import java.util.Map; 23 | 24 | import javax.annotation.Nullable; 25 | 26 | import org.apache.log4j.Logger; 27 | import org.apache.lucene.document.Document; 28 | import org.apache.lucene.index.DirectoryReader; 29 | import org.apache.lucene.index.IndexReader; 30 | import org.apache.lucene.queryparser.classic.QueryParser; 31 | import org.apache.lucene.search.Filter; 32 | import org.apache.lucene.search.IndexSearcher; 33 | import org.apache.lucene.search.NumericRangeFilter; 34 | import org.apache.lucene.search.Query; 35 | import org.apache.lucene.search.ScoreDoc; 36 | import org.apache.lucene.search.TopDocs; 37 | import org.apache.lucene.search.similarities.LMDirichletSimilarity; 38 | import org.apache.lucene.store.FSDirectory; 39 | import org.apache.lucene.util.Version; 40 | 41 | import cc.twittertools.index.IndexStatuses; 42 | import cc.twittertools.index.IndexStatuses.StatusField; 43 | import cc.twittertools.thrift.gen.TQuery; 44 | import cc.twittertools.thrift.gen.TResult; 45 | import cc.twittertools.thrift.gen.TrecSearch; 46 | import cc.twittertools.thrift.gen.TrecSearchException; 47 | 48 | import com.google.common.base.Preconditions; 49 | import com.google.common.collect.Lists; 50 | 51 | public class TrecSearchHandler implements TrecSearch.Iface { 52 | private static final Logger LOG = Logger.getLogger(TrecSearchHandler.class); 53 | 54 | private static QueryParser QUERY_PARSER = 55 | new QueryParser(Version.LUCENE_43, StatusField.TEXT.name, IndexStatuses.ANALYZER); 56 | 57 | private final IndexSearcher searcher; 58 | private final Map credentials; 59 | 60 | public TrecSearchHandler(File indexPath, @Nullable Map credentials) 61 | throws IOException { 62 | Preconditions.checkNotNull(indexPath); 63 | Preconditions.checkArgument(indexPath.exists()); 64 | 65 | // Can be null, in which case we don't check for credentials. 66 | this.credentials = credentials; 67 | 68 | IndexReader reader = DirectoryReader.open(FSDirectory.open(indexPath)); 69 | searcher = new IndexSearcher(reader); 70 | searcher.setSimilarity(new LMDirichletSimilarity(2500.0f)); 71 | } 72 | 73 | public List search(TQuery query) throws TrecSearchException { 74 | Preconditions.checkNotNull(query); 75 | 76 | LOG.info(String.format("Incoming request (%s, %s)", query.group, query.token)); 77 | 78 | // Verify credentials. 79 | if (credentials != null && (!credentials.containsKey(query.group) || 80 | !credentials.get(query.group).equals(query.token))) { 81 | LOG.info(String.format("Access denied for (%s, %s)", query.group, query.token)); 82 | throw new TrecSearchException("Invalid credentials: access denied."); 83 | } 84 | 85 | List results = Lists.newArrayList(); 86 | long startTime = System.currentTimeMillis(); 87 | 88 | try { 89 | Filter filter = 90 | NumericRangeFilter.newLongRange(StatusField.ID.name, 0L, query.max_id, true, true); 91 | 92 | Query q = QUERY_PARSER.parse(query.text); 93 | int num = query.num_results > 10000 ? 10000 : query.num_results; 94 | TopDocs rs = searcher.search(q, filter, num); 95 | for (ScoreDoc scoreDoc : rs.scoreDocs) { 96 | Document hit = searcher.doc(scoreDoc.doc); 97 | 98 | TResult p = new TResult(); 99 | p.id = (Long) hit.getField(StatusField.ID.name).numericValue(); 100 | p.screen_name = hit.get(StatusField.SCREEN_NAME.name); 101 | p.epoch = (Long) hit.getField(StatusField.EPOCH.name).numericValue(); 102 | p.text = hit.get(StatusField.TEXT.name); 103 | p.rsv = scoreDoc.score; 104 | 105 | p.followers_count = (Integer) hit.getField(StatusField.FOLLOWERS_COUNT.name).numericValue(); 106 | p.statuses_count = (Integer) hit.getField(StatusField.STATUSES_COUNT.name).numericValue(); 107 | 108 | if ( hit.get(StatusField.LANG.name) != null) { 109 | p.lang = hit.get(StatusField.LANG.name); 110 | } 111 | 112 | if ( hit.get(StatusField.IN_REPLY_TO_STATUS_ID.name) != null) { 113 | p.in_reply_to_status_id = (Long) hit.getField(StatusField.IN_REPLY_TO_STATUS_ID.name).numericValue(); 114 | } 115 | 116 | if ( hit.get(StatusField.IN_REPLY_TO_USER_ID.name) != null) { 117 | p.in_reply_to_user_id = (Long) hit.getField(StatusField.IN_REPLY_TO_USER_ID.name).numericValue(); 118 | } 119 | 120 | if ( hit.get(StatusField.RETWEETED_STATUS_ID.name) != null) { 121 | p.retweeted_status_id = (Long) hit.getField(StatusField.RETWEETED_STATUS_ID.name).numericValue(); 122 | } 123 | 124 | if ( hit.get(StatusField.RETWEETED_USER_ID.name) != null) { 125 | p.retweeted_user_id = (Long) hit.getField(StatusField.RETWEETED_USER_ID.name).numericValue(); 126 | } 127 | 128 | if ( hit.get(StatusField.RETWEET_COUNT.name) != null) { 129 | p.retweeted_count = (Integer) hit.getField(StatusField.RETWEET_COUNT.name).numericValue(); 130 | } 131 | 132 | results.add(p); 133 | } 134 | } catch (Exception e) { 135 | e.printStackTrace(); 136 | throw new TrecSearchException(e.getMessage()); 137 | } 138 | 139 | long endTime = System.currentTimeMillis(); 140 | LOG.info(String.format("%4dms %s", (endTime - startTime), query.toString())); 141 | 142 | return results; 143 | } 144 | } -------------------------------------------------------------------------------- /twitter-tools-core/src/main/java/cc/twittertools/search/api/TrecSearchThriftClient.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Twitter Tools 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package cc.twittertools.search.api; 18 | 19 | import java.util.List; 20 | 21 | import javax.annotation.Nullable; 22 | 23 | import org.apache.thrift.TException; 24 | import org.apache.thrift.protocol.TBinaryProtocol; 25 | import org.apache.thrift.transport.TSocket; 26 | import org.apache.thrift.transport.TTransport; 27 | 28 | import cc.twittertools.thrift.gen.TQuery; 29 | import cc.twittertools.thrift.gen.TResult; 30 | import cc.twittertools.thrift.gen.TrecSearch; 31 | 32 | import com.google.common.base.Preconditions; 33 | 34 | public class TrecSearchThriftClient { 35 | private final String group; 36 | private final String token; 37 | private final String host; 38 | private final int port; 39 | 40 | public TrecSearchThriftClient(String host, int port, 41 | @Nullable String group, @Nullable String token) { 42 | Preconditions.checkNotNull(host); 43 | Preconditions.checkArgument(port > 0); 44 | this.group = group; 45 | this.token = token; 46 | this.host= host; 47 | this.port = port; 48 | } 49 | 50 | public List search(String query, long maxId, int numResults) throws TException { 51 | TTransport transport = new TSocket(host, port); 52 | transport.open(); 53 | 54 | TrecSearch.Client client = new TrecSearch.Client(new TBinaryProtocol(transport)); 55 | 56 | TQuery q = new TQuery(); 57 | q.text = query; 58 | q.max_id = maxId; 59 | q.num_results = numResults; 60 | 61 | q.group = group; 62 | q.token = token; 63 | 64 | List results = client.search(q); 65 | transport.close(); 66 | 67 | return results; 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /twitter-tools-core/src/main/java/cc/twittertools/search/api/TrecSearchThriftServer.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Twitter Tools 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package cc.twittertools.search.api; 18 | 19 | import java.io.File; 20 | import java.util.Map; 21 | 22 | import org.apache.commons.cli.CommandLine; 23 | import org.apache.commons.cli.CommandLineParser; 24 | import org.apache.commons.cli.GnuParser; 25 | import org.apache.commons.cli.HelpFormatter; 26 | import org.apache.commons.cli.Option; 27 | import org.apache.commons.cli.OptionBuilder; 28 | import org.apache.commons.cli.Options; 29 | import org.apache.commons.cli.ParseException; 30 | import org.apache.thrift.protocol.TBinaryProtocol; 31 | import org.apache.thrift.server.TServer; 32 | import org.apache.thrift.server.TThreadPoolServer; 33 | import org.apache.thrift.transport.TServerSocket; 34 | 35 | import cc.twittertools.thrift.gen.TrecSearch; 36 | 37 | import com.google.common.base.Charsets; 38 | import com.google.common.collect.Maps; 39 | import com.google.common.io.Files; 40 | 41 | public class TrecSearchThriftServer { 42 | private static final int DEFAULT_PORT = 9090; 43 | private static final int DEFAULT_MAX_THREADS = 8; 44 | 45 | private static final String HELP_OPTION = "h"; 46 | private static final String INDEX_OPTION = "index"; 47 | private static final String PORT_OPTION = "port"; 48 | private static final String MAX_THREADS_OPTION = "max_threads"; 49 | private static final String CREDENTIALS_OPTION = "credentials"; 50 | 51 | @SuppressWarnings("static-access") 52 | public static void main(String[] args) throws Exception { 53 | Options options = new Options(); 54 | 55 | options.addOption(new Option(HELP_OPTION, "show help")); 56 | options.addOption(OptionBuilder.withArgName("port").hasArg() 57 | .withDescription("port").create(PORT_OPTION)); 58 | options.addOption(OptionBuilder.withArgName("index").hasArg() 59 | .withDescription("index location").create(INDEX_OPTION)); 60 | options.addOption(OptionBuilder.withArgName("num").hasArg() 61 | .withDescription("max number of threads in thread pool").create(MAX_THREADS_OPTION)); 62 | options.addOption(OptionBuilder.withArgName("file").hasArg() 63 | .withDescription("file containing access tokens").create(CREDENTIALS_OPTION)); 64 | 65 | CommandLine cmdline = null; 66 | CommandLineParser parser = new GnuParser(); 67 | try { 68 | cmdline = parser.parse(options, args); 69 | } catch (ParseException exp) { 70 | System.err.println("Error parsing command line: " + exp.getMessage()); 71 | System.exit(-1); 72 | } 73 | 74 | if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { 75 | HelpFormatter formatter = new HelpFormatter(); 76 | formatter.printHelp(TrecSearchThriftServer.class.getName(), options); 77 | System.exit(-1); 78 | } 79 | 80 | int port = cmdline.hasOption(PORT_OPTION) ? 81 | Integer.parseInt(cmdline.getOptionValue(PORT_OPTION)) : DEFAULT_PORT; 82 | int maxThreads = cmdline.hasOption(MAX_THREADS_OPTION) ? 83 | Integer.parseInt(cmdline.getOptionValue(MAX_THREADS_OPTION)) : DEFAULT_MAX_THREADS; 84 | File index = new File(cmdline.getOptionValue(INDEX_OPTION)); 85 | 86 | Map credentials = null; 87 | if (cmdline.hasOption(CREDENTIALS_OPTION)) { 88 | credentials = Maps.newHashMap(); 89 | File cfile = new File(cmdline.getOptionValue(CREDENTIALS_OPTION)); 90 | if (!cfile.exists()) { 91 | System.err.println("Error: " + cfile + " does not exist!"); 92 | System.exit(-1); 93 | } 94 | for (String s : Files.readLines(cfile, Charsets.UTF_8)) { 95 | try { 96 | String[] arr = s.split(":"); 97 | credentials.put(arr[0], arr[1]); 98 | } catch (Exception e){ 99 | // Catch any exceptions from parsing file contain access tokens 100 | System.err.println("Error reading access tokens from " + cfile + "!"); 101 | System.exit(-1); 102 | } 103 | } 104 | } 105 | 106 | if (!index.exists()) { 107 | System.err.println("Error: " + index + " does not exist!"); 108 | System.exit(-1); 109 | } 110 | 111 | TServerSocket serverSocket = new TServerSocket(port); 112 | TrecSearch.Processor searchProcessor = 113 | new TrecSearch.Processor(new TrecSearchHandler(index, credentials)); 114 | 115 | TThreadPoolServer.Args serverArgs = new TThreadPoolServer.Args(serverSocket); 116 | serverArgs.maxWorkerThreads(maxThreads); 117 | TServer thriftServer = new TThreadPoolServer(serverArgs.processor(searchProcessor) 118 | .protocolFactory(new TBinaryProtocol.Factory())); 119 | 120 | thriftServer.serve(); 121 | } 122 | } -------------------------------------------------------------------------------- /twitter-tools-core/src/main/java/cc/twittertools/search/local/RunQueries.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Twitter Tools 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package cc.twittertools.search.local; 18 | 19 | import java.io.File; 20 | import java.io.PrintStream; 21 | 22 | import org.apache.commons.cli.CommandLine; 23 | import org.apache.commons.cli.CommandLineParser; 24 | import org.apache.commons.cli.GnuParser; 25 | import org.apache.commons.cli.HelpFormatter; 26 | import org.apache.commons.cli.Option; 27 | import org.apache.commons.cli.OptionBuilder; 28 | import org.apache.commons.cli.Options; 29 | import org.apache.commons.cli.ParseException; 30 | import org.apache.lucene.document.Document; 31 | import org.apache.lucene.index.DirectoryReader; 32 | import org.apache.lucene.index.IndexReader; 33 | import org.apache.lucene.queryparser.classic.QueryParser; 34 | import org.apache.lucene.search.Filter; 35 | import org.apache.lucene.search.IndexSearcher; 36 | import org.apache.lucene.search.NumericRangeFilter; 37 | import org.apache.lucene.search.Query; 38 | import org.apache.lucene.search.ScoreDoc; 39 | import org.apache.lucene.search.TopDocs; 40 | import org.apache.lucene.search.similarities.BM25Similarity; 41 | import org.apache.lucene.search.similarities.LMDirichletSimilarity; 42 | import org.apache.lucene.store.FSDirectory; 43 | import org.apache.lucene.util.Version; 44 | 45 | import cc.twittertools.index.IndexStatuses; 46 | import cc.twittertools.index.IndexStatuses.StatusField; 47 | import cc.twittertools.search.TrecTopic; 48 | import cc.twittertools.search.TrecTopicSet; 49 | 50 | public class RunQueries { 51 | private static final String DEFAULT_RUNTAG = "lucene4lm"; 52 | 53 | private static final String INDEX_OPTION = "index"; 54 | private static final String QUERIES_OPTION = "queries"; 55 | private static final String NUM_RESULTS_OPTION = "num_results"; 56 | private static final String SIMILARITY_OPTION = "similarity"; 57 | private static final String RUNTAG_OPTION = "runtag"; 58 | private static final String VERBOSE_OPTION = "verbose"; 59 | 60 | private RunQueries() {} 61 | 62 | @SuppressWarnings("static-access") 63 | public static void main(String[] args) throws Exception { 64 | Options options = new Options(); 65 | 66 | options.addOption(OptionBuilder.withArgName("path").hasArg() 67 | .withDescription("index location").create(INDEX_OPTION)); 68 | options.addOption(OptionBuilder.withArgName("num").hasArg() 69 | .withDescription("number of results to return").create(NUM_RESULTS_OPTION)); 70 | options.addOption(OptionBuilder.withArgName("file").hasArg() 71 | .withDescription("file containing topics in TREC format").create(QUERIES_OPTION)); 72 | options.addOption(OptionBuilder.withArgName("similarity").hasArg() 73 | .withDescription("similarity to use (BM25, LM)").create(SIMILARITY_OPTION)); 74 | options.addOption(OptionBuilder.withArgName("string").hasArg() 75 | .withDescription("runtag").create(RUNTAG_OPTION)); 76 | options.addOption(new Option(VERBOSE_OPTION, "print out complete document")); 77 | 78 | CommandLine cmdline = null; 79 | CommandLineParser parser = new GnuParser(); 80 | try { 81 | cmdline = parser.parse(options, args); 82 | } catch (ParseException exp) { 83 | System.err.println("Error parsing command line: " + exp.getMessage()); 84 | System.exit(-1); 85 | } 86 | 87 | if (!cmdline.hasOption(QUERIES_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { 88 | HelpFormatter formatter = new HelpFormatter(); 89 | formatter.printHelp(RunQueries.class.getName(), options); 90 | System.exit(-1); 91 | } 92 | 93 | File indexLocation = new File(cmdline.getOptionValue(INDEX_OPTION)); 94 | if (!indexLocation.exists()) { 95 | System.err.println("Error: " + indexLocation + " does not exist!"); 96 | System.exit(-1); 97 | } 98 | 99 | String runtag = cmdline.hasOption(RUNTAG_OPTION) ? 100 | cmdline.getOptionValue(RUNTAG_OPTION) : DEFAULT_RUNTAG; 101 | 102 | String topicsFile = cmdline.getOptionValue(QUERIES_OPTION); 103 | 104 | int numResults = 1000; 105 | try { 106 | if (cmdline.hasOption(NUM_RESULTS_OPTION)) { 107 | numResults = Integer.parseInt(cmdline.getOptionValue(NUM_RESULTS_OPTION)); 108 | } 109 | } catch (NumberFormatException e) { 110 | System.err.println("Invalid " + NUM_RESULTS_OPTION + ": " + cmdline.getOptionValue(NUM_RESULTS_OPTION)); 111 | System.exit(-1); 112 | } 113 | 114 | String similarity = "LM"; 115 | if (cmdline.hasOption(SIMILARITY_OPTION)) { 116 | similarity = cmdline.getOptionValue(SIMILARITY_OPTION); 117 | } 118 | 119 | boolean verbose = cmdline.hasOption(VERBOSE_OPTION); 120 | 121 | PrintStream out = new PrintStream(System.out, true, "UTF-8"); 122 | 123 | IndexReader reader = DirectoryReader.open(FSDirectory.open(indexLocation)); 124 | IndexSearcher searcher = new IndexSearcher(reader); 125 | 126 | if (similarity.equalsIgnoreCase("BM25")) { 127 | searcher.setSimilarity(new BM25Similarity()); 128 | } else if (similarity.equalsIgnoreCase("LM")) { 129 | searcher.setSimilarity(new LMDirichletSimilarity(2500.0f)); 130 | } 131 | 132 | QueryParser p = new QueryParser(Version.LUCENE_43, StatusField.TEXT.name, 133 | IndexStatuses.ANALYZER); 134 | 135 | TrecTopicSet topics = TrecTopicSet.fromFile(new File(topicsFile)); 136 | for ( TrecTopic topic : topics ) { 137 | Query query = p.parse(topic.getQuery()); 138 | Filter filter = NumericRangeFilter.newLongRange(StatusField.ID.name, 0L, 139 | topic.getQueryTweetTime(), true, true); 140 | 141 | TopDocs rs = searcher.search(query, filter, numResults); 142 | 143 | int i = 1; 144 | for (ScoreDoc scoreDoc : rs.scoreDocs) { 145 | Document hit = searcher.doc(scoreDoc.doc); 146 | out.println(String.format("%s Q0 %s %d %f %s", topic.getId(), 147 | hit.getField(StatusField.ID.name).numericValue(), i, scoreDoc.score, runtag)); 148 | if ( verbose) { 149 | out.println("# " + hit.toString().replaceAll("[\\n\\r]+", " ")); 150 | } 151 | i++; 152 | } 153 | } 154 | reader.close(); 155 | out.close(); 156 | } 157 | } 158 | -------------------------------------------------------------------------------- /twitter-tools-core/src/main/java/cc/twittertools/stream/GatherStatusStream.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Twitter Tools 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package cc.twittertools.stream; 18 | 19 | import org.apache.log4j.ConsoleAppender; 20 | import org.apache.log4j.Level; 21 | import org.apache.log4j.Logger; 22 | import org.apache.log4j.PatternLayout; 23 | import org.apache.log4j.rolling.RollingFileAppender; 24 | import org.apache.log4j.rolling.TimeBasedRollingPolicy; 25 | import org.apache.log4j.varia.LevelRangeFilter; 26 | 27 | import twitter4j.RawStreamListener; 28 | import twitter4j.TwitterException; 29 | import twitter4j.TwitterStream; 30 | import twitter4j.TwitterStreamFactory; 31 | 32 | public final class GatherStatusStream { 33 | private static int cnt = 0; 34 | 35 | @SuppressWarnings("unused") 36 | private static final String MINUTE_ROLL = ".%d{yyyy-MM-dd-HH-mm}.gz"; 37 | private static final String HOUR_ROLL = ".%d{yyyy-MM-dd-HH}.gz"; 38 | 39 | public static void main(String[] args) throws TwitterException { 40 | PatternLayout layoutStandard = new PatternLayout(); 41 | layoutStandard.setConversionPattern("[%p] %d %c %M - %m%n"); 42 | 43 | PatternLayout layoutSimple = new PatternLayout(); 44 | layoutSimple.setConversionPattern("%m%n"); 45 | 46 | // Filter for the statuses: we only want INFO messages 47 | LevelRangeFilter filter = new LevelRangeFilter(); 48 | filter.setLevelMax(Level.INFO); 49 | filter.setLevelMin(Level.INFO); 50 | filter.setAcceptOnMatch(true); 51 | filter.activateOptions(); 52 | 53 | TimeBasedRollingPolicy statusesRollingPolicy = new TimeBasedRollingPolicy(); 54 | statusesRollingPolicy.setFileNamePattern("statuses.log" + HOUR_ROLL); 55 | statusesRollingPolicy.activateOptions(); 56 | 57 | RollingFileAppender statusesAppender = new RollingFileAppender(); 58 | statusesAppender.setRollingPolicy(statusesRollingPolicy); 59 | statusesAppender.addFilter(filter); 60 | statusesAppender.setLayout(layoutSimple); 61 | statusesAppender.activateOptions(); 62 | 63 | TimeBasedRollingPolicy warningsRollingPolicy = new TimeBasedRollingPolicy(); 64 | warningsRollingPolicy.setFileNamePattern("warnings.log" + HOUR_ROLL); 65 | warningsRollingPolicy.activateOptions(); 66 | 67 | RollingFileAppender warningsAppender = new RollingFileAppender(); 68 | warningsAppender.setRollingPolicy(statusesRollingPolicy); 69 | warningsAppender.setThreshold(Level.WARN); 70 | warningsAppender.setLayout(layoutStandard); 71 | warningsAppender.activateOptions(); 72 | 73 | ConsoleAppender consoleAppender = new ConsoleAppender(); 74 | consoleAppender.setThreshold(Level.WARN); 75 | consoleAppender.setLayout(layoutStandard); 76 | consoleAppender.activateOptions(); 77 | 78 | // configures the root logger 79 | Logger rootLogger = Logger.getRootLogger(); 80 | rootLogger.setLevel(Level.INFO); 81 | rootLogger.removeAllAppenders(); 82 | rootLogger.addAppender(consoleAppender); 83 | rootLogger.addAppender(statusesAppender); 84 | rootLogger.addAppender(warningsAppender); 85 | 86 | // creates a custom logger and log messages 87 | final Logger logger = Logger.getLogger(GatherStatusStream.class); 88 | 89 | TwitterStream twitterStream = new TwitterStreamFactory().getInstance(); 90 | RawStreamListener rawListener = new RawStreamListener() { 91 | 92 | @Override 93 | public void onMessage(String rawString) { 94 | cnt++; 95 | logger.info(rawString); 96 | if (cnt % 1000 == 0) { 97 | System.out.println(cnt + " messages received."); 98 | } 99 | } 100 | 101 | @Override 102 | public void onException(Exception ex) { 103 | logger.warn(ex); 104 | } 105 | 106 | }; 107 | 108 | twitterStream.addListener(rawListener); 109 | twitterStream.sample(); 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /twitter-tools-core/src/main/java/cc/twittertools/util/ExtractSubcollection.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Twitter Tools 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package cc.twittertools.util; 18 | 19 | import it.unimi.dsi.fastutil.longs.LongIterator; 20 | import it.unimi.dsi.fastutil.longs.LongOpenHashSet; 21 | 22 | import java.io.BufferedReader; 23 | import java.io.BufferedWriter; 24 | import java.io.File; 25 | import java.io.FileInputStream; 26 | import java.io.FileOutputStream; 27 | import java.io.InputStreamReader; 28 | import java.io.OutputStreamWriter; 29 | import java.io.Writer; 30 | 31 | import org.apache.commons.cli.CommandLine; 32 | import org.apache.commons.cli.CommandLineParser; 33 | import org.apache.commons.cli.GnuParser; 34 | import org.apache.commons.cli.HelpFormatter; 35 | import org.apache.commons.cli.OptionBuilder; 36 | import org.apache.commons.cli.Options; 37 | import org.apache.commons.cli.ParseException; 38 | import org.apache.log4j.Logger; 39 | 40 | import cc.twittertools.corpus.data.JsonStatusCorpusReader; 41 | import cc.twittertools.corpus.data.Status; 42 | import cc.twittertools.corpus.data.StatusStream; 43 | 44 | public class ExtractSubcollection { 45 | private static final Logger LOG = Logger.getLogger(ExtractSubcollection.class); 46 | 47 | private static final String COLLECTION_OPTION = "collection"; 48 | private static final String ID_OPTION = "tweetids"; 49 | private static final String OUTPUT_OPTION = "output"; 50 | private static final String MISSING_OPTION = "missing"; 51 | 52 | @SuppressWarnings("static-access") 53 | public static void main(String[] args) throws Exception { 54 | Options options = new Options(); 55 | 56 | options.addOption(OptionBuilder.withArgName("dir").hasArg() 57 | .withDescription("source collection directory").create(COLLECTION_OPTION)); 58 | options.addOption(OptionBuilder.withArgName("file").hasArg() 59 | .withDescription("list of tweetids").create(ID_OPTION)); 60 | options.addOption(OptionBuilder.withArgName("file").hasArg() 61 | .withDescription("output JSON").create(OUTPUT_OPTION)); 62 | options.addOption(OptionBuilder.withArgName("file").hasArg() 63 | .withDescription("file to store missing tweeids").create(MISSING_OPTION)); 64 | 65 | CommandLine cmdline = null; 66 | CommandLineParser parser = new GnuParser(); 67 | try { 68 | cmdline = parser.parse(options, args); 69 | } catch (ParseException exp) { 70 | System.err.println("Error parsing command line: " + exp.getMessage()); 71 | System.exit(-1); 72 | } 73 | 74 | if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(ID_OPTION) || 75 | !cmdline.hasOption(OUTPUT_OPTION) || !cmdline.hasOption(MISSING_OPTION)) { 76 | HelpFormatter formatter = new HelpFormatter(); 77 | formatter.printHelp(ExtractSubcollection.class.getName(), options); 78 | System.exit(-1); 79 | } 80 | 81 | String outputFile = cmdline.getOptionValue(OUTPUT_OPTION); 82 | String missingFile = cmdline.getOptionValue(MISSING_OPTION); 83 | String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION); 84 | 85 | LongOpenHashSet tweetids = new LongOpenHashSet(); 86 | File tweetidsFile = new File(cmdline.getOptionValue(ID_OPTION)); 87 | if (!tweetidsFile.exists()) { 88 | System.err.println("Error: " + tweetidsFile + " does not exist!"); 89 | System.exit(-1); 90 | } 91 | LOG.info("Reading tweetids from " + tweetidsFile); 92 | 93 | FileInputStream fin = new FileInputStream(tweetidsFile); 94 | BufferedReader br = new BufferedReader(new InputStreamReader(fin)); 95 | 96 | String s; 97 | while ((s = br.readLine()) != null) { 98 | tweetids.add(Long.parseLong(s)); 99 | } 100 | br.close(); 101 | fin.close(); 102 | LOG.info("Read " + tweetids.size() + " tweetids."); 103 | 104 | File file = new File(collectionPath); 105 | if (!file.exists()) { 106 | System.err.println("Error: " + file + " does not exist!"); 107 | System.exit(-1); 108 | } 109 | 110 | // Store tweet ids we've already seen to dedup. 111 | LongOpenHashSet seen = new LongOpenHashSet(); 112 | 113 | Writer out = new BufferedWriter(new OutputStreamWriter( 114 | new FileOutputStream(outputFile), "UTF-8")); 115 | 116 | StatusStream stream = new JsonStatusCorpusReader(file); 117 | Status status; 118 | while ((status = stream.next()) != null) { 119 | if (tweetids.contains(status.getId()) && !seen.contains(status.getId())) { 120 | out.write(status.getJsonObject().toString() + "\n"); 121 | seen.add(status.getId()); 122 | } 123 | } 124 | stream.close(); 125 | out.close(); 126 | 127 | LOG.info("Extracted " + seen.size() + " tweetids."); 128 | LOG.info("Storing missing tweetids..."); 129 | 130 | out = new BufferedWriter(new OutputStreamWriter( 131 | new FileOutputStream(missingFile), "UTF-8")); 132 | LongIterator iter = tweetids.iterator(); 133 | while (iter.hasNext()) { 134 | long t = iter.nextLong(); 135 | if (!seen.contains(t)) { 136 | out.write(t + "\n"); 137 | } 138 | } 139 | out.close(); 140 | 141 | LOG.info("Done!"); 142 | } 143 | } 144 | -------------------------------------------------------------------------------- /twitter-tools-core/src/main/java/cc/twittertools/util/VerifySubcollection.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Twitter Tools 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package cc.twittertools.util; 18 | 19 | import it.unimi.dsi.fastutil.longs.LongOpenHashSet; 20 | 21 | import java.io.BufferedReader; 22 | import java.io.File; 23 | import java.io.FileInputStream; 24 | import java.io.InputStreamReader; 25 | import java.io.PrintStream; 26 | import java.util.Map; 27 | import java.util.TreeMap; 28 | 29 | import org.apache.commons.cli.CommandLine; 30 | import org.apache.commons.cli.CommandLineParser; 31 | import org.apache.commons.cli.GnuParser; 32 | import org.apache.commons.cli.HelpFormatter; 33 | import org.apache.commons.cli.OptionBuilder; 34 | import org.apache.commons.cli.Options; 35 | import org.apache.commons.cli.ParseException; 36 | import org.apache.log4j.Logger; 37 | 38 | import cc.twittertools.corpus.data.JsonStatusCorpusReader; 39 | import cc.twittertools.corpus.data.Status; 40 | import cc.twittertools.corpus.data.StatusStream; 41 | 42 | import com.google.common.collect.Maps; 43 | 44 | public class VerifySubcollection { 45 | private static final Logger LOG = Logger.getLogger(VerifySubcollection.class); 46 | 47 | private static final String COLLECTION_OPTION = "collection"; 48 | private static final String ID_OPTION = "tweetids"; 49 | 50 | @SuppressWarnings("static-access") 51 | public static void main(String[] args) throws Exception { 52 | Options options = new Options(); 53 | 54 | options.addOption(OptionBuilder.withArgName("dir").hasArg() 55 | .withDescription("source collection directory").create(COLLECTION_OPTION)); 56 | options.addOption(OptionBuilder.withArgName("file").hasArg() 57 | .withDescription("list of tweetids").create(ID_OPTION)); 58 | 59 | CommandLine cmdline = null; 60 | CommandLineParser parser = new GnuParser(); 61 | try { 62 | cmdline = parser.parse(options, args); 63 | } catch (ParseException exp) { 64 | System.err.println("Error parsing command line: " + exp.getMessage()); 65 | System.exit(-1); 66 | } 67 | 68 | if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(ID_OPTION)) { 69 | HelpFormatter formatter = new HelpFormatter(); 70 | formatter.printHelp(ExtractSubcollection.class.getName(), options); 71 | System.exit(-1); 72 | } 73 | 74 | String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION); 75 | 76 | LongOpenHashSet tweetids = new LongOpenHashSet(); 77 | File tweetidsFile = new File(cmdline.getOptionValue(ID_OPTION)); 78 | if (!tweetidsFile.exists()) { 79 | System.err.println("Error: " + tweetidsFile + " does not exist!"); 80 | System.exit(-1); 81 | } 82 | LOG.info("Reading tweetids from " + tweetidsFile); 83 | 84 | FileInputStream fin = new FileInputStream(tweetidsFile); 85 | BufferedReader br = new BufferedReader(new InputStreamReader(fin)); 86 | 87 | String s; 88 | while ((s = br.readLine()) != null) { 89 | tweetids.add(Long.parseLong(s)); 90 | } 91 | br.close(); 92 | fin.close(); 93 | LOG.info("Read " + tweetids.size() + " tweetids."); 94 | 95 | File file = new File(collectionPath); 96 | if (!file.exists()) { 97 | System.err.println("Error: " + file + " does not exist!"); 98 | System.exit(-1); 99 | } 100 | 101 | LongOpenHashSet seen = new LongOpenHashSet(); 102 | TreeMap tweets = Maps.newTreeMap(); 103 | 104 | PrintStream out = new PrintStream(System.out, true, "UTF-8"); 105 | StatusStream stream = new JsonStatusCorpusReader(file); 106 | Status status; 107 | int cnt = 0; 108 | while ((status = stream.next()) != null) { 109 | if (!tweetids.contains(status.getId())) { 110 | LOG.error("tweetid " + status.getId() + " doesn't belong in collection"); 111 | continue; 112 | } 113 | if (seen.contains(status.getId())) { 114 | LOG.error("tweetid " + status.getId() + " already seen!"); 115 | continue; 116 | } 117 | 118 | tweets.put(status.getId(), status.getJsonObject().toString()); 119 | seen.add(status.getId()); 120 | cnt++; 121 | } 122 | LOG.info("total of " + cnt + " tweets in subcollection."); 123 | 124 | for ( Map.Entry entry : tweets.entrySet()){ 125 | out.println(entry.getValue()); 126 | } 127 | 128 | stream.close(); 129 | out.close(); 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /twitter-tools-core/src/main/java/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=INFO, A1 2 | log4j.appender.A1=org.apache.log4j.ConsoleAppender 3 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout 4 | 5 | # Print the date in ISO 8601 format 6 | log4j.appender.A1.layout.ConversionPattern=%d [%t] %-5p %c{1} - %m%n 7 | log4j.logger.com.ning.http.client=WARN 8 | -------------------------------------------------------------------------------- /twitter-tools-core/src/main/perl/extract_deletes.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Scans a directory containing the output of the stream crawler and 4 | # extracts the deletes 5 | 6 | $directory = shift or die "$0 [directory]"; 7 | 8 | for $f ( `ls $directory` ) { 9 | chomp($f); 10 | my $path = "$directory/$f"; 11 | 12 | open(DATA, "gunzip -c $path | grep '{\"delete\"' | "); 13 | while ( my $line = ) { 14 | if ( $line =~ m/{"delete":{"status":{"id":(\d+),/ ) { 15 | print "$1\n"; 16 | } 17 | } 18 | close(DATA); 19 | } 20 | -------------------------------------------------------------------------------- /twitter-tools-core/src/main/perl/join_deletes_with_collection.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Joins together deletes and collection tweetids to identify the 4 | # deleted statuses. 5 | 6 | $USAGE = "$0 [deletes (bz2)] [collection (bz2)]"; 7 | 8 | $deletes = shift or die $USAGE; 9 | $collection = shift or die $USAGE; 10 | 11 | open(DATA, "bzcat $deletes | "); 12 | while ( my $line = ) { 13 | chomp($line); 14 | $H{$line} = 1; 15 | } 16 | close(DATA); 17 | 18 | open(DATA, "bzcat $collection | "); 19 | while ( my $line = ) { 20 | if ($line =~ /^(\d+)/ ) { 21 | print $line if exists($H{$1}); 22 | } 23 | } 24 | close(DATA); 25 | -------------------------------------------------------------------------------- /twitter-tools-core/src/main/python/Search/TrecSearch-remote: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Autogenerated by Thrift Compiler (0.8.0) 4 | # 5 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 6 | # 7 | # options string: py 8 | # 9 | 10 | import sys 11 | import pprint 12 | from urlparse import urlparse 13 | from thrift.transport import TTransport 14 | from thrift.transport import TSocket 15 | from thrift.transport import THttpClient 16 | from thrift.protocol import TBinaryProtocol 17 | 18 | import TrecSearch 19 | from ttypes import * 20 | 21 | if len(sys.argv) <= 1 or sys.argv[1] == '--help': 22 | print '' 23 | print 'Usage: ' + sys.argv[0] + ' [-h host[:port]] [-u url] [-f[ramed]] function [arg1 [arg2...]]' 24 | print '' 25 | print 'Functions:' 26 | print ' search(TQuery query)' 27 | print '' 28 | sys.exit(0) 29 | 30 | pp = pprint.PrettyPrinter(indent = 2) 31 | host = 'localhost' 32 | port = 9090 33 | uri = '' 34 | framed = False 35 | http = False 36 | argi = 1 37 | 38 | if sys.argv[argi] == '-h': 39 | parts = sys.argv[argi+1].split(':') 40 | host = parts[0] 41 | if len(parts) > 1: 42 | port = int(parts[1]) 43 | argi += 2 44 | 45 | if sys.argv[argi] == '-u': 46 | url = urlparse(sys.argv[argi+1]) 47 | parts = url[1].split(':') 48 | host = parts[0] 49 | if len(parts) > 1: 50 | port = int(parts[1]) 51 | else: 52 | port = 80 53 | uri = url[2] 54 | if url[4]: 55 | uri += '?%s' % url[4] 56 | http = True 57 | argi += 2 58 | 59 | if sys.argv[argi] == '-f' or sys.argv[argi] == '-framed': 60 | framed = True 61 | argi += 1 62 | 63 | cmd = sys.argv[argi] 64 | args = sys.argv[argi+1:] 65 | 66 | if http: 67 | transport = THttpClient.THttpClient(host, port, uri) 68 | else: 69 | socket = TSocket.TSocket(host, port) 70 | if framed: 71 | transport = TTransport.TFramedTransport(socket) 72 | else: 73 | transport = TTransport.TBufferedTransport(socket) 74 | protocol = TBinaryProtocol.TBinaryProtocol(transport) 75 | client = TrecSearch.Client(protocol) 76 | transport.open() 77 | 78 | if cmd == 'search': 79 | if len(args) != 1: 80 | print 'search requires 1 args' 81 | sys.exit(1) 82 | pp.pprint(client.search(eval(args[0]),)) 83 | 84 | else: 85 | print 'Unrecognized method %s' % cmd 86 | sys.exit(1) 87 | 88 | transport.close() 89 | -------------------------------------------------------------------------------- /twitter-tools-core/src/main/python/Search/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ['ttypes', 'constants', 'TrecSearch'] 2 | -------------------------------------------------------------------------------- /twitter-tools-core/src/main/python/Search/constants.py: -------------------------------------------------------------------------------- 1 | # 2 | # Autogenerated by Thrift Compiler (0.8.0) 3 | # 4 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 5 | # 6 | # options string: py 7 | # 8 | 9 | from thrift.Thrift import TType, TMessageType, TException 10 | from ttypes import * 11 | 12 | -------------------------------------------------------------------------------- /twitter-tools-core/src/main/python/TrecSearchThriftClientCli.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | """ 4 | A demonstration of how to use the python thrift bindings to retrieve tweets from the TREC 2013 API. 5 | 6 | This script requires the python-thrift package, which can installed using 'pip install thrift'. 7 | 8 | To execute this script: 9 | python TrecSearchThriftClientCli.py -host='host' -port=port -group='team_name' -token='access_token' -qid='MB01' -q='BBC World Service staff cuts' -runtag='lucene4lm' -max_id=34952194402811905 10 | 11 | """ 12 | 13 | from Search import TrecSearch, ttypes 14 | 15 | from thrift import Thrift 16 | from thrift.transport import TSocket 17 | from thrift.transport import TTransport 18 | from thrift.protocol import TBinaryProtocol 19 | 20 | import argparse 21 | 22 | try: 23 | # Command line arguments 24 | parser = argparse.ArgumentParser() 25 | parser.add_argument('-host', dest="host", help='server to connect to', required=True) 26 | parser.add_argument('-port',type=int, dest="port", help='port to use', required=True) 27 | parser.add_argument('-group', dest="group", help='group id', required=True) 28 | parser.add_argument('-token', dest="token", help='access token', required=True) 29 | parser.add_argument('-qid', dest="qid", help='query id', required=False, default='MB01') 30 | parser.add_argument('-q', dest="query", help='query text', required=False, default='BBC World Service staff cuts') 31 | parser.add_argument('-runtag', dest="run_tag", help='runtag', required=False, default='lucene4lm') 32 | parser.add_argument('-max_id', dest="max_id", help='maxid', required=False, default=34952194402811905) 33 | parser.add_argument('-num_results', dest="num_results", help='number of results', required=False, default=10) 34 | args = parser.parse_args() 35 | 36 | # Init thrift connection and protocol handlers 37 | transport = TSocket.TSocket(args.host, args.port) 38 | transport = TTransport.TBufferedTransport(transport) 39 | protocol = TBinaryProtocol.TBinaryProtocol(transport) 40 | client = TrecSearch.Client(protocol) 41 | 42 | # Open the connection to the server 43 | transport.open() 44 | 45 | # Create a new query 46 | q = ttypes.TQuery() 47 | q.group = args.group 48 | q.token = args.token 49 | q.text = args.query 50 | q.max_id = long(args.max_id) 51 | q.num_results = int(args.num_results) 52 | 53 | # Performs the actual search 54 | results = client.search(q) 55 | 56 | for i, result in enumerate(results, 1): 57 | # TREC_eval formatted line 58 | print "%s Q0 %d %d %f %s" % (args.qid, result.id, i, result.rsv, args.run_tag) 59 | 60 | # Close connection 61 | transport.close() 62 | 63 | except Thrift.TException, tx: 64 | print 'Thrift TException: %s' % (tx.message) 65 | -------------------------------------------------------------------------------- /twitter-tools-core/src/main/python/twittertools/stream/gather_status_stream.py: -------------------------------------------------------------------------------- 1 | # Twitter Tools 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from tweepy.streaming import StreamListener 17 | from tweepy import OAuthHandler 18 | from tweepy import Stream 19 | import logging 20 | import logging.handlers 21 | 22 | 23 | consumer_key="" 24 | consumer_secret="" 25 | 26 | access_token="" 27 | access_token_secret="" 28 | 29 | class TweetListener(StreamListener): 30 | 31 | def __init__(self,api=None): 32 | super(TweetListener,self).__init__(api) 33 | self.logger = logging.getLogger('tweetlogger') 34 | 35 | 36 | statusHandler = logging.handlers.TimedRotatingFileHandler('status.log',when='H',encoding='bz2',utc=True) 37 | statusHandler.setLevel(logging.INFO) 38 | self.logger.addHandler(statusHandler) 39 | 40 | 41 | warningHandler = logging.handlers.TimedRotatingFileHandler('warning.log',when='H',encoding='bz2',utc=True) 42 | warningHandler.setLevel(logging.WARN) 43 | self.logger.addHandler(warningHandler) 44 | logging.captureWarnings(True); 45 | 46 | consoleHandler = logging.StreamHandler() 47 | consoleHandler.setLevel(logging.WARN) 48 | self.logger.addHandler(consoleHandler) 49 | 50 | 51 | self.logger.setLevel(logging.INFO) 52 | self.count = 0 53 | 54 | def on_data(self,data): 55 | self.count+=1 56 | self.logger.info(data) 57 | if self.count % 1000 == 0: 58 | print "%d statuses processed" % self.count 59 | return True 60 | 61 | def on_error(self,exception): 62 | self.logger.warn(str(exception)) 63 | 64 | if __name__ == '__main__': 65 | listener = TweetListener() 66 | auth = OAuthHandler(consumer_key,consumer_secret) 67 | auth.set_access_token(access_token,access_token_secret) 68 | 69 | stream = Stream(auth,listener) 70 | while True: 71 | try: 72 | stream.sample() 73 | except Exception as ex: 74 | print str(ex) 75 | pass 76 | -------------------------------------------------------------------------------- /twitter-tools-core/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=INFO, A1 2 | log4j.appender.A1=org.apache.log4j.ConsoleAppender 3 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout 4 | 5 | # Print the date in ISO 8601 format 6 | log4j.appender.A1.layout.ConversionPattern=%d [%t] %-5p %c{1} - %m%n 7 | log4j.logger.com.ning.http.client=WARN 8 | -------------------------------------------------------------------------------- /twitter-tools-core/src/main/thrift/gen-py/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/twitter-tools/777776083d1e4a76da5bdc5860551a38f8fd6766/twitter-tools-core/src/main/thrift/gen-py/__init__.py -------------------------------------------------------------------------------- /twitter-tools-core/src/main/thrift/gen-py/twittertools/TrecSearch-remote: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Autogenerated by Thrift Compiler (0.8.0) 4 | # 5 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 6 | # 7 | # options string: py 8 | # 9 | 10 | import sys 11 | import pprint 12 | from urlparse import urlparse 13 | from thrift.transport import TTransport 14 | from thrift.transport import TSocket 15 | from thrift.transport import THttpClient 16 | from thrift.protocol import TBinaryProtocol 17 | 18 | import TrecSearch 19 | from ttypes import * 20 | 21 | if len(sys.argv) <= 1 or sys.argv[1] == '--help': 22 | print '' 23 | print 'Usage: ' + sys.argv[0] + ' [-h host[:port]] [-u url] [-f[ramed]] function [arg1 [arg2...]]' 24 | print '' 25 | print 'Functions:' 26 | print ' search(TQuery query)' 27 | print '' 28 | sys.exit(0) 29 | 30 | pp = pprint.PrettyPrinter(indent = 2) 31 | host = 'localhost' 32 | port = 9090 33 | uri = '' 34 | framed = False 35 | http = False 36 | argi = 1 37 | 38 | if sys.argv[argi] == '-h': 39 | parts = sys.argv[argi+1].split(':') 40 | host = parts[0] 41 | if len(parts) > 1: 42 | port = int(parts[1]) 43 | argi += 2 44 | 45 | if sys.argv[argi] == '-u': 46 | url = urlparse(sys.argv[argi+1]) 47 | parts = url[1].split(':') 48 | host = parts[0] 49 | if len(parts) > 1: 50 | port = int(parts[1]) 51 | else: 52 | port = 80 53 | uri = url[2] 54 | if url[4]: 55 | uri += '?%s' % url[4] 56 | http = True 57 | argi += 2 58 | 59 | if sys.argv[argi] == '-f' or sys.argv[argi] == '-framed': 60 | framed = True 61 | argi += 1 62 | 63 | cmd = sys.argv[argi] 64 | args = sys.argv[argi+1:] 65 | 66 | if http: 67 | transport = THttpClient.THttpClient(host, port, uri) 68 | else: 69 | socket = TSocket.TSocket(host, port) 70 | if framed: 71 | transport = TTransport.TFramedTransport(socket) 72 | else: 73 | transport = TTransport.TBufferedTransport(socket) 74 | protocol = TBinaryProtocol.TBinaryProtocol(transport) 75 | client = TrecSearch.Client(protocol) 76 | transport.open() 77 | 78 | if cmd == 'search': 79 | if len(args) != 1: 80 | print 'search requires 1 args' 81 | sys.exit(1) 82 | pp.pprint(client.search(eval(args[0]),)) 83 | 84 | else: 85 | print 'Unrecognized method %s' % cmd 86 | sys.exit(1) 87 | 88 | transport.close() 89 | -------------------------------------------------------------------------------- /twitter-tools-core/src/main/thrift/gen-py/twittertools/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ['ttypes', 'constants', 'TrecSearch'] 2 | -------------------------------------------------------------------------------- /twitter-tools-core/src/main/thrift/gen-py/twittertools/constants.py: -------------------------------------------------------------------------------- 1 | # 2 | # Autogenerated by Thrift Compiler (0.8.0) 3 | # 4 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 5 | # 6 | # options string: py 7 | # 8 | 9 | from thrift.Thrift import TType, TMessageType, TException 10 | from ttypes import * 11 | 12 | -------------------------------------------------------------------------------- /twitter-tools-core/src/main/thrift/twittertools.thrift: -------------------------------------------------------------------------------- 1 | namespace java cc.twittertools.thrift.gen 2 | 3 | struct TResult { 4 | 1: i64 id, 5 | 2: double rsv, 6 | 3: string screen_name, 7 | 4: i64 epoch, 8 | 5: string text, 9 | 6: i32 followers_count, 10 | 7: i32 statuses_count, 11 | 8: string lang, 12 | 9: i64 in_reply_to_status_id, 13 | 10: i64 in_reply_to_user_id, 14 | 11: i64 retweeted_status_id, 15 | 12: i64 retweeted_user_id, 16 | 13: i32 retweeted_count 17 | } 18 | 19 | struct TQuery { 20 | 1: string group, 21 | 2: string token, 22 | 3: string text, 23 | 4: i64 max_id, 24 | 5: i32 num_results 25 | } 26 | 27 | exception TrecSearchException { 28 | 1: string message 29 | } 30 | 31 | service TrecSearch { 32 | list search(1: TQuery query) 33 | throws (1: TrecSearchException error) 34 | } 35 | -------------------------------------------------------------------------------- /twitter-tools-core/src/test/java/cc/twittertools/download/FetchStatusTest.java: -------------------------------------------------------------------------------- 1 | package cc.twittertools.download; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | import static org.junit.Assert.assertTrue; 5 | 6 | import java.util.concurrent.Future; 7 | 8 | import junit.framework.JUnit4TestAdapter; 9 | 10 | import org.apache.commons.lang.StringEscapeUtils; 11 | import org.junit.Test; 12 | 13 | import cc.twittertools.corpus.data.Status; 14 | 15 | import com.google.gson.JsonObject; 16 | import com.google.gson.JsonParser; 17 | import com.ning.http.client.AsyncHttpClient; 18 | import com.ning.http.client.Response; 19 | 20 | public class FetchStatusTest { 21 | private static final JsonParser JSON_PARSER = new JsonParser(); 22 | 23 | @Test 24 | public void basicHTML() throws Exception { 25 | String url = AsyncEmbeddedJsonStatusBlockCrawler.getUrl(1121915133L, "jkrums"); 26 | AsyncHttpClient asyncHttpClient = new AsyncHttpClient(); 27 | AsyncHttpClient.BoundRequestBuilder request = asyncHttpClient.prepareGet(url); 28 | Future f = request.execute(); 29 | Response response = f.get(); 30 | 31 | // Make sure status is OK. 32 | String html = response.getResponseBody("UTF-8"); 33 | assertTrue(html != null); 34 | } 35 | 36 | // The fetcher is broken, so disabling test. 37 | //@Test 38 | public void basicFamous() throws Exception { 39 | String url = AsyncEmbeddedJsonStatusBlockCrawler.getUrl(1121915133L, "jkrums"); 40 | AsyncHttpClient asyncHttpClient = new AsyncHttpClient(); 41 | AsyncHttpClient.BoundRequestBuilder request = asyncHttpClient.prepareGet(url); 42 | Future f = request.execute(); 43 | Response response = f.get(); 44 | 45 | // Make sure status is OK. 46 | assertEquals(200, response.getStatusCode()); 47 | String html = response.getResponseBody("UTF-8"); 48 | 49 | int jsonStart = html.indexOf(AsyncEmbeddedJsonStatusBlockCrawler.JSON_START); 50 | int jsonEnd = html.indexOf(AsyncEmbeddedJsonStatusBlockCrawler.JSON_END, 51 | jsonStart + AsyncEmbeddedJsonStatusBlockCrawler.JSON_START.length()); 52 | 53 | String json = html.substring(jsonStart + AsyncEmbeddedJsonStatusBlockCrawler.JSON_START.length(), jsonEnd); 54 | json = StringEscapeUtils.unescapeHtml(json); 55 | JsonObject page = (JsonObject) JSON_PARSER.parse(json); 56 | JsonObject statusJson = page.getAsJsonObject("embedData").getAsJsonObject("status"); 57 | 58 | Status status = Status.fromJson(statusJson.toString()); 59 | assertEquals(1121915133L, status.getId()); 60 | assertEquals("jkrums", status.getScreenname()); 61 | assertEquals("http://twitpic.com/135xa - There's a plane in the Hudson. I'm on the ferry going to pick up the people. Crazy.", status.getText()); 62 | 63 | asyncHttpClient.close(); 64 | } 65 | 66 | public static junit.framework.Test suite() { 67 | return new JUnit4TestAdapter(FetchStatusTest.class); 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /twitter-tools-core/src/test/java/cc/twittertools/index/TokenizationTest.java: -------------------------------------------------------------------------------- 1 | package cc.twittertools.index; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | 5 | import java.io.IOException; 6 | import java.io.StringReader; 7 | import java.util.List; 8 | 9 | import junit.framework.JUnit4TestAdapter; 10 | 11 | import org.apache.lucene.analysis.Analyzer; 12 | import org.apache.lucene.analysis.TokenStream; 13 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 14 | import org.apache.lucene.util.Version; 15 | import org.junit.Test; 16 | 17 | import cc.twittertools.index.TweetAnalyzer; 18 | 19 | import com.google.common.collect.Lists; 20 | 21 | public class TokenizationTest { 22 | 23 | Object[][] examples = new Object[][] { 24 | {"AT&T getting secret immunity from wiretapping laws for government surveillance http://vrge.co/ZP3Fx5", 25 | new String[] {"att", "get", "secret", "immun", "from", "wiretap", "law", "for", "govern", "surveil", "http://vrge.co/ZP3Fx5"}}, 26 | 27 | {"want to see the @verge aston martin GT4 racer tear up long beach? http://theracersgroup.kinja.com/watch-an-aston-martin-vantage-gt4-tear-around-long-beac-479726219 …", 28 | new String[] {"want", "to", "see", "the", "@verge", "aston", "martin", "gt4", "racer", "tear", "up", "long", "beach", "http://theracersgroup.kinja.com/watch-an-aston-martin-vantage-gt4-tear-around-long-beac-479726219"}}, 29 | 30 | {"Incredibly good news! #Drupal users rally http://bit.ly/Z8ZoFe to ensure blind accessibility contributor gets to @DrupalCon #Opensource", 31 | new String[] {"incred", "good", "new", "#drupal", "user", "ralli", "http://bit.ly/Z8ZoFe", "to", "ensur", "blind", "access", "contributor", "get", "to", "@drupalcon", "#opensource"}}, 32 | 33 | {"We're entering the quiet hours at #amznhack. #Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz", 34 | new String[] {"were", "enter", "the", "quiet", "hour", "at", "#amznhack", "#rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz"}}, 35 | 36 | {"The 2013 Social Event Detection Task (SED) at #mediaeval2013, http://bit.ly/16nITsf supported by @linkedtv @project_mmixer @socialsensor_ip", 37 | new String[] {"the", "2013", "social", "event", "detect", "task", "sed", "at", "#mediaeval2013", "http://bit.ly/16nITsf", "support", "by", "@linkedtv", "@project_mmixer", "@socialsensor_ip"}}, 38 | 39 | {"U.S.A. U.K. U.K USA UK #US #UK #U.S.A #U.K ...A.B.C...D..E..F..A.LONG WORD", 40 | new String[] {"usa", "uk", "uk", "usa", "uk", "#us", "#uk", "#u", "sa", "#u", "k", "abc", "d", "e", "f", "a", "long", "word"}}, 41 | 42 | {"this is @a_valid_mention and this_is_multiple_words", 43 | new String[] {"thi", "is", "@a_valid_mention", "and", "thi", "is", "multipl", "word"}}, 44 | 45 | {"PLEASE BE LOWER CASE WHEN YOU COME OUT THE OTHER SIDE - ALSO A @VALID_VALID-INVALID", 46 | new String[] {"pleas", "be", "lower", "case", "when", "you", "come", "out", "the", "other", "side", "also", "a", "@valid_valid", "invalid"}}, 47 | 48 | // Note: the at sign is not the normal (at) sign and the crazy hashtag is not the normal # 49 | {"@reply @with #crazy ~#at", 50 | new String[] {"@reply", "@with", "#crazy", "#at"}}, 51 | 52 | {":@valid testing(valid)#hashtags. RT:@meniton (the last @mention is #valid and so is this:@valid), however this is@invalid", 53 | new String[] {"@valid", "test", "valid", "#hashtags", "rt", "@meniton", "the", "last", "@mention", "is", "#valid", "and", "so", "is", "thi", "@valid", "howev", "thi", "is", "invalid"}}, 54 | 55 | {"this][is[lots[(of)words+with-lots=of-strange!characters?$in-fact=it&has&Every&Single:one;ofin_here_B&N_test_test?test\\test^testing`testing{testing}testing…testing¬testing·testing what?", 56 | new String[] {"thi", "is", "lot", "of", "word", "with", "lot", "of", "strang", "charact", "in", "fact", "it", "ha", "everi", "singl", "on", "of", "them", "in", "here", "bn", "test", "test", "test", "test", "test", "test", "test", "test", "test", "test", "test", "what"}}, 57 | }; 58 | 59 | @Test 60 | public void basic() throws Exception { 61 | Analyzer analyzer = new TweetAnalyzer(Version.LUCENE_43); 62 | 63 | for (int i = 0; i < examples.length; i++) { 64 | verify((String[]) examples[i][1], parseKeywords(analyzer, (String) examples[i][0])); 65 | } 66 | } 67 | 68 | public void verify(String[] truth, List tokens) { 69 | assertEquals(truth.length, tokens.size()); 70 | for ( int i=0; i parseKeywords(Analyzer analyzer, String keywords) throws IOException { 76 | List list = Lists.newArrayList(); 77 | 78 | TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(keywords)); 79 | CharTermAttribute cattr = tokenStream.addAttribute(CharTermAttribute.class); 80 | tokenStream.reset(); 81 | while (tokenStream.incrementToken()) { 82 | if (cattr.toString().length() == 0) { 83 | continue; 84 | } 85 | list.add(cattr.toString()); 86 | } 87 | tokenStream.end(); 88 | tokenStream.close(); 89 | 90 | return list; 91 | } 92 | 93 | public static junit.framework.Test suite() { 94 | return new JUnit4TestAdapter(TokenizationTest.class); 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /twitter-tools-core/src/test/java/cc/twittertools/search/TrecTopicSetTest.java: -------------------------------------------------------------------------------- 1 | package cc.twittertools.search; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | import static org.junit.Assert.assertTrue; 5 | 6 | import java.io.File; 7 | import java.util.List; 8 | 9 | import junit.framework.JUnit4TestAdapter; 10 | 11 | import org.junit.Test; 12 | 13 | import com.google.common.collect.Lists; 14 | 15 | public class TrecTopicSetTest { 16 | 17 | @Test 18 | public void topics2011() throws Exception { 19 | File f = new File("../data/topics.microblog2011.txt"); 20 | assertTrue(f.exists()); 21 | 22 | TrecTopicSet topics = TrecTopicSet.fromFile(f); 23 | List t = Lists.newArrayList(topics.iterator()); 24 | 25 | assertEquals(50, t.size()); 26 | assertEquals("MB01", t.get(0).getId()); 27 | assertEquals("MB50", t.get(t.size()-1).getId()); 28 | } 29 | 30 | @Test 31 | public void topics2012() throws Exception { 32 | File f = new File("../data/topics.microblog2012.txt"); 33 | assertTrue(f.exists()); 34 | 35 | TrecTopicSet topics = TrecTopicSet.fromFile(f); 36 | List t = Lists.newArrayList(topics.iterator()); 37 | 38 | assertEquals(60, t.size()); 39 | assertEquals("MB51", t.get(0).getId()); 40 | assertEquals("MB110", t.get(t.size()-1).getId()); 41 | } 42 | 43 | public static junit.framework.Test suite() { 44 | return new JUnit4TestAdapter(TrecTopicSetTest.class); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /twitter-tools-hadoop/.settings/org.eclipse.jdt.ui.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | formatter_profile=_twitter-tools 3 | formatter_settings_version=12 4 | org.eclipse.jdt.ui.exception.name=e 5 | org.eclipse.jdt.ui.gettersetter.use.is=true 6 | org.eclipse.jdt.ui.keywordthis=false 7 | org.eclipse.jdt.ui.overrideannotation=true 8 | -------------------------------------------------------------------------------- /twitter-tools-hadoop/README.md: -------------------------------------------------------------------------------- 1 | # Analyzing Tweets with Pig: Getting Started 2 | 3 | Since tweets are encoded in JSON, and Pig offers poor native JSON support, it's more convenient to use JSON loaders in Twitter's [Elephant Bird](https://github.com/kevinweil/elephant-bird/) library. Easiest just to fetch the relevant jars directly: 4 | 5 | ``` 6 | wget http://repo1.maven.org/maven2/com/twitter/elephantbird/elephant-bird-core/4.5/elephant-bird-core-4.5.jar 7 | wget http://repo1.maven.org/maven2/com/twitter/elephantbird/elephant-bird-pig/4.5/elephant-bird-pig-4.5.jar 8 | wget http://repo1.maven.org/maven2/com/twitter/elephantbird/elephant-bird-hadoop-compat/4.5/elephant-bird-hadoop-compat-4.5.jar 9 | wget http://repo1.maven.org/maven2/com/googlecode/json-simple/json-simple/1.1.1/json-simple-1.1.1.jar 10 | ``` 11 | 12 | You're ready to start analyzing tweets with Pig! Here's the obligatory word count example in Pig: 13 | 14 | ``` 15 | register 'elephant-bird-core-4.5.jar'; 16 | register 'elephant-bird-pig-4.5.jar'; 17 | register 'elephant-bird-hadoop-compat-4.5.jar'; 18 | register 'json-simple-1.1.1.jar'; 19 | 20 | raw = load '/path/to/tweets' using com.twitter.elephantbird.pig.load.JsonLoader('-nestedLoad'); 21 | 22 | a = foreach raw generate (chararray) $0#'text' as text; 23 | b = foreach a generate flatten(TOKENIZE(text)) as word; 24 | c = group b by word; 25 | d = foreach c generate COUNT(b), group; 26 | 27 | store d into 'wordcount'; 28 | ``` 29 | -------------------------------------------------------------------------------- /twitter-tools-hadoop/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | cc.twittertools.hadoop 6 | twitter-tools-hadoop 7 | 1.0-SNAPSHOT 8 | jar 9 | 10 | twitter-tools-hadoop 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | The Apache Software License, Version 2.0 20 | http://www.apache.org/licenses/LICENSE-2.0.txt 21 | repo 22 | 23 | 24 | 25 | 26 | 27 | JeffyRao 28 | Jinfeng Rao 29 | jinfeng@cs.umd.edu 30 | 31 | 32 | 33 | 34 | 35 | 36 | org.codehaus.mojo 37 | appassembler-maven-plugin 38 | 1.3.1 39 | 40 | 41 | 42 | cc.twittertools.hbase.LoadWordCount 43 | LoadWordCount 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | junit 53 | junit 54 | 3.8.1 55 | test 56 | 57 | 58 | cc.twittertools 59 | twitter-tools-core 60 | 1.4.2 61 | 62 | 63 | org.apache.pig 64 | pig 65 | 0.12.1 66 | 67 | 68 | org.apache.hadoop 69 | hadoop-core 70 | 1.2.1 71 | 72 | 73 | org.apache.hbase 74 | hbase 75 | 0.92.1 76 | 77 | 78 | maven-release-plugin 79 | org.apache.maven.plugins 80 | 81 | 82 | 83 | 85 | 86 | commons-io 87 | commons-io 88 | 2.1 89 | 90 | 91 | org.apache.lucene 92 | lucene-core 93 | 4.8.0 94 | 95 | 96 | com.google.guava 97 | guava 98 | 17.0 99 | 100 | 101 | 102 | -------------------------------------------------------------------------------- /twitter-tools-hadoop/src/main/java/cc/twittertools/hadoop/Example.java: -------------------------------------------------------------------------------- 1 | package cc.twittertools.hadoop; 2 | 3 | import java.io.IOException; 4 | import java.io.StringReader; 5 | 6 | import org.apache.lucene.analysis.TokenStream; 7 | import org.apache.lucene.analysis.Tokenizer; 8 | import org.apache.lucene.analysis.core.WhitespaceTokenizer; 9 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 10 | import org.apache.lucene.util.Version; 11 | 12 | import cc.twittertools.index.LowerCaseEntityPreservingFilter; 13 | 14 | public class Example { 15 | 16 | public static void main(String[] args) throws IOException{ 17 | //Test GetInterval Correctness 18 | try{ 19 | String str = "Tue Oct 01 00:07:43 +0000 2011"; 20 | String[] groups = str.split("\\s+"); 21 | String time = groups[3]; 22 | String[] timeGroups= time.split(":"); 23 | int interval = (Integer.valueOf(timeGroups[0]))*12 + (Integer.valueOf(timeGroups[1])/5) + 1; 24 | System.out.println(interval); 25 | }catch(Exception e){ 26 | throw new IOException("caught exception",e); 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /twitter-tools-hadoop/src/main/java/cc/twittertools/hbase/LoadWordCount.java: -------------------------------------------------------------------------------- 1 | package cc.twittertools.hbase; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileNotFoundException; 6 | import java.io.FileReader; 7 | import java.io.IOException; 8 | import java.util.HashMap; 9 | import java.util.HashSet; 10 | import java.util.Iterator; 11 | import java.util.Map; 12 | import java.util.Set; 13 | 14 | import org.apache.hadoop.hbase.client.HTablePool; 15 | import org.apache.hadoop.hbase.util.Bytes; 16 | 17 | import com.google.common.collect.HashBasedTable; 18 | import com.google.common.collect.Table; 19 | 20 | public class LoadWordCount { 21 | 22 | public static void main(String[] args) throws IOException { 23 | // TODO Auto-generated method stub 24 | if(args.length!=1){ 25 | System.out.println("invalid argument"); 26 | } 27 | Table wordCountMap = HashBasedTable.create(); 28 | File folder = new File(args[0]); 29 | if(folder.isDirectory()){ 30 | for (File file : folder.listFiles()) { 31 | if(!file.getName().startsWith("part")) 32 | continue; 33 | System.out.println("Processing "+args[0]+file.getName()); 34 | BufferedReader bf = new BufferedReader(new FileReader(args[0]+file.getName())); 35 | // each line in wordcount file is like : 1 twitter 100 36 | String line; 37 | while((line=bf.readLine())!=null){ 38 | String[] groups = line.split("\\t"); 39 | if(groups.length != 4) 40 | continue; 41 | String day = groups[0]; // each day is viewed as a column in underlying HBase 42 | String interval = groups[1]; 43 | String word = groups[2]; 44 | String count = groups[3]; 45 | if(!wordCountMap.contains(word, day)){ 46 | WordCountDAO.WordCount w = new WordCountDAO.WordCount(word, day); 47 | wordCountMap.put(word, day, w); 48 | } 49 | WordCountDAO.WordCount w = wordCountMap.get(word, day); 50 | w.setCount(Integer.valueOf(interval), Integer.valueOf(count)); 51 | wordCountMap.put(word, day, w); 52 | 53 | } 54 | } 55 | } 56 | 57 | System.out.println("Total "+wordCountMap.size()+" words"); 58 | HTablePool pool = new HTablePool(); 59 | WordCountDAO DAO = new WordCountDAO(pool); 60 | DAO.CreateTable(); 61 | int count = 0; 62 | for(WordCountDAO.WordCount w: wordCountMap.values()){ 63 | DAO.addWordCount(w); 64 | if(++count % 50000==0){ 65 | System.out.println("Loading "+count+" words"); 66 | } 67 | } 68 | pool.closeTablePool(DAO.TABLE_NAME); 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /twitter-tools-hadoop/src/main/java/cc/twittertools/hbase/WordCountDAO.java: -------------------------------------------------------------------------------- 1 | package cc.twittertools.hbase; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.Comparator; 6 | import java.util.List; 7 | import java.util.NavigableMap; 8 | import java.util.Set; 9 | import java.util.TreeMap; 10 | 11 | import org.apache.hadoop.conf.Configuration; 12 | import org.apache.hadoop.hbase.HBaseConfiguration; 13 | import org.apache.hadoop.hbase.HColumnDescriptor; 14 | import org.apache.hadoop.hbase.HTableDescriptor; 15 | import org.apache.hadoop.hbase.MasterNotRunningException; 16 | import org.apache.hadoop.hbase.ZooKeeperConnectionException; 17 | import org.apache.hadoop.hbase.client.Delete; 18 | import org.apache.hadoop.hbase.client.Get; 19 | import org.apache.hadoop.hbase.client.HBaseAdmin; 20 | import org.apache.hadoop.hbase.client.HTableInterface; 21 | import org.apache.hadoop.hbase.client.HTablePool; 22 | import org.apache.hadoop.hbase.client.Put; 23 | import org.apache.hadoop.hbase.client.Result; 24 | import org.apache.hadoop.hbase.client.Scan; 25 | import org.apache.hadoop.hbase.util.Bytes; 26 | import org.apache.log4j.Logger; 27 | 28 | 29 | public class WordCountDAO { 30 | private final static int DAY = 60*24; 31 | private final static int INTERVAL = 5; 32 | public static int NUM_INTERVALS = DAY/INTERVAL; 33 | public static final byte[] TABLE_NAME = Bytes.toBytes("wordcount"); 34 | public static final byte[] COLUMN_FAMILY = Bytes.toBytes("count"); 35 | 36 | private static final Logger log = Logger.getLogger(WordCountDAO.class); 37 | 38 | private HTablePool pool; 39 | 40 | public WordCountDAO(HTablePool pool) { 41 | this.pool = pool; 42 | } 43 | 44 | public void CreateTable() throws IOException, ZooKeeperConnectionException{ 45 | Configuration conf = HBaseConfiguration.create(); 46 | 47 | HBaseAdmin hbase = new HBaseAdmin(conf); 48 | HTableDescriptor[] wordcounts = hbase.listTables("wordcount"); 49 | 50 | if(wordcounts.length != 0){ //Drop Table if Exists 51 | hbase.disableTable(TABLE_NAME); 52 | hbase.deleteTable(TABLE_NAME); 53 | } 54 | 55 | HTableDescriptor wordcount = new HTableDescriptor(TABLE_NAME); 56 | hbase.createTable(wordcount); 57 | // Cannot edit a stucture on an active table. 58 | hbase.disableTable(TABLE_NAME); 59 | HColumnDescriptor columnFamily = new HColumnDescriptor(COLUMN_FAMILY); 60 | hbase.addColumn(TABLE_NAME, columnFamily); 61 | hbase.enableTable(TABLE_NAME); 62 | 63 | hbase.close(); 64 | } 65 | 66 | private static Get mkGet(String word) throws IOException { 67 | log.debug(String.format("Creating Get for %s", word)); 68 | 69 | Get g = new Get(Bytes.toBytes(word)); 70 | g.addFamily(COLUMN_FAMILY); 71 | return g; 72 | } 73 | 74 | private static Put mkPut(WordCount w){ 75 | log.debug(String.format("Creating Put for %s", w.word)); 76 | 77 | Put p = new Put(w.word); 78 | // add integer compression here 79 | // convert 2-d byte array to 1-d byte array 80 | byte[] storage = new byte[NUM_INTERVALS*Integer.SIZE/Byte.SIZE]; 81 | for(int i=0; i< NUM_INTERVALS; i++){ 82 | for(int j=0; j getWordCount(String word) throws IOException { 112 | HTableInterface words = pool.getTable(TABLE_NAME); 113 | Get g = mkGet(word); 114 | Result result = words.get(g); 115 | if (result.isEmpty()) { 116 | log.info(String.format("word %s not found.", word)); 117 | return null; 118 | } 119 | 120 | List wordCounts = WordCount.GetWordCountFromResults(result); 121 | words.close(); 122 | return wordCounts; 123 | } 124 | 125 | public void deleteUser(String word) throws IOException { 126 | HTableInterface words = pool.getTable(TABLE_NAME); 127 | 128 | Delete d = mkDel(word); 129 | words.delete(d); 130 | 131 | words.close(); 132 | } 133 | 134 | public static class WordCount{ 135 | public byte[] word; 136 | public byte[] column_id; 137 | public byte[][] count; 138 | 139 | public WordCount(byte[] word, byte[] column_id){ 140 | this.word = word; 141 | this.column_id = column_id; 142 | this.count = new byte[NUM_INTERVALS][]; 143 | for(int i=0; i < NUM_INTERVALS; i++){ 144 | this.count[i] = Bytes.toBytes(0); 145 | } 146 | } 147 | 148 | public WordCount(String word, String column_id){ 149 | this.word = Bytes.toBytes(word); 150 | this.column_id = Bytes.toBytes(column_id); 151 | this.count = new byte[NUM_INTERVALS][]; 152 | for(int i=0; i < NUM_INTERVALS; i++){ 153 | this.count[i] = Bytes.toBytes(0); 154 | } 155 | } 156 | 157 | private WordCount(byte[] word, byte[] column_id, byte[][] count){ 158 | this.word = word; 159 | this.column_id = column_id; 160 | this.count = count; 161 | } 162 | 163 | public static List GetWordCountFromResults(Result r){ 164 | List wordCounts = new ArrayList(); 165 | byte[] word = r.getRow(); 166 | // Map from column qualifiers to values 167 | NavigableMap familyMap = r.getFamilyMap(COLUMN_FAMILY); 168 | for(byte[] column: familyMap.keySet()){ 169 | byte[] value = familyMap.get(column); 170 | // decompression 171 | byte[][] count = new byte[NUM_INTERVALS][Integer.SIZE/Byte.SIZE]; 172 | for(int i=0; i { 11 | private static final String DATE_FORMAT = "EEE MMM d k:m:s ZZZZZ yyyy"; // "Fri Mar 29 11:03:41 +0000 2013"; 12 | private static final SimpleDateFormat DATE_PARSER = new SimpleDateFormat(DATE_FORMAT); 13 | 14 | public Long exec(Tuple input) throws IOException { 15 | if (input == null || input.size() == 0) { 16 | return -1L; 17 | } 18 | 19 | String createdAt = (String) input.get(0); 20 | long epoch; 21 | try { 22 | epoch = DATE_PARSER.parse(createdAt).getTime() / 1000; 23 | } catch (ParseException e) { 24 | epoch = -1L; 25 | } 26 | 27 | return epoch; 28 | } 29 | } -------------------------------------------------------------------------------- /twitter-tools-hadoop/src/main/java/cc/twittertools/piggybank/GetLatitude.java: -------------------------------------------------------------------------------- 1 | package cc.twittertools.piggybank; 2 | 3 | import java.io.IOException; 4 | import java.util.Iterator; 5 | 6 | import org.apache.pig.EvalFunc; 7 | import org.apache.pig.data.DataBag; 8 | import org.apache.pig.data.Tuple; 9 | 10 | // Sample usage: cc.twittertools.piggybank.GetLatitude($0#'geo'#'coordinates') 11 | public class GetLatitude extends EvalFunc { 12 | public String exec(Tuple input) throws IOException { 13 | DataBag bag = (DataBag) input.get(0); 14 | Iterator it = bag.iterator(); 15 | if (!it.hasNext()) { 16 | return null; 17 | } 18 | Tuple tup = it.next(); 19 | 20 | return (String) tup.get(0); 21 | } 22 | } -------------------------------------------------------------------------------- /twitter-tools-hadoop/src/main/java/cc/twittertools/piggybank/GetLongitude.java: -------------------------------------------------------------------------------- 1 | package cc.twittertools.piggybank; 2 | 3 | import java.io.IOException; 4 | import java.util.Iterator; 5 | 6 | import org.apache.pig.EvalFunc; 7 | import org.apache.pig.data.DataBag; 8 | import org.apache.pig.data.Tuple; 9 | 10 | // Sample usage: cc.twittertools.piggybank.GetLongitude($0#'geo'#'coordinates'); 11 | public class GetLongitude extends EvalFunc { 12 | public String exec(Tuple input) throws IOException { 13 | DataBag bag = (DataBag) input.get(0); 14 | Iterator it = bag.iterator(); 15 | if (!it.hasNext()) { 16 | return null; 17 | } 18 | it.next(); 19 | if (!it.hasNext()) { 20 | return null; 21 | } 22 | 23 | Tuple tup = it.next(); 24 | 25 | return (String) tup.get(0); 26 | } 27 | } -------------------------------------------------------------------------------- /twitter-tools-hadoop/src/main/java/cc/twittertools/piggybank/IsMap.java: -------------------------------------------------------------------------------- 1 | package cc.twittertools.piggybank; 2 | 3 | import java.io.IOException; 4 | import java.util.Map; 5 | 6 | import org.apache.pig.FilterFunc; 7 | import org.apache.pig.data.Tuple; 8 | 9 | public class IsMap extends FilterFunc { 10 | 11 | @Override 12 | public Boolean exec(Tuple input) throws IOException { 13 | if (input == null || input.size() == 0) { 14 | return false; 15 | } 16 | 17 | return (input.get(0) instanceof Map); 18 | } 19 | } -------------------------------------------------------------------------------- /twitter-tools-hadoop/src/main/java/cc/twittertools/udf/GetDate.java: -------------------------------------------------------------------------------- 1 | package cc.twittertools.udf; 2 | 3 | import java.io.IOException; 4 | import java.util.regex.Matcher; 5 | import java.util.regex.Pattern; 6 | import org.apache.lucene.analysis.Tokenizer; 7 | import org.apache.lucene.analysis.TokenStream; 8 | import org.apache.pig.EvalFunc; 9 | import org.apache.pig.data.Tuple; 10 | import cc.twittertools.index.LowerCaseEntityPreservingFilter; 11 | import org.apache.lucene.analysis.core.WhitespaceTokenizer; 12 | 13 | public class GetDate extends EvalFunc{ 14 | 15 | public String exec(Tuple input) throws IOException { 16 | if(input == null || input.size() == 0){ 17 | return null; 18 | } 19 | //Standard Time Format: Tue Feb 08 23:59:59 +0000 2011 20 | try{ 21 | String str = (String) input.get(0); 22 | String[] groups = str.split("\\s+"); 23 | String year = groups[5]; 24 | String month = groups[1]; 25 | String day= groups[2]; 26 | return year+" "+month+" "+day; 27 | }catch(Exception e){ 28 | throw new IOException("caught exception",e); 29 | } 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /twitter-tools-hadoop/src/main/java/cc/twittertools/udf/GetInterval.java: -------------------------------------------------------------------------------- 1 | package cc.twittertools.udf; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.pig.EvalFunc; 6 | import org.apache.pig.data.Tuple; 7 | 8 | public class GetInterval extends EvalFunc{ 9 | public String exec(Tuple input) throws IOException { 10 | if(input == null || input.size() == 0){ 11 | return null; 12 | } 13 | //Standard Time Format: Tue Feb 08 23:59:59 +0000 2011 14 | try{ 15 | String str = (String) input.get(0); 16 | String[] groups = str.split("\\s+"); 17 | String time = groups[3]; 18 | String[] timeGroups= time.split(":"); 19 | int interval = (Integer.valueOf(timeGroups[0]))*12 + (Integer.valueOf(timeGroups[1])/5); 20 | return String.valueOf(interval); 21 | }catch(Exception e){ 22 | throw new IOException("caught exception",e); 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /twitter-tools-hadoop/src/main/java/cc/twittertools/udf/LuceneTokenizer.java: -------------------------------------------------------------------------------- 1 | package cc.twittertools.udf; 2 | 3 | import java.io.IOException; 4 | import java.io.StringReader; 5 | import java.util.StringTokenizer; 6 | 7 | import org.apache.lucene.analysis.Analyzer; 8 | import org.apache.lucene.analysis.TokenStream; 9 | import org.apache.lucene.analysis.Tokenizer; 10 | import org.apache.lucene.analysis.core.WhitespaceTokenizer; 11 | import org.apache.lucene.analysis.en.PorterStemFilter; 12 | import org.apache.lucene.analysis.standard.StandardTokenizer; 13 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 14 | import org.apache.lucene.util.Version; 15 | import org.apache.pig.EvalFunc; 16 | import org.apache.pig.data.BagFactory; 17 | import org.apache.pig.data.DataBag; 18 | import org.apache.pig.data.Tuple; 19 | import org.apache.pig.data.TupleFactory; 20 | 21 | import cc.twittertools.index.LowerCaseEntityPreservingFilter; 22 | 23 | public class LuceneTokenizer extends EvalFunc{ 24 | TupleFactory mTupleFactory = TupleFactory.getInstance(); 25 | BagFactory mBagFactory = BagFactory.getInstance(); 26 | 27 | public DataBag exec(Tuple input) throws IOException{ 28 | try { 29 | DataBag output = mBagFactory.newDefaultBag(); 30 | Object o = input.get(0); 31 | if (!(o instanceof String)) { 32 | throw new IOException("Expected input to be chararray, but got " + o.getClass().getName()); 33 | } 34 | Tokenizer source = new WhitespaceTokenizer(Version.LUCENE_43, new StringReader((String)o)); 35 | TokenStream tokenstream = new LowerCaseEntityPreservingFilter(source); 36 | tokenstream.reset(); 37 | while (tokenstream.incrementToken()){ 38 | String token = tokenstream.getAttribute(CharTermAttribute.class).toString(); 39 | output.add(mTupleFactory.newTuple(token)); 40 | } 41 | return output; 42 | } catch (Exception e) { 43 | // error handling goes here 44 | throw new IOException("caught exception",e); 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /twitter-tools-hadoop/wordcountbytime.pig: -------------------------------------------------------------------------------- 1 | register 'jar/elephant-bird-core-4.5.jar'; 2 | register 'jar/elephant-bird-pig-4.5.jar'; 3 | register 'jar/elephant-bird-hadoop-compat-4.5.jar'; 4 | register 'jar/json-simple-1.1.1.jar'; 5 | register 'jar/twitter-tools-hadoop-1.0-SNAPSHOT.jar'; 6 | register 'jar/twitter-tools-core-1.4.3-SNAPSHOT.jar'; 7 | register 'jar/lucene-core-4.8.0.jar'; 8 | register 'jar/lucene-analyzers-common-4.8.0.jar'; 9 | register 'jar/twitter-text-1.9.0.jar'; 10 | 11 | raw = load '/shared/collections/Tweets2011/20110208-099.json.gz' using com.twitter.elephantbird.pig.load.JsonLoader('-nestedLoad'); 12 | 13 | a = foreach raw generate $0#'created_at',$0#'text'; 14 | b = foreach a generate cc.twittertools.udf.GetDate($0), cc.twittertools.udf.GetInterval($0), flatten(cc.twittertools.udf.LuceneTokenizer($1)); 15 | c = group b by ($0,$1,$2); 16 | d = foreach c generate flatten(group),COUNT(b); 17 | 18 | store d into 'wordcount'; 19 | -------------------------------------------------------------------------------- /twitter-tools-rm3/README.md: -------------------------------------------------------------------------------- 1 | microblog-demos 2 | =============== 3 | 4 | Examples of using the [2013 TREC microblog API](http://twittertools.cc/). Basically clones IndriRunQuery. 5 | 6 | Getting Started 7 | -------------- 8 | 9 | Once you've cloned the repository, build the package with Maven: 10 | 11 | ``` 12 | $ mvn clean package appassembler:assemble 13 | ``` 14 | 15 | Appassembler will automatically generate a launch scripts for: 16 | 17 | + `target/appassembler/bin/RunQueries`: baseline run. with or without RM3 feedback 18 | 19 | To automatically generate project files for Eclipse: 20 | 21 | ``` 22 | $ mvn eclipse:clean 23 | $ mvn eclipse:eclipse 24 | ``` 25 | 26 | You can then use Eclipse's Import "Existing Projects into Workspace" functionality to import the project. 27 | 28 | 29 | Invoking Sample Runs 30 | -------------------- 31 | After building, you can run the sample programs via somthing like this: 32 | 33 | ``` 34 | $ sh ./target/appassembler/bin/RunQueries ./config/params_run.json 35 | ``` 36 | 37 | which will run a simple baseline query likelihood retrieval. All runnable programs are in ./target/appassembler/bin/ . Also, all programs take a single argument: a JSON-formatted file that will look something like this: 38 | ``` 39 | { 40 | "queries" : "./data/topics.microblog2012.txt", 41 | "host" : "", 42 | "port" : 9090, 43 | "num_results" : 1000, 44 | "fb_docs" : 0, 45 | "fb_terms" : 0, 46 | "group" : "", 47 | "token" : "", 48 | "runtag" : "" 49 | } 50 | ``` 51 | 52 | Hopefully these variables are self-explanatory. Setting either `fb_docs` or `fb_terms` to 0 gives a run with no feedback. If both of these 53 | are set >0, pseudo-feedback using RM3 is used. 54 | 55 | License 56 | ------- 57 | 58 | Licensed under the Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0 59 | -------------------------------------------------------------------------------- /twitter-tools-rm3/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | mvn clean package appassembler:assemble 3 | rm target/appassembler/bin/*bat 4 | chmod +x ./target/appassembler/bin/* 5 | -------------------------------------------------------------------------------- /twitter-tools-rm3/config/run_params_sample.json: -------------------------------------------------------------------------------- 1 | { 2 | "queries" : "./data/topics.microblog2011.json", 3 | "stopper" : "./data/stoplist.twitter", 4 | "fb_docs" : 50, 5 | "fb_terms" : 20, 6 | "host" : "", 7 | "port" : 9090, 8 | "num_results" : 1000, 9 | "group" : "", 10 | "token" : "", 11 | "runtag" : "" 12 | } 13 | -------------------------------------------------------------------------------- /twitter-tools-rm3/data/stoplist.twitter: -------------------------------------------------------------------------------- 1 | de 2 | en 3 | new 4 | y 5 | i'm 6 | el 7 | que 8 | tinyurl.com 9 | en 10 | t.co 11 | rt 12 | ow.ly 13 | bit.ly 14 | twitpic 15 | http 16 | html 17 | www 18 | https 19 | com 20 | php 21 | htm 22 | free 23 | cfm 24 | asp 25 | jsp 26 | a 27 | about 28 | above 29 | according 30 | across 31 | after 32 | afterwards 33 | again 34 | against 35 | albeit 36 | all 37 | almost 38 | alone 39 | along 40 | already 41 | also 42 | although 43 | always 44 | am 45 | among 46 | amongst 47 | an 48 | and 49 | another 50 | any 51 | anybody 52 | anyhow 53 | anyone 54 | anything 55 | anyway 56 | anywhere 57 | apart 58 | are 59 | around 60 | as 61 | at 62 | av 63 | be 64 | became 65 | because 66 | become 67 | becomes 68 | becoming 69 | been 70 | before 71 | beforehand 72 | behind 73 | being 74 | below 75 | beside 76 | besides 77 | between 78 | beyond 79 | both 80 | but 81 | by 82 | can 83 | cannot 84 | canst 85 | certain 86 | cf 87 | choose 88 | contrariwise 89 | cos 90 | could 91 | cu 92 | day 93 | do 94 | does 95 | doesn't 96 | doing 97 | dost 98 | doth 99 | double 100 | down 101 | dual 102 | during 103 | each 104 | either 105 | else 106 | elsewhere 107 | enough 108 | et 109 | etc 110 | even 111 | ever 112 | every 113 | everybody 114 | everyone 115 | everything 116 | everywhere 117 | except 118 | excepted 119 | excepting 120 | exception 121 | exclude 122 | excluding 123 | exclusive 124 | far 125 | farther 126 | farthest 127 | few 128 | ff 129 | first 130 | for 131 | formerly 132 | forth 133 | forward 134 | from 135 | front 136 | further 137 | furthermore 138 | furthest 139 | get 140 | go 141 | had 142 | halves 143 | hardly 144 | has 145 | hast 146 | hath 147 | have 148 | he 149 | hence 150 | henceforth 151 | her 152 | here 153 | hereabouts 154 | hereafter 155 | hereby 156 | herein 157 | hereto 158 | hereupon 159 | hers 160 | herself 161 | him 162 | himself 163 | hindmost 164 | his 165 | hither 166 | hitherto 167 | how 168 | however 169 | howsoever 170 | i 171 | ie 172 | if 173 | in 174 | inasmuch 175 | inc 176 | include 177 | included 178 | including 179 | indeed 180 | indoors 181 | inside 182 | insomuch 183 | instead 184 | into 185 | inward 186 | inwards 187 | is 188 | it 189 | its 190 | itself 191 | just 192 | kind 193 | kg 194 | km 195 | last 196 | latter 197 | latterly 198 | less 199 | lest 200 | let 201 | like 202 | little 203 | ltd 204 | many 205 | may 206 | maybe 207 | me 208 | meantime 209 | meanwhile 210 | might 211 | moreover 212 | most 213 | mostly 214 | more 215 | mr 216 | mrs 217 | ms 218 | much 219 | must 220 | my 221 | myself 222 | namely 223 | need 224 | neither 225 | never 226 | nevertheless 227 | next 228 | no 229 | nobody 230 | none 231 | nonetheless 232 | noone 233 | nope 234 | nor 235 | not 236 | nothing 237 | notwithstanding 238 | now 239 | nowadays 240 | nowhere 241 | of 242 | off 243 | often 244 | ok 245 | on 246 | once 247 | one 248 | only 249 | onto 250 | or 251 | other 252 | others 253 | otherwise 254 | ought 255 | our 256 | ours 257 | ourselves 258 | out 259 | outside 260 | over 261 | own 262 | per 263 | perhaps 264 | plenty 265 | provide 266 | quite 267 | rather 268 | really 269 | round 270 | said 271 | sake 272 | same 273 | sang 274 | save 275 | saw 276 | see 277 | seeing 278 | seem 279 | seemed 280 | seeming 281 | seems 282 | seen 283 | seldom 284 | selves 285 | sent 286 | several 287 | shalt 288 | she 289 | should 290 | shown 291 | sideways 292 | since 293 | slept 294 | slew 295 | slung 296 | slunk 297 | smote 298 | so 299 | some 300 | somebody 301 | somehow 302 | someone 303 | something 304 | sometime 305 | sometimes 306 | somewhat 307 | somewhere 308 | spake 309 | spat 310 | spoke 311 | spoken 312 | sprang 313 | sprung 314 | stave 315 | staves 316 | still 317 | such 318 | supposing 319 | than 320 | that 321 | the 322 | thee 323 | their 324 | them 325 | themselves 326 | then 327 | thence 328 | thenceforth 329 | there 330 | thereabout 331 | thereabouts 332 | thereafter 333 | thereby 334 | therefore 335 | therein 336 | thereof 337 | thereon 338 | thereto 339 | thereupon 340 | these 341 | they 342 | this 343 | those 344 | thou 345 | though 346 | thrice 347 | through 348 | throughout 349 | thru 350 | thus 351 | thy 352 | thyself 353 | till 354 | to 355 | together 356 | too 357 | toward 358 | towards 359 | ugh 360 | unable 361 | under 362 | underneath 363 | unless 364 | unlike 365 | until 366 | up 367 | upon 368 | upward 369 | upwards 370 | us 371 | use 372 | used 373 | using 374 | very 375 | via 376 | vs 377 | want 378 | was 379 | we 380 | week 381 | well 382 | were 383 | what 384 | whatever 385 | whatsoever 386 | when 387 | whence 388 | whenever 389 | whensoever 390 | where 391 | whereabouts 392 | whereafter 393 | whereas 394 | whereat 395 | whereby 396 | wherefore 397 | wherefrom 398 | wherein 399 | whereinto 400 | whereof 401 | whereon 402 | wheresoever 403 | whereto 404 | whereunto 405 | whereupon 406 | wherever 407 | wherewith 408 | whether 409 | whew 410 | which 411 | whichever 412 | whichsoever 413 | while 414 | whilst 415 | whither 416 | who 417 | whoa 418 | whoever 419 | whole 420 | whom 421 | whomever 422 | whomsoever 423 | whose 424 | whosoever 425 | why 426 | will 427 | wilt 428 | with 429 | within 430 | without 431 | worse 432 | worst 433 | would 434 | wow 435 | ye 436 | yet 437 | year 438 | yippee 439 | you 440 | your 441 | yours 442 | yourself 443 | yourselves 444 | -------------------------------------------------------------------------------- /twitter-tools-rm3/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | edu.illinois.lis 4 | twitter-tools-rm3 5 | jar 6 | 0.1-SNAPSHOT 7 | twitter-tools-rm3 8 | demo classes for using the TREC 2013 Microblog API 9 | http://people.lis.illinois.edu/~mefron/ 10 | 11 | 12 | 13 | The Apache Software License, Version 2.0 14 | http://www.apache.org/licenses/LICENSE-2.0.txt 15 | repo 16 | 17 | 18 | 19 | 20 | scm:git:git@github.com:milesefron/microblog-demos.git 21 | scm:git:git@github.com:milesefron/microblog-demos.git 22 | git@github.com:milesefron/microblog-demos.git 23 | 24 | 25 | 26 | 27 | milesefron 28 | Miles Efron 29 | mefron@illinois.edu 30 | 31 | 32 | 33 | 34 | org.sonatype.oss 35 | oss-parent 36 | 7 37 | 38 | 39 | 40 | 41 | 42 | org.codehaus.mojo 43 | appassembler-maven-plugin 44 | 1.3.1 45 | 46 | 47 | 48 | edu.illinois.lis.search.RunQueries 49 | RunQueries 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | UTF-8 59 | UTF-8 60 | 61 | 62 | 63 | 64 | junit 65 | junit 66 | 4.11 67 | test 68 | 69 | 70 | commons-cli 71 | commons-cli 72 | 1.2 73 | 74 | 75 | commons-io 76 | commons-io 77 | 2.4 78 | 79 | 80 | org.apache.commons 81 | commons-math3 82 | 3.2 83 | 84 | 85 | cc.twittertools 86 | twitter-tools-core 87 | 1.4.2 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /twitter-tools-rm3/src/main/java/edu/illinois/lis/feedback/FeedbackModel.java: -------------------------------------------------------------------------------- 1 | package edu.illinois.lis.feedback; 2 | 3 | import java.text.DecimalFormat; 4 | import java.util.Collections; 5 | import java.util.HashMap; 6 | import java.util.Iterator; 7 | import java.util.List; 8 | import java.util.Map; 9 | 10 | import cc.twittertools.thrift.gen.TResult; 11 | import edu.illinois.lis.document.FeatureVector; 12 | import edu.illinois.lis.query.GQuery; 13 | import edu.illinois.lis.utils.KeyValuePair; 14 | import edu.illinois.lis.utils.ScorableComparator; 15 | import edu.illinois.lis.utils.Stopper; 16 | 17 | 18 | public abstract class FeedbackModel { 19 | protected List relDocs; 20 | protected GQuery originalQuery; 21 | protected int fbDocCount = 20; 22 | protected int fbTermCount = 20; 23 | protected List features; // these will be KeyValuePair objects 24 | protected Stopper stopper; 25 | 26 | 27 | 28 | public void build(Stopper stopper) { 29 | this.stopper = stopper; 30 | } 31 | 32 | 33 | 34 | public GQuery asGquery() { 35 | GQuery newQuery = new GQuery(); 36 | newQuery.setTitle(originalQuery.getTitle()); 37 | newQuery.setText(originalQuery.getText()); 38 | 39 | FeatureVector finalVector = new FeatureVector(stopper); 40 | 41 | ScorableComparator comparator = new ScorableComparator(true); 42 | Collections.sort(features, comparator); 43 | Iterator it = features.iterator(); 44 | 45 | int i=0; 46 | while(it.hasNext() && i++ < fbTermCount) { 47 | KeyValuePair tuple = it.next(); 48 | finalVector.addTerm(tuple.getKey(), tuple.getScore()); 49 | } 50 | 51 | newQuery.setFeatureVector(finalVector); 52 | 53 | return newQuery; 54 | } 55 | 56 | public FeatureVector asFeatureVector() { 57 | FeatureVector f = new FeatureVector(stopper); 58 | Iterator it = features.iterator(); 59 | 60 | while(it.hasNext()) { 61 | KeyValuePair tuple = it.next(); 62 | f.addTerm(tuple.getKey(), tuple.getScore()); 63 | } 64 | 65 | return f; 66 | } 67 | 68 | public Map asMap() { 69 | Map map = new HashMap(features.size()); 70 | Iterator it = features.iterator(); 71 | while(it.hasNext()) { 72 | KeyValuePair tuple = it.next(); 73 | map.put(tuple.getKey(), tuple.getScore()); 74 | } 75 | 76 | return map; 77 | } 78 | 79 | @Override 80 | public String toString() { 81 | return toString(features.size()); 82 | } 83 | 84 | public String toString(int k) { 85 | DecimalFormat format = new DecimalFormat("#.#####################"); 86 | 87 | 88 | 89 | ScorableComparator comparator = new ScorableComparator(true); 90 | Collections.sort(features, comparator); 91 | 92 | double sum = 0.0; 93 | Iterator it = features.iterator(); 94 | int i=0; 95 | while(it.hasNext() && i++ < k) { 96 | sum += it.next().getScore(); 97 | } 98 | 99 | StringBuilder b = new StringBuilder(); 100 | it = features.iterator(); 101 | i=0; 102 | while(it.hasNext() && i++ < k) { 103 | KeyValuePair tuple = it.next(); 104 | b.append(format.format(tuple.getScore()/sum) + " " + tuple.getKey() + "\n"); 105 | } 106 | 107 | return b.toString(); 108 | } 109 | 110 | 111 | public void setRes(List relDocs) { 112 | this.relDocs = relDocs; 113 | } 114 | public void setOriginalQuery(GQuery originalQuery) { 115 | this.originalQuery = originalQuery; 116 | } 117 | public void setFbTermCount(int fbTermCount) { 118 | this.fbTermCount = fbTermCount; 119 | } 120 | 121 | 122 | 123 | 124 | } 125 | -------------------------------------------------------------------------------- /twitter-tools-rm3/src/main/java/edu/illinois/lis/feedback/FeedbackRelevanceModel.java: -------------------------------------------------------------------------------- 1 | package edu.illinois.lis.feedback; 2 | 3 | import java.util.HashSet; 4 | import java.util.Iterator; 5 | import java.util.LinkedList; 6 | import java.util.List; 7 | import java.util.Set; 8 | 9 | import cc.twittertools.thrift.gen.TResult; 10 | 11 | import edu.illinois.lis.document.FeatureVector; 12 | import edu.illinois.lis.utils.Stopper; 13 | import edu.illinois.lis.utils.KeyValuePair; 14 | 15 | 16 | 17 | 18 | public class FeedbackRelevanceModel extends FeedbackModel { 19 | private boolean stripNumbers = false; 20 | private double[] docWeights = null; 21 | 22 | @Override 23 | public void build(Stopper stopper) { 24 | this.stopper = stopper; 25 | try { 26 | Set vocab = new HashSet(); 27 | List fbDocVectors = new LinkedList(); 28 | 29 | 30 | 31 | double[] rsvs = new double[relDocs.size()]; 32 | int k=0; 33 | Iterator hitIterator = relDocs.iterator(); 34 | while(hitIterator.hasNext()) { 35 | TResult hit = hitIterator.next(); 36 | rsvs[k++] = hit.getRsv(); 37 | } 38 | 39 | hitIterator = relDocs.iterator(); 40 | while(hitIterator.hasNext()) { 41 | TResult hit = hitIterator.next(); 42 | String text = hit.getText().toLowerCase(); 43 | FeatureVector docVector = new FeatureVector(text, stopper); 44 | vocab.addAll(docVector.getFeatures()); 45 | fbDocVectors.add(docVector); 46 | } 47 | 48 | features = new LinkedList(); 49 | 50 | 51 | Iterator it = vocab.iterator(); 52 | while(it.hasNext()) { 53 | String term = it.next(); 54 | double fbWeight = 0.0; 55 | 56 | Iterator docIT = fbDocVectors.iterator(); 57 | k=0; 58 | while(docIT.hasNext()) { 59 | double docWeight = 1.0; 60 | if(docWeights != null) 61 | docWeight = docWeights[k]; 62 | FeatureVector docVector = docIT.next(); 63 | double docProb = docVector.getFeaturetWeight(term) / docVector.getLength(); 64 | docProb *= rsvs[k++] * docWeight; 65 | 66 | fbWeight += docProb; 67 | } 68 | 69 | fbWeight /= (double)fbDocVectors.size(); 70 | 71 | KeyValuePair tuple = new KeyValuePair(term, fbWeight); 72 | features.add(tuple); 73 | } 74 | 75 | 76 | 77 | } catch (Exception e) { 78 | e.printStackTrace(); 79 | } 80 | } 81 | 82 | public void setDocWeights(double[] docWeights) { 83 | this.docWeights = docWeights; 84 | } 85 | 86 | 87 | } 88 | -------------------------------------------------------------------------------- /twitter-tools-rm3/src/main/java/edu/illinois/lis/query/GQueries.java: -------------------------------------------------------------------------------- 1 | package edu.illinois.lis.query; 2 | 3 | import java.util.Iterator; 4 | 5 | /** 6 | * A container for holding a bunch of GQuery objects, with various types of convenience functionality added in 7 | * instantiating classes. 8 | * 9 | * @author Miles Efron 10 | * 11 | */ 12 | public interface GQueries { 13 | public void read(String pathToQueries); 14 | 15 | public Iterator iterator(); 16 | 17 | public GQuery getIthQuery(int i); 18 | 19 | public GQuery getNamedQuery(String queryName); 20 | 21 | public int numQueries(); 22 | } 23 | -------------------------------------------------------------------------------- /twitter-tools-rm3/src/main/java/edu/illinois/lis/query/GQueriesJsonImpl.java: -------------------------------------------------------------------------------- 1 | package edu.illinois.lis.query; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.FileReader; 5 | import java.util.ArrayList; 6 | import java.util.HashMap; 7 | import java.util.Iterator; 8 | import java.util.List; 9 | import java.util.Map; 10 | 11 | import org.apache.log4j.Logger; 12 | 13 | 14 | import com.google.gson.JsonArray; 15 | import com.google.gson.JsonElement; 16 | import com.google.gson.JsonObject; 17 | import com.google.gson.JsonParser; 18 | 19 | import edu.illinois.lis.document.FeatureVector; 20 | 21 | 22 | /** 23 | * reads and holds GQueries stored as a serialized JSON file on disk. 24 | * 25 | * @author Miles Efron 26 | * 27 | */ 28 | public class GQueriesJsonImpl implements GQueries { 29 | private static final Logger LOG = Logger.getLogger(GQueriesJsonImpl.class); 30 | 31 | private static final JsonParser JSON_PARSER = new JsonParser(); 32 | private List queryList; 33 | private Map nameToIndex; 34 | 35 | public void read(String pathToQueries) { 36 | JsonObject obj = null; 37 | try { 38 | obj = (JsonObject) JSON_PARSER.parse(new BufferedReader(new FileReader(pathToQueries))); 39 | } catch (Exception e) { 40 | LOG.fatal("died reading queries from json file", e); 41 | System.exit(-1); 42 | } 43 | 44 | 45 | JsonArray queryObjectArray = obj.getAsJsonArray("queries"); 46 | queryList = new ArrayList(queryObjectArray.size()); 47 | nameToIndex = new HashMap(queryList.size()); 48 | Iterator queryObjectIterator = queryObjectArray.iterator(); 49 | int k=0; 50 | while(queryObjectIterator.hasNext()) { 51 | JsonObject queryObject = (JsonObject) queryObjectIterator.next(); 52 | String title = queryObject.get("title").getAsString(); 53 | String text = queryObject.get("text").getAsString(); 54 | double epoch = queryObject.get("epoch").getAsDouble(); 55 | long querytweettime = queryObject.get("querytweettime").getAsLong(); 56 | nameToIndex.put(title, k++); 57 | FeatureVector featureVector = new FeatureVector(null); 58 | JsonArray modelObjectArray = queryObject.getAsJsonArray("model"); 59 | Iterator featureIterator = modelObjectArray.iterator(); 60 | while(featureIterator.hasNext()) { 61 | JsonObject featureObject = (JsonObject)featureIterator.next(); 62 | double weight = featureObject.get("weight").getAsDouble(); 63 | String feature = featureObject.get("feature").getAsString(); 64 | featureVector.addTerm(feature, weight); 65 | } 66 | 67 | 68 | GQuery gQuery = new GQuery(); 69 | gQuery.setTitle(title); 70 | gQuery.setText(text); 71 | gQuery.setEpoch(epoch); 72 | gQuery.setQuerytweettime(querytweettime); 73 | gQuery.setFeatureVector(featureVector); 74 | 75 | queryList.add(gQuery); 76 | 77 | } 78 | } 79 | 80 | public GQuery getIthQuery(int i) { 81 | if(queryList == null || i >= queryList.size()) { 82 | LOG.fatal("died trying to get query number " + i + " when we have only " + queryList.size() + " queries."); 83 | System.exit(-1); 84 | } 85 | return queryList.get(i); 86 | } 87 | 88 | public GQuery getNamedQuery(String queryName) { 89 | if(queryList == null || ! nameToIndex.containsKey(queryName)) { 90 | LOG.fatal("died trying to get query " + queryName + "."); 91 | System.exit(-1); } 92 | return queryList.get(nameToIndex.get(queryName)); 93 | } 94 | 95 | 96 | public Iterator iterator() { 97 | return queryList.iterator(); 98 | } 99 | 100 | public int numQueries() { 101 | return queryList.size(); 102 | } 103 | 104 | @Override 105 | public String toString() { 106 | StringBuilder b = new StringBuilder(); 107 | 108 | Iterator it = queryList.iterator(); 109 | while(it.hasNext()) { 110 | b.append(it.next()); 111 | } 112 | 113 | return b.toString(); 114 | } 115 | 116 | 117 | 118 | } 119 | -------------------------------------------------------------------------------- /twitter-tools-rm3/src/main/java/edu/illinois/lis/query/GQuery.java: -------------------------------------------------------------------------------- 1 | package edu.illinois.lis.query; 2 | 3 | 4 | import java.util.HashMap; 5 | import java.util.Map; 6 | 7 | import edu.illinois.lis.document.FeatureVector; 8 | 9 | 10 | /** 11 | * a fairly rich representation of a query (or query-like) object. at a minimum, it will typically contain a 12 | * name some text. 13 | * 14 | * @author Miles Efron 15 | * 16 | */ 17 | public class GQuery { 18 | private String name; 19 | private String text; 20 | private double epoch = -1.0; 21 | private long querytweettime = -1L; 22 | private FeatureVector featureVector; 23 | 24 | 25 | public String getTitle() { 26 | return name; 27 | } 28 | public String getText() { 29 | return text; 30 | } 31 | public void setTitle(String name) { 32 | this.name = name; 33 | } 34 | public void setText(String text) { 35 | this.text = text; 36 | } 37 | public void setEpoch(double epoch) { 38 | this.epoch = epoch; 39 | } 40 | public void setQuerytweettime(long querytweettime) { 41 | this.querytweettime = querytweettime; 42 | } 43 | public double getEpoch() { 44 | return epoch; 45 | } 46 | public long getQuerytweettime() { 47 | return querytweettime; 48 | } 49 | 50 | 51 | public FeatureVector getFeatureVector() { 52 | return featureVector; 53 | } 54 | public void setFeatureVector(FeatureVector featureVector) { 55 | this.featureVector = featureVector; 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /twitter-tools-rm3/src/main/java/edu/illinois/lis/query/TrecTemporalTopic.java: -------------------------------------------------------------------------------- 1 | package edu.illinois.lis.query; 2 | 3 | import com.google.common.base.Preconditions; 4 | 5 | public class TrecTemporalTopic { 6 | private String query; 7 | private String id; 8 | private long time; 9 | private double epoch; 10 | 11 | public TrecTemporalTopic(String id, String query, long time, double epoch) { 12 | this.id = Preconditions.checkNotNull(id); 13 | this.query = Preconditions.checkNotNull(query); 14 | Preconditions.checkArgument(time > 0); 15 | this.time = time; 16 | Preconditions.checkArgument(epoch > 0); 17 | this.epoch = epoch; 18 | } 19 | 20 | public String getId() { 21 | return id; 22 | } 23 | 24 | public String getQuery() { 25 | return query; 26 | } 27 | 28 | public long getQueryTweetTime() { 29 | return time; 30 | } 31 | 32 | public double getEpoch() { 33 | return epoch; 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /twitter-tools-rm3/src/main/java/edu/illinois/lis/query/TrecTemporalTopicSet.java: -------------------------------------------------------------------------------- 1 | package edu.illinois.lis.query; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.text.ParseException; 6 | import java.text.SimpleDateFormat; 7 | import java.util.Iterator; 8 | import java.util.List; 9 | import java.util.regex.Matcher; 10 | import java.util.regex.Pattern; 11 | 12 | import com.google.common.base.Charsets; 13 | import com.google.common.base.Joiner; 14 | import com.google.common.base.Preconditions; 15 | import com.google.common.collect.Lists; 16 | import com.google.common.io.Files; 17 | 18 | public class TrecTemporalTopicSet implements Iterable{ 19 | private List queries = Lists.newArrayList(); 20 | 21 | private TrecTemporalTopicSet() {} 22 | 23 | private void add(TrecTemporalTopic q) { 24 | queries.add(q); 25 | } 26 | 27 | public Iterator iterator() { 28 | return queries.iterator(); 29 | } 30 | 31 | private static final String DATE_FORMAT = "EEE MMM d k:m:s ZZZZZ yyyy"; //"Fri Mar 29 11:03:41 +0000 2013"; 32 | 33 | private static final Pattern TOP_PATTERN = Pattern.compile("", Pattern.DOTALL); 34 | private static final Pattern NUM_PATTERN = Pattern.compile(" Number: (MB\\d+) ", Pattern.DOTALL); 35 | 36 | // TREC 2011 topics uses tag 37 | private static final Pattern TITLE_PATTERN = Pattern.compile("<title>\\s*(.*?)\\s*", Pattern.DOTALL); 38 | // TREC 2012 topics use tag 39 | private static final Pattern TITLE_PATTERN2 = Pattern.compile("\\s*(.*?)\\s*", Pattern.DOTALL); 40 | 41 | private static final Pattern TIMESTAMP_PATTERN = Pattern.compile("\\s*(.*?)\\s*", Pattern.DOTALL); 42 | 43 | private static final Pattern TWEETTIME_PATTERN = Pattern.compile("\\s*(\\d+)\\s*", Pattern.DOTALL); 44 | 45 | 46 | public static TrecTemporalTopicSet fromFile(File f) throws IOException { 47 | Preconditions.checkNotNull(f); 48 | Preconditions.checkArgument(f.exists()); 49 | 50 | String s = Joiner.on("\n").join(Files.readLines(f, Charsets.UTF_8)); 51 | TrecTemporalTopicSet queries = new TrecTemporalTopicSet(); 52 | 53 | Matcher matcher = TOP_PATTERN.matcher(s); 54 | while (matcher.find()) { 55 | String top = matcher.group(0); 56 | 57 | 58 | Matcher m = NUM_PATTERN.matcher(top); 59 | if (!m.find()) { 60 | throw new IOException("Error parsing " + f); 61 | } 62 | String id = m.group(1); 63 | // Topics from 2012 are inconsistently numbered, 64 | // e.g., MB051 should match the qrels, which has MB51 65 | if (id.matches("MB0\\d\\d")) { 66 | id = id.replace("MB0", "MB"); 67 | } 68 | 69 | m = TITLE_PATTERN.matcher(top); 70 | if (!m.find()) { 71 | m = TITLE_PATTERN2.matcher(top); 72 | if (!m.find()) { 73 | throw new IOException("Error parsing " + f); 74 | } 75 | } 76 | String text = m.group(1); 77 | 78 | m = TIMESTAMP_PATTERN.matcher(top); 79 | if (!m.find()) { 80 | throw new IOException("Error parsing " + f); 81 | } 82 | double epoch = -1.0; 83 | try { 84 | epoch = (new SimpleDateFormat(DATE_FORMAT)).parse(m.group(1)).getTime() / 1000; 85 | } catch (ParseException e) { 86 | epoch = -1.0; 87 | } 88 | 89 | m = TWEETTIME_PATTERN.matcher(top); 90 | if (!m.find()) { 91 | throw new IOException("Error parsing " + f); 92 | } 93 | long time = Long.parseLong(m.group(1)); 94 | 95 | 96 | 97 | queries.add(new TrecTemporalTopic(id, text, time, epoch)); 98 | } 99 | return queries; 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /twitter-tools-rm3/src/main/java/edu/illinois/lis/rerank/SearchReranker.java: -------------------------------------------------------------------------------- 1 | package edu.illinois.lis.rerank; 2 | 3 | import java.util.Collections; 4 | import java.util.List; 5 | 6 | 7 | import cc.twittertools.thrift.gen.TResult; 8 | 9 | 10 | public abstract class SearchReranker { 11 | protected List results; 12 | 13 | protected abstract void score(); 14 | 15 | public List getReranked() { 16 | TResultComparator comparator = new TResultComparator(true); 17 | Collections.sort(results, comparator); 18 | return results; 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /twitter-tools-rm3/src/main/java/edu/illinois/lis/rerank/TResultComparator.java: -------------------------------------------------------------------------------- 1 | package edu.illinois.lis.rerank; 2 | 3 | import java.util.Comparator; 4 | 5 | import cc.twittertools.thrift.gen.TResult; 6 | 7 | 8 | public class TResultComparator implements Comparator{ 9 | private boolean decreasing = true; 10 | 11 | public TResultComparator(boolean decreasing) { 12 | this.decreasing = decreasing; 13 | } 14 | public int compare(TResult x, TResult y) { 15 | double xVal = x.getRsv(); 16 | double yVal = y.getRsv(); 17 | 18 | if(decreasing) { 19 | return (xVal > yVal ? -1 : (xVal == yVal ? 0 : 1)); 20 | } else { 21 | return (xVal < yVal ? -1 : (xVal == yVal ? 0 : 1)); 22 | } 23 | 24 | } 25 | 26 | } 27 | -------------------------------------------------------------------------------- /twitter-tools-rm3/src/main/java/edu/illinois/lis/search/RunQueries.java: -------------------------------------------------------------------------------- 1 | package edu.illinois.lis.search; 2 | 3 | import java.io.PrintStream; 4 | import java.util.Iterator; 5 | import java.util.List; 6 | 7 | 8 | 9 | 10 | 11 | 12 | import cc.twittertools.search.api.TrecSearchThriftClient; 13 | import cc.twittertools.thrift.gen.TResult; 14 | import edu.illinois.lis.document.FeatureVector; 15 | import edu.illinois.lis.feedback.FeedbackRelevanceModel; 16 | import edu.illinois.lis.query.GQueries; 17 | import edu.illinois.lis.query.GQueriesJsonImpl; 18 | import edu.illinois.lis.query.GQuery; 19 | import edu.illinois.lis.utils.ParameterBroker; 20 | import edu.illinois.lis.utils.Stopper; 21 | 22 | public class RunQueries { 23 | private static final String DEFAULT_RUNTAG = "lucene4lm"; 24 | 25 | private static final String HOST_OPTION = "host"; 26 | private static final String PORT_OPTION = "port"; 27 | private static final String QUERIES_OPTION = "queries"; 28 | private static final String STOPPER_OPTION = "stopper"; 29 | private static final String FB_DOCS_OPTION = "fb_docs"; 30 | private static final String FB_TERMS_OPTION = "fb_terms"; 31 | private static final String NUM_RESULTS_OPTION = "num_results"; 32 | private static final String GROUP_OPTION = "group"; 33 | private static final String TOKEN_OPTION = "token"; 34 | private static final String RUNTAG_OPTION = "runtag"; 35 | 36 | private static final double ORIG_QUERY_WEIGHT = 0.5; 37 | 38 | private RunQueries() {} 39 | 40 | public static void main(String[] args) throws Exception { 41 | ParameterBroker params = new ParameterBroker(args[0]); 42 | 43 | PrintStream out = new PrintStream(System.out, true, "UTF-8"); 44 | PrintStream err = new PrintStream(System.err, true, "UTF-8"); 45 | 46 | GQueries queries = new GQueriesJsonImpl(); 47 | queries.read(params.getParamValue(QUERIES_OPTION)); 48 | 49 | Stopper stopper = null; 50 | if(params.getParamValue(STOPPER_OPTION) != null) 51 | stopper = new Stopper(params.getParamValue(STOPPER_OPTION)); 52 | 53 | // max number of docs to send to output 54 | int numResults = 1000; 55 | try { 56 | if (params.getParamValue(NUM_RESULTS_OPTION) != null) { 57 | numResults = Integer.parseInt(params.getParamValue(NUM_RESULTS_OPTION)); 58 | } 59 | } catch (NumberFormatException e) { 60 | err.println("Invalid " + NUM_RESULTS_OPTION + ": " + params.getParamValue(NUM_RESULTS_OPTION)); 61 | System.exit(-1); 62 | } 63 | 64 | int fbDocs = 0; 65 | try { 66 | if (params.getParamValue(FB_DOCS_OPTION) != null) { 67 | fbDocs = Integer.parseInt(params.getParamValue(FB_DOCS_OPTION)); 68 | } 69 | } catch (NumberFormatException e) { 70 | err.println("Invalid " + FB_DOCS_OPTION + ": " + params.getParamValue(FB_DOCS_OPTION)); 71 | System.exit(-1); 72 | } 73 | 74 | int fbTerms = 0; 75 | try { 76 | if (params.getParamValue(FB_TERMS_OPTION) != null) { 77 | fbTerms = Integer.parseInt(params.getParamValue(FB_TERMS_OPTION)); 78 | } 79 | } catch (NumberFormatException e) { 80 | err.println("Invalid " + FB_TERMS_OPTION + ": " + params.getParamValue(FB_TERMS_OPTION)); 81 | System.exit(-1); 82 | } 83 | 84 | // authentication credentials 85 | String group = params.getParamValue(GROUP_OPTION); 86 | if(group==null) { 87 | err.println("Invalid " + GROUP_OPTION + ": must set a valid group ID"); 88 | System.exit(-1); 89 | } 90 | String token = params.getParamValue(TOKEN_OPTION); 91 | if(group==null) { 92 | err.println("Invalid " + TOKEN_OPTION + ": must set a valid authentication token"); 93 | System.exit(-1); 94 | } 95 | 96 | TrecSearchThriftClient client = new TrecSearchThriftClient(params.getParamValue(HOST_OPTION), 97 | Integer.parseInt(params.getParamValue(PORT_OPTION)), group, token); 98 | 99 | Iterator queryIterator = queries.iterator(); 100 | while(queryIterator.hasNext()) { 101 | GQuery query = queryIterator.next(); 102 | System.err.println(query.getTitle()); 103 | String queryText = query.getText(); 104 | 105 | // stupid hack. need to lowercase the query vector 106 | FeatureVector temp = new FeatureVector(null); 107 | Iterator qTerms = query.getFeatureVector().iterator(); 108 | while(qTerms.hasNext()) { 109 | String term = qTerms.next(); 110 | temp.addTerm(term.toLowerCase(), query.getFeatureVector().getFeaturetWeight(term)); 111 | } 112 | temp.normalizeToOne(); 113 | query.setFeatureVector(temp); 114 | 115 | 116 | // if we're doing feedback 117 | if(fbDocs > 0 && fbTerms > 0) { 118 | List results = client.search(queryText, query.getQuerytweettime(), fbDocs); 119 | FeedbackRelevanceModel fb = new FeedbackRelevanceModel(); 120 | fb.setOriginalQuery(query); 121 | fb.setRes(results); 122 | fb.build(stopper); 123 | 124 | FeatureVector fbVector = fb.asFeatureVector(); 125 | fbVector.pruneToSize(fbTerms); 126 | fbVector.normalizeToOne(); 127 | fbVector = FeatureVector.interpolate(query.getFeatureVector(), fbVector, ORIG_QUERY_WEIGHT); 128 | 129 | System.err.println(fbVector); 130 | 131 | StringBuilder builder = new StringBuilder(); 132 | Iterator terms = fbVector.iterator(); 133 | while(terms.hasNext()) { 134 | String term = terms.next(); 135 | if(term.length() < 2) 136 | continue; 137 | double prob = fbVector.getFeaturetWeight(term); 138 | builder.append(term + "^" + prob + " "); 139 | } 140 | queryText = builder.toString().trim(); 141 | 142 | } 143 | 144 | List results = client.search(queryText, query.getQuerytweettime(), numResults); 145 | String runTag = params.getParamValue(RUNTAG_OPTION); 146 | if(runTag==null) 147 | runTag = DEFAULT_RUNTAG; 148 | 149 | int i = 1; 150 | Iterator hitIterator = results.iterator(); 151 | while(hitIterator.hasNext()) { 152 | TResult hit = hitIterator.next(); 153 | out.println(String.format("%s Q0 %s %d %f %s", query.getTitle(), hit.getId(), i, 154 | hit.getRsv(), runTag)); 155 | 156 | if(i++ >= numResults) 157 | break; 158 | } 159 | 160 | } 161 | out.close(); 162 | } 163 | } 164 | -------------------------------------------------------------------------------- /twitter-tools-rm3/src/main/java/edu/illinois/lis/searchsource/IndexWrapperMicroblogApi.java: -------------------------------------------------------------------------------- 1 | package edu.illinois.lis.searchsource; 2 | 3 | import java.util.HashMap; 4 | import java.util.Iterator; 5 | import java.util.List; 6 | import java.util.Map; 7 | 8 | 9 | 10 | import cc.twittertools.search.api.TrecSearchThriftClient; 11 | import cc.twittertools.thrift.gen.TResult; 12 | import edu.illinois.lis.document.FeatureVector; 13 | 14 | 15 | 16 | public class IndexWrapperMicroblogApi { 17 | // API-specific variables 18 | private String hostname; 19 | private int port; 20 | private String groupId; 21 | private String authToken; 22 | 23 | private Map seenDocs; // we store the text of any docs we've harvested. e.g. for FB. 24 | 25 | private TrecSearchThriftClient client; 26 | 27 | 28 | public IndexWrapperMicroblogApi(String hostname, int port, String groupId, String authToken) { 29 | this.hostname = hostname; 30 | this.port = port; 31 | this.groupId = groupId; 32 | this.authToken = authToken; 33 | 34 | seenDocs = new HashMap(); 35 | 36 | try { 37 | client = new TrecSearchThriftClient(hostname, port, groupId, authToken); 38 | } catch (Exception e) { 39 | 40 | } 41 | } 42 | 43 | public double docCount() { 44 | return 0; 45 | } 46 | 47 | 48 | public double docFreq(String arg0) { 49 | return 0; 50 | } 51 | 52 | public double termFreq(String arg0) { 53 | return 0; 54 | } 55 | 56 | public double termTokenCount() { 57 | return 0; 58 | } 59 | 60 | public double termTypeCount() { 61 | return 0; 62 | } 63 | 64 | public Object getActualIndex() { 65 | return null; 66 | } 67 | 68 | public FeatureVector getDocVector(String docId) { 69 | if(seenDocs.containsKey(docId)) 70 | return new FeatureVector(seenDocs.get(docId), null); 71 | 72 | // we should also be able to ping the API to get docs we haven't already seen 73 | return null; 74 | } 75 | 76 | public List runQuery(String query, long upperBoundTime, int count) { 77 | List results = null; 78 | try { 79 | results = client.search(query,upperBoundTime, count); 80 | 81 | // store our text for future reference 82 | Iterator resultIterator = results.iterator(); 83 | while(resultIterator.hasNext()) { 84 | TResult result = resultIterator.next(); 85 | seenDocs.put(Long.toString(result.getId()), result.getText()); 86 | } 87 | } catch (Exception e) { 88 | 89 | } 90 | return results; 91 | } 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | } 100 | -------------------------------------------------------------------------------- /twitter-tools-rm3/src/main/java/edu/illinois/lis/utils/ExtractGqueriesFromTrecFormat.java: -------------------------------------------------------------------------------- 1 | package edu.illinois.lis.utils; 2 | 3 | 4 | 5 | 6 | import java.io.File; 7 | 8 | import com.google.gson.Gson; 9 | import com.google.gson.GsonBuilder; 10 | import com.google.gson.JsonArray; 11 | import com.google.gson.JsonObject; 12 | 13 | import edu.illinois.lis.query.TrecTemporalTopicSet; 14 | 15 | 16 | /** 17 | * creates a simple set of gQueries from the official TREC MB topic file 18 | * 19 | * @author Miles Efron 20 | * 21 | */ 22 | public class ExtractGqueriesFromTrecFormat { 23 | 24 | private JsonObject outputObjects = null; 25 | private String pathToTrecTopics; 26 | 27 | public ExtractGqueriesFromTrecFormat(String pathToTrecTopics) { 28 | this.pathToTrecTopics = pathToTrecTopics; 29 | outputObjects = new JsonObject(); 30 | } 31 | 32 | public void harvest() { 33 | TrecTemporalTopicSet topicsFile = null; 34 | try { 35 | topicsFile = TrecTemporalTopicSet.fromFile(new File(pathToTrecTopics)); 36 | } catch (Exception e) { 37 | e.printStackTrace(); 38 | } 39 | 40 | JsonArray outputJsonArray = new JsonArray(); 41 | for(edu.illinois.lis.query.TrecTemporalTopic query : topicsFile) { 42 | 43 | 44 | JsonObject outputQueryObject = new JsonObject(); 45 | outputQueryObject.addProperty("title", query.getId()); 46 | outputQueryObject.addProperty("text", query.getQuery()); 47 | outputQueryObject.addProperty("epoch", Double.toString(query.getEpoch())); 48 | outputQueryObject.addProperty("querytweettime", Long.toString(query.getQueryTweetTime())); 49 | 50 | String text = query.getQuery(); 51 | String[] toks = text.split(" "); 52 | 53 | JsonArray modelArray = new JsonArray(); 54 | for(String tok : toks) { 55 | JsonObject tupleObject = new JsonObject(); 56 | tupleObject.addProperty("weight", 1.0); 57 | tupleObject.addProperty("feature", tok); 58 | modelArray.add(tupleObject); 59 | } 60 | outputQueryObject.add("model", modelArray); 61 | 62 | 63 | outputJsonArray.add(outputQueryObject); 64 | } 65 | outputObjects.add("queries", outputJsonArray); 66 | } 67 | 68 | 69 | public String toString() { 70 | Gson gson = new GsonBuilder().setPrettyPrinting().create(); 71 | String json = gson.toJson(outputObjects); 72 | return json; 73 | } 74 | 75 | 76 | 77 | 78 | public static void main(String[] args) throws Exception { 79 | String trecQueryPath = args[0]; 80 | 81 | ExtractGqueriesFromTrecFormat harvester = new ExtractGqueriesFromTrecFormat(trecQueryPath); 82 | harvester.harvest(); 83 | 84 | System.out.println(harvester); 85 | } 86 | 87 | 88 | 89 | } 90 | -------------------------------------------------------------------------------- /twitter-tools-rm3/src/main/java/edu/illinois/lis/utils/KeyValuePair.java: -------------------------------------------------------------------------------- 1 | package edu.illinois.lis.utils; 2 | 3 | public class KeyValuePair implements Scorable { 4 | private String key; 5 | private double value; 6 | 7 | public KeyValuePair(String key, double value) { 8 | this.key = key; 9 | this.value = value; 10 | } 11 | 12 | public String getKey() { 13 | return key; 14 | } 15 | 16 | @Override 17 | public String toString() { 18 | StringBuilder b = new StringBuilder(value + "\t" + key); 19 | return b.toString(); 20 | } 21 | 22 | public void setScore(double score) { 23 | this.value = score; 24 | } 25 | 26 | public double getScore() { 27 | return value; 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /twitter-tools-rm3/src/main/java/edu/illinois/lis/utils/ListUtils.java: -------------------------------------------------------------------------------- 1 | package edu.illinois.lis.utils; 2 | 3 | import java.util.Iterator; 4 | import java.util.List; 5 | 6 | public class ListUtils { 7 | 8 | public static double[] listToArray(List x) { 9 | double[] a = new double[x.size()]; 10 | Iterator it = x.iterator(); 11 | int i=0; 12 | while(it.hasNext()) { 13 | a[i++] = it.next(); 14 | } 15 | return a; 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /twitter-tools-rm3/src/main/java/edu/illinois/lis/utils/LuceneQuery.java: -------------------------------------------------------------------------------- 1 | package edu.illinois.lis.utils; 2 | 3 | import java.util.Iterator; 4 | 5 | import edu.illinois.lis.document.FeatureVector; 6 | import edu.illinois.lis.query.GQuery; 7 | 8 | public class LuceneQuery { 9 | public static String gQueryToLucene(GQuery gQuery, int k) { 10 | FeatureVector mainVector = new FeatureVector(gQuery.getText(), null); 11 | mainVector.normalizeToOne(); 12 | FeatureVector fbVector = gQuery.getFeatureVector(); 13 | fbVector.pruneToSize(k); 14 | fbVector.normalizeToOne(); 15 | FeatureVector finalVector = FeatureVector.interpolate(mainVector, fbVector, 0.5); 16 | StringBuilder b = new StringBuilder(); 17 | Iterator terms = finalVector.iterator(); 18 | while(terms.hasNext()) { 19 | String term = terms.next(); 20 | double weight = finalVector.getFeaturetWeight(term); 21 | b.append(term + "^" + weight + " "); 22 | } 23 | return b.toString().trim(); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /twitter-tools-rm3/src/main/java/edu/illinois/lis/utils/ParameterBroker.java: -------------------------------------------------------------------------------- 1 | package edu.illinois.lis.utils; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.FileReader; 5 | import java.util.HashMap; 6 | import java.util.Iterator; 7 | import java.util.Map; 8 | import java.util.Map.Entry; 9 | import java.util.Set; 10 | 11 | 12 | import com.google.gson.JsonElement; 13 | import com.google.gson.JsonObject; 14 | import com.google.gson.JsonParser; 15 | 16 | /** 17 | * N.B. All params are stored as strings. It is the responsibility of calling classes to transform into 18 | * appropriate data types. 19 | * e.g. mu = Double.parseDouble(paramBroker.getParamValue("mu") 20 | * 21 | * @author Miles Efron 22 | * 23 | */ 24 | public class ParameterBroker { 25 | 26 | private static final JsonParser JSON_PARSER = new JsonParser(); 27 | private Map params; 28 | 29 | 30 | 31 | /** 32 | * constructor where we initialize from a json file of structure: 33 | * { 34 | * "param1":"value1", 35 | * "param2":"value2" 36 | * } 37 | * 38 | * @param pathToJson 39 | */ 40 | public ParameterBroker(String pathToJson) { 41 | params = new HashMap(); 42 | JsonObject json = null; 43 | try { 44 | json = (JsonObject) JSON_PARSER.parse(new BufferedReader(new FileReader(pathToJson))); 45 | } catch (Exception e) { 46 | System.err.println("died trying to parse json file: " + pathToJson); 47 | System.exit(-1); 48 | } 49 | 50 | Set> jsonEntries = json.entrySet(); 51 | Iterator> entryIterator = jsonEntries.iterator(); 52 | while(entryIterator.hasNext()) { 53 | Entry entry = entryIterator.next(); 54 | params.put(entry.getKey(), entry.getValue().getAsString()); 55 | System.setProperty(entry.getKey(), entry.getValue().getAsString()); 56 | } 57 | } 58 | 59 | 60 | public String getParamValue(String paramName) { 61 | if(!params.containsKey(paramName)) 62 | return null; 63 | return params.get(paramName); 64 | } 65 | 66 | public void setParam(String name, String value) { 67 | params.put(name, value); 68 | } 69 | 70 | 71 | } 72 | -------------------------------------------------------------------------------- /twitter-tools-rm3/src/main/java/edu/illinois/lis/utils/Qrels.java: -------------------------------------------------------------------------------- 1 | package edu.illinois.lis.utils; 2 | 3 | import java.io.File; 4 | import java.io.FileReader; 5 | import java.util.HashMap; 6 | import java.util.HashSet; 7 | import java.util.Iterator; 8 | import java.util.List; 9 | import java.util.Map; 10 | import java.util.Set; 11 | import java.util.regex.Pattern; 12 | 13 | import org.apache.commons.io.IOUtils; 14 | 15 | public class Qrels { 16 | 17 | public static final Pattern SPACE_PATTERN = Pattern.compile(" ", Pattern.DOTALL); 18 | 19 | private static final int QUERY_COLUMN = 0; 20 | private static final int DOCNO_COLUMN = 2; 21 | private static final int REL_COLUMN = 3; 22 | 23 | private Map> rel; 24 | private int minRel = 1; 25 | 26 | public Qrels(String pathToQrelsFile) { 27 | try { 28 | 29 | rel = new HashMap>(); 30 | 31 | List lines = IOUtils.readLines(new FileReader(new File(pathToQrelsFile))); 32 | Iterator linesIt = lines.iterator(); 33 | while(linesIt.hasNext()) { 34 | String[] toks = SPACE_PATTERN.split(linesIt.next()); 35 | if(toks==null || toks.length != 4) { 36 | System.err.println("bad qrels line"); 37 | continue; 38 | } 39 | String query = toks[QUERY_COLUMN]; 40 | String docno = toks[DOCNO_COLUMN]; 41 | int r = Integer.parseInt(toks[REL_COLUMN]); 42 | if(r >= minRel) { 43 | Set relDocs = null; 44 | if(!rel.containsKey(query)) { 45 | relDocs = new HashSet(); 46 | } else { 47 | relDocs = rel.get(query); 48 | } 49 | relDocs.add(docno); 50 | rel.put(query, relDocs); 51 | } else { 52 | } 53 | } 54 | } catch (Exception e) { 55 | System.err.println("died trying to read qrel file: " + pathToQrelsFile); 56 | System.exit(-1); 57 | } 58 | } 59 | 60 | public boolean isRel(String query, String docno) { 61 | if(!rel.containsKey(query)) { 62 | System.err.println("no relevant documents found for query " + query); 63 | return false; 64 | } 65 | return rel.get(query).contains(docno); 66 | } 67 | 68 | public Set getRelDocs(String query) { 69 | if(!rel.containsKey(query)) { 70 | System.err.println("no relevant documents found for query " + query); 71 | return null; 72 | } 73 | return rel.get(query); 74 | } 75 | 76 | public double numRel(String query) { 77 | if(!rel.containsKey(query)) { 78 | System.err.println("no relevant documents found for query " + query); 79 | return 0.0; 80 | } 81 | return (double)rel.get(query).size(); 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /twitter-tools-rm3/src/main/java/edu/illinois/lis/utils/Scorable.java: -------------------------------------------------------------------------------- 1 | package edu.illinois.lis.utils; 2 | 3 | public interface Scorable { 4 | 5 | public void setScore(double score); 6 | 7 | public double getScore(); 8 | } 9 | -------------------------------------------------------------------------------- /twitter-tools-rm3/src/main/java/edu/illinois/lis/utils/ScorableComparator.java: -------------------------------------------------------------------------------- 1 | package edu.illinois.lis.utils; 2 | 3 | import java.util.Comparator; 4 | 5 | 6 | public class ScorableComparator implements Comparator{ 7 | private boolean decreasing = true; 8 | 9 | public ScorableComparator(boolean decreasing) { 10 | this.decreasing = decreasing; 11 | } 12 | public int compare(Scorable x, Scorable y) { 13 | double xVal = x.getScore(); 14 | double yVal = y.getScore(); 15 | 16 | if(decreasing) { 17 | return (xVal > yVal ? -1 : (xVal == yVal ? 0 : 1)); 18 | } else { 19 | return (xVal < yVal ? -1 : (xVal == yVal ? 0 : 1)); 20 | } 21 | 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /twitter-tools-rm3/src/main/java/edu/illinois/lis/utils/Stopper.java: -------------------------------------------------------------------------------- 1 | package edu.illinois.lis.utils; 2 | 3 | import java.io.FileInputStream; 4 | import java.util.HashSet; 5 | import java.util.Iterator; 6 | import java.util.List; 7 | import java.util.Set; 8 | import java.util.regex.Pattern; 9 | 10 | import org.apache.commons.io.IOUtils; 11 | 12 | public class Stopper { 13 | public static final Pattern SPACE_PATTERN = Pattern.compile(" ", Pattern.DOTALL); 14 | private Set stopwords; 15 | 16 | 17 | public Stopper() { 18 | stopwords = new HashSet(); 19 | } 20 | 21 | public Stopper(String pathToStoplist) { 22 | try { 23 | stopwords = new HashSet(); 24 | 25 | // assume our stoplist has one stopword per line 26 | List lines = IOUtils.readLines(new FileInputStream(pathToStoplist)); 27 | Iterator it = lines.iterator(); 28 | while(it.hasNext()) { 29 | stopwords.add(it.next()); 30 | } 31 | } catch (Exception e) { 32 | e.printStackTrace(); 33 | } 34 | } 35 | 36 | public String apply(String text) { 37 | StringBuilder b = new StringBuilder(); 38 | String[] toks = SPACE_PATTERN.split(text); 39 | for(String tok : toks) { 40 | if(! isStopWord(tok)) 41 | b.append(tok + " "); 42 | } 43 | return b.toString().trim(); 44 | } 45 | public void addStopword(String term) { 46 | stopwords.add(term); 47 | } 48 | public boolean isStopWord(String term) { 49 | return (stopwords.contains(term)) ? true : false; 50 | } 51 | 52 | public Set asSet() { 53 | return stopwords; 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /twitter-tools-rm3/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=INFO, A1 2 | log4j.appender.A1=org.apache.log4j.ConsoleAppender 3 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout 4 | 5 | # Print the date in ISO 8601 format 6 | log4j.appender.A1.layout.ConversionPattern=%d [%t] %-5p %c{1} - %m%n 7 | log4j.logger.com.ning.http.client=WARN 8 | -------------------------------------------------------------------------------- /twitter-tools-ttgbaseline/README.md: -------------------------------------------------------------------------------- 1 | microblogTTGBaseline 2 | ==================== 3 | 4 | A baseline run using an (empirically determined) Jaccard similarity score to cluster tweets. 5 | 6 | 1. Build with `mvn package` 7 | 2. Set your `host`, `group`, and `package` parameters in `config/run_params.json`. Change any other parameters you want. 8 | 3. Run with `java -cp target/microblogTTGBaseline-0.0.1-SNAPSHOT-jar-with-dependencies.jar edu.gslis.ttg.main.RunTTGBaseline` 9 | 10 | Note: Weighted scoring does not work properly, yet. 11 | -------------------------------------------------------------------------------- /twitter-tools-ttgbaseline/config/run_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "queries" : "./topics/topics.microblog-2013.json", 3 | "host" : HOST_NAME_HERE, 4 | "training_port" : 9090, 5 | "testing_port" : 9091, 6 | "num_results" : 1000, 7 | "group" : YOUR_GROUP_HERE, 8 | "token" : YOUR_TOKEN_HERE, 9 | "runtag" : "baseline", 10 | "jaccard_step" : 0.1, 11 | "training_queries" : "./topics/topics.ttg-training.json", 12 | "training_clusters" : 13 | "../data/clusters.training.microblog2011-2012.json", 14 | "qrels" : 15 | "../data/qrels.microblog2011-2012.txt", 16 | "evaluation_type" : "unweighted" 17 | } 18 | -------------------------------------------------------------------------------- /twitter-tools-ttgbaseline/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | edu.gslis 4 | microblogTTGBaseline 5 | 0.0.1-SNAPSHOT 6 | 7 | microblog TTG baseline 8 | http://maven.apache.org 9 | 10 | 11 | UTF-8 12 | 13 | 14 | 15 | nema-dev.lis.illinois.edu 16 | nema-dev.lis.illinois.edu-releases 17 | http://nema-dev.lis.illinois.edu/artifactory//ir-libs 18 | 19 | 20 | nema-dev.lis.illinois.edu 21 | nema-dev.lis.illinois.edu-snapshots 22 | http://nema-dev.lis.illinois.edu/artifactory//ir-libs 23 | 24 | 25 | 26 | 27 | ir-libs 28 | ir-libs 29 | http://nema-dev.lis.illinois.edu/artifactory/ir-libs/ 30 | 31 | true 32 | never 33 | 34 | 35 | true 36 | never 37 | 38 | 39 | 40 | 41 | src 42 | 43 | 44 | maven-compiler-plugin 45 | 3.1 46 | 47 | 1.6 48 | 1.6 49 | 50 | 51 | 52 | maven-assembly-plugin 53 | 54 | 55 | jar-with-dependencies 56 | 57 | 58 | 59 | 60 | simple-command 61 | package 62 | 63 | attached 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | indri 73 | indri 74 | 0.1 75 | 76 | 77 | edu.gslis 78 | ir-utils 79 | 0.0.1-SNAPSHOT 80 | 81 | 82 | cc.twittertools 83 | twitter-tools-core 84 | 1.4.1 85 | 86 | 87 | cc.twittertools 88 | twitter-tools 89 | 1.3.0 90 | 91 | 92 | com.googlecode.json-simple 93 | json-simple 94 | 1.1 95 | 96 | 97 | -------------------------------------------------------------------------------- /twitter-tools-ttgbaseline/src/edu/gslis/ttg/clusters/Cluster.java: -------------------------------------------------------------------------------- 1 | package edu.gslis.ttg.clusters; 2 | 3 | import java.util.Arrays; 4 | import java.util.HashSet; 5 | import java.util.Set; 6 | 7 | import edu.gslis.eval.Qrels; 8 | import edu.gslis.queries.GQuery; 9 | 10 | public class Cluster { 11 | private Set members; 12 | 13 | public Cluster() { 14 | members = new HashSet(); 15 | } 16 | 17 | public Cluster(long member) { 18 | members = new HashSet(); 19 | members.add(member); 20 | } 21 | 22 | public void add(long member) { 23 | members.add(member); 24 | } 25 | 26 | public void add(Set newMembers) { 27 | members.addAll(newMembers); 28 | } 29 | 30 | public Set getMembers() { 31 | return members; 32 | } 33 | 34 | public long getFirstMember() { 35 | return members.iterator().next(); 36 | } 37 | 38 | public boolean hasMember(long member) { 39 | return members.contains(member); 40 | } 41 | 42 | public int getWeight(GQuery query, Qrels qrels) { 43 | // hack to change e.g. MB01 to 01 44 | String q = String.valueOf(Integer.parseInt(query.getTitle().substring(2, query.getTitle().length()))); 45 | 46 | int weight = 0; 47 | for (long member : members) { 48 | if (qrels.isRel(q, String.valueOf(member))) { 49 | int level = qrels.getRelLevel(q, String.valueOf(member)); 50 | weight += level; 51 | } 52 | } 53 | return weight; 54 | } 55 | 56 | @Override 57 | public String toString() { 58 | return Arrays.deepToString(members.toArray()); 59 | } 60 | 61 | public int size() { 62 | return members.size(); 63 | } 64 | 65 | } 66 | -------------------------------------------------------------------------------- /twitter-tools-ttgbaseline/src/edu/gslis/ttg/clusters/Clusters.java: -------------------------------------------------------------------------------- 1 | package edu.gslis.ttg.clusters; 2 | 3 | import java.util.HashMap; 4 | import java.util.HashSet; 5 | import java.util.Iterator; 6 | import java.util.Map; 7 | import java.util.Set; 8 | 9 | public class Clusters implements Iterable { 10 | private Set clusters; 11 | private Map clusterMemberLookup; 12 | 13 | public Clusters() { 14 | clusters = new HashSet(); 15 | clusterMemberLookup = new HashMap(); 16 | } 17 | 18 | public void add(Cluster cluster) { 19 | clusters.add(cluster); 20 | for (long member : cluster.getMembers()) { 21 | clusterMemberLookup.put(member, cluster); 22 | } 23 | } 24 | 25 | public Set getClusters() { 26 | return clusters; 27 | } 28 | 29 | public boolean hasCluster(Cluster cluster) { 30 | return clusters.contains(cluster); 31 | } 32 | 33 | public Cluster findCluster(long member) { 34 | try { 35 | return clusterMemberLookup.get(member); 36 | } catch (NullPointerException e) { 37 | return null; 38 | } 39 | } 40 | 41 | public Set getAllClusteredResults() { 42 | return clusterMemberLookup.keySet(); 43 | } 44 | 45 | // Merge cluster 2 into cluster 1 and update the clusterMemberLookup 46 | // Note: only call this function if cluster 1 is already in the clusters set 47 | // (cluster 2 can be new or existing) 48 | public void mergeExistingClusters(Cluster c1, Cluster c2) { 49 | c1.add(c2.getMembers()); 50 | clusters.remove(c1); 51 | try { 52 | clusters.remove(c2); 53 | } catch (Exception e) { 54 | System.err.println("Unable to remove cluster 2 from clusters. Might be a new cluster."); 55 | } 56 | clusters.add(c1); 57 | 58 | updateClusterMembership(c1); 59 | } 60 | 61 | // Merge two new clusters into the clusters set 62 | public void mergeNewClusters(Cluster c1, Cluster c2) { 63 | c1.add(c2.getMembers()); 64 | clusters.add(c1); 65 | 66 | updateClusterMembership(c1); 67 | } 68 | 69 | public void mergeMembers(long m1, long m2) { 70 | Cluster c1 = findCluster(m1); 71 | Cluster c2 = findCluster(m2); 72 | if (c1 == null && c2 == null) { 73 | c1 = new Cluster(m1); 74 | c2 = new Cluster(m2); 75 | mergeNewClusters(c1, c2); 76 | } else if (c1 == null) { // c2 exists 77 | c1 = new Cluster(m1); 78 | mergeExistingClusters(c2, c1); 79 | } else { // c1 exists 80 | if (c2 == null) { 81 | c2 = new Cluster(m2); 82 | } 83 | mergeExistingClusters(c1, c2); 84 | } 85 | } 86 | 87 | public int size() { 88 | return clusters.size(); 89 | } 90 | 91 | @Override 92 | public Iterator iterator() { 93 | return clusters.iterator(); 94 | } 95 | 96 | @Override 97 | public String toString() { 98 | String output = ""; 99 | output += "["; 100 | Iterator it = clusters.iterator(); 101 | while (it.hasNext()) { 102 | Cluster cluster = it.next(); 103 | output += cluster.toString(); 104 | if (it.hasNext()) { 105 | output += ", "; 106 | } 107 | } 108 | output += "]"; 109 | return output; 110 | } 111 | 112 | private void updateClusterMembership(Cluster cluster) { 113 | for (long member : cluster.getMembers()) { 114 | clusterMemberLookup.put(member, cluster); 115 | } 116 | } 117 | 118 | } 119 | -------------------------------------------------------------------------------- /twitter-tools-ttgbaseline/src/edu/gslis/ttg/clusters/clusterers/SimpleJaccardClusterer.java: -------------------------------------------------------------------------------- 1 | package edu.gslis.ttg.clusters.clusterers; 2 | 3 | import java.util.Iterator; 4 | import java.util.List; 5 | import java.util.NavigableMap; 6 | 7 | import cc.twittertools.thrift.gen.TResult; 8 | import edu.gslis.ttg.clusters.Clusters; 9 | import edu.gslis.ttg.jaccard.JaccardStore; 10 | 11 | public class SimpleJaccardClusterer { 12 | 13 | private List results; 14 | private JaccardStore jaccardScores; 15 | 16 | public SimpleJaccardClusterer(List results) { 17 | this.results = results; 18 | this.jaccardScores = computeJaccardSimilarity(); 19 | } 20 | 21 | public Clusters cluster(double threshold) { 22 | Clusters clusters = new Clusters(); 23 | 24 | NavigableMap> thresholdPairs = jaccardScores.getDocsGreaterThanScore(threshold); 25 | Iterator pairsIt = thresholdPairs.keySet().iterator(); 26 | while (pairsIt.hasNext()) { // for each pair of documents matching this jaccard score 27 | List docPairs = thresholdPairs.get(pairsIt.next()); 28 | Iterator docPairIt = docPairs.iterator(); 29 | while (docPairIt.hasNext()) { // 30 | long[] docs = docPairIt.next(); 31 | clusters.mergeMembers(docs[0], docs[1]); 32 | } 33 | } 34 | 35 | return clusters; 36 | } 37 | 38 | public List getResults() { 39 | return results; 40 | } 41 | 42 | public void setResults(List results) { 43 | this.results = results; 44 | } 45 | 46 | private JaccardStore computeJaccardSimilarity() { 47 | // compute jaccard similarity for each pair of results 48 | JaccardStore scores = new JaccardStore(); 49 | for (int j = 0; j < results.size(); j++) { 50 | TResult doc1 = results.get(j); 51 | for (int k = j + 1; k < results.size(); k++) { 52 | TResult doc2 = results.get(k); 53 | 54 | double jaccardSim = JaccardStore.computeJaccardSimilarity(doc1.getText(), doc2.getText()); 55 | scores.setScore(doc1.getId(), doc2.getId(), jaccardSim); 56 | } 57 | } 58 | 59 | return scores; 60 | } 61 | 62 | } 63 | -------------------------------------------------------------------------------- /twitter-tools-ttgbaseline/src/edu/gslis/ttg/jaccard/JaccardStore.java: -------------------------------------------------------------------------------- 1 | package edu.gslis.ttg.jaccard; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Arrays; 5 | import java.util.HashMap; 6 | import java.util.HashSet; 7 | import java.util.List; 8 | import java.util.Map; 9 | import java.util.NavigableMap; 10 | import java.util.Set; 11 | import java.util.TreeMap; 12 | 13 | public class JaccardStore { 14 | 15 | private Map scores; // 16 | private TreeMap> scoreLookup; // 17 | 18 | public JaccardStore() { 19 | scores = new HashMap(); 20 | scoreLookup = new TreeMap>(); 21 | } 22 | 23 | public double getScore(long doc1, long doc2) { 24 | return scores.get(ordered(doc1, doc2)); 25 | } 26 | 27 | public void setScore(long doc1, long doc2, double score) { 28 | scores.put(ordered(doc1, doc2), score); 29 | if (scoreLookup.get(score) == null) { 30 | scoreLookup.put(score, new ArrayList()); 31 | } 32 | scoreLookup.get(score).add(ordered(doc1, doc2)); 33 | } 34 | 35 | public List getDocsForScore(double score) { 36 | return scoreLookup.get(score); 37 | } 38 | 39 | public NavigableMap> getDocsGreaterThanScore(double score) { 40 | return scoreLookup.tailMap(score, true); 41 | } 42 | 43 | public int size() { 44 | return scores.keySet().size(); 45 | } 46 | 47 | private long[] ordered(long doc1, long doc2) { 48 | long[] ordered = new long[2]; 49 | if (doc1 < doc2) { 50 | ordered[0] = doc1; 51 | ordered[1] = doc2; 52 | } else { 53 | ordered[0] = doc2; 54 | ordered[1] = doc1; 55 | } 56 | return ordered; 57 | } 58 | 59 | public static double computeJaccardSimilarity(Set doc1, Set doc2) { 60 | Set intersection = new HashSet(doc1); 61 | Set union = new HashSet(doc1); 62 | 63 | intersection.retainAll(doc2); 64 | union.addAll(doc2); 65 | 66 | return intersection.size() / (double) union.size(); 67 | } 68 | 69 | public static double computeJaccardSimilarity(String doc1, String doc2) { 70 | String[] docOneTerms = doc1.toLowerCase().split("[^A-Za-z0-9]"); 71 | List termList = new ArrayList(Arrays.asList(docOneTerms)); 72 | termList.removeAll(Arrays.asList("", null)); 73 | Set docOneBag = new HashSet(termList); 74 | 75 | String[] docTwoTerms = doc2.toLowerCase().split("[^A-Za-z0-9]"); 76 | termList = new ArrayList(Arrays.asList(docTwoTerms)); 77 | termList.removeAll(Arrays.asList("", null)); 78 | Set docTwoBag = new HashSet(termList); 79 | 80 | return computeJaccardSimilarity(docOneBag, docTwoBag); 81 | } 82 | 83 | } 84 | -------------------------------------------------------------------------------- /twitter-tools-ttgbaseline/src/edu/gslis/ttg/searchers/SimpleSearcher.java: -------------------------------------------------------------------------------- 1 | package edu.gslis.ttg.searchers; 2 | 3 | import java.util.HashMap; 4 | import java.util.Iterator; 5 | import java.util.List; 6 | import java.util.Map; 7 | 8 | import cc.twittertools.search.api.TrecSearchThriftClient; 9 | import cc.twittertools.thrift.gen.TResult; 10 | import edu.gslis.queries.GQuery; 11 | import edu.gslis.textrepresentation.FeatureVector; 12 | 13 | public class SimpleSearcher { 14 | 15 | private TrecSearchThriftClient client; 16 | private int maxResults; 17 | 18 | public SimpleSearcher(TrecSearchThriftClient client, int maxResults) { 19 | this.client = client; 20 | this.maxResults = maxResults; 21 | } 22 | 23 | 24 | public Map search(GQuery query) { 25 | // clean up query 26 | String queryText = query.getText(); 27 | queryText = queryText.replaceAll("[,'\\.\\?]", " "); 28 | queryText = queryText.replaceAll(" ", " ").trim(); 29 | 30 | // need to lowercase the query vector 31 | FeatureVector temp = new FeatureVector(null); 32 | Iterator qTerms = query.getFeatureVector().iterator(); 33 | while(qTerms.hasNext()) { 34 | String term = qTerms.next(); 35 | temp.addTerm(term.toLowerCase(), query.getFeatureVector().getFeatureWeight(term)); 36 | } 37 | temp.normalize();; 38 | query.setFeatureVector(temp); 39 | 40 | System.err.println(query.getTitle()+": "+queryText); 41 | 42 | // perform search 43 | List results = null; 44 | try { 45 | results = client.search(queryText, Long.parseLong(query.getMetadata("querytweettime")), maxResults); 46 | } catch (Exception e) { 47 | System.err.println("Error searching."); 48 | System.exit(-1); 49 | } 50 | 51 | // set cutoff score heuristically 52 | double topScore = results.get(0).getRsv(); 53 | double cutOffScore = topScore / 2; 54 | 55 | // record hits, removing duplicates 56 | int i = 1; 57 | Map seenMap = new HashMap(); 58 | Iterator hitIterator = results.iterator(); 59 | while(hitIterator.hasNext()) { 60 | TResult hit = hitIterator.next(); 61 | if (hit.getRsv() < cutOffScore) { 62 | break; 63 | } 64 | 65 | long docId = hit.id; 66 | if (seenMap.containsKey(docId)) 67 | continue; 68 | seenMap.put(docId, hit); 69 | 70 | if(i++ >= maxResults) 71 | break; 72 | } 73 | 74 | return seenMap; 75 | } 76 | 77 | } 78 | -------------------------------------------------------------------------------- /twitter-tools-ttgbaseline/topics/topics.ttg-training.json: -------------------------------------------------------------------------------- 1 | { 2 | "queries": [ 3 | { 4 | "title": "MB03", 5 | "text": "Haiti Aristide return", 6 | "epoch": "1.297200733E9", 7 | "querytweettime": "35088534306033665", 8 | "model": [ 9 | { 10 | "weight": 1.0, 11 | "feature": "Haiti" 12 | }, 13 | { 14 | "weight": 1.0, 15 | "feature": "Aristide" 16 | }, 17 | { 18 | "weight": 1.0, 19 | "feature": "return" 20 | } 21 | ] 22 | }, 23 | { 24 | "title": "MB21", 25 | "text": "Emanuel residency court rulings", 26 | "epoch": "1.29627021E9", 27 | "querytweettime": "31185639047172097", 28 | "model": [ 29 | { 30 | "weight": 1.0, 31 | "feature": "Emanuel" 32 | }, 33 | { 34 | "weight": 1.0, 35 | "feature": "residency" 36 | }, 37 | { 38 | "weight": 1.0, 39 | "feature": "court" 40 | }, 41 | { 42 | "weight": 1.0, 43 | "feature": "rulings" 44 | } 45 | ] 46 | }, 47 | { 48 | "title": "MB22", 49 | "text": "healthcare law unconstitutional", 50 | "epoch": "1.296598654E9", 51 | "querytweettime": "32563233118224385", 52 | "model": [ 53 | { 54 | "weight": 1.0, 55 | "feature": "healthcare" 56 | }, 57 | { 58 | "weight": 1.0, 59 | "feature": "law" 60 | }, 61 | { 62 | "weight": 1.0, 63 | "feature": "unconstitutional" 64 | } 65 | ] 66 | }, 67 | { 68 | "title": "MB26", 69 | "text": "US unemployment", 70 | "epoch": "1.296828651E9", 71 | "querytweettime": "33527910379814912", 72 | "model": [ 73 | { 74 | "weight": 1.0, 75 | "feature": "US" 76 | }, 77 | { 78 | "weight": 1.0, 79 | "feature": "unemployment" 80 | } 81 | ] 82 | }, 83 | { 84 | "title": "MB42", 85 | "text": "Holland Iran envoy recall", 86 | "epoch": "1.297111633E9", 87 | "querytweettime": "34714824982134784", 88 | "model": [ 89 | { 90 | "weight": 1.0, 91 | "feature": "Holland" 92 | }, 93 | { 94 | "weight": 1.0, 95 | "feature": "Iran" 96 | }, 97 | { 98 | "weight": 1.0, 99 | "feature": "envoy" 100 | }, 101 | { 102 | "weight": 1.0, 103 | "feature": "recall" 104 | } 105 | ] 106 | }, 107 | { 108 | "title": "MB51", 109 | "text": "British Government cuts", 110 | "epoch": "1.297209406E9", 111 | "querytweettime": "35124912364457984", 112 | "model": [ 113 | { 114 | "weight": 1.0, 115 | "feature": "British" 116 | }, 117 | { 118 | "weight": 1.0, 119 | "feature": "Government" 120 | }, 121 | { 122 | "weight": 1.0, 123 | "feature": "cuts" 124 | } 125 | ] 126 | }, 127 | { 128 | "title": "MB57", 129 | "text": "Chicago blizzard", 130 | "epoch": "1.296683586E9", 131 | "querytweettime": "32919462151720960", 132 | "model": [ 133 | { 134 | "weight": 1.0, 135 | "feature": "Chicago" 136 | }, 137 | { 138 | "weight": 1.0, 139 | "feature": "blizzard" 140 | } 141 | ] 142 | }, 143 | { 144 | "title": "MB66", 145 | "text": "Journalists treatment in Egypt", 146 | "epoch": "1.296865923E9", 147 | "querytweettime": "33684239400566784", 148 | "model": [ 149 | { 150 | "weight": 1.0, 151 | "feature": "Journalists" 152 | }, 153 | { 154 | "weight": 1.0, 155 | "feature": "treatment" 156 | }, 157 | { 158 | "weight": 1.0, 159 | "feature": "in" 160 | }, 161 | { 162 | "weight": 1.0, 163 | "feature": "Egypt" 164 | } 165 | ] 166 | }, 167 | { 168 | "title": "MB68", 169 | "text": "Charlie Sheen rehab", 170 | "epoch": "1.296591293E9", 171 | "querytweettime": "32532358276063232", 172 | "model": [ 173 | { 174 | "weight": 1.0, 175 | "feature": "Charlie" 176 | }, 177 | { 178 | "weight": 1.0, 179 | "feature": "Sheen" 180 | }, 181 | { 182 | "weight": 1.0, 183 | "feature": "rehab" 184 | } 185 | ] 186 | }, 187 | { 188 | "title": "MB88", 189 | "text": "Kings Speech awards", 190 | "epoch": "1.297126104E9", 191 | "querytweettime": "34775520600129536", 192 | "model": [ 193 | { 194 | "weight": 1.0, 195 | "feature": "Kings" 196 | }, 197 | { 198 | "weight": 1.0, 199 | "feature": "Speech" 200 | }, 201 | { 202 | "weight": 1.0, 203 | "feature": "awards" 204 | } 205 | ] 206 | } 207 | ] 208 | } --------------------------------------------------------------------------------