├── .gitignore
├── API-agreement.pdf
├── HISTORY.md
├── README.md
├── data
    ├── clusters.training.microblog2011-2012.json
    ├── qrels.microblog2011-2012.txt
    ├── qrels.microblog2011.txt.gz
    ├── qrels.microblog2012.txt.gz
    ├── qrels.microblog2013.txt.gz
    ├── qrels.microblog2014.txt.gz
    ├── queries.trec2005efficiency.txt.gz
    ├── run.lm.xml
    ├── topics.microblog2011.txt
    ├── topics.microblog2012.txt
    ├── topics.microblog2013.txt
    └── topics.microblog2014.txt
├── etc
    ├── trec_eval.9.0.tar.gz
    └── ttg_eval.py
├── twitter-tools-core
    ├── .settings
    │   ├── org.eclipse.jdt.core.prefs
    │   └── org.eclipse.jdt.ui.prefs
    ├── pom.xml
    └── src
    │   ├── attic
    │       └── java
    │       │   └── cc
    │       │       └── twittertools
    │       │           ├── corpus
    │       │               └── data
    │       │               │   ├── TSVStatusBlockReader.java
    │       │               │   └── TSVStatusCorpusReader.java
    │       │           └── download
    │       │               ├── AsyncJsonStatusBlockCrawler.java
    │       │               └── VerifyJsonStatusBlockCrawl.java
    │   ├── main
    │       ├── java
    │       │   ├── cc
    │       │   │   └── twittertools
    │       │   │   │   ├── corpus
    │       │   │   │       ├── data
    │       │   │   │       │   ├── HTMLStatusExtractor.java
    │       │   │   │       │   ├── JsonStatusBlockReader.java
    │       │   │   │       │   ├── JsonStatusCorpusReader.java
    │       │   │   │       │   ├── Status.java
    │       │   │   │       │   └── StatusStream.java
    │       │   │   │       └── demo
    │       │   │   │       │   └── ReadStatuses.java
    │       │   │   │   ├── download
    │       │   │   │       ├── AsyncEmbeddedJsonStatusBlockCrawler.java
    │       │   │   │       └── AsyncHTMLStatusBlockCrawler.java
    │       │   │   │   ├── index
    │       │   │   │       ├── ExtractTermStatisticsFromIndex.java
    │       │   │   │       ├── ExtractTweetidsFromCollection.java
    │       │   │   │       ├── ExtractTweetidsFromIndex.java
    │       │   │   │       ├── IndexStatuses.java
    │       │   │   │       ├── LowerCaseEntityPreservingFilter.java
    │       │   │   │       └── TweetAnalyzer.java
    │       │   │   │   ├── search
    │       │   │   │       ├── TrecTopic.java
    │       │   │   │       ├── TrecTopicSet.java
    │       │   │   │       ├── api
    │       │   │   │       │   ├── RunQueriesBaselineThrift.java
    │       │   │   │       │   ├── RunQueriesThrift.java
    │       │   │   │       │   ├── SearchStatusesThrift.java
    │       │   │   │       │   ├── TResultComparable.java
    │       │   │   │       │   ├── TrecSearchHandler.java
    │       │   │   │       │   ├── TrecSearchThriftClient.java
    │       │   │   │       │   ├── TrecSearchThriftLoadGenerator.java
    │       │   │   │       │   └── TrecSearchThriftServer.java
    │       │   │   │       └── local
    │       │   │   │       │   ├── RunQueries.java
    │       │   │   │       │   └── SearchStatuses.java
    │       │   │   │   ├── stream
    │       │   │   │       └── GatherStatusStream.java
    │       │   │   │   ├── thrift
    │       │   │   │       └── gen
    │       │   │   │       │   ├── TQuery.java
    │       │   │   │       │   ├── TResult.java
    │       │   │   │       │   ├── TrecSearch.java
    │       │   │   │       │   └── TrecSearchException.java
    │       │   │   │   └── util
    │       │   │   │       ├── ExtractSubcollection.java
    │       │   │   │       └── VerifySubcollection.java
    │       │   └── log4j.properties
    │       ├── perl
    │       │   ├── extract_deletes.pl
    │       │   └── join_deletes_with_collection.pl
    │       ├── python
    │       │   ├── Search
    │       │   │   ├── TrecSearch-remote
    │       │   │   ├── TrecSearch.py
    │       │   │   ├── __init__.py
    │       │   │   ├── constants.py
    │       │   │   └── ttypes.py
    │       │   ├── TrecSearchThriftClientCli.py
    │       │   └── twittertools
    │       │   │   └── stream
    │       │   │       └── gather_status_stream.py
    │       ├── resources
    │       │   └── log4j.properties
    │       └── thrift
    │       │   ├── gen-py
    │       │       ├── __init__.py
    │       │       └── twittertools
    │       │       │   ├── TrecSearch-remote
    │       │       │   ├── TrecSearch.py
    │       │       │   ├── __init__.py
    │       │       │   ├── constants.py
    │       │       │   └── ttypes.py
    │       │   └── twittertools.thrift
    │   └── test
    │       └── java
    │           └── cc
    │               └── twittertools
    │                   ├── download
    │                       └── FetchStatusTest.java
    │                   ├── index
    │                       └── TokenizationTest.java
    │                   └── search
    │                       └── TrecTopicSetTest.java
├── twitter-tools-hadoop
    ├── .settings
    │   ├── org.eclipse.jdt.core.prefs
    │   └── org.eclipse.jdt.ui.prefs
    ├── README.md
    ├── pom.xml
    ├── src
    │   └── main
    │   │   └── java
    │   │       └── cc
    │   │           └── twittertools
    │   │               ├── hadoop
    │   │                   └── Example.java
    │   │               ├── hbase
    │   │                   ├── LoadWordCount.java
    │   │                   └── WordCountDAO.java
    │   │               ├── piggybank
    │   │                   ├── ConvertCreatedAtToEpoch.java
    │   │                   ├── GetLatitude.java
    │   │                   ├── GetLongitude.java
    │   │                   └── IsMap.java
    │   │               └── udf
    │   │                   ├── GetDate.java
    │   │                   ├── GetInterval.java
    │   │                   └── LuceneTokenizer.java
    └── wordcountbytime.pig
├── twitter-tools-rm3
    ├── README.md
    ├── build.sh
    ├── config
    │   └── run_params_sample.json
    ├── data
    │   ├── qrels.microblog
    │   ├── stoplist.twitter
    │   ├── topics.microblog2011.json
    │   ├── topics.microblog2012.json
    │   └── topics.microblog2013.json
    ├── pom.xml
    └── src
    │   └── main
    │       ├── java
    │           └── edu
    │           │   └── illinois
    │           │       └── lis
    │           │           ├── document
    │           │               └── FeatureVector.java
    │           │           ├── feedback
    │           │               ├── FeedbackModel.java
    │           │               └── FeedbackRelevanceModel.java
    │           │           ├── query
    │           │               ├── GQueries.java
    │           │               ├── GQueriesJsonImpl.java
    │           │               ├── GQuery.java
    │           │               ├── TrecTemporalTopic.java
    │           │               └── TrecTemporalTopicSet.java
    │           │           ├── rerank
    │           │               ├── SearchReranker.java
    │           │               └── TResultComparator.java
    │           │           ├── search
    │           │               └── RunQueries.java
    │           │           ├── searchsource
    │           │               └── IndexWrapperMicroblogApi.java
    │           │           └── utils
    │           │               ├── ExtractGqueriesFromTrecFormat.java
    │           │               ├── KeyValuePair.java
    │           │               ├── ListUtils.java
    │           │               ├── LuceneQuery.java
    │           │               ├── ParameterBroker.java
    │           │               ├── Qrels.java
    │           │               ├── Scorable.java
    │           │               ├── ScorableComparator.java
    │           │               └── Stopper.java
    │       └── resources
    │           └── log4j.properties
└── twitter-tools-ttgbaseline
    ├── README.md
    ├── config
        └── run_params.json
    ├── pom.xml
    ├── src
        └── edu
        │   └── gslis
        │       └── ttg
        │           ├── clusters
        │               ├── Cluster.java
        │               ├── Clusters.java
        │               └── clusterers
        │               │   └── SimpleJaccardClusterer.java
        │           ├── jaccard
        │               └── JaccardStore.java
        │           ├── main
        │               └── RunTTGBaseline.java
        │           └── searchers
        │               └── SimpleSearcher.java
    └── topics
        ├── topics.microblog-2011.json
        ├── topics.microblog-2012.json
        ├── topics.microblog-2013.json
        └── topics.ttg-training.json


/.gitignore:
--------------------------------------------------------------------------------
 1 | twitter-tools-core/.classpath
 2 | twitter-tools-core/.project
 3 | twitter-tools-core/target/
 4 | twitter-tools-rm3/.classpath
 5 | twitter-tools-rm3/.project
 6 | twitter-tools-rm3/target/
 7 | twitter-tools-ttgbaseline/.classpath
 8 | twitter-tools-ttgbaseline/.project
 9 | twitter-tools-ttgbaseline/.settings/
10 | twitter-tools-ttgbaseline/target/
11 | twitter-tools-ttgbaseline/output.txt
12 | twitter-tools-hadoop/.classpath
13 | twitter-tools-hadoop/.project
14 | twitter-tools-hadoop/target/
15 | etc/run.sh
16 | etc/trec_eval.9.0/
17 | etc/trec_eval
18 | data/qrels.microblog2011.txt
19 | data/qrels.microblog2012.txt
20 | data/qrels.microblog2013.txt
21 | data/qrels.microblog2014.txt
22 | data/queries.trec2005efficiency.txt
23 | *~
24 | .DS_Store
25 | *.pyc
26 | 


--------------------------------------------------------------------------------
/API-agreement.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lintool/twitter-tools/777776083d1e4a76da5bdc5860551a38f8fd6766/API-agreement.pdf


--------------------------------------------------------------------------------
/HISTORY.md:
--------------------------------------------------------------------------------
 1 | Version 1.4.3
 2 | =============
 3 | December 26, 2014
 4 | 
 5 | + API serving the Tweets2013 collection for TREC 2014, includes minor code fixes during TREC evaluations that have been merged back to master
 6 | 
 7 | Version 1.4.2
 8 | =============
 9 | March 15, 2014
10 | 
11 | + Added code to generate Thrift baseline runs
12 | + Added code to extract subcollection and term statistics
13 | + Added topics and qrels for TREC 2013
14 | 
15 | Version 1.4.1
16 | =============
17 | July 7, 2013
18 | 
19 | + Cleaned up dependencies and eliminated direct dependency on Solr
20 | + Fixed unnecessary string -> int/long parsing in retrieval
21 | 
22 | Version 1.4.0
23 | =============
24 | July 3, 2013
25 | 
26 | + Switched over from Ant to Maven for build management, with artifactId `twitter-tools-core`
27 | 
28 | Version 1.3.0
29 | =============
30 | June 12, 2013
31 | 
32 | + Package refactoring/renaming and code cleanup
33 | + Upgraded to Lucene 4.3
34 | + Added initial Python client
35 | + Installed Tweet-specific Lucene analyzer
36 | + Added simple Perl scripts for processing deletes
37 | 
38 | Version 1.2.0
39 | =============
40 | June 6, 2013
41 | 
42 | + Initial release of the API for TREC 2013
43 | 
44 | Version 1.1.1
45 | =============
46 | January 28, 2013
47 | 
48 | + Noted that `AsyncEmbeddedJsonStatusBlockCrawler` is currently broken
49 | 
50 | Version 1.1.0
51 | =============
52 | January 23, 2013
53 | 
54 | + Added crawler for Twitter public stream 
55 | 
56 | Version 1.0.0
57 | =============
58 | January 15, 2013
59 | 
60 | + Cleaned up code
61 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Twitter Tools
  2 | =============
  3 | 
  4 | This repo holds a collection of tools for the TREC Microblog tracks, which officially ended in 2015. The track mailing list can be found at [trec-microblog@googlegroups.com](http://groups.google.com/group/trec-microblog).
  5 | 
  6 | Archival Documents
  7 | ------------------
  8 | 
  9 | + [TREC 2013 API Specifications](https://github.com/lintool/twitter-tools/wiki/TREC-2013-API-Specifications)
 10 | + [TREC 2013 Track Guidelines](https://github.com/lintool/twitter-tools/wiki/TREC-2013-Track-Guidelines)
 11 | + [TREC 2014 Track Guidelines](https://github.com/lintool/twitter-tools/wiki/TREC-2014-Track-Guidelines)
 12 | + [TREC 2015 Track Guidelines](https://github.com/lintool/twitter-tools/wiki/TREC-2015-Track-Guidelines)
 13 | 
 14 | API Access
 15 | ----------
 16 | 
 17 | The Microblog tracks in 2013 and 2014 used the "evaluation as a service" (EaaS) model, where teams interact with the official corpus via a common API. Although the evaluation has ended, the API is still available for researcher use.
 18 | 
 19 | To request access to the API, follow these steps:
 20 | 
 21 | 1. Fill out the [API usage agreement](http://lintool.github.io/twitter-tools/API-agreement.pdf).
 22 | 2. Email the usage agreement to `microblog-request@nist.gov`.
 23 | 3. After NIST receives your request, you will receive an access token from NIST.
 24 | 4. The code for accessing the API can be found in this repository. The endpoint of API itself (i.e., hostname, port) will be provided by NIST.
 25 | 
 26 | Getting Stated
 27 | --------------
 28 | 
 29 | The main Maven artifact for the TREC Microblog API is `twitter-tools-core`. The latest releases of Maven artifacts are available at [Maven Central](http://search.maven.org/#search%7Cga%7C1%7Ccc.twittertools).
 30 | 
 31 | You can clone the repo with the following command:
 32 | 
 33 | ```
 34 | $ git clone git://github.com/lintool/twitter-tools.git
 35 | ``` 
 36 | 
 37 | Once you've cloned the repository, change directory into `twitter-tools-core` and build the package with Maven:
 38 | 
 39 | ```
 40 | $ cd twitter-tools-core
 41 | $ mvn clean package appassembler:assemble
 42 | ```
 43 | 
 44 | For more information, see the [project wiki](https://github.com/lintool/twitter-tools/wiki).
 45 | 
 46 | Replicating TREC Baselines
 47 | --------------------------
 48 | 
 49 | One advantage of the TREC Microblog API is that it is possible to deploy a community baseline whose results are replicable by *anyone*. The `raw` results are simply the output of the API unmodified. The `baseline` results are the `raw` results that have been post-processed to remove retweets and break score ties by reverse chronological order (earliest first).
 50 | 
 51 | To run the `raw` results for TREC 2011, issue the following command:
 52 | 
 53 | ```
 54 | sh target/appassembler/bin/RunQueriesThrift \
 55 |  -host [host] -port [port] -group [group] -token [token] \
 56 |  -queries ../data/topics.microblog2011.txt > run.microblog2011.raw.txt
 57 | ```
 58 | 
 59 | And to run the `baseline` results for TREC 2011, issue the following command:
 60 | 
 61 | ```
 62 | sh target/appassembler/bin/RunQueriesBaselineThrift \
 63 |  -host [host] -port [port] -group [group] -token [token] \
 64 |  -queries ../data/topics.microblog2011.txt > run.microblog2011.baseline.txt
 65 | ```
 66 | 
 67 | Note that `trec_eval` is included in `twitter-tools/etc` (just needs to be compiled), and the qrels are stored in `twitter-tools/data` (just needs to be uncompressed), so you can evaluate as follows:
 68 | 
 69 | ```
 70 | ../etc/trec_eval.9.0/trec_eval ../data/qrels.microblog2011.txt run.microblog2011.raw.txt
 71 | ```
 72 | 
 73 | Similar commands will allow you to replicate runs for TREC 2012 and TREC 2013. With `trec_eval`, you should get *exactly* the following results:
 74 | 
 75 | MAP       | raw    | baseline
 76 | ----------|--------|---------
 77 | TREC 2011 | 0.3050 | 0.3576
 78 | TREC 2012 | 0.1751 | 0.2091
 79 | TREC 2013 | 0.2044 | 0.2532
 80 | TREC 2014 | 0.3090 | 0.3924
 81 | 
 82 | P30       | raw    | baseline
 83 | ----------|--------|---------
 84 | TREC 2011 | 0.3483 | 0.4000
 85 | TREC 2012 | 0.2831 | 0.3311
 86 | TREC 2013 | 0.3761 | 0.4450
 87 | TREC 2014 | 0.5145 | 0.6182
 88 | 
 89 | 
 90 | License
 91 | -------
 92 | 
 93 | Licensed under the [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0).
 94 | 
 95 | 
 96 | Acknowledgments
 97 | ---------------
 98 | 
 99 | This work is supported in part by the National Science Foundation under award [IIS-1218043](http://www.nsf.gov/awardsearch/showAward?AWD_ID=1218043). Any opinions, findings, and conclusions or recommendations expressed are those of the researchers and do not necessarily reflect the views of the National Science Foundation.
100 | 


--------------------------------------------------------------------------------
/data/qrels.microblog2011.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lintool/twitter-tools/777776083d1e4a76da5bdc5860551a38f8fd6766/data/qrels.microblog2011.txt.gz


--------------------------------------------------------------------------------
/data/qrels.microblog2012.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lintool/twitter-tools/777776083d1e4a76da5bdc5860551a38f8fd6766/data/qrels.microblog2012.txt.gz


--------------------------------------------------------------------------------
/data/qrels.microblog2013.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lintool/twitter-tools/777776083d1e4a76da5bdc5860551a38f8fd6766/data/qrels.microblog2013.txt.gz


--------------------------------------------------------------------------------
/data/qrels.microblog2014.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lintool/twitter-tools/777776083d1e4a76da5bdc5860551a38f8fd6766/data/qrels.microblog2014.txt.gz


--------------------------------------------------------------------------------
/data/queries.trec2005efficiency.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lintool/twitter-tools/777776083d1e4a76da5bdc5860551a38f8fd6766/data/queries.trec2005efficiency.txt.gz


--------------------------------------------------------------------------------
/data/run.lm.xml:
--------------------------------------------------------------------------------
1 | <parameters>
2 |  <index>tweets2011-index</index>
3 |  <trecFormat>true</trecFormat>
4 |  <count>1000</count>
5 |  <similarity>lm</similarity>
6 | </parameters>
7 | 


--------------------------------------------------------------------------------
/etc/trec_eval.9.0.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lintool/twitter-tools/777776083d1e4a76da5bdc5860551a38f8fd6766/etc/trec_eval.9.0.tar.gz


--------------------------------------------------------------------------------
/etc/ttg_eval.py:
--------------------------------------------------------------------------------
 1 | #This file is to take run file (as an input argument) and ground truth non-redundant tweets 
 2 | #to compute the unweighted precision, recall and weighted precision per topic.
 3 | import json
 4 | from sets import Set
 5 | import argparse
 6 | 
 7 | parser = argparse.ArgumentParser(description='Tweet Timeline Generation (TTG) evaluation script (version 1.0)')
 8 | parser.add_argument('-q', required=True, metavar='qrels', help='qrels file')
 9 | parser.add_argument('-c', required=True, metavar='clusters', help='cluster anotations')
10 | parser.add_argument('-r', required=True, metavar='run', help='run file')
11 | 
12 | args = parser.parse_args()
13 | file_qrels_path = vars(args)['q']
14 | clusters_path = vars(args)['c']
15 | run_path = vars(args)['r']
16 | 
17 | #Take qrels to generate dictionary of {topic number:{tweetid:weight}} 
18 | #where weight is 0(non-relevant), 1(relevant), 2(highly relevant)
19 | qrels_dt = {}
20 | file_qrels = open(file_qrels_path, "r")
21 | lines = file_qrels.readlines()
22 | for line in lines:
23 |     line = line.strip().split()
24 |     topic_ind = line[0]
25 |     if topic_ind not in qrels_dt:
26 |         qrels_dt[topic_ind] = {}
27 |     qrels_dt[topic_ind][line[2]] = line[3]
28 | 
29 | #Take run file and generate dictionary of {topic number:Set of tweetids for that topic}
30 | runlength = len(run_path) - run_path.index("/") - 1
31 | clusters_run_dt = {}
32 | file_run = open(run_path, "r")
33 | lines = file_run.readlines()
34 | for line in lines:
35 |     line = line.strip().split()
36 |     topic_ind = line[0][line[0].index("MB") + 2:]
37 |     if topic_ind not in clusters_run_dt:
38 |         clusters_run_dt[topic_ind] = Set()
39 |     clusters_run_dt[topic_ind].add(line[2])
40 | 
41 | #Take ground truth, generate dictionary of {topic number:2D array of clusters of tweetids}, for each topic,
42 | #compare tweet from each cluster with that from run file and compute unweighted precision, recall and weighted recall.
43 | clusters_dt = {}
44 | precision_total = 0
45 | unweighted_recall_total = 0 
46 | weighted_recall_total = 0
47 | file_clusters = open(clusters_path, "r")
48 | data = json.load(file_clusters)
49 | topics = data["topics"]
50 | print "runtag".ljust(runlength) + "\ttopic\tunweighted_recall weighted_recall precision"
51 | for topic in sorted(topics.keys()):
52 | 	total_weight = 0
53 | 	credits = 0
54 | 	hit_num = 0
55 | 	topic_ind = topic[line[0].index("MB") + 2:]
56 | 	topic_ind = topic_ind.encode("utf-8")
57 | 	clusters_json = topics[topic]["clusters"]
58 | 	for i in range(len(clusters_json)):
59 | 		clusters_json[i] = [s.encode("utf-8") for s in clusters_json[i]]
60 | 	clusters_dt[topic_ind] = clusters_json
61 | 	for cluster in clusters_dt[topic_ind]:
62 | 		weight = 0
63 | 		hit_flag = 0
64 | 		for tweet in cluster:
65 | 			weight = weight + int(qrels_dt[topic_ind][tweet])
66 | 			if tweet in clusters_run_dt[topic_ind]:
67 | 				hit_flag = 1
68 | 		total_weight = total_weight + weight
69 | 		if hit_flag == 1:
70 | 			credits = credits + weight
71 | 			hit_num = hit_num + 1
72 | 			hit_flag = 0
73 | 	precision = float(hit_num) / len(clusters_run_dt[topic_ind])
74 | 	unweighted_recall = float(hit_num) / len(clusters_dt[topic_ind])
75 | 	weighted_recall = float(credits) / total_weight
76 | 	precision_total = precision_total + precision
77 | 	unweighted_recall_total = unweighted_recall_total + unweighted_recall
78 | 	weighted_recall_total = weighted_recall_total + weighted_recall
79 | 	print run_path[run_path.rindex("/") + 1:].ljust(max(runlength, 6)) + "\t" + "MB" + str(topic_ind) + "\t" + "%12.4f" % unweighted_recall + "\t" + "%12.4f" % weighted_recall + "\t" + "%10.4f" % precision
80 | precision_mean = precision_total / len(clusters_dt)
81 | unweighted_recall_mean = unweighted_recall_total / len(clusters_dt)
82 | weighted_recall_mean = weighted_recall_total / len(clusters_dt)
83 | print run_path[run_path.rindex("/") + 1:].ljust(max(runlength, 6)) + "\t" + "all".ljust(5) + "\t" + "%12.4f" % unweighted_recall_mean + "\t" + "%12.4f" % weighted_recall_mean + "\t" + "%10.4f" % precision_mean
84 | file_run.close()
85 | file_clusters.close()
86 | 


--------------------------------------------------------------------------------
/twitter-tools-core/.settings/org.eclipse.jdt.ui.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | formatter_profile=_twitter-tools
3 | formatter_settings_version=12
4 | org.eclipse.jdt.ui.exception.name=e
5 | org.eclipse.jdt.ui.gettersetter.use.is=true
6 | org.eclipse.jdt.ui.keywordthis=false
7 | org.eclipse.jdt.ui.overrideannotation=true
8 | 


--------------------------------------------------------------------------------
/twitter-tools-core/src/attic/java/cc/twittertools/corpus/data/TSVStatusBlockReader.java:
--------------------------------------------------------------------------------
 1 | package cc.twittertools.corpus.data;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.File;
 5 | import java.io.FileInputStream;
 6 | import java.io.IOException;
 7 | import java.io.InputStreamReader;
 8 | import java.util.zip.GZIPInputStream;
 9 | 
10 | 
11 | /**
12 |  * Abstraction for an stream of statuses, backed by an underlying gzipped file with JSON-encoded
13 |  * tweets, one per line.
14 |  */
15 | public class TSVStatusBlockReader implements StatusStream {
16 |   private final BufferedReader br;
17 | 
18 |   public TSVStatusBlockReader(File file) throws IOException {
19 | 
20 |     if (!file.getName().endsWith(".gz")) {
21 |       throw new IOException("Expecting .gz compressed file!");
22 |     }
23 | 
24 |     br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file)), "UTF-8"));
25 |   }
26 | 
27 |   /**
28 |    * Returns the next status, or <code>null</code> if no more statuses.
29 |    */
30 |   public Status next() throws IOException {
31 |     Status nxt = null;
32 |     String raw = null;
33 | 
34 |     while (nxt == null) {
35 | 	raw = br.readLine();
36 | 
37 | 	// Check to see if we've reached end of file.
38 | 	if ( raw == null) {
39 | 	    return null;
40 | 	}
41 | 
42 | 	nxt = Status.fromTSV(raw);
43 |     }
44 |     return Status.fromTSV(raw);
45 |   }
46 | 
47 |   public void close() throws IOException {
48 |     br.close();
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/twitter-tools-core/src/attic/java/cc/twittertools/corpus/data/TSVStatusCorpusReader.java:
--------------------------------------------------------------------------------
 1 | package cc.twittertools.corpus.data;
 2 | 
 3 | import java.io.File;
 4 | import java.io.FileFilter;
 5 | import java.io.IOException;
 6 | 
 7 | 
 8 | /**
 9 |  * Abstraction for a corpus of statuses. A corpus is assumed to consist of a number of blocks, each
10 |  * represented by a gzipped file within a root directory. This object will allow to caller to read
11 |  * through all blocks, in sorted lexicographic order of the files.
12 |  */
13 | public class TSVStatusCorpusReader implements StatusStream {
14 |   private final File[] files;
15 |   private int nextFile = 0;
16 |   private TSVStatusBlockReader currentBlock = null;
17 | 
18 |   public TSVStatusCorpusReader(File file) throws IOException {
19 | 
20 |     if (!file.isDirectory()) {
21 |       throw new IOException("Expecting " + file + " to be a directory!");
22 |     }
23 | 
24 |     files = file.listFiles(new FileFilter() {
25 |       public boolean accept(File path) {
26 |         return path.getName().endsWith(".gz") ? true : false;
27 |       }
28 |     });
29 | 
30 |     if (files.length == 0) {
31 |       throw new IOException(file + " does not contain any .gz files!");
32 |     }
33 |   }
34 | 
35 |   /**
36 |    * Returns the next status, or <code>null</code> if no more statuses.
37 |    */
38 |   public Status next() throws IOException {
39 |     if (currentBlock == null) {
40 |       currentBlock = new TSVStatusBlockReader(files[nextFile]);
41 |       nextFile++;
42 |     }
43 | 
44 |     Status status = null;
45 |     while (true) {
46 |       status = currentBlock.next();
47 |       if (status != null) {
48 |         return status;
49 |       }
50 | 
51 |       if (nextFile >= files.length) {
52 |         // We're out of files to read. Must be the end of the corpus.
53 |         return null;
54 |       }
55 | 
56 |       currentBlock.close();
57 |       // Move to next file.
58 |       currentBlock = new TSVStatusBlockReader(files[nextFile]);
59 |       nextFile++;
60 |     }
61 |   }
62 | 
63 |   public void close() throws IOException {
64 |     currentBlock.close();
65 |   }
66 | }
67 | 


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/corpus/data/HTMLStatusExtractor.java:
--------------------------------------------------------------------------------
  1 | package cc.twittertools.corpus.data;
  2 | 
  3 | import java.util.HashMap;
  4 | import java.util.LinkedHashMap;
  5 | import java.util.Map;
  6 | import java.io.BufferedReader;
  7 | import java.io.InputStreamReader;
  8 | import java.io.FileInputStream;
  9 | import java.io.IOException;
 10 | import java.net.URL;
 11 | import java.net.URLDecoder;
 12 | import java.text.SimpleDateFormat;
 13 | import java.util.Date;
 14 | import java.util.TimeZone;
 15 | 
 16 | import org.jsoup.Jsoup;
 17 | import org.jsoup.nodes.Element;
 18 | import org.jsoup.nodes.Document;
 19 | import org.jsoup.select.Elements;
 20 | 
 21 | import com.google.gson.Gson;
 22 | import com.google.gson.GsonBuilder;
 23 | import com.google.gson.JsonObject;
 24 | 
 25 | import org.apache.commons.cli.CommandLine;
 26 | import org.apache.commons.cli.CommandLineParser;
 27 | import org.apache.commons.cli.GnuParser;
 28 | import org.apache.commons.cli.HelpFormatter;
 29 | import org.apache.commons.cli.OptionBuilder;
 30 | import org.apache.commons.cli.Options;
 31 | import org.apache.commons.cli.ParseException;
 32 | 
 33 | public class HTMLStatusExtractor {
 34 | 
 35 |     public SimpleDateFormat date_fmt = new SimpleDateFormat("EEE MMM d kk:mm:ss Z yyyy");
 36 | 
 37 |     public HTMLStatusExtractor() {
 38 | 	date_fmt.setTimeZone(TimeZone.getTimeZone("UTC"));
 39 |     }
 40 | 
 41 |     public static Map<String, String> splitQuery(URL url) 
 42 | 	throws java.io.UnsupportedEncodingException {
 43 | 	Map<String, String> query_pairs = new LinkedHashMap<String, String>();
 44 | 	String query = url.getQuery();
 45 | 	String[] pairs = query.split("&");
 46 | 	for (String pair : pairs) {
 47 | 	    int idx = pair.indexOf("=");
 48 | 	    query_pairs.put(URLDecoder.decode(pair.substring(0, idx), "UTF-8"),
 49 | 			    URLDecoder.decode(pair.substring(idx + 1), "UTF-8"));
 50 | 	}
 51 | 	return query_pairs;
 52 |     }
 53 | 
 54 |     public JsonObject extractTweet(String html) 
 55 | 	throws java.net.MalformedURLException, java.io.UnsupportedEncodingException {
 56 | 	JsonObject status = new JsonObject();
 57 | 
 58 | 	Document doc = Jsoup.parse(html);
 59 | 	Element tweet_div = doc.select("div.permalink-tweet").first();
 60 | 
 61 | 	String tweet_text = tweet_div.select("p.tweet-text").first().text();
 62 | 	status.addProperty("text", tweet_text);
 63 | 
 64 | 	String tweet_id = tweet_div.attr("data-tweet-id");
 65 | 	status.addProperty("id_str", tweet_id);
 66 | 	status.addProperty("id", Long.parseLong(tweet_id));
 67 | 
 68 | 	String timestamp = doc.select("span.js-short-timestamp").first().attr("data-time");
 69 | 	Date created_at = new Date();
 70 | 	created_at.setTime(Long.parseLong(timestamp) * 1000);
 71 | 	status.addProperty("created_at", date_fmt.format(created_at));
 72 | 
 73 | 	Elements js_stats_retweets = doc.select("li.js-stat-retweets");
 74 | 	if (!js_stats_retweets.isEmpty()) {
 75 | 	    status.addProperty("retweeted", true);
 76 | 	    String count = js_stats_retweets.select("strong").first().text();
 77 | 	    status.addProperty("retweet_count", Long.parseLong(count));
 78 | 	} else {
 79 | 	    status.addProperty("retweeted", false);
 80 | 	    status.addProperty("retweet_count", 0);
 81 | 	}
 82 | 	Elements js_stats_favs = doc.select("li.js-stat-favorites");
 83 | 	status.addProperty("favorited", !js_stats_favs.isEmpty());
 84 | 	    
 85 | 
 86 | 	// User subfield
 87 | 	JsonObject user = new JsonObject();
 88 | 	String user_id = tweet_div.attr("data-user-id");
 89 | 	user.addProperty("id_str", user_id);
 90 | 	user.addProperty("id", Long.parseLong(user_id));
 91 | 	String screen_name = tweet_div.attr("data-screen-name");
 92 | 	user.addProperty("screen_name", screen_name);
 93 | 	String user_name = tweet_div.attr("data-name");
 94 | 	user.addProperty("name", user_name);
 95 | 	
 96 | 	status.add("user", user);
 97 | 	
 98 | 	// Geo information
 99 | 	Elements tweet_loc = doc.select("a.tweet-geo-text");
100 | 	if (!tweet_loc.isEmpty()) {
101 | 	    JsonObject location = new JsonObject();
102 | 	    Element loc = tweet_loc.first();
103 | 	    // Adding http to avoid malformed URL exception
104 | 	    URL url = new URL("http:" + loc.attr("href"));
105 | 	    Map<String, String> query_params = HTMLStatusExtractor.splitQuery(url);
106 | 	    // Loop over possible query parameters
107 | 	    // http://asnsblues.blogspot.ch/2011/11/google-maps-query-string-parameters.html
108 | 	    String lat_and_long = null;
109 | 	    if ((lat_and_long = query_params.get("ll")) != null
110 | 		|| (lat_and_long = query_params.get("sll")) != null
111 | 		|| (lat_and_long = query_params.get("cbll")) != null
112 | 		|| (lat_and_long = query_params.get("q")) != null) {
113 | 		String[] coordinates = lat_and_long.split(",");
114 | 		double latitude = Double.parseDouble(coordinates[0]);
115 | 		double longitude = Double.parseDouble(coordinates[1]);
116 | 		location.addProperty("latitude", latitude);
117 | 		location.addProperty("longitude", longitude);
118 | 	    }
119 | 	    location.addProperty("location_text", loc.text());
120 | 	    status.add("location", location);
121 | 	}
122 | 
123 | 	return status;
124 |     }
125 | 
126 |     private static final String HTML_OPTION = "html";
127 |     
128 |     @SuppressWarnings("static-access")
129 |     public static void main(String[] args) throws Exception {
130 | 	Options options = new Options();
131 | 	options.addOption(OptionBuilder.withArgName("path").hasArg()
132 | 			  .withDescription("HTML file from twitter.com").create(HTML_OPTION));
133 | 	
134 | 	CommandLine cmdline = null;
135 | 	CommandLineParser parser = new GnuParser();
136 | 	try {
137 | 	    cmdline = parser.parse(options, args);
138 | 	} catch (ParseException exp) {
139 | 	    System.err.println("Error parsing command line: " + exp.getMessage());
140 | 	    System.exit(-1);
141 | 	}
142 | 	
143 | 	if (!cmdline.hasOption(HTML_OPTION)) {
144 | 	    HelpFormatter formatter = new HelpFormatter();
145 | 	    formatter.printHelp(HTMLStatusExtractor.class.getName(), options);
146 | 	    System.exit(-1);
147 | 	}
148 | 	
149 | 	String html_filename = cmdline.getOptionValue(HTML_OPTION);
150 | 	BufferedReader html_file = null;
151 | 	StringBuffer buf = new StringBuffer();
152 | 	try {
153 | 	    html_file = new BufferedReader(new InputStreamReader(new FileInputStream(html_filename)));
154 | 	    String line;
155 | 	    while ((line = html_file.readLine()) != null) {
156 | 		buf.append(line);
157 | 		buf.append('\n');
158 | 	    }
159 | 	} catch (IOException e) {
160 | 	    e.printStackTrace();
161 | 	} finally {
162 | 	    html_file.close();
163 | 	}
164 | 
165 | 	HTMLStatusExtractor hse = new HTMLStatusExtractor();
166 | 	JsonObject json = hse.extractTweet(buf.toString());
167 | 	Gson gson = new GsonBuilder().setPrettyPrinting().create();
168 | 	System.out.println(gson.toJson(json));
169 |     }
170 | }
171 | 


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/corpus/data/JsonStatusBlockReader.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Twitter Tools
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package cc.twittertools.corpus.data;
18 | 
19 | import java.io.BufferedReader;
20 | import java.io.File;
21 | import java.io.FileInputStream;
22 | import java.io.IOException;
23 | import java.io.InputStreamReader;
24 | import java.util.zip.GZIPInputStream;
25 | 
26 | import com.google.common.base.Preconditions;
27 | 
28 | /**
29 |  * Abstraction for an stream of statuses, backed by an underlying gzipped file with JSON-encoded
30 |  * tweets, one per line.
31 |  */
32 | public class JsonStatusBlockReader implements StatusStream {
33 |   private final BufferedReader br;
34 | 
35 |   public JsonStatusBlockReader(File file) throws IOException {
36 |     Preconditions.checkNotNull(file);
37 | 
38 |     if (!file.getName().endsWith(".gz")) {
39 |       throw new IOException("Expecting .gz compressed file!");
40 |     }
41 | 
42 |     br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file)), "UTF-8"));
43 |   }
44 | 
45 |   /**
46 |    * Returns the next status, or <code>null</code> if no more statuses.
47 |    */
48 |   public Status next() throws IOException {
49 |     Status nxt = null;
50 |     String raw = null;
51 | 
52 |     while (nxt == null) {
53 |       raw = br.readLine();
54 | 
55 |       // Check to see if we've reached end of file.
56 |       if (raw == null) {
57 |         return null;
58 |       }
59 | 
60 |       nxt = Status.fromJson(raw);
61 |     }
62 |     return Status.fromJson(raw);
63 |   }
64 | 
65 |   public void close() throws IOException {
66 |     br.close();
67 |   }
68 | }
69 | 


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/corpus/data/JsonStatusCorpusReader.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Twitter Tools
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package cc.twittertools.corpus.data;
18 | 
19 | import java.io.File;
20 | import java.io.FileFilter;
21 | import java.io.IOException;
22 | 
23 | import com.google.common.base.Preconditions;
24 | 
25 | /**
26 |  * Abstraction for a corpus of statuses. A corpus is assumed to consist of a number of blocks, each
27 |  * represented by a gzipped file within a root directory. This object will allow to caller to read
28 |  * through all blocks, in sorted lexicographic order of the files.
29 |  */
30 | public class JsonStatusCorpusReader implements StatusStream {
31 |   private final File[] files;
32 |   private int nextFile = 0;
33 |   private JsonStatusBlockReader currentBlock = null;
34 | 
35 |   public JsonStatusCorpusReader(File file) throws IOException {
36 |     Preconditions.checkNotNull(file);
37 | 
38 |     if (!file.isDirectory()) {
39 |       throw new IOException("Expecting " + file + " to be a directory!");
40 |     }
41 | 
42 |     files = file.listFiles(new FileFilter() {
43 |       public boolean accept(File path) {
44 |         return path.getName().endsWith(".gz") ? true : false;
45 |       }
46 |     });
47 | 
48 |     if (files.length == 0) {
49 |       throw new IOException(file + " does not contain any .gz files!");
50 |     }
51 |   }
52 | 
53 |   /**
54 |    * Returns the next status, or <code>null</code> if no more statuses.
55 |    */
56 |   public Status next() throws IOException {
57 |     if (currentBlock == null) {
58 |       currentBlock = new JsonStatusBlockReader(files[nextFile]);
59 |       nextFile++;
60 |     }
61 | 
62 |     Status status = null;
63 |     while (true) {
64 |       status = currentBlock.next();
65 |       if (status != null) {
66 |         return status;
67 |       }
68 | 
69 |       if (nextFile >= files.length) {
70 |         // We're out of files to read. Must be the end of the corpus.
71 |         return null;
72 |       }
73 | 
74 |       currentBlock.close();
75 |       // Move to next file.
76 |       currentBlock = new JsonStatusBlockReader(files[nextFile]);
77 |       nextFile++;
78 |     }
79 |   }
80 | 
81 |   public void close() throws IOException {
82 |     currentBlock.close();
83 |   }
84 | }
85 | 


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/corpus/data/StatusStream.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Twitter Tools
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package cc.twittertools.corpus.data;
18 | 
19 | import java.io.IOException;
20 | 
21 | /**
22 |  * Abstraction for a stream of statuses. Ordering of the statuses is left to the implementation.
23 |  */
24 | public interface StatusStream {
25 |   public Status next() throws IOException;
26 |   public void close() throws IOException;
27 | }
28 | 


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/corpus/demo/ReadStatuses.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Twitter Tools
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package cc.twittertools.corpus.demo;
 18 | 
 19 | import java.io.File;
 20 | import java.io.PrintStream;
 21 | 
 22 | import org.apache.commons.cli.CommandLine;
 23 | import org.apache.commons.cli.CommandLineParser;
 24 | import org.apache.commons.cli.GnuParser;
 25 | import org.apache.commons.cli.HelpFormatter;
 26 | import org.apache.commons.cli.OptionBuilder;
 27 | import org.apache.commons.cli.Options;
 28 | import org.apache.commons.cli.ParseException;
 29 | import org.apache.log4j.Logger;
 30 | 
 31 | import cc.twittertools.corpus.data.JsonStatusBlockReader;
 32 | import cc.twittertools.corpus.data.JsonStatusCorpusReader;
 33 | import cc.twittertools.corpus.data.Status;
 34 | import cc.twittertools.corpus.data.StatusStream;
 35 | 
 36 | /**
 37 |  * Sample program to illustrate how to work with {@link StatusStream}.
 38 |  */
 39 | public class ReadStatuses {
 40 |   private static final Logger LOG = Logger.getLogger(ReadStatuses.class);
 41 | 
 42 |   private ReadStatuses() {}
 43 | 
 44 |   private static final String INPUT_OPTION = "input";
 45 |   private static final String VERBOSE_OPTION = "verbose";
 46 |   private static final String DUMP_OPTION = "dump";
 47 | 
 48 |   @SuppressWarnings("static-access")
 49 |   public static void main(String[] args) throws Exception {
 50 |     Options options = new Options();
 51 |     options.addOption(OptionBuilder.withArgName("path").hasArg()
 52 |         .withDescription("input directory or file").create(INPUT_OPTION));
 53 |     options.addOption(VERBOSE_OPTION, false, "print logging output every 10000 tweets");
 54 |     options.addOption(DUMP_OPTION, false, "dump statuses");
 55 | 
 56 |     CommandLine cmdline = null;
 57 |     CommandLineParser parser = new GnuParser();
 58 |     try {
 59 |       cmdline = parser.parse(options, args);
 60 |     } catch (ParseException exp) {
 61 |       System.err.println("Error parsing command line: " + exp.getMessage());
 62 |       System.exit(-1);
 63 |     }
 64 | 
 65 |     if (!cmdline.hasOption(INPUT_OPTION)) {
 66 |       HelpFormatter formatter = new HelpFormatter();
 67 |       formatter.printHelp(ReadStatuses.class.getName(), options);
 68 |       System.exit(-1);
 69 |     }
 70 | 
 71 |     PrintStream out = new PrintStream(System.out, true, "UTF-8");
 72 | 
 73 |     StatusStream stream;
 74 |     // Figure out if we're reading from HTML SequenceFiles or JSON.
 75 |     File file = new File(cmdline.getOptionValue(INPUT_OPTION));
 76 |     if (!file.exists()) {
 77 |       System.err.println("Error: " + file + " does not exist!");
 78 |       System.exit(-1);
 79 |     }
 80 | 
 81 |     if (file.isDirectory()) {
 82 |       stream = new JsonStatusCorpusReader(file);
 83 |     } else {
 84 |       stream = new JsonStatusBlockReader(file);
 85 |     }
 86 | 
 87 |     int cnt = 0;
 88 |     Status status;
 89 |     while ((status = stream.next()) != null) {
 90 |       if (cmdline.hasOption(DUMP_OPTION)) {
 91 |         String text = status.getText();
 92 |         if (text != null) {
 93 |           text = text.replaceAll("\\s+", " ");
 94 |           text = text.replaceAll("\0", "");
 95 |         }
 96 |         out.println(String.format("%d\t%s\t%s\t%s", status.getId(), status.getScreenname(),
 97 |             status.getCreatedAt(), text));
 98 |       }
 99 |       cnt++;
100 |       if ( cnt % 10000 == 0 && cmdline.hasOption(VERBOSE_OPTION)) {
101 |         LOG.info(cnt + " statuses read");
102 |       }
103 |     }
104 |     stream.close();
105 |     LOG.info(String.format("Total of %s statuses read.", cnt));
106 |   }
107 | }
108 | 


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/index/ExtractTermStatisticsFromIndex.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Twitter Tools
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package cc.twittertools.index;
 18 | 
 19 | import java.io.File;
 20 | import java.io.PrintStream;
 21 | 
 22 | import org.apache.commons.cli.CommandLine;
 23 | import org.apache.commons.cli.CommandLineParser;
 24 | import org.apache.commons.cli.GnuParser;
 25 | import org.apache.commons.cli.HelpFormatter;
 26 | import org.apache.commons.cli.OptionBuilder;
 27 | import org.apache.commons.cli.Options;
 28 | import org.apache.commons.cli.ParseException;
 29 | import org.apache.lucene.index.DirectoryReader;
 30 | import org.apache.lucene.index.IndexReader;
 31 | import org.apache.lucene.index.SlowCompositeReaderWrapper;
 32 | import org.apache.lucene.index.Terms;
 33 | import org.apache.lucene.index.TermsEnum;
 34 | import org.apache.lucene.store.FSDirectory;
 35 | import org.apache.lucene.util.BytesRef;
 36 | 
 37 | import cc.twittertools.index.IndexStatuses.StatusField;
 38 | 
 39 | public class ExtractTermStatisticsFromIndex {
 40 |   private static final String INDEX_OPTION = "index";
 41 |   private static final String MIN_OPTION = "min";
 42 | 
 43 |   @SuppressWarnings("static-access")
 44 |   public static void main(String[] args) throws Exception {
 45 |     Options options = new Options();
 46 | 
 47 |     options.addOption(OptionBuilder.withArgName("dir").hasArg()
 48 |         .withDescription("index").create(INDEX_OPTION));
 49 |     options.addOption(OptionBuilder.withArgName("num").hasArg()
 50 |         .withDescription("min").create(MIN_OPTION));
 51 | 
 52 |     CommandLine cmdline = null;
 53 |     CommandLineParser parser = new GnuParser();
 54 |     try {
 55 |       cmdline = parser.parse(options, args);
 56 |     } catch (ParseException exp) {
 57 |       System.err.println("Error parsing command line: " + exp.getMessage());
 58 |       System.exit(-1);
 59 |     }
 60 | 
 61 |     if (!cmdline.hasOption(INDEX_OPTION)) {
 62 |       HelpFormatter formatter = new HelpFormatter();
 63 |       formatter.printHelp(ExtractTermStatisticsFromIndex.class.getName(), options);
 64 |       System.exit(-1);
 65 |     }
 66 | 
 67 |     String indexLocation = cmdline.getOptionValue(INDEX_OPTION);
 68 |     int min = cmdline.hasOption(MIN_OPTION) ?
 69 |         Integer.parseInt(cmdline.getOptionValue(MIN_OPTION)) : 1;
 70 | 
 71 |     PrintStream out = new PrintStream(System.out, true, "UTF-8");
 72 | 
 73 |     IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation)));
 74 |     Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms(StatusField.TEXT.name);
 75 |     TermsEnum termsEnum = terms.iterator(TermsEnum.EMPTY);
 76 | 
 77 |     long missingCnt = 0;
 78 |     int skippedTerms = 0;
 79 |     BytesRef bytes = new BytesRef();
 80 |     while ( (bytes = termsEnum.next()) != null) {
 81 |       byte[] buf = new byte[bytes.length];
 82 |       System.arraycopy(bytes.bytes, 0, buf, 0, bytes.length);
 83 |       String term = new String(buf, "UTF-8");
 84 |       int df = termsEnum.docFreq();
 85 |       long cf = termsEnum.totalTermFreq();
 86 | 
 87 |       if ( df < min) {
 88 |         skippedTerms++;
 89 |         missingCnt += cf;
 90 |         continue;
 91 |       }
 92 | 
 93 |       out.println(term + "\t" + df + "\t" + cf);
 94 |     }
 95 | 
 96 |     reader.close();
 97 |     out.close();
 98 |     System.err.println("skipped terms: " + skippedTerms + ", cnt: " + missingCnt);
 99 |   }
100 | }
101 | 


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/index/ExtractTweetidsFromCollection.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Twitter Tools
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package cc.twittertools.index;
18 | 
19 | import java.io.File;
20 | 
21 | import org.apache.commons.cli.CommandLine;
22 | import org.apache.commons.cli.CommandLineParser;
23 | import org.apache.commons.cli.GnuParser;
24 | import org.apache.commons.cli.HelpFormatter;
25 | import org.apache.commons.cli.OptionBuilder;
26 | import org.apache.commons.cli.Options;
27 | import org.apache.commons.cli.ParseException;
28 | 
29 | import cc.twittertools.corpus.data.JsonStatusCorpusReader;
30 | import cc.twittertools.corpus.data.Status;
31 | import cc.twittertools.corpus.data.StatusStream;
32 | 
33 | public class ExtractTweetidsFromCollection {
34 |   private static final String COLLECTION_OPTION = "collection";
35 | 
36 |   @SuppressWarnings("static-access")
37 |   public static void main(String[] args) throws Exception {
38 |     Options options = new Options();
39 | 
40 |     options.addOption(OptionBuilder.withArgName("dir").hasArg()
41 |         .withDescription("source collection directory").create(COLLECTION_OPTION));
42 | 
43 |     CommandLine cmdline = null;
44 |     CommandLineParser parser = new GnuParser();
45 |     try {
46 |       cmdline = parser.parse(options, args);
47 |     } catch (ParseException exp) {
48 |       System.err.println("Error parsing command line: " + exp.getMessage());
49 |       System.exit(-1);
50 |     }
51 | 
52 |     if (!cmdline.hasOption(COLLECTION_OPTION)) {
53 |       HelpFormatter formatter = new HelpFormatter();
54 |       formatter.printHelp(ExtractTweetidsFromCollection.class.getName(), options);
55 |       System.exit(-1);
56 |     }
57 | 
58 |     String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION);
59 | 
60 |     File file = new File(collectionPath);
61 |     if (!file.exists()) {
62 |       System.err.println("Error: " + file + " does not exist!");
63 |       System.exit(-1);
64 |     }
65 | 
66 |     StatusStream stream = new JsonStatusCorpusReader(file);
67 | 
68 |     Status status;
69 |     while ((status = stream.next()) != null) {
70 |       System.out.println(status.getId() + "\t" + status.getScreenname());
71 |     }
72 |   }
73 | }
74 | 


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/index/ExtractTweetidsFromIndex.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Twitter Tools
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package cc.twittertools.index;
18 | 
19 | import java.io.File;
20 | import java.io.PrintStream;
21 | 
22 | import org.apache.commons.cli.CommandLine;
23 | import org.apache.commons.cli.CommandLineParser;
24 | import org.apache.commons.cli.GnuParser;
25 | import org.apache.commons.cli.HelpFormatter;
26 | import org.apache.commons.cli.OptionBuilder;
27 | import org.apache.commons.cli.Options;
28 | import org.apache.commons.cli.ParseException;
29 | import org.apache.lucene.document.Document;
30 | import org.apache.lucene.index.DirectoryReader;
31 | import org.apache.lucene.index.IndexReader;
32 | import org.apache.lucene.store.FSDirectory;
33 | 
34 | import cc.twittertools.index.IndexStatuses.StatusField;
35 | 
36 | /**
37 |  * Reference implementation for indexing statuses.
38 |  */
39 | public class ExtractTweetidsFromIndex {
40 |   private ExtractTweetidsFromIndex() {}
41 | 
42 |   private static final String INDEX_OPTION = "index";
43 | 
44 |   @SuppressWarnings("static-access")
45 |   public static void main(String[] args) throws Exception {
46 |     Options options = new Options();
47 | 
48 |     options.addOption(OptionBuilder.withArgName("dir").hasArg()
49 |         .withDescription("index location").create(INDEX_OPTION));
50 | 
51 |     CommandLine cmdline = null;
52 |     CommandLineParser parser = new GnuParser();
53 |     try {
54 |       cmdline = parser.parse(options, args);
55 |     } catch (ParseException exp) {
56 |       System.err.println("Error parsing command line: " + exp.getMessage());
57 |       System.exit(-1);
58 |     }
59 | 
60 |     if (!cmdline.hasOption(INDEX_OPTION)) {
61 |       HelpFormatter formatter = new HelpFormatter();
62 |       formatter.printHelp(ExtractTweetidsFromIndex.class.getName(), options);
63 |       System.exit(-1);
64 |     }
65 | 
66 |     File indexLocation = new File(cmdline.getOptionValue(INDEX_OPTION));
67 |     if (!indexLocation.exists()) {
68 |       System.err.println("Error: " + indexLocation + " does not exist!");
69 |       System.exit(-1);
70 |     }
71 | 
72 |     IndexReader reader = DirectoryReader.open(FSDirectory.open(indexLocation));
73 |     PrintStream out = new PrintStream(System.out, true, "UTF-8");
74 |     for (int i=0; i<reader.maxDoc(); i++) {
75 |       Document doc = reader.document(i);
76 |       out.println(doc.getField(StatusField.ID.name).stringValue() + "\t" +
77 |           doc.getField(StatusField.SCREEN_NAME.name).stringValue());
78 |     }
79 |     out.close();
80 |     reader.close();
81 |   }
82 | }
83 | 


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/index/TweetAnalyzer.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Twitter Tools
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package cc.twittertools.index;
18 | 
19 | import java.io.Reader;
20 | 
21 | import org.apache.lucene.analysis.Analyzer;
22 | import org.apache.lucene.analysis.TokenStream;
23 | import org.apache.lucene.analysis.Tokenizer;
24 | import org.apache.lucene.analysis.core.WhitespaceTokenizer;
25 | import org.apache.lucene.analysis.en.PorterStemFilter;
26 | import org.apache.lucene.util.Version;
27 | 
28 | import com.google.common.base.Preconditions;
29 | 
30 | public final class TweetAnalyzer extends Analyzer {
31 |   private final Version matchVersion;
32 |   private final boolean stemming;
33 | 
34 |   public TweetAnalyzer(Version matchVersion, boolean stemming) {
35 |     this.matchVersion = Preconditions.checkNotNull(matchVersion);
36 |     this.stemming = stemming;
37 |   }
38 | 
39 |   public TweetAnalyzer(Version matchVersion) {
40 |     this(matchVersion, true);
41 |   }
42 | 
43 |   @Override
44 |   protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
45 |     Tokenizer source = new WhitespaceTokenizer(matchVersion, reader);
46 |     TokenStream filter = new LowerCaseEntityPreservingFilter(source);
47 | 
48 |     if (stemming) {
49 |       // Porter stemmer ignores words which are marked as keywords
50 |       filter = new PorterStemFilter(filter);
51 |     }
52 |     return new TokenStreamComponents(source, filter);
53 |   }
54 | 
55 | }


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/search/TrecTopic.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Twitter Tools
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package cc.twittertools.search;
18 | 
19 | import com.google.common.base.Preconditions;
20 | 
21 | public class TrecTopic {
22 |   private String query;
23 |   private String id;
24 |   private long time;
25 | 
26 |   public TrecTopic(String id, String query, long time) {
27 |     this.id = Preconditions.checkNotNull(id);
28 |     this.query = Preconditions.checkNotNull(query);
29 |     Preconditions.checkArgument(time > 0);
30 |     this.time = time;
31 |   }
32 | 
33 |   public String getId() {
34 |     return id;
35 |   }
36 | 
37 |   public String getQuery() {
38 |     return query;
39 |   }
40 | 
41 |   public long getQueryTweetTime() {
42 |     return time;
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/search/TrecTopicSet.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Twitter Tools
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package cc.twittertools.search;
18 | 
19 | import java.io.File;
20 | import java.io.IOException;
21 | import java.util.Iterator;
22 | import java.util.List;
23 | import java.util.regex.Matcher;
24 | import java.util.regex.Pattern;
25 | 
26 | import com.google.common.base.Charsets;
27 | import com.google.common.base.Joiner;
28 | import com.google.common.base.Preconditions;
29 | import com.google.common.collect.Lists;
30 | import com.google.common.io.Files;
31 | 
32 | public class TrecTopicSet implements Iterable<TrecTopic>{
33 |   private List<TrecTopic> queries = Lists.newArrayList();
34 | 
35 |   private TrecTopicSet() {}
36 | 
37 |   private void add(TrecTopic q) {
38 |     queries.add(q);
39 |   }
40 | 
41 |   @Override
42 |   public Iterator<TrecTopic> iterator() {
43 |     return queries.iterator();
44 |   }
45 | 
46 |   private static final Pattern TOP_PATTERN = Pattern.compile("<top(.*?)</top>", Pattern.DOTALL);
47 |   private static final Pattern NUM_PATTERN = Pattern.compile("<num> Number: (MB\\d+) </num>", Pattern.DOTALL);
48 | 
49 |   // TREC 2011 topics uses <title> tag
50 |   private static final Pattern TITLE_PATTERN = Pattern.compile("<title>\\s*(.*?)\\s*</title>", Pattern.DOTALL);
51 |   // TREC 2012 topics use <query> tag
52 |   private static final Pattern TITLE_PATTERN2 = Pattern.compile("<query>\\s*(.*?)\\s*</query>", Pattern.DOTALL);
53 | 
54 |   private static final Pattern TWEETTIME_PATTERN = Pattern.compile("<querytweettime>\\s*(\\d+)\\s*</querytweettime>", Pattern.DOTALL);
55 | 
56 |   public static TrecTopicSet fromFile(File f) throws IOException {
57 |     Preconditions.checkNotNull(f);
58 |     Preconditions.checkArgument(f.exists());
59 | 
60 |     String s = Joiner.on("\n").join(Files.readLines(f, Charsets.UTF_8));
61 |     TrecTopicSet queries = new TrecTopicSet();
62 | 
63 |     Matcher matcher = TOP_PATTERN.matcher(s);
64 |     while (matcher.find()) {
65 |       String top = matcher.group(0);
66 | 
67 |       Matcher m = NUM_PATTERN.matcher(top);
68 |       if (!m.find()) {
69 |         throw new IOException("Error parsing " + f);
70 |       }
71 |       String id = m.group(1);
72 |       // Topics from 2012 are inconsistently numbered,
73 |       // e.g., MB051 should match the qrels, which has MB51
74 |       if (id.matches("MB0\\d\\d")) {
75 |         id = id.replace("MB0", "MB");
76 |       }
77 | 
78 |       m = TITLE_PATTERN.matcher(top);
79 |       if (!m.find()) {
80 |         m = TITLE_PATTERN2.matcher(top);
81 |         if (!m.find()) {
82 |           throw new IOException("Error parsing " + f);
83 |         }
84 |       }
85 |       String text = m.group(1);
86 | 
87 |       m = TWEETTIME_PATTERN.matcher(top);
88 |       if (!m.find()) {
89 |         throw new IOException("Error parsing " + f);
90 |       }
91 |       long time = Long.parseLong(m.group(1));
92 |       queries.add(new TrecTopic(id, text, time));
93 |     }
94 |     return queries;
95 |   }
96 | }
97 | 


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/search/api/RunQueriesThrift.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Twitter Tools
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package cc.twittertools.search.api;
 18 | 
 19 | import java.io.File;
 20 | import java.io.PrintStream;
 21 | import java.util.List;
 22 | import java.util.Set;
 23 | import java.util.HashSet;
 24 | 
 25 | import org.apache.commons.cli.CommandLine;
 26 | import org.apache.commons.cli.CommandLineParser;
 27 | import org.apache.commons.cli.GnuParser;
 28 | import org.apache.commons.cli.HelpFormatter;
 29 | import org.apache.commons.cli.Option;
 30 | import org.apache.commons.cli.OptionBuilder;
 31 | import org.apache.commons.cli.Options;
 32 | import org.apache.commons.cli.ParseException;
 33 | 
 34 | import cc.twittertools.search.TrecTopicSet;
 35 | import cc.twittertools.thrift.gen.TResult;
 36 | 
 37 | public class RunQueriesThrift {
 38 |   private static final String DEFAULT_RUNTAG = "lucene4lm";
 39 | 
 40 |   private static final String HOST_OPTION = "host";
 41 |   private static final String PORT_OPTION = "port";
 42 |   private static final String QUERIES_OPTION = "queries";
 43 |   private static final String NUM_RESULTS_OPTION = "num_results";
 44 |   private static final String GROUP_OPTION = "group";
 45 |   private static final String TOKEN_OPTION = "token";
 46 |   private static final String RUNTAG_OPTION = "runtag";
 47 |   private static final String VERBOSE_OPTION = "verbose";
 48 | 
 49 |   private RunQueriesThrift() {}
 50 | 
 51 |   @SuppressWarnings("static-access")
 52 |   public static void main(String[] args) throws Exception {
 53 |     Options options = new Options();
 54 | 
 55 |     options.addOption(OptionBuilder.withArgName("string").hasArg()
 56 |         .withDescription("host").create(HOST_OPTION));
 57 |     options.addOption(OptionBuilder.withArgName("port").hasArg()
 58 |         .withDescription("port").create(PORT_OPTION));
 59 |     options.addOption(OptionBuilder.withArgName("file").hasArg()
 60 |         .withDescription("file containing topics in TREC format").create(QUERIES_OPTION));
 61 |     options.addOption(OptionBuilder.withArgName("num").hasArg()
 62 |         .withDescription("number of results to return").create(NUM_RESULTS_OPTION));
 63 |     options.addOption(OptionBuilder.withArgName("string").hasArg()
 64 |         .withDescription("group id").create(GROUP_OPTION));
 65 |     options.addOption(OptionBuilder.withArgName("string").hasArg()
 66 |         .withDescription("access token").create(TOKEN_OPTION));
 67 |     options.addOption(OptionBuilder.withArgName("string").hasArg()
 68 |         .withDescription("runtag").create(RUNTAG_OPTION));
 69 |     options.addOption(new Option(VERBOSE_OPTION, "print out complete document"));
 70 | 
 71 |     CommandLine cmdline = null;
 72 |     CommandLineParser parser = new GnuParser();
 73 |     try {
 74 |       cmdline = parser.parse(options, args);
 75 |     } catch (ParseException exp) {
 76 |       System.err.println("Error parsing command line: " + exp.getMessage());
 77 |       System.exit(-1);
 78 |     }
 79 | 
 80 |     if (!cmdline.hasOption(HOST_OPTION) || !cmdline.hasOption(PORT_OPTION)
 81 |         || !cmdline.hasOption(QUERIES_OPTION)) {
 82 |       HelpFormatter formatter = new HelpFormatter();
 83 |       formatter.printHelp(RunQueriesThrift.class.getName(), options);
 84 |       System.exit(-1);
 85 |     }
 86 | 
 87 |     String queryFile = cmdline.getOptionValue(QUERIES_OPTION);
 88 |     if (!new File(queryFile).exists()) {
 89 |       System.err.println("Error: " + queryFile + " doesn't exist!");
 90 |       System.exit(-1);
 91 |     }
 92 | 
 93 |     String runtag = cmdline.hasOption(RUNTAG_OPTION) ?
 94 |         cmdline.getOptionValue(RUNTAG_OPTION) : DEFAULT_RUNTAG;
 95 | 
 96 |     TrecTopicSet topicsFile = TrecTopicSet.fromFile(new File(queryFile));
 97 | 
 98 |     int numResults = 1000;
 99 |     try {
100 |       if (cmdline.hasOption(NUM_RESULTS_OPTION)) {
101 |         numResults = Integer.parseInt(cmdline.getOptionValue(NUM_RESULTS_OPTION));
102 |       }
103 |     } catch (NumberFormatException e) {
104 |       System.err.println("Invalid " + NUM_RESULTS_OPTION + ": " + cmdline.getOptionValue(NUM_RESULTS_OPTION));
105 |       System.exit(-1);
106 |     }
107 | 
108 |     String group = cmdline.hasOption(GROUP_OPTION) ? cmdline.getOptionValue(GROUP_OPTION) : null;
109 |     String token = cmdline.hasOption(TOKEN_OPTION) ? cmdline.getOptionValue(TOKEN_OPTION) : null;
110 | 
111 |     boolean verbose = cmdline.hasOption(VERBOSE_OPTION);
112 | 
113 |     PrintStream out = new PrintStream(System.out, true, "UTF-8");
114 | 
115 |     TrecSearchThriftClient client = new TrecSearchThriftClient(cmdline.getOptionValue(HOST_OPTION),
116 |         Integer.parseInt(cmdline.getOptionValue(PORT_OPTION)), group, token);
117 | 
118 |     for (cc.twittertools.search.TrecTopic query : topicsFile) {
119 |       List<TResult> results = client.search(query.getQuery(),
120 |           query.getQueryTweetTime(), numResults);
121 |       int i = 1;
122 |       Set<Long> tweetIds = new HashSet<Long>();
123 |       for (TResult result : results) {
124 |         if (!tweetIds.contains(result.id)) {
125 |           // The TREC official qrels don't have the "MB" prefix and trailing zeros, so we perform
126 |           // this transformation so that trec_eval doesn't complain.
127 |           String qid = query.getId().replaceFirst("^MB0*", "");
128 |           tweetIds.add(result.id);
129 |           out.println(String.format("%s Q0 %d %d %f %s", qid, result.id, i, result.rsv, runtag));
130 |           if (verbose) {
131 |             out.println("# " + result.toString().replaceAll("[\\n\\r]+", " "));
132 |           }
133 |           i++;
134 |         }
135 |       }
136 |     }
137 |     out.close();
138 |   }
139 | }
140 | 


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/search/api/SearchStatusesThrift.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Twitter Tools
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package cc.twittertools.search.api;
 18 | 
 19 | import java.io.PrintStream;
 20 | import java.util.List;
 21 | 
 22 | import org.apache.commons.cli.CommandLine;
 23 | import org.apache.commons.cli.CommandLineParser;
 24 | import org.apache.commons.cli.GnuParser;
 25 | import org.apache.commons.cli.HelpFormatter;
 26 | import org.apache.commons.cli.Option;
 27 | import org.apache.commons.cli.OptionBuilder;
 28 | import org.apache.commons.cli.Options;
 29 | import org.apache.commons.cli.ParseException;
 30 | 
 31 | import cc.twittertools.thrift.gen.TResult;
 32 | 
 33 | public class SearchStatusesThrift {
 34 |   // Defaults: if user doesn't specify an actual query, run MB01 as a demo.
 35 |   private static final String DEFAULT_QID = "MB01";
 36 |   private static final String DEFAULT_Q = "BBC World Service staff cuts";
 37 |   private static final long DEFAULT_MAX_ID = 34952194402811905L;
 38 |   private static final int DEFAULT_NUM_RESULTS = 10;
 39 |   private static final String DEFAULT_RUNTAG = "lucene4lm";
 40 | 
 41 |   private static final String HELP_OPTION = "h";
 42 |   private static final String HOST_OPTION = "host";
 43 |   private static final String PORT_OPTION = "port";
 44 |   private static final String QID_OPTION = "qid";
 45 |   private static final String QUERY_OPTION = "q";
 46 |   private static final String RUNTAG_OPTION = "runtag";
 47 |   private static final String MAX_ID_OPTION = "max_id";
 48 |   private static final String NUM_RESULTS_OPTION = "num_results";
 49 |   private static final String GROUP_OPTION = "group";
 50 |   private static final String TOKEN_OPTION = "token";
 51 |   private static final String VERBOSE_OPTION = "verbose";
 52 | 
 53 |   @SuppressWarnings("static-access")
 54 |   public static void main(String[] args) throws Exception {
 55 |     Options options = new Options();
 56 | 
 57 |     options.addOption(new Option(HELP_OPTION, "show help"));
 58 |     options.addOption(OptionBuilder.withArgName("string").hasArg()
 59 |         .withDescription("host").create(HOST_OPTION));
 60 |     options.addOption(OptionBuilder.withArgName("port").hasArg()
 61 |         .withDescription("port").create(PORT_OPTION));
 62 |     options.addOption(OptionBuilder.withArgName("string").hasArg()
 63 |         .withDescription("query id").create(QID_OPTION));
 64 |     options.addOption(OptionBuilder.withArgName("string").hasArg()
 65 |         .withDescription("query text").create(QUERY_OPTION));
 66 |     options.addOption(OptionBuilder.withArgName("string").hasArg()
 67 |         .withDescription("runtag").create(RUNTAG_OPTION));
 68 |     options.addOption(OptionBuilder.withArgName("num").hasArg()
 69 |         .withDescription("maxid").create(MAX_ID_OPTION));
 70 |     options.addOption(OptionBuilder.withArgName("num").hasArg()
 71 |         .withDescription("number of results to return").create(NUM_RESULTS_OPTION));
 72 |     options.addOption(OptionBuilder.withArgName("string").hasArg()
 73 |         .withDescription("group id").create(GROUP_OPTION));
 74 |     options.addOption(OptionBuilder.withArgName("string").hasArg()
 75 |         .withDescription("access token").create(TOKEN_OPTION));
 76 |     options.addOption(new Option(VERBOSE_OPTION, "print out complete document"));
 77 | 
 78 |     CommandLine cmdline = null;
 79 |     CommandLineParser parser = new GnuParser();
 80 |     try {
 81 |       cmdline = parser.parse(options, args);
 82 |     } catch (ParseException exp) {
 83 |       System.err.println("Error parsing command line: " + exp.getMessage());
 84 |       System.exit(-1);
 85 |     }
 86 | 
 87 |     if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(HOST_OPTION)
 88 |         || !cmdline.hasOption(PORT_OPTION)) {
 89 |       HelpFormatter formatter = new HelpFormatter();
 90 |       formatter.printHelp(SearchStatusesThrift.class.getName(), options);
 91 |       System.exit(-1);
 92 |     }
 93 | 
 94 |     String qid = cmdline.hasOption(QID_OPTION) ?
 95 |         cmdline.getOptionValue(QID_OPTION) : DEFAULT_QID;
 96 |     String query = cmdline.hasOption(QUERY_OPTION) ?
 97 |         cmdline.getOptionValue(QUERY_OPTION) : DEFAULT_Q;
 98 |     String runtag = cmdline.hasOption(RUNTAG_OPTION) ?
 99 |         cmdline.getOptionValue(RUNTAG_OPTION) : DEFAULT_RUNTAG;
100 |     long maxId = cmdline.hasOption(MAX_ID_OPTION) ?
101 |         Long.parseLong(cmdline.getOptionValue(MAX_ID_OPTION)) : DEFAULT_MAX_ID;
102 |     int numResults = cmdline.hasOption(NUM_RESULTS_OPTION) ?
103 |         Integer.parseInt(cmdline.getOptionValue(NUM_RESULTS_OPTION)) : DEFAULT_NUM_RESULTS;
104 |     boolean verbose = cmdline.hasOption(VERBOSE_OPTION);
105 | 
106 |     String group = cmdline.hasOption(GROUP_OPTION) ? cmdline.getOptionValue(GROUP_OPTION) : null;
107 |     String token = cmdline.hasOption(TOKEN_OPTION) ? cmdline.getOptionValue(TOKEN_OPTION) : null;
108 |     TrecSearchThriftClient client = new TrecSearchThriftClient(cmdline.getOptionValue(HOST_OPTION),
109 |         Integer.parseInt(cmdline.getOptionValue(PORT_OPTION)), group, token);
110 | 
111 |     System.err.println("qid: " + qid);
112 |     System.err.println("q: " + query);
113 |     System.err.println("max_id: " + maxId);
114 |     System.err.println("num_results: " + numResults);
115 | 
116 |     PrintStream out = new PrintStream(System.out, true, "UTF-8");
117 | 
118 |     List<TResult> results = client.search(query, maxId, numResults);
119 |     int i = 1;
120 |     for (TResult result : results) {
121 |       out.println(String.format("%s Q0 %d %d %f %s", qid, result.id, i, result.rsv, runtag));
122 |       if (verbose) {
123 |         System.out.println("# " + result.toString().replaceAll("[\\n\\r]+", " "));
124 |       }
125 |       i++;
126 |     }
127 |     out.close();
128 |   }
129 | }


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/search/api/TResultComparable.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Twitter Tools
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package cc.twittertools.search.api;
18 | 
19 | import cc.twittertools.thrift.gen.TResult;
20 | 
21 | public class TResultComparable implements Comparable<TResultComparable> {
22 |   private TResult tresult;
23 | 
24 |   public TResultComparable(TResult tresult) {
25 |     this.tresult = tresult;
26 |   }
27 | 
28 |   public TResult getTResult() {
29 |     return tresult;
30 |   }
31 | 
32 |   public int compareTo(TResultComparable other) {
33 |     if (tresult.rsv > other.tresult.rsv) {
34 |       return -1;
35 |     } else if (tresult.rsv < other.tresult.rsv) {
36 |       return 1;
37 |     } else {
38 |       if (tresult.id > other.tresult.id) {
39 |         return -1;
40 |       } else if (tresult.id < other.tresult.id) {
41 |         return 1;
42 |       } else {
43 |         return 0;
44 |       }
45 |     }
46 |   }
47 | 
48 |   public boolean equals(Object other) {
49 |     if (other == null) {
50 |       return false;
51 |     } if (other.getClass() != this.getClass()) {
52 |       return false;
53 |     }
54 | 
55 |     return ((TResultComparable) other).tresult.id == this.tresult.id;
56 |   }
57 | }


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/search/api/TrecSearchHandler.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Twitter Tools
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package cc.twittertools.search.api;
 18 | 
 19 | import java.io.File;
 20 | import java.io.IOException;
 21 | import java.util.List;
 22 | import java.util.Map;
 23 | 
 24 | import javax.annotation.Nullable;
 25 | 
 26 | import org.apache.log4j.Logger;
 27 | import org.apache.lucene.document.Document;
 28 | import org.apache.lucene.index.DirectoryReader;
 29 | import org.apache.lucene.index.IndexReader;
 30 | import org.apache.lucene.queryparser.classic.QueryParser;
 31 | import org.apache.lucene.search.Filter;
 32 | import org.apache.lucene.search.IndexSearcher;
 33 | import org.apache.lucene.search.NumericRangeFilter;
 34 | import org.apache.lucene.search.Query;
 35 | import org.apache.lucene.search.ScoreDoc;
 36 | import org.apache.lucene.search.TopDocs;
 37 | import org.apache.lucene.search.similarities.LMDirichletSimilarity;
 38 | import org.apache.lucene.store.FSDirectory;
 39 | import org.apache.lucene.util.Version;
 40 | 
 41 | import cc.twittertools.index.IndexStatuses;
 42 | import cc.twittertools.index.IndexStatuses.StatusField;
 43 | import cc.twittertools.thrift.gen.TQuery;
 44 | import cc.twittertools.thrift.gen.TResult;
 45 | import cc.twittertools.thrift.gen.TrecSearch;
 46 | import cc.twittertools.thrift.gen.TrecSearchException;
 47 | 
 48 | import com.google.common.base.Preconditions;
 49 | import com.google.common.collect.Lists;
 50 | 
 51 | public class TrecSearchHandler implements TrecSearch.Iface {
 52 |   private static final Logger LOG = Logger.getLogger(TrecSearchHandler.class);
 53 | 
 54 |   private static QueryParser QUERY_PARSER =
 55 |       new QueryParser(Version.LUCENE_43, StatusField.TEXT.name, IndexStatuses.ANALYZER);
 56 | 
 57 |   private final IndexSearcher searcher;
 58 |   private final Map<String, String> credentials;
 59 | 
 60 |   public TrecSearchHandler(File indexPath, @Nullable Map<String, String> credentials)
 61 |       throws IOException {
 62 |     Preconditions.checkNotNull(indexPath);
 63 |     Preconditions.checkArgument(indexPath.exists());
 64 | 
 65 |     // Can be null, in which case we don't check for credentials.
 66 |     this.credentials = credentials;
 67 | 
 68 |     IndexReader reader = DirectoryReader.open(FSDirectory.open(indexPath));
 69 |     searcher = new IndexSearcher(reader);
 70 |     searcher.setSimilarity(new LMDirichletSimilarity(2500.0f));
 71 |   }
 72 | 
 73 |   public List<TResult> search(TQuery query) throws TrecSearchException {
 74 |     Preconditions.checkNotNull(query);
 75 | 
 76 |     LOG.info(String.format("Incoming request (%s, %s)", query.group, query.token));
 77 | 
 78 |     // Verify credentials.
 79 |     if (credentials != null && (!credentials.containsKey(query.group) ||
 80 |         !credentials.get(query.group).equals(query.token))) {
 81 |       LOG.info(String.format("Access denied for (%s, %s)", query.group, query.token));
 82 |       throw new TrecSearchException("Invalid credentials: access denied.");
 83 |     }
 84 | 
 85 |     List<TResult> results = Lists.newArrayList();
 86 |     long startTime = System.currentTimeMillis();
 87 | 
 88 |     try {
 89 |       Filter filter =
 90 |           NumericRangeFilter.newLongRange(StatusField.ID.name, 0L, query.max_id, true, true);
 91 | 
 92 |       Query q = QUERY_PARSER.parse(query.text);
 93 |       int num = query.num_results > 10000 ? 10000 : query.num_results;
 94 |       TopDocs rs = searcher.search(q, filter, num);
 95 |       for (ScoreDoc scoreDoc : rs.scoreDocs) {
 96 |         Document hit = searcher.doc(scoreDoc.doc);
 97 | 
 98 |         TResult p = new TResult();
 99 |         p.id = (Long) hit.getField(StatusField.ID.name).numericValue();
100 |         p.screen_name = hit.get(StatusField.SCREEN_NAME.name);
101 |         p.epoch = (Long) hit.getField(StatusField.EPOCH.name).numericValue();
102 |         p.text = hit.get(StatusField.TEXT.name);
103 |         p.rsv = scoreDoc.score;
104 | 
105 |         p.followers_count = (Integer) hit.getField(StatusField.FOLLOWERS_COUNT.name).numericValue();
106 |         p.statuses_count = (Integer) hit.getField(StatusField.STATUSES_COUNT.name).numericValue();
107 | 
108 |         if ( hit.get(StatusField.LANG.name) != null) {
109 |           p.lang = hit.get(StatusField.LANG.name);
110 |         }
111 | 
112 |         if ( hit.get(StatusField.IN_REPLY_TO_STATUS_ID.name) != null) {
113 |           p.in_reply_to_status_id = (Long) hit.getField(StatusField.IN_REPLY_TO_STATUS_ID.name).numericValue();
114 |         }
115 | 
116 |         if ( hit.get(StatusField.IN_REPLY_TO_USER_ID.name) != null) {
117 |           p.in_reply_to_user_id = (Long) hit.getField(StatusField.IN_REPLY_TO_USER_ID.name).numericValue();
118 |         }
119 | 
120 |         if ( hit.get(StatusField.RETWEETED_STATUS_ID.name) != null) {
121 |           p.retweeted_status_id = (Long) hit.getField(StatusField.RETWEETED_STATUS_ID.name).numericValue();
122 |         }
123 | 
124 |         if ( hit.get(StatusField.RETWEETED_USER_ID.name) != null) {
125 |           p.retweeted_user_id = (Long) hit.getField(StatusField.RETWEETED_USER_ID.name).numericValue();
126 |         }
127 | 
128 |         if ( hit.get(StatusField.RETWEET_COUNT.name) != null) {
129 |           p.retweeted_count = (Integer) hit.getField(StatusField.RETWEET_COUNT.name).numericValue();
130 |         }
131 | 
132 |         results.add(p);
133 |       }
134 |     } catch (Exception e) {
135 |       e.printStackTrace();
136 |       throw new TrecSearchException(e.getMessage());
137 |     }
138 | 
139 |     long endTime = System.currentTimeMillis();
140 |     LOG.info(String.format("%4dms %s", (endTime - startTime), query.toString()));
141 | 
142 |     return results;
143 |   }
144 | }


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/search/api/TrecSearchThriftClient.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Twitter Tools
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package cc.twittertools.search.api;
18 | 
19 | import java.util.List;
20 | 
21 | import javax.annotation.Nullable;
22 | 
23 | import org.apache.thrift.TException;
24 | import org.apache.thrift.protocol.TBinaryProtocol;
25 | import org.apache.thrift.transport.TSocket;
26 | import org.apache.thrift.transport.TTransport;
27 | 
28 | import cc.twittertools.thrift.gen.TQuery;
29 | import cc.twittertools.thrift.gen.TResult;
30 | import cc.twittertools.thrift.gen.TrecSearch;
31 | 
32 | import com.google.common.base.Preconditions;
33 | 
34 | public class TrecSearchThriftClient {
35 |   private final String group;
36 |   private final String token;
37 |   private final String host;
38 |   private final int port;
39 | 
40 |   public TrecSearchThriftClient(String host, int port,
41 |       @Nullable String group, @Nullable String token) {
42 |     Preconditions.checkNotNull(host);
43 |     Preconditions.checkArgument(port > 0);
44 |     this.group = group;
45 |     this.token = token;
46 |     this.host= host;
47 |     this.port = port;
48 |   }
49 | 
50 |   public List<TResult> search(String query, long maxId, int numResults) throws TException {
51 |     TTransport transport = new TSocket(host, port);
52 |     transport.open();
53 | 
54 |     TrecSearch.Client client = new TrecSearch.Client(new TBinaryProtocol(transport));
55 | 
56 |     TQuery q = new TQuery();
57 |     q.text = query;
58 |     q.max_id = maxId;
59 |     q.num_results = numResults;
60 | 
61 |     q.group = group;
62 |     q.token = token;
63 | 
64 |     List<TResult> results = client.search(q);
65 |     transport.close();
66 | 
67 |     return results;
68 |   }
69 | }
70 | 


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/search/api/TrecSearchThriftServer.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Twitter Tools
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package cc.twittertools.search.api;
 18 | 
 19 | import java.io.File;
 20 | import java.util.Map;
 21 | 
 22 | import org.apache.commons.cli.CommandLine;
 23 | import org.apache.commons.cli.CommandLineParser;
 24 | import org.apache.commons.cli.GnuParser;
 25 | import org.apache.commons.cli.HelpFormatter;
 26 | import org.apache.commons.cli.Option;
 27 | import org.apache.commons.cli.OptionBuilder;
 28 | import org.apache.commons.cli.Options;
 29 | import org.apache.commons.cli.ParseException;
 30 | import org.apache.thrift.protocol.TBinaryProtocol;
 31 | import org.apache.thrift.server.TServer;
 32 | import org.apache.thrift.server.TThreadPoolServer;
 33 | import org.apache.thrift.transport.TServerSocket;
 34 | 
 35 | import cc.twittertools.thrift.gen.TrecSearch;
 36 | 
 37 | import com.google.common.base.Charsets;
 38 | import com.google.common.collect.Maps;
 39 | import com.google.common.io.Files;
 40 | 
 41 | public class TrecSearchThriftServer {
 42 |   private static final int DEFAULT_PORT = 9090;
 43 |   private static final int DEFAULT_MAX_THREADS = 8;
 44 | 
 45 |   private static final String HELP_OPTION = "h";
 46 |   private static final String INDEX_OPTION = "index";
 47 |   private static final String PORT_OPTION = "port";
 48 |   private static final String MAX_THREADS_OPTION = "max_threads";
 49 |   private static final String CREDENTIALS_OPTION = "credentials";
 50 | 
 51 |   @SuppressWarnings("static-access")
 52 |   public static void main(String[] args) throws Exception {
 53 |     Options options = new Options();
 54 | 
 55 |     options.addOption(new Option(HELP_OPTION, "show help"));
 56 |     options.addOption(OptionBuilder.withArgName("port").hasArg()
 57 |         .withDescription("port").create(PORT_OPTION));
 58 |     options.addOption(OptionBuilder.withArgName("index").hasArg()
 59 |         .withDescription("index location").create(INDEX_OPTION));
 60 |     options.addOption(OptionBuilder.withArgName("num").hasArg()
 61 |         .withDescription("max number of threads in thread pool").create(MAX_THREADS_OPTION));
 62 |     options.addOption(OptionBuilder.withArgName("file").hasArg()
 63 |         .withDescription("file containing access tokens").create(CREDENTIALS_OPTION));
 64 | 
 65 |     CommandLine cmdline = null;
 66 |     CommandLineParser parser = new GnuParser();
 67 |     try {
 68 |       cmdline = parser.parse(options, args);
 69 |     } catch (ParseException exp) {
 70 |       System.err.println("Error parsing command line: " + exp.getMessage());
 71 |       System.exit(-1);
 72 |     }
 73 | 
 74 |     if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(INDEX_OPTION)) {
 75 |       HelpFormatter formatter = new HelpFormatter();
 76 |       formatter.printHelp(TrecSearchThriftServer.class.getName(), options);
 77 |       System.exit(-1);
 78 |     }
 79 | 
 80 |     int port = cmdline.hasOption(PORT_OPTION) ?
 81 |         Integer.parseInt(cmdline.getOptionValue(PORT_OPTION)) : DEFAULT_PORT;
 82 |     int maxThreads = cmdline.hasOption(MAX_THREADS_OPTION) ?
 83 |         Integer.parseInt(cmdline.getOptionValue(MAX_THREADS_OPTION)) : DEFAULT_MAX_THREADS;
 84 |     File index = new File(cmdline.getOptionValue(INDEX_OPTION));
 85 | 
 86 |     Map<String, String> credentials = null;
 87 |     if (cmdline.hasOption(CREDENTIALS_OPTION)) {
 88 |       credentials = Maps.newHashMap();
 89 |       File cfile = new File(cmdline.getOptionValue(CREDENTIALS_OPTION));
 90 |       if (!cfile.exists()) {
 91 |         System.err.println("Error: " + cfile + " does not exist!");
 92 |         System.exit(-1);
 93 |       }
 94 |       for (String s : Files.readLines(cfile, Charsets.UTF_8)) {
 95 |         try {
 96 |           String[] arr = s.split(":");
 97 |           credentials.put(arr[0], arr[1]);
 98 |         } catch (Exception e){
 99 |           // Catch any exceptions from parsing file contain access tokens
100 |           System.err.println("Error reading access tokens from " + cfile + "!");
101 |           System.exit(-1);
102 |         }
103 |       }
104 |     }
105 | 
106 |     if (!index.exists()) {
107 |       System.err.println("Error: " + index + " does not exist!");
108 |       System.exit(-1);
109 |     }
110 | 
111 |     TServerSocket serverSocket = new TServerSocket(port);
112 |     TrecSearch.Processor<TrecSearch.Iface> searchProcessor =
113 |         new TrecSearch.Processor<TrecSearch.Iface>(new TrecSearchHandler(index, credentials));
114 |     
115 |     TThreadPoolServer.Args serverArgs = new TThreadPoolServer.Args(serverSocket);
116 |     serverArgs.maxWorkerThreads(maxThreads);
117 |     TServer thriftServer = new TThreadPoolServer(serverArgs.processor(searchProcessor)
118 |         .protocolFactory(new TBinaryProtocol.Factory()));
119 | 
120 |     thriftServer.serve();
121 |   }
122 | }


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/search/local/RunQueries.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Twitter Tools
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package cc.twittertools.search.local;
 18 | 
 19 | import java.io.File;
 20 | import java.io.PrintStream;
 21 | 
 22 | import org.apache.commons.cli.CommandLine;
 23 | import org.apache.commons.cli.CommandLineParser;
 24 | import org.apache.commons.cli.GnuParser;
 25 | import org.apache.commons.cli.HelpFormatter;
 26 | import org.apache.commons.cli.Option;
 27 | import org.apache.commons.cli.OptionBuilder;
 28 | import org.apache.commons.cli.Options;
 29 | import org.apache.commons.cli.ParseException;
 30 | import org.apache.lucene.document.Document;
 31 | import org.apache.lucene.index.DirectoryReader;
 32 | import org.apache.lucene.index.IndexReader;
 33 | import org.apache.lucene.queryparser.classic.QueryParser;
 34 | import org.apache.lucene.search.Filter;
 35 | import org.apache.lucene.search.IndexSearcher;
 36 | import org.apache.lucene.search.NumericRangeFilter;
 37 | import org.apache.lucene.search.Query;
 38 | import org.apache.lucene.search.ScoreDoc;
 39 | import org.apache.lucene.search.TopDocs;
 40 | import org.apache.lucene.search.similarities.BM25Similarity;
 41 | import org.apache.lucene.search.similarities.LMDirichletSimilarity;
 42 | import org.apache.lucene.store.FSDirectory;
 43 | import org.apache.lucene.util.Version;
 44 | 
 45 | import cc.twittertools.index.IndexStatuses;
 46 | import cc.twittertools.index.IndexStatuses.StatusField;
 47 | import cc.twittertools.search.TrecTopic;
 48 | import cc.twittertools.search.TrecTopicSet;
 49 | 
 50 | public class RunQueries {
 51 |   private static final String DEFAULT_RUNTAG = "lucene4lm";
 52 | 
 53 |   private static final String INDEX_OPTION = "index";
 54 |   private static final String QUERIES_OPTION = "queries";
 55 |   private static final String NUM_RESULTS_OPTION = "num_results";
 56 |   private static final String SIMILARITY_OPTION = "similarity";
 57 |   private static final String RUNTAG_OPTION = "runtag";
 58 |   private static final String VERBOSE_OPTION = "verbose";
 59 | 
 60 |   private RunQueries() {}
 61 | 
 62 |   @SuppressWarnings("static-access")
 63 |   public static void main(String[] args) throws Exception {
 64 |     Options options = new Options();
 65 | 
 66 |     options.addOption(OptionBuilder.withArgName("path").hasArg()
 67 |         .withDescription("index location").create(INDEX_OPTION));
 68 |     options.addOption(OptionBuilder.withArgName("num").hasArg()
 69 |         .withDescription("number of results to return").create(NUM_RESULTS_OPTION));
 70 |     options.addOption(OptionBuilder.withArgName("file").hasArg()
 71 |         .withDescription("file containing topics in TREC format").create(QUERIES_OPTION));
 72 |     options.addOption(OptionBuilder.withArgName("similarity").hasArg()
 73 |         .withDescription("similarity to use (BM25, LM)").create(SIMILARITY_OPTION));
 74 |     options.addOption(OptionBuilder.withArgName("string").hasArg()
 75 |         .withDescription("runtag").create(RUNTAG_OPTION));
 76 |     options.addOption(new Option(VERBOSE_OPTION, "print out complete document"));
 77 | 
 78 |     CommandLine cmdline = null;
 79 |     CommandLineParser parser = new GnuParser();
 80 |     try {
 81 |       cmdline = parser.parse(options, args);
 82 |     } catch (ParseException exp) {
 83 |       System.err.println("Error parsing command line: " + exp.getMessage());
 84 |       System.exit(-1);
 85 |     }
 86 | 
 87 |     if (!cmdline.hasOption(QUERIES_OPTION) || !cmdline.hasOption(INDEX_OPTION)) {
 88 |       HelpFormatter formatter = new HelpFormatter();
 89 |       formatter.printHelp(RunQueries.class.getName(), options);
 90 |       System.exit(-1);
 91 |     }
 92 | 
 93 |     File indexLocation = new File(cmdline.getOptionValue(INDEX_OPTION));
 94 |     if (!indexLocation.exists()) {
 95 |       System.err.println("Error: " + indexLocation + " does not exist!");
 96 |       System.exit(-1);
 97 |     }
 98 | 
 99 |     String runtag = cmdline.hasOption(RUNTAG_OPTION) ?
100 |         cmdline.getOptionValue(RUNTAG_OPTION) : DEFAULT_RUNTAG;
101 | 
102 |     String topicsFile = cmdline.getOptionValue(QUERIES_OPTION);
103 |     
104 |     int numResults = 1000;
105 |     try {
106 |       if (cmdline.hasOption(NUM_RESULTS_OPTION)) {
107 |         numResults = Integer.parseInt(cmdline.getOptionValue(NUM_RESULTS_OPTION));
108 |       }
109 |     } catch (NumberFormatException e) {
110 |       System.err.println("Invalid " + NUM_RESULTS_OPTION + ": " + cmdline.getOptionValue(NUM_RESULTS_OPTION));
111 |       System.exit(-1);
112 |     }
113 | 
114 |     String similarity = "LM";
115 |     if (cmdline.hasOption(SIMILARITY_OPTION)) {
116 |       similarity = cmdline.getOptionValue(SIMILARITY_OPTION);
117 |     }
118 | 
119 |     boolean verbose = cmdline.hasOption(VERBOSE_OPTION);
120 | 
121 |     PrintStream out = new PrintStream(System.out, true, "UTF-8");
122 | 
123 |     IndexReader reader = DirectoryReader.open(FSDirectory.open(indexLocation));
124 |     IndexSearcher searcher = new IndexSearcher(reader);
125 | 
126 |     if (similarity.equalsIgnoreCase("BM25")) {
127 |       searcher.setSimilarity(new BM25Similarity());
128 |     } else if (similarity.equalsIgnoreCase("LM")) {
129 |       searcher.setSimilarity(new LMDirichletSimilarity(2500.0f));
130 |     }
131 | 
132 |     QueryParser p = new QueryParser(Version.LUCENE_43, StatusField.TEXT.name,
133 |         IndexStatuses.ANALYZER);
134 | 
135 |     TrecTopicSet topics = TrecTopicSet.fromFile(new File(topicsFile));
136 |     for ( TrecTopic topic : topics ) {
137 |       Query query = p.parse(topic.getQuery());
138 |       Filter filter = NumericRangeFilter.newLongRange(StatusField.ID.name, 0L,
139 |           topic.getQueryTweetTime(), true, true);
140 | 
141 |       TopDocs rs = searcher.search(query, filter, numResults);
142 | 
143 |       int i = 1;
144 |       for (ScoreDoc scoreDoc : rs.scoreDocs) {
145 |         Document hit = searcher.doc(scoreDoc.doc);
146 |         out.println(String.format("%s Q0 %s %d %f %s", topic.getId(),
147 |             hit.getField(StatusField.ID.name).numericValue(), i, scoreDoc.score, runtag));
148 |         if ( verbose) {
149 |           out.println("# " + hit.toString().replaceAll("[\\n\\r]+", " "));
150 |         }
151 |         i++;
152 |       }
153 |     }
154 |     reader.close();
155 |     out.close();
156 |   }
157 | }
158 | 


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/stream/GatherStatusStream.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Twitter Tools
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package cc.twittertools.stream;
 18 | 
 19 | import org.apache.log4j.ConsoleAppender;
 20 | import org.apache.log4j.Level;
 21 | import org.apache.log4j.Logger;
 22 | import org.apache.log4j.PatternLayout;
 23 | import org.apache.log4j.rolling.RollingFileAppender;
 24 | import org.apache.log4j.rolling.TimeBasedRollingPolicy;
 25 | import org.apache.log4j.varia.LevelRangeFilter;
 26 | 
 27 | import twitter4j.RawStreamListener;
 28 | import twitter4j.TwitterException;
 29 | import twitter4j.TwitterStream;
 30 | import twitter4j.TwitterStreamFactory;
 31 | 
 32 | public final class GatherStatusStream {
 33 |   private static int cnt = 0;
 34 | 
 35 |   @SuppressWarnings("unused")
 36 |   private static final String MINUTE_ROLL = ".%d{yyyy-MM-dd-HH-mm}.gz";
 37 |   private static final String HOUR_ROLL = ".%d{yyyy-MM-dd-HH}.gz";
 38 | 
 39 |   public static void main(String[] args) throws TwitterException {
 40 |     PatternLayout layoutStandard = new PatternLayout();
 41 |     layoutStandard.setConversionPattern("[%p] %d %c %M - %m%n");
 42 | 
 43 |     PatternLayout layoutSimple = new PatternLayout();
 44 |     layoutSimple.setConversionPattern("%m%n");
 45 | 
 46 |     // Filter for the statuses: we only want INFO messages
 47 |     LevelRangeFilter filter = new LevelRangeFilter();
 48 |     filter.setLevelMax(Level.INFO);
 49 |     filter.setLevelMin(Level.INFO);
 50 |     filter.setAcceptOnMatch(true);
 51 |     filter.activateOptions();
 52 | 
 53 |     TimeBasedRollingPolicy statusesRollingPolicy = new TimeBasedRollingPolicy();
 54 |     statusesRollingPolicy.setFileNamePattern("statuses.log" + HOUR_ROLL);
 55 |     statusesRollingPolicy.activateOptions();
 56 | 
 57 |     RollingFileAppender statusesAppender = new RollingFileAppender();
 58 |     statusesAppender.setRollingPolicy(statusesRollingPolicy);
 59 |     statusesAppender.addFilter(filter);
 60 |     statusesAppender.setLayout(layoutSimple);
 61 |     statusesAppender.activateOptions();
 62 | 
 63 |     TimeBasedRollingPolicy warningsRollingPolicy = new TimeBasedRollingPolicy();
 64 |     warningsRollingPolicy.setFileNamePattern("warnings.log" + HOUR_ROLL);
 65 |     warningsRollingPolicy.activateOptions();
 66 | 
 67 |     RollingFileAppender warningsAppender = new RollingFileAppender();
 68 |     warningsAppender.setRollingPolicy(statusesRollingPolicy);
 69 |     warningsAppender.setThreshold(Level.WARN);
 70 |     warningsAppender.setLayout(layoutStandard);
 71 |     warningsAppender.activateOptions();
 72 | 
 73 |     ConsoleAppender consoleAppender = new ConsoleAppender();
 74 |     consoleAppender.setThreshold(Level.WARN);
 75 |     consoleAppender.setLayout(layoutStandard);
 76 |     consoleAppender.activateOptions();
 77 | 
 78 |     // configures the root logger
 79 |     Logger rootLogger = Logger.getRootLogger();
 80 |     rootLogger.setLevel(Level.INFO);
 81 |     rootLogger.removeAllAppenders();
 82 |     rootLogger.addAppender(consoleAppender);
 83 |     rootLogger.addAppender(statusesAppender);
 84 |     rootLogger.addAppender(warningsAppender);
 85 | 
 86 |     // creates a custom logger and log messages
 87 |     final Logger logger = Logger.getLogger(GatherStatusStream.class);
 88 | 
 89 |     TwitterStream twitterStream = new TwitterStreamFactory().getInstance();
 90 |     RawStreamListener rawListener = new RawStreamListener() {
 91 | 
 92 |       @Override
 93 |       public void onMessage(String rawString) {
 94 |         cnt++;
 95 |         logger.info(rawString);
 96 |         if (cnt % 1000 == 0) {
 97 |           System.out.println(cnt + " messages received.");
 98 |         }
 99 |       }
100 | 
101 |       @Override
102 |       public void onException(Exception ex) {
103 |         logger.warn(ex);
104 |       }
105 | 
106 |     };
107 | 
108 |     twitterStream.addListener(rawListener);
109 |     twitterStream.sample();
110 |   }
111 | }
112 | 


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/util/ExtractSubcollection.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Twitter Tools
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package cc.twittertools.util;
 18 | 
 19 | import it.unimi.dsi.fastutil.longs.LongIterator;
 20 | import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
 21 | 
 22 | import java.io.BufferedReader;
 23 | import java.io.BufferedWriter;
 24 | import java.io.File;
 25 | import java.io.FileInputStream;
 26 | import java.io.FileOutputStream;
 27 | import java.io.InputStreamReader;
 28 | import java.io.OutputStreamWriter;
 29 | import java.io.Writer;
 30 | 
 31 | import org.apache.commons.cli.CommandLine;
 32 | import org.apache.commons.cli.CommandLineParser;
 33 | import org.apache.commons.cli.GnuParser;
 34 | import org.apache.commons.cli.HelpFormatter;
 35 | import org.apache.commons.cli.OptionBuilder;
 36 | import org.apache.commons.cli.Options;
 37 | import org.apache.commons.cli.ParseException;
 38 | import org.apache.log4j.Logger;
 39 | 
 40 | import cc.twittertools.corpus.data.JsonStatusCorpusReader;
 41 | import cc.twittertools.corpus.data.Status;
 42 | import cc.twittertools.corpus.data.StatusStream;
 43 | 
 44 | public class ExtractSubcollection {
 45 |   private static final Logger LOG = Logger.getLogger(ExtractSubcollection.class);
 46 | 
 47 |   private static final String COLLECTION_OPTION = "collection";
 48 |   private static final String ID_OPTION = "tweetids";
 49 |   private static final String OUTPUT_OPTION = "output";
 50 |   private static final String MISSING_OPTION = "missing";
 51 |   
 52 |   @SuppressWarnings("static-access")
 53 |   public static void main(String[] args) throws Exception {
 54 |     Options options = new Options();
 55 | 
 56 |     options.addOption(OptionBuilder.withArgName("dir").hasArg()
 57 |         .withDescription("source collection directory").create(COLLECTION_OPTION));
 58 |     options.addOption(OptionBuilder.withArgName("file").hasArg()
 59 |         .withDescription("list of tweetids").create(ID_OPTION));
 60 |     options.addOption(OptionBuilder.withArgName("file").hasArg()
 61 |         .withDescription("output JSON").create(OUTPUT_OPTION));
 62 |     options.addOption(OptionBuilder.withArgName("file").hasArg()
 63 |         .withDescription("file to store missing tweeids").create(MISSING_OPTION));
 64 | 
 65 |     CommandLine cmdline = null;
 66 |     CommandLineParser parser = new GnuParser();
 67 |     try {
 68 |       cmdline = parser.parse(options, args);
 69 |     } catch (ParseException exp) {
 70 |       System.err.println("Error parsing command line: " + exp.getMessage());
 71 |       System.exit(-1);
 72 |     }
 73 | 
 74 |     if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(ID_OPTION) || 
 75 |         !cmdline.hasOption(OUTPUT_OPTION) || !cmdline.hasOption(MISSING_OPTION)) {
 76 |       HelpFormatter formatter = new HelpFormatter();
 77 |       formatter.printHelp(ExtractSubcollection.class.getName(), options);
 78 |       System.exit(-1);
 79 |     }
 80 | 
 81 |     String outputFile = cmdline.getOptionValue(OUTPUT_OPTION);
 82 |     String missingFile = cmdline.getOptionValue(MISSING_OPTION);
 83 |     String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION);
 84 | 
 85 |     LongOpenHashSet tweetids = new LongOpenHashSet();
 86 |     File tweetidsFile = new File(cmdline.getOptionValue(ID_OPTION));
 87 |     if (!tweetidsFile.exists()) {
 88 |       System.err.println("Error: " + tweetidsFile + " does not exist!");
 89 |       System.exit(-1);
 90 |     }
 91 |     LOG.info("Reading tweetids from " + tweetidsFile);
 92 | 
 93 |     FileInputStream fin = new FileInputStream(tweetidsFile);
 94 |     BufferedReader br = new BufferedReader(new InputStreamReader(fin));
 95 | 
 96 |     String s;
 97 |     while ((s = br.readLine()) != null) {
 98 |       tweetids.add(Long.parseLong(s));
 99 |     }
100 |     br.close();
101 |     fin.close();
102 |     LOG.info("Read " + tweetids.size() + " tweetids.");
103 | 
104 |     File file = new File(collectionPath);
105 |     if (!file.exists()) {
106 |       System.err.println("Error: " + file + " does not exist!");
107 |       System.exit(-1);
108 |     }
109 | 
110 |     // Store tweet ids we've already seen to dedup.
111 |     LongOpenHashSet seen = new LongOpenHashSet();
112 | 
113 |     Writer out = new BufferedWriter(new OutputStreamWriter(
114 |         new FileOutputStream(outputFile), "UTF-8"));
115 |     
116 |     StatusStream stream = new JsonStatusCorpusReader(file);
117 |     Status status;
118 |     while ((status = stream.next()) != null) {
119 |       if (tweetids.contains(status.getId()) && !seen.contains(status.getId())) {
120 |         out.write(status.getJsonObject().toString() + "\n");
121 |         seen.add(status.getId());
122 |       }
123 |     }
124 |     stream.close();
125 |     out.close();
126 | 
127 |     LOG.info("Extracted " + seen.size() + " tweetids.");
128 |     LOG.info("Storing missing tweetids...");
129 | 
130 |     out = new BufferedWriter(new OutputStreamWriter(
131 |         new FileOutputStream(missingFile), "UTF-8"));
132 |     LongIterator iter = tweetids.iterator();
133 |     while (iter.hasNext()) {
134 |       long t = iter.nextLong();
135 |       if (!seen.contains(t)) {
136 |         out.write(t + "\n");
137 |       }
138 |     }
139 |     out.close();
140 |     
141 |     LOG.info("Done!");
142 |   }
143 | }
144 | 


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/util/VerifySubcollection.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Twitter Tools
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package cc.twittertools.util;
 18 | 
 19 | import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
 20 | 
 21 | import java.io.BufferedReader;
 22 | import java.io.File;
 23 | import java.io.FileInputStream;
 24 | import java.io.InputStreamReader;
 25 | import java.io.PrintStream;
 26 | import java.util.Map;
 27 | import java.util.TreeMap;
 28 | 
 29 | import org.apache.commons.cli.CommandLine;
 30 | import org.apache.commons.cli.CommandLineParser;
 31 | import org.apache.commons.cli.GnuParser;
 32 | import org.apache.commons.cli.HelpFormatter;
 33 | import org.apache.commons.cli.OptionBuilder;
 34 | import org.apache.commons.cli.Options;
 35 | import org.apache.commons.cli.ParseException;
 36 | import org.apache.log4j.Logger;
 37 | 
 38 | import cc.twittertools.corpus.data.JsonStatusCorpusReader;
 39 | import cc.twittertools.corpus.data.Status;
 40 | import cc.twittertools.corpus.data.StatusStream;
 41 | 
 42 | import com.google.common.collect.Maps;
 43 | 
 44 | public class VerifySubcollection {
 45 |   private static final Logger LOG = Logger.getLogger(VerifySubcollection.class);
 46 | 
 47 |   private static final String COLLECTION_OPTION = "collection";
 48 |   private static final String ID_OPTION = "tweetids";
 49 | 
 50 |   @SuppressWarnings("static-access")
 51 |   public static void main(String[] args) throws Exception {
 52 |     Options options = new Options();
 53 | 
 54 |     options.addOption(OptionBuilder.withArgName("dir").hasArg()
 55 |         .withDescription("source collection directory").create(COLLECTION_OPTION));
 56 |     options.addOption(OptionBuilder.withArgName("file").hasArg()
 57 |         .withDescription("list of tweetids").create(ID_OPTION));
 58 | 
 59 |     CommandLine cmdline = null;
 60 |     CommandLineParser parser = new GnuParser();
 61 |     try {
 62 |       cmdline = parser.parse(options, args);
 63 |     } catch (ParseException exp) {
 64 |       System.err.println("Error parsing command line: " + exp.getMessage());
 65 |       System.exit(-1);
 66 |     }
 67 | 
 68 |     if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(ID_OPTION)) {
 69 |       HelpFormatter formatter = new HelpFormatter();
 70 |       formatter.printHelp(ExtractSubcollection.class.getName(), options);
 71 |       System.exit(-1);
 72 |     }
 73 | 
 74 |     String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION);
 75 | 
 76 |     LongOpenHashSet tweetids = new LongOpenHashSet();
 77 |     File tweetidsFile = new File(cmdline.getOptionValue(ID_OPTION));
 78 |     if (!tweetidsFile.exists()) {
 79 |       System.err.println("Error: " + tweetidsFile + " does not exist!");
 80 |       System.exit(-1);
 81 |     }
 82 |     LOG.info("Reading tweetids from " + tweetidsFile);
 83 | 
 84 |     FileInputStream fin = new FileInputStream(tweetidsFile);
 85 |     BufferedReader br = new BufferedReader(new InputStreamReader(fin));
 86 | 
 87 |     String s;
 88 |     while ((s = br.readLine()) != null) {
 89 |       tweetids.add(Long.parseLong(s));
 90 |     }
 91 |     br.close();
 92 |     fin.close();
 93 |     LOG.info("Read " + tweetids.size() + " tweetids.");
 94 | 
 95 |     File file = new File(collectionPath);
 96 |     if (!file.exists()) {
 97 |       System.err.println("Error: " + file + " does not exist!");
 98 |       System.exit(-1);
 99 |     }
100 | 
101 |     LongOpenHashSet seen = new LongOpenHashSet();
102 |     TreeMap<Long, String> tweets = Maps.newTreeMap();
103 | 
104 |     PrintStream out = new PrintStream(System.out, true, "UTF-8");
105 |     StatusStream stream = new JsonStatusCorpusReader(file);
106 |     Status status;
107 |     int cnt = 0;
108 |     while ((status = stream.next()) != null) {
109 |       if (!tweetids.contains(status.getId())) {
110 |         LOG.error("tweetid " + status.getId() + " doesn't belong in collection");
111 |         continue;
112 |       }
113 |       if (seen.contains(status.getId())) {
114 |         LOG.error("tweetid " + status.getId() + " already seen!");
115 |         continue;
116 |       }
117 | 
118 |       tweets.put(status.getId(), status.getJsonObject().toString());
119 |       seen.add(status.getId());
120 |       cnt++;
121 |     }
122 |     LOG.info("total of " + cnt + " tweets in subcollection.");
123 | 
124 |     for ( Map.Entry<Long, String> entry : tweets.entrySet()){
125 |       out.println(entry.getValue());
126 |     }
127 | 
128 |     stream.close();
129 |     out.close();
130 |   }
131 | }
132 | 


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootLogger=INFO, A1
2 | log4j.appender.A1=org.apache.log4j.ConsoleAppender
3 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout
4 | 
5 | # Print the date in ISO 8601 format
6 | log4j.appender.A1.layout.ConversionPattern=%d [%t] %-5p %c{1} - %m%n
7 | log4j.logger.com.ning.http.client=WARN
8 | 


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/perl/extract_deletes.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | # Scans a directory containing the output of the stream crawler and
 4 | # extracts the deletes
 5 | 
 6 | $directory = shift or die "$0 [directory]";
 7 | 
 8 | for $f ( `ls $directory` ) {
 9 |     chomp($f);
10 |     my $path = "$directory/$f";
11 | 
12 |     open(DATA, "gunzip -c $path | grep '{\"delete\"' | ");
13 |     while ( my $line = <DATA> ) {
14 | 	if ( $line =~ m/{"delete":{"status":{"id":(\d+),/ ) {
15 | 	    print "$1\n";
16 | 	}
17 |     }
18 |     close(DATA);
19 | }
20 | 


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/perl/join_deletes_with_collection.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | # Joins together deletes and collection tweetids to identify the
 4 | # deleted statuses.
 5 | 
 6 | $USAGE = "$0 [deletes (bz2)] [collection (bz2)]";
 7 | 
 8 | $deletes = shift or die $USAGE;
 9 | $collection = shift or die $USAGE;
10 | 
11 | open(DATA, "bzcat $deletes | ");
12 | while ( my $line = <DATA> ) {
13 |     chomp($line);
14 |     $H{$line} = 1;
15 | }
16 | close(DATA);
17 | 
18 | open(DATA, "bzcat $collection | ");
19 | while ( my $line = <DATA> ) {
20 |     if ($line =~ /^(\d+)/ ) {
21 | 	print $line if exists($H{$1});
22 |     }
23 | }
24 | close(DATA);
25 | 


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/python/Search/TrecSearch-remote:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # Autogenerated by Thrift Compiler (0.8.0)
 4 | #
 5 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
 6 | #
 7 | #  options string: py
 8 | #
 9 | 
10 | import sys
11 | import pprint
12 | from urlparse import urlparse
13 | from thrift.transport import TTransport
14 | from thrift.transport import TSocket
15 | from thrift.transport import THttpClient
16 | from thrift.protocol import TBinaryProtocol
17 | 
18 | import TrecSearch
19 | from ttypes import *
20 | 
21 | if len(sys.argv) <= 1 or sys.argv[1] == '--help':
22 |   print ''
23 |   print 'Usage: ' + sys.argv[0] + ' [-h host[:port]] [-u url] [-f[ramed]] function [arg1 [arg2...]]'
24 |   print ''
25 |   print 'Functions:'
26 |   print '   search(TQuery query)'
27 |   print ''
28 |   sys.exit(0)
29 | 
30 | pp = pprint.PrettyPrinter(indent = 2)
31 | host = 'localhost'
32 | port = 9090
33 | uri = ''
34 | framed = False
35 | http = False
36 | argi = 1
37 | 
38 | if sys.argv[argi] == '-h':
39 |   parts = sys.argv[argi+1].split(':')
40 |   host = parts[0]
41 |   if len(parts) > 1:
42 |     port = int(parts[1])
43 |   argi += 2
44 | 
45 | if sys.argv[argi] == '-u':
46 |   url = urlparse(sys.argv[argi+1])
47 |   parts = url[1].split(':')
48 |   host = parts[0]
49 |   if len(parts) > 1:
50 |     port = int(parts[1])
51 |   else:
52 |     port = 80
53 |   uri = url[2]
54 |   if url[4]:
55 |     uri += '?%s' % url[4]
56 |   http = True
57 |   argi += 2
58 | 
59 | if sys.argv[argi] == '-f' or sys.argv[argi] == '-framed':
60 |   framed = True
61 |   argi += 1
62 | 
63 | cmd = sys.argv[argi]
64 | args = sys.argv[argi+1:]
65 | 
66 | if http:
67 |   transport = THttpClient.THttpClient(host, port, uri)
68 | else:
69 |   socket = TSocket.TSocket(host, port)
70 |   if framed:
71 |     transport = TTransport.TFramedTransport(socket)
72 |   else:
73 |     transport = TTransport.TBufferedTransport(socket)
74 | protocol = TBinaryProtocol.TBinaryProtocol(transport)
75 | client = TrecSearch.Client(protocol)
76 | transport.open()
77 | 
78 | if cmd == 'search':
79 |   if len(args) != 1:
80 |     print 'search requires 1 args'
81 |     sys.exit(1)
82 |   pp.pprint(client.search(eval(args[0]),))
83 | 
84 | else:
85 |   print 'Unrecognized method %s' % cmd
86 |   sys.exit(1)
87 | 
88 | transport.close()
89 | 


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/python/Search/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ['ttypes', 'constants', 'TrecSearch']
2 | 


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/python/Search/constants.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Autogenerated by Thrift Compiler (0.8.0)
 3 | #
 4 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
 5 | #
 6 | #  options string: py
 7 | #
 8 | 
 9 | from thrift.Thrift import TType, TMessageType, TException
10 | from ttypes import *
11 | 
12 | 


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/python/TrecSearchThriftClientCli.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | """
 4 | A demonstration of how to use the python thrift bindings to retrieve tweets from the TREC 2013 API. 
 5 | 
 6 | This script requires the python-thrift package, which can installed using 'pip install thrift'.
 7 | 
 8 | To execute this script:
 9 |     python TrecSearchThriftClientCli.py -host='host' -port=port -group='team_name' -token='access_token' -qid='MB01' -q='BBC World Service staff cuts' -runtag='lucene4lm' -max_id=34952194402811905
10 | 
11 | """
12 | 
13 | from Search import TrecSearch, ttypes
14 | 
15 | from thrift import Thrift
16 | from thrift.transport import TSocket
17 | from thrift.transport import TTransport
18 | from thrift.protocol import TBinaryProtocol
19 | 
20 | import argparse
21 | 
22 | try:
23 |     # Command line arguments
24 |     parser = argparse.ArgumentParser()
25 |     parser.add_argument('-host', dest="host", help='server to connect to', required=True)
26 |     parser.add_argument('-port',type=int, dest="port", help='port to use', required=True)
27 |     parser.add_argument('-group', dest="group", help='group id', required=True)
28 |     parser.add_argument('-token', dest="token", help='access token', required=True)
29 |     parser.add_argument('-qid', dest="qid", help='query id', required=False, default='MB01')
30 |     parser.add_argument('-q', dest="query", help='query text', required=False, default='BBC World Service staff cuts')
31 |     parser.add_argument('-runtag', dest="run_tag", help='runtag', required=False, default='lucene4lm')
32 |     parser.add_argument('-max_id', dest="max_id", help='maxid', required=False, default=34952194402811905)
33 |     parser.add_argument('-num_results', dest="num_results", help='number of results', required=False, default=10)
34 |     args = parser.parse_args()
35 | 
36 |     # Init thrift connection and protocol handlers
37 |     transport = TSocket.TSocket(args.host, args.port)
38 |     transport = TTransport.TBufferedTransport(transport)
39 |     protocol = TBinaryProtocol.TBinaryProtocol(transport)
40 |     client = TrecSearch.Client(protocol)
41 | 
42 |     # Open the connection to the server
43 |     transport.open()
44 | 
45 |     # Create a new query
46 |     q = ttypes.TQuery()
47 |     q.group = args.group
48 |     q.token = args.token
49 |     q.text = args.query
50 |     q.max_id = long(args.max_id)
51 |     q.num_results = int(args.num_results)
52 | 
53 |     # Performs the actual search
54 |     results = client.search(q)
55 | 
56 |     for i, result in enumerate(results, 1):
57 |         # TREC_eval formatted line
58 |         print "%s Q0 %d %d %f %s" % (args.qid, result.id, i, result.rsv, args.run_tag)
59 | 
60 |     # Close connection
61 |     transport.close()
62 | 
63 | except Thrift.TException, tx:
64 |     print 'Thrift TException: %s' % (tx.message)
65 | 


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/python/twittertools/stream/gather_status_stream.py:
--------------------------------------------------------------------------------
 1 | # Twitter Tools
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | from tweepy.streaming import StreamListener
17 | from tweepy import OAuthHandler
18 | from tweepy import Stream
19 | import logging
20 | import logging.handlers
21 | 
22 | 
23 | consumer_key=""
24 | consumer_secret=""
25 | 
26 | access_token=""
27 | access_token_secret=""
28 | 
29 | class TweetListener(StreamListener):
30 | 
31 |   def __init__(self,api=None):
32 |     super(TweetListener,self).__init__(api)
33 |     self.logger = logging.getLogger('tweetlogger')
34 | 
35 |     
36 |     statusHandler = logging.handlers.TimedRotatingFileHandler('status.log',when='H',encoding='bz2',utc=True)
37 |     statusHandler.setLevel(logging.INFO)
38 |     self.logger.addHandler(statusHandler)
39 |     
40 | 
41 |     warningHandler = logging.handlers.TimedRotatingFileHandler('warning.log',when='H',encoding='bz2',utc=True)
42 |     warningHandler.setLevel(logging.WARN)
43 |     self.logger.addHandler(warningHandler)
44 |     logging.captureWarnings(True);
45 | 
46 |     consoleHandler = logging.StreamHandler()
47 |     consoleHandler.setLevel(logging.WARN)
48 |     self.logger.addHandler(consoleHandler)
49 | 
50 | 
51 |     self.logger.setLevel(logging.INFO)
52 |     self.count = 0
53 | 
54 |   def on_data(self,data):
55 |     self.count+=1
56 |     self.logger.info(data)
57 |     if self.count % 1000 == 0:
58 |         print "%d statuses processed" % self.count
59 |     return True
60 | 
61 |   def on_error(self,exception):
62 |     self.logger.warn(str(exception))
63 | 
64 | if __name__ == '__main__':
65 |   listener = TweetListener()
66 |   auth = OAuthHandler(consumer_key,consumer_secret)
67 |   auth.set_access_token(access_token,access_token_secret)
68 | 
69 |   stream = Stream(auth,listener)
70 |   while True:
71 |     try:
72 |       stream.sample()
73 |     except Exception as ex:
74 |       print str(ex)
75 |       pass
76 | 


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootLogger=INFO, A1
2 | log4j.appender.A1=org.apache.log4j.ConsoleAppender
3 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout
4 | 
5 | # Print the date in ISO 8601 format
6 | log4j.appender.A1.layout.ConversionPattern=%d [%t] %-5p %c{1} - %m%n
7 | log4j.logger.com.ning.http.client=WARN
8 | 


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/thrift/gen-py/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lintool/twitter-tools/777776083d1e4a76da5bdc5860551a38f8fd6766/twitter-tools-core/src/main/thrift/gen-py/__init__.py


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/thrift/gen-py/twittertools/TrecSearch-remote:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # Autogenerated by Thrift Compiler (0.8.0)
 4 | #
 5 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
 6 | #
 7 | #  options string: py
 8 | #
 9 | 
10 | import sys
11 | import pprint
12 | from urlparse import urlparse
13 | from thrift.transport import TTransport
14 | from thrift.transport import TSocket
15 | from thrift.transport import THttpClient
16 | from thrift.protocol import TBinaryProtocol
17 | 
18 | import TrecSearch
19 | from ttypes import *
20 | 
21 | if len(sys.argv) <= 1 or sys.argv[1] == '--help':
22 |   print ''
23 |   print 'Usage: ' + sys.argv[0] + ' [-h host[:port]] [-u url] [-f[ramed]] function [arg1 [arg2...]]'
24 |   print ''
25 |   print 'Functions:'
26 |   print '   search(TQuery query)'
27 |   print ''
28 |   sys.exit(0)
29 | 
30 | pp = pprint.PrettyPrinter(indent = 2)
31 | host = 'localhost'
32 | port = 9090
33 | uri = ''
34 | framed = False
35 | http = False
36 | argi = 1
37 | 
38 | if sys.argv[argi] == '-h':
39 |   parts = sys.argv[argi+1].split(':')
40 |   host = parts[0]
41 |   if len(parts) > 1:
42 |     port = int(parts[1])
43 |   argi += 2
44 | 
45 | if sys.argv[argi] == '-u':
46 |   url = urlparse(sys.argv[argi+1])
47 |   parts = url[1].split(':')
48 |   host = parts[0]
49 |   if len(parts) > 1:
50 |     port = int(parts[1])
51 |   else:
52 |     port = 80
53 |   uri = url[2]
54 |   if url[4]:
55 |     uri += '?%s' % url[4]
56 |   http = True
57 |   argi += 2
58 | 
59 | if sys.argv[argi] == '-f' or sys.argv[argi] == '-framed':
60 |   framed = True
61 |   argi += 1
62 | 
63 | cmd = sys.argv[argi]
64 | args = sys.argv[argi+1:]
65 | 
66 | if http:
67 |   transport = THttpClient.THttpClient(host, port, uri)
68 | else:
69 |   socket = TSocket.TSocket(host, port)
70 |   if framed:
71 |     transport = TTransport.TFramedTransport(socket)
72 |   else:
73 |     transport = TTransport.TBufferedTransport(socket)
74 | protocol = TBinaryProtocol.TBinaryProtocol(transport)
75 | client = TrecSearch.Client(protocol)
76 | transport.open()
77 | 
78 | if cmd == 'search':
79 |   if len(args) != 1:
80 |     print 'search requires 1 args'
81 |     sys.exit(1)
82 |   pp.pprint(client.search(eval(args[0]),))
83 | 
84 | else:
85 |   print 'Unrecognized method %s' % cmd
86 |   sys.exit(1)
87 | 
88 | transport.close()
89 | 


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/thrift/gen-py/twittertools/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ['ttypes', 'constants', 'TrecSearch']
2 | 


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/thrift/gen-py/twittertools/constants.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Autogenerated by Thrift Compiler (0.8.0)
 3 | #
 4 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
 5 | #
 6 | #  options string: py
 7 | #
 8 | 
 9 | from thrift.Thrift import TType, TMessageType, TException
10 | from ttypes import *
11 | 
12 | 


--------------------------------------------------------------------------------
/twitter-tools-core/src/main/thrift/twittertools.thrift:
--------------------------------------------------------------------------------
 1 | namespace java cc.twittertools.thrift.gen
 2 | 
 3 | struct TResult {
 4 |   1: i64 id,
 5 |   2: double rsv,
 6 |   3: string screen_name,
 7 |   4: i64 epoch,
 8 |   5: string text,
 9 |   6: i32 followers_count,
10 |   7: i32 statuses_count,
11 |   8: string lang,
12 |   9: i64 in_reply_to_status_id,
13 |  10: i64 in_reply_to_user_id,
14 |  11: i64 retweeted_status_id,
15 |  12: i64 retweeted_user_id,
16 |  13: i32 retweeted_count
17 | }
18 | 
19 | struct TQuery {
20 |   1: string group,
21 |   2: string token,
22 |   3: string text,
23 |   4: i64 max_id,
24 |   5: i32 num_results
25 | }
26 |  
27 | exception TrecSearchException {
28 |   1: string message
29 | }
30 |  
31 | service TrecSearch {
32 |   list<TResult> search(1: TQuery query)
33 |     throws (1: TrecSearchException error)
34 | }
35 | 


--------------------------------------------------------------------------------
/twitter-tools-core/src/test/java/cc/twittertools/download/FetchStatusTest.java:
--------------------------------------------------------------------------------
 1 | package cc.twittertools.download;
 2 | 
 3 | import static org.junit.Assert.assertEquals;
 4 | import static org.junit.Assert.assertTrue;
 5 | 
 6 | import java.util.concurrent.Future;
 7 | 
 8 | import junit.framework.JUnit4TestAdapter;
 9 | 
10 | import org.apache.commons.lang.StringEscapeUtils;
11 | import org.junit.Test;
12 | 
13 | import cc.twittertools.corpus.data.Status;
14 | 
15 | import com.google.gson.JsonObject;
16 | import com.google.gson.JsonParser;
17 | import com.ning.http.client.AsyncHttpClient;
18 | import com.ning.http.client.Response;
19 | 
20 | public class FetchStatusTest {
21 |   private static final JsonParser JSON_PARSER = new JsonParser();
22 | 
23 |   @Test
24 |   public void basicHTML() throws Exception {
25 |     String url = AsyncEmbeddedJsonStatusBlockCrawler.getUrl(1121915133L, "jkrums");
26 |     AsyncHttpClient asyncHttpClient = new AsyncHttpClient();
27 |     AsyncHttpClient.BoundRequestBuilder request = asyncHttpClient.prepareGet(url);
28 |     Future<Response> f = request.execute();
29 |     Response response = f.get();
30 | 
31 |     // Make sure status is OK.
32 |     String html = response.getResponseBody("UTF-8");
33 |     assertTrue(html != null);
34 |   }
35 |   
36 |   // The fetcher is broken, so disabling test.
37 |   //@Test
38 |   public void basicFamous() throws Exception {
39 |     String url = AsyncEmbeddedJsonStatusBlockCrawler.getUrl(1121915133L, "jkrums");
40 |     AsyncHttpClient asyncHttpClient = new AsyncHttpClient();
41 |     AsyncHttpClient.BoundRequestBuilder request = asyncHttpClient.prepareGet(url);
42 |     Future<Response> f = request.execute();
43 |     Response response = f.get();
44 | 
45 |     // Make sure status is OK.
46 |     assertEquals(200, response.getStatusCode());
47 |     String html = response.getResponseBody("UTF-8");
48 | 
49 |     int jsonStart = html.indexOf(AsyncEmbeddedJsonStatusBlockCrawler.JSON_START);
50 |     int jsonEnd = html.indexOf(AsyncEmbeddedJsonStatusBlockCrawler.JSON_END,
51 |         jsonStart + AsyncEmbeddedJsonStatusBlockCrawler.JSON_START.length());
52 | 
53 |     String json = html.substring(jsonStart + AsyncEmbeddedJsonStatusBlockCrawler.JSON_START.length(), jsonEnd);
54 |     json = StringEscapeUtils.unescapeHtml(json);
55 |     JsonObject page = (JsonObject) JSON_PARSER.parse(json);
56 |     JsonObject statusJson = page.getAsJsonObject("embedData").getAsJsonObject("status");
57 | 
58 |     Status status = Status.fromJson(statusJson.toString());
59 |     assertEquals(1121915133L, status.getId());
60 |     assertEquals("jkrums", status.getScreenname());
61 |     assertEquals("http://twitpic.com/135xa - There's a plane in the Hudson. I'm on the ferry going to pick up the people. Crazy.", status.getText());
62 | 
63 |     asyncHttpClient.close();
64 |   }
65 | 
66 |   public static junit.framework.Test suite() {
67 |     return new JUnit4TestAdapter(FetchStatusTest.class);
68 |   }
69 | }
70 | 


--------------------------------------------------------------------------------
/twitter-tools-core/src/test/java/cc/twittertools/index/TokenizationTest.java:
--------------------------------------------------------------------------------
 1 | package cc.twittertools.index;
 2 | 
 3 | import static org.junit.Assert.assertEquals;
 4 | 
 5 | import java.io.IOException;
 6 | import java.io.StringReader;
 7 | import java.util.List;
 8 | 
 9 | import junit.framework.JUnit4TestAdapter;
10 | 
11 | import org.apache.lucene.analysis.Analyzer;
12 | import org.apache.lucene.analysis.TokenStream;
13 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
14 | import org.apache.lucene.util.Version;
15 | import org.junit.Test;
16 | 
17 | import cc.twittertools.index.TweetAnalyzer;
18 | 
19 | import com.google.common.collect.Lists;
20 | 
21 | public class TokenizationTest {
22 | 
23 |   Object[][] examples = new Object[][] {
24 |       {"AT&T getting secret immunity from wiretapping laws for government surveillance http://vrge.co/ZP3Fx5",
25 |        new String[] {"att", "get", "secret", "immun", "from", "wiretap", "law", "for", "govern", "surveil", "http://vrge.co/ZP3Fx5"}},
26 | 
27 |       {"want to see the @verge aston martin GT4 racer tear up long beach? http://theracersgroup.kinja.com/watch-an-aston-martin-vantage-gt4-tear-around-long-beac-479726219 …",
28 |        new String[] {"want", "to", "see", "the", "@verge", "aston", "martin", "gt4", "racer", "tear", "up", "long", "beach", "http://theracersgroup.kinja.com/watch-an-aston-martin-vantage-gt4-tear-around-long-beac-479726219"}},
29 | 
30 |       {"Incredibly good news! #Drupal users rally http://bit.ly/Z8ZoFe  to ensure blind accessibility contributor gets to @DrupalCon #Opensource",
31 |        new String[] {"incred", "good", "new", "#drupal", "user", "ralli", "http://bit.ly/Z8ZoFe", "to", "ensur", "blind", "access", "contributor", "get", "to", "@drupalcon", "#opensource"}},
32 | 
33 |       {"We're entering the quiet hours at #amznhack. #Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz",
34 |        new String[] {"were", "enter", "the", "quiet", "hour", "at", "#amznhack", "#rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz"}},
35 | 
36 |       {"The 2013 Social Event Detection Task (SED) at #mediaeval2013, http://bit.ly/16nITsf  supported by @linkedtv @project_mmixer @socialsensor_ip",
37 |        new String[] {"the", "2013", "social", "event", "detect", "task", "sed", "at", "#mediaeval2013", "http://bit.ly/16nITsf", "support", "by", "@linkedtv", "@project_mmixer", "@socialsensor_ip"}},
38 | 
39 |       {"U.S.A. U.K. U.K USA UK #US #UK #U.S.A #U.K ...A.B.C...D..E..F..A.LONG WORD",
40 |        new String[] {"usa", "uk", "uk", "usa", "uk", "#us", "#uk", "#u", "sa", "#u", "k", "abc", "d", "e", "f", "a", "long", "word"}},
41 | 
42 |       {"this is @a_valid_mention and this_is_multiple_words",
43 |        new String[] {"thi", "is", "@a_valid_mention", "and", "thi", "is", "multipl", "word"}},
44 | 
45 |       {"PLEASE BE LOWER CASE WHEN YOU COME OUT THE OTHER SIDE - ALSO A @VALID_VALID-INVALID",
46 |        new String[] {"pleas", "be", "lower", "case", "when", "you", "come", "out", "the", "other", "side", "also", "a", "@valid_valid", "invalid"}},
47 | 
48 |       // Note: the at sign is not the normal (at) sign and the crazy hashtag is not the normal #
49 |       {"＠reply @with #crazy ~＃at",
50 |        new String[] {"＠reply", "@with", "#crazy", "＃at"}},
51 | 
52 |       {":@valid testing(valid)#hashtags. RT:@meniton (the last @mention is #valid and so is this:@valid), however this is@invalid",
53 |        new String[] {"@valid", "test", "valid", "#hashtags", "rt", "@meniton", "the", "last", "@mention", "is", "#valid", "and", "so", "is", "thi", "@valid", "howev", "thi", "is", "invalid"}},
54 | 
55 |       {"this][is[lots[(of)words+with-lots=of-strange!characters?$in-fact=it&has&Every&Single:one;of<them>in_here_B&N_test_test?test\\test^testing`testing{testing}testing…testing¬testing·testing what?",
56 |        new String[] {"thi", "is", "lot", "of", "word", "with", "lot", "of", "strang", "charact", "in", "fact", "it", "ha", "everi", "singl", "on", "of", "them", "in", "here", "bn", "test", "test", "test", "test", "test", "test", "test", "test", "test", "test", "test", "what"}},
57 |   };
58 |   
59 |   @Test
60 |   public void basic() throws Exception {
61 |     Analyzer analyzer = new TweetAnalyzer(Version.LUCENE_43);
62 | 
63 |     for (int i = 0; i < examples.length; i++) {
64 |       verify((String[]) examples[i][1], parseKeywords(analyzer, (String) examples[i][0]));
65 |     }
66 |   }
67 | 
68 |   public void verify(String[] truth, List<String> tokens) {
69 |     assertEquals(truth.length, tokens.size());
70 |     for ( int i=0; i<truth.length; i++) {
71 |       assertEquals(truth[i], tokens.get(i));
72 |     }
73 |   }
74 | 
75 |   public List<String> parseKeywords(Analyzer analyzer, String keywords) throws IOException {
76 |     List<String> list = Lists.newArrayList();
77 | 
78 |     TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(keywords));
79 |     CharTermAttribute cattr = tokenStream.addAttribute(CharTermAttribute.class);
80 |     tokenStream.reset();
81 |     while (tokenStream.incrementToken()) {
82 |       if (cattr.toString().length() == 0) {
83 |         continue;
84 |       }
85 |       list.add(cattr.toString());
86 |     }
87 |     tokenStream.end();
88 |     tokenStream.close();
89 | 
90 |     return list;
91 |   }
92 | 
93 |   public static junit.framework.Test suite() {
94 |     return new JUnit4TestAdapter(TokenizationTest.class);
95 |   }
96 | }
97 | 


--------------------------------------------------------------------------------
/twitter-tools-core/src/test/java/cc/twittertools/search/TrecTopicSetTest.java:
--------------------------------------------------------------------------------
 1 | package cc.twittertools.search;
 2 | 
 3 | import static org.junit.Assert.assertEquals;
 4 | import static org.junit.Assert.assertTrue;
 5 | 
 6 | import java.io.File;
 7 | import java.util.List;
 8 | 
 9 | import junit.framework.JUnit4TestAdapter;
10 | 
11 | import org.junit.Test;
12 | 
13 | import com.google.common.collect.Lists;
14 | 
15 | public class TrecTopicSetTest {
16 | 
17 |   @Test
18 |   public void topics2011() throws Exception {
19 |     File f = new File("../data/topics.microblog2011.txt");
20 |     assertTrue(f.exists());
21 | 
22 |     TrecTopicSet topics = TrecTopicSet.fromFile(f);
23 |     List<TrecTopic> t = Lists.newArrayList(topics.iterator());
24 | 
25 |     assertEquals(50, t.size());
26 |     assertEquals("MB01", t.get(0).getId());
27 |     assertEquals("MB50", t.get(t.size()-1).getId());
28 |   }
29 | 
30 |   @Test
31 |   public void topics2012() throws Exception {
32 |     File f = new File("../data/topics.microblog2012.txt");
33 |     assertTrue(f.exists());
34 | 
35 |     TrecTopicSet topics = TrecTopicSet.fromFile(f);
36 |     List<TrecTopic> t = Lists.newArrayList(topics.iterator());
37 | 
38 |     assertEquals(60, t.size());
39 |     assertEquals("MB51", t.get(0).getId());
40 |     assertEquals("MB110", t.get(t.size()-1).getId());
41 |   }
42 | 
43 |   public static junit.framework.Test suite() {
44 |     return new JUnit4TestAdapter(TrecTopicSetTest.class);
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/twitter-tools-hadoop/.settings/org.eclipse.jdt.ui.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | formatter_profile=_twitter-tools
3 | formatter_settings_version=12
4 | org.eclipse.jdt.ui.exception.name=e
5 | org.eclipse.jdt.ui.gettersetter.use.is=true
6 | org.eclipse.jdt.ui.keywordthis=false
7 | org.eclipse.jdt.ui.overrideannotation=true
8 | 


--------------------------------------------------------------------------------
/twitter-tools-hadoop/README.md:
--------------------------------------------------------------------------------
 1 | # Analyzing Tweets with Pig: Getting Started
 2 | 
 3 | Since tweets are encoded in JSON, and Pig offers poor native JSON support, it's more convenient to use JSON loaders in Twitter's [Elephant Bird](https://github.com/kevinweil/elephant-bird/) library. Easiest just to fetch the relevant jars directly:
 4 | 
 5 | ```
 6 | wget http://repo1.maven.org/maven2/com/twitter/elephantbird/elephant-bird-core/4.5/elephant-bird-core-4.5.jar
 7 | wget http://repo1.maven.org/maven2/com/twitter/elephantbird/elephant-bird-pig/4.5/elephant-bird-pig-4.5.jar
 8 | wget http://repo1.maven.org/maven2/com/twitter/elephantbird/elephant-bird-hadoop-compat/4.5/elephant-bird-hadoop-compat-4.5.jar
 9 | wget http://repo1.maven.org/maven2/com/googlecode/json-simple/json-simple/1.1.1/json-simple-1.1.1.jar
10 | ```
11 | 
12 | You're ready to start analyzing tweets with Pig! Here's the obligatory word count example in Pig:
13 | 
14 | ```
15 | register 'elephant-bird-core-4.5.jar';
16 | register 'elephant-bird-pig-4.5.jar';
17 | register 'elephant-bird-hadoop-compat-4.5.jar';
18 | register 'json-simple-1.1.1.jar';
19 | 
20 | raw = load '/path/to/tweets' using com.twitter.elephantbird.pig.load.JsonLoader('-nestedLoad');
21 | 
22 | a = foreach raw generate (chararray) $0#'text' as text;
23 | b = foreach a generate flatten(TOKENIZE(text)) as word;
24 | c = group b by word;
25 | d = foreach c generate COUNT(b), group;
26 | 
27 | store d into 'wordcount';
28 | ```
29 | 


--------------------------------------------------------------------------------
/twitter-tools-hadoop/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 |   <modelVersion>4.0.0</modelVersion>
  4 | 
  5 |   <groupId>cc.twittertools.hadoop</groupId>
  6 |   <artifactId>twitter-tools-hadoop</artifactId>
  7 |   <version>1.0-SNAPSHOT</version>
  8 |   <packaging>jar</packaging>
  9 | 
 10 |   <name>twitter-tools-hadoop</name>
 11 |   <url>http://maven.apache.org</url>
 12 | 
 13 |   <properties>
 14 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 15 |   </properties>
 16 | 
 17 |   <licenses>
 18 |     <license>
 19 |       <name>The Apache Software License, Version 2.0</name>
 20 |       <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
 21 |       <distribution>repo</distribution>
 22 |     </license>
 23 |   </licenses>
 24 | 
 25 |   <developers>
 26 |     <developer>
 27 |       <id>JeffyRao</id>
 28 |       <name>Jinfeng Rao</name>
 29 |       <email>jinfeng@cs.umd.edu</email>
 30 |     </developer>
 31 |   </developers>
 32 |  
 33 |   <build>
 34 |     <plugins>
 35 |       <plugin>
 36 |       <groupId>org.codehaus.mojo</groupId>
 37 |       <artifactId>appassembler-maven-plugin</artifactId>
 38 |       <version>1.3.1</version>
 39 |       <configuration>
 40 |         <programs>
 41 |           <program>
 42 |             <mainClass>cc.twittertools.hbase.LoadWordCount</mainClass>
 43 |             <name>LoadWordCount</name>
 44 |           </program>              
 45 |          </programs>
 46 |        </configuration>                                                              </plugin>
 47 |     </plugins>
 48 |   </build>
 49 | 
 50 |   <dependencies>
 51 |     <dependency>
 52 |       <groupId>junit</groupId>
 53 |       <artifactId>junit</artifactId>
 54 |       <version>3.8.1</version>
 55 |       <scope>test</scope>
 56 |     </dependency>
 57 |     <dependency>
 58 |       <groupId>cc.twittertools</groupId>
 59 |       <artifactId>twitter-tools-core</artifactId>
 60 |       <version>1.4.2</version>
 61 |     </dependency>
 62 |  	<dependency>
 63 | 		<groupId>org.apache.pig</groupId>
 64 | 		<artifactId>pig</artifactId>
 65 | 		<version>0.12.1</version>
 66 | 	</dependency>
 67 | 	<dependency>
 68 | 		<groupId>org.apache.hadoop</groupId>
 69 | 		<artifactId>hadoop-core</artifactId>
 70 | 		<version>1.2.1</version>
 71 | 	</dependency>
 72 | 	<dependency>
 73 |       <groupId>org.apache.hbase</groupId>
 74 |       <artifactId>hbase</artifactId>
 75 |       <version>0.92.1</version>
 76 |       <exclusions>
 77 |         <exclusion>
 78 |           <artifactId>maven-release-plugin</artifactId>
 79 |           <groupId>org.apache.maven.plugins</groupId>
 80 |         </exclusion>
 81 |       </exclusions>
 82 |     </dependency>
 83 |     <!-- Hadoop requires commons-io but doesn't list it as an explicit
 84 |          or transient dependency. include it manually. -->
 85 |     <dependency>
 86 |       <groupId>commons-io</groupId>
 87 |       <artifactId>commons-io</artifactId>
 88 |       <version>2.1</version>
 89 |     </dependency>
 90 |     <dependency>
 91 |     	<groupId>org.apache.lucene</groupId>
 92 |     	<artifactId>lucene-core</artifactId>
 93 |     	<version>4.8.0</version>
 94 |     </dependency>
 95 |     <dependency>
 96 | 		<groupId>com.google.guava</groupId>
 97 | 		<artifactId>guava</artifactId>
 98 | 		<version>17.0</version>
 99 | 	</dependency>
100 |   </dependencies>
101 | </project>
102 | 


--------------------------------------------------------------------------------
/twitter-tools-hadoop/src/main/java/cc/twittertools/hadoop/Example.java:
--------------------------------------------------------------------------------
 1 | package cc.twittertools.hadoop;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.StringReader;
 5 | 
 6 | import org.apache.lucene.analysis.TokenStream;
 7 | import org.apache.lucene.analysis.Tokenizer;
 8 | import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 9 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
10 | import org.apache.lucene.util.Version;
11 | 
12 | import cc.twittertools.index.LowerCaseEntityPreservingFilter;
13 | 
14 | public class Example {
15 | 	
16 | 	public static void main(String[] args) throws IOException{
17 | 		//Test GetInterval Correctness
18 | 		try{
19 | 			String str = "Tue Oct 01 00:07:43 +0000 2011";
20 | 			String[] groups = str.split("\\s+");
21 | 			String time = groups[3];
22 | 			String[] timeGroups= time.split(":");
23 | 			int interval = (Integer.valueOf(timeGroups[0]))*12 + (Integer.valueOf(timeGroups[1])/5) + 1;
24 | 			System.out.println(interval);
25 | 		}catch(Exception e){
26 | 			throw new IOException("caught exception",e);
27 | 		}
28 | 	}
29 | }
30 | 


--------------------------------------------------------------------------------
/twitter-tools-hadoop/src/main/java/cc/twittertools/hbase/LoadWordCount.java:
--------------------------------------------------------------------------------
 1 | package cc.twittertools.hbase;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.File;
 5 | import java.io.FileNotFoundException;
 6 | import java.io.FileReader;
 7 | import java.io.IOException;
 8 | import java.util.HashMap;
 9 | import java.util.HashSet;
10 | import java.util.Iterator;
11 | import java.util.Map;
12 | import java.util.Set;
13 | 
14 | import org.apache.hadoop.hbase.client.HTablePool;
15 | import org.apache.hadoop.hbase.util.Bytes;
16 | 
17 | import com.google.common.collect.HashBasedTable;
18 | import com.google.common.collect.Table;
19 | 
20 | public class LoadWordCount {
21 | 
22 | 	public static void main(String[] args) throws IOException {
23 | 		// TODO Auto-generated method stub
24 | 		if(args.length!=1){
25 | 			System.out.println("invalid argument");
26 | 		}
27 | 		Table<String, String, WordCountDAO.WordCount> wordCountMap = HashBasedTable.create();
28 | 		File folder = new File(args[0]);
29 | 		if(folder.isDirectory()){
30 | 			for (File file : folder.listFiles()) {
31 | 				if(!file.getName().startsWith("part"))
32 | 					continue;
33 | 				System.out.println("Processing "+args[0]+file.getName());
34 | 				BufferedReader bf = new BufferedReader(new FileReader(args[0]+file.getName()));
35 | 				// each line in wordcount file is like : 1 twitter 100
36 | 				String line;
37 | 				while((line=bf.readLine())!=null){
38 | 					String[] groups = line.split("\\t");
39 | 					if(groups.length != 4) 
40 | 						continue;
41 | 					String day = groups[0]; // each day is viewed as a column in underlying HBase
42 | 					String interval = groups[1];
43 | 					String word = groups[2];
44 | 					String count = groups[3];
45 | 					if(!wordCountMap.contains(word, day)){
46 | 						WordCountDAO.WordCount w = new WordCountDAO.WordCount(word, day);
47 | 						wordCountMap.put(word, day, w);
48 | 					}
49 | 					WordCountDAO.WordCount w = wordCountMap.get(word, day);
50 | 					w.setCount(Integer.valueOf(interval), Integer.valueOf(count));
51 | 					wordCountMap.put(word, day, w);
52 | 					
53 | 				}
54 | 			}
55 | 		}
56 | 		
57 | 		System.out.println("Total "+wordCountMap.size()+" words");
58 | 		HTablePool pool = new HTablePool();
59 | 		WordCountDAO DAO = new WordCountDAO(pool);
60 | 		DAO.CreateTable();
61 | 		int count = 0;
62 | 		for(WordCountDAO.WordCount w: wordCountMap.values()){
63 | 			DAO.addWordCount(w);
64 | 			if(++count % 50000==0){
65 | 				System.out.println("Loading "+count+" words");
66 | 			}
67 | 		}
68 | 		pool.closeTablePool(DAO.TABLE_NAME);
69 | 	}
70 | 
71 | }
72 | 


--------------------------------------------------------------------------------
/twitter-tools-hadoop/src/main/java/cc/twittertools/hbase/WordCountDAO.java:
--------------------------------------------------------------------------------
  1 | package cc.twittertools.hbase;
  2 | 
  3 | import java.io.IOException;
  4 | import java.util.ArrayList;
  5 | import java.util.Comparator;
  6 | import java.util.List;
  7 | import java.util.NavigableMap;
  8 | import java.util.Set;
  9 | import java.util.TreeMap;
 10 | 
 11 | import org.apache.hadoop.conf.Configuration;
 12 | import org.apache.hadoop.hbase.HBaseConfiguration;
 13 | import org.apache.hadoop.hbase.HColumnDescriptor;
 14 | import org.apache.hadoop.hbase.HTableDescriptor;
 15 | import org.apache.hadoop.hbase.MasterNotRunningException;
 16 | import org.apache.hadoop.hbase.ZooKeeperConnectionException;
 17 | import org.apache.hadoop.hbase.client.Delete;
 18 | import org.apache.hadoop.hbase.client.Get;
 19 | import org.apache.hadoop.hbase.client.HBaseAdmin;
 20 | import org.apache.hadoop.hbase.client.HTableInterface;
 21 | import org.apache.hadoop.hbase.client.HTablePool;
 22 | import org.apache.hadoop.hbase.client.Put;
 23 | import org.apache.hadoop.hbase.client.Result;
 24 | import org.apache.hadoop.hbase.client.Scan;
 25 | import org.apache.hadoop.hbase.util.Bytes;
 26 | import org.apache.log4j.Logger;
 27 | 
 28 | 
 29 | public class WordCountDAO {
 30 | 	 private final static int DAY = 60*24;
 31 | 	 private final static int INTERVAL = 5;
 32 | 	 public static int NUM_INTERVALS = DAY/INTERVAL;
 33 | 	 public static final byte[] TABLE_NAME = Bytes.toBytes("wordcount");
 34 | 	 public static final byte[] COLUMN_FAMILY = Bytes.toBytes("count");
 35 | 	 
 36 | 	 private static final Logger log = Logger.getLogger(WordCountDAO.class);
 37 | 
 38 | 	 private HTablePool pool;
 39 | 	 
 40 | 	 public WordCountDAO(HTablePool pool) {
 41 | 		 this.pool = pool;
 42 | 	 }
 43 | 	 
 44 | 	 public void CreateTable() throws IOException, ZooKeeperConnectionException{
 45 | 		 Configuration conf = HBaseConfiguration.create();
 46 | 		  
 47 | 		 HBaseAdmin hbase = new HBaseAdmin(conf);
 48 | 		 HTableDescriptor[] wordcounts = hbase.listTables("wordcount");
 49 | 		  
 50 | 		 if(wordcounts.length != 0){ //Drop Table if Exists
 51 | 		 	 hbase.disableTable(TABLE_NAME);
 52 | 			 hbase.deleteTable(TABLE_NAME);
 53 | 		 }
 54 | 		 
 55 | 		 HTableDescriptor wordcount = new HTableDescriptor(TABLE_NAME);
 56 | 		 hbase.createTable(wordcount);
 57 | 		 // Cannot edit a stucture on an active table.
 58 | 		 hbase.disableTable(TABLE_NAME);
 59 | 		 HColumnDescriptor columnFamily = new HColumnDescriptor(COLUMN_FAMILY);
 60 | 		 hbase.addColumn(TABLE_NAME, columnFamily);
 61 | 		 hbase.enableTable(TABLE_NAME);
 62 | 		 
 63 | 		 hbase.close();
 64 | 	 }
 65 | 	 
 66 | 	 private static Get mkGet(String word) throws IOException {
 67 | 		 log.debug(String.format("Creating Get for %s", word));
 68 | 
 69 | 		 Get g = new Get(Bytes.toBytes(word));
 70 | 		 g.addFamily(COLUMN_FAMILY);
 71 | 		 return g;
 72 | 	 }
 73 | 	 
 74 | 	 private static Put mkPut(WordCount w){
 75 | 		 log.debug(String.format("Creating Put for %s", w.word));
 76 | 
 77 | 		 Put p = new Put(w.word);
 78 | 		 // add integer compression here
 79 | 		 // convert 2-d byte array to 1-d byte array
 80 | 		 byte[] storage = new byte[NUM_INTERVALS*Integer.SIZE/Byte.SIZE];
 81 | 		 for(int i=0; i< NUM_INTERVALS; i++){
 82 | 			 for(int j=0; j<Integer.SIZE/Byte.SIZE; j++){
 83 | 				storage[i*Integer.SIZE/Byte.SIZE+j] = w.count[i][j]; 
 84 | 			 }
 85 | 		 }
 86 | 		 p.add(COLUMN_FAMILY, w.column_id, storage);
 87 | 		 
 88 | 		 return p;
 89 | 	 }
 90 | 	 
 91 | 	 private static Delete mkDel(String word) {
 92 | 	    log.debug(String.format("Creating Delete for %s", word));
 93 | 
 94 | 	    Delete d = new Delete(Bytes.toBytes(word));
 95 | 	    return d;
 96 | 	  }
 97 | 
 98 | 	 private static Scan mkScan() {
 99 | 		 Scan s = new Scan();
100 | 		 s.addFamily(COLUMN_FAMILY);
101 | 		 return s;
102 | 	 }
103 | 	 
104 | 	 public void addWordCount(WordCount w) throws IOException{
105 | 		 HTableInterface words = pool.getTable(TABLE_NAME);
106 | 		 Put p = mkPut(w);
107 | 		 words.put(p);
108 | 		 words.close();
109 | 	 }
110 | 	 
111 | 	 public List<WordCount> getWordCount(String word) throws IOException {
112 | 		 HTableInterface words = pool.getTable(TABLE_NAME);
113 | 		 Get g = mkGet(word);
114 | 		 Result result = words.get(g);
115 | 		 if (result.isEmpty()) {
116 | 		      log.info(String.format("word %s not found.", word));
117 | 		      return null;
118 | 		 }
119 | 
120 | 		 List<WordCount> wordCounts = WordCount.GetWordCountFromResults(result);
121 | 		 words.close();
122 | 		 return wordCounts;
123 | 	 }
124 | 	 
125 | 	 public void deleteUser(String word) throws IOException {
126 | 		 HTableInterface words = pool.getTable(TABLE_NAME);
127 | 
128 | 	     Delete d = mkDel(word);
129 | 	     words.delete(d);
130 | 
131 | 	     words.close();
132 | 	 }
133 | 	 
134 | 	 public static class WordCount{
135 | 		 public byte[] word;
136 | 		 public byte[] column_id;
137 | 		 public byte[][] count;
138 | 		 
139 | 		 public WordCount(byte[] word, byte[] column_id){
140 | 			 this.word = word;
141 | 			 this.column_id = column_id;
142 | 			 this.count = new byte[NUM_INTERVALS][];
143 | 			 for(int i=0; i < NUM_INTERVALS; i++){
144 | 				 this.count[i] = Bytes.toBytes(0);
145 | 			 }
146 | 		 }
147 | 		 
148 | 		 public WordCount(String word, String column_id){
149 | 			 this.word = Bytes.toBytes(word);
150 | 			 this.column_id = Bytes.toBytes(column_id);
151 | 			 this.count = new byte[NUM_INTERVALS][];
152 | 			 for(int i=0; i < NUM_INTERVALS; i++){
153 | 				 this.count[i] = Bytes.toBytes(0);
154 | 			 }
155 | 		 }
156 | 		 
157 | 		 private WordCount(byte[] word, byte[] column_id, byte[][] count){
158 | 			 this.word = word;
159 | 			 this.column_id = column_id;
160 | 			 this.count = count;
161 | 		 }
162 | 		 
163 | 		 public static List<WordCount> GetWordCountFromResults(Result r){
164 | 			 List<WordCount> wordCounts = new ArrayList<WordCount>();
165 | 			 byte[] word = r.getRow();
166 | 			 // Map from column qualifiers to values
167 | 			 NavigableMap<byte[],byte[]> familyMap = r.getFamilyMap(COLUMN_FAMILY);
168 | 			 for(byte[] column: familyMap.keySet()){
169 | 				 byte[] value = familyMap.get(column);
170 | 				 // decompression
171 | 				 byte[][] count = new byte[NUM_INTERVALS][Integer.SIZE/Byte.SIZE];
172 | 				 for(int i=0; i<NUM_INTERVALS; i++){
173 | 					 for(int j=0; j<Integer.SIZE/Byte.SIZE; j++){
174 | 						 count[i][j] = value[i*Integer.SIZE/Byte.SIZE+j];
175 | 					 }
176 | 				 }
177 | 				 WordCount w = new WordCount(word, column, count);
178 | 				 wordCounts.add(w);
179 | 			 }
180 | 			 return wordCounts;
181 | 		 }
182 | 		 
183 | 		 public void setCount(int interval, int count){
184 | 			 this.count[interval] = Bytes.toBytes(count);
185 | 		 }
186 | 	 }
187 | }
188 | 


--------------------------------------------------------------------------------
/twitter-tools-hadoop/src/main/java/cc/twittertools/piggybank/ConvertCreatedAtToEpoch.java:
--------------------------------------------------------------------------------
 1 | package cc.twittertools.piggybank;
 2 | 
 3 | import java.io.IOException;
 4 | import java.text.ParseException;
 5 | import java.text.SimpleDateFormat;
 6 | 
 7 | import org.apache.pig.EvalFunc;
 8 | import org.apache.pig.data.Tuple;
 9 | 
10 | public class ConvertCreatedAtToEpoch extends EvalFunc<Long> {
11 |   private static final String DATE_FORMAT = "EEE MMM d k:m:s ZZZZZ yyyy"; // "Fri Mar 29 11:03:41 +0000 2013";
12 |   private static final SimpleDateFormat DATE_PARSER = new SimpleDateFormat(DATE_FORMAT);
13 | 
14 |   public Long exec(Tuple input) throws IOException {
15 |     if (input == null || input.size() == 0) {
16 |       return -1L;
17 |     }
18 | 
19 |     String createdAt = (String) input.get(0);
20 |     long epoch;
21 |     try {
22 |       epoch = DATE_PARSER.parse(createdAt).getTime() / 1000;
23 |     } catch (ParseException e) {
24 |       epoch = -1L;
25 |     }
26 | 
27 |     return epoch;
28 |   }
29 | }


--------------------------------------------------------------------------------
/twitter-tools-hadoop/src/main/java/cc/twittertools/piggybank/GetLatitude.java:
--------------------------------------------------------------------------------
 1 | package cc.twittertools.piggybank;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Iterator;
 5 | 
 6 | import org.apache.pig.EvalFunc;
 7 | import org.apache.pig.data.DataBag;
 8 | import org.apache.pig.data.Tuple;
 9 | 
10 | // Sample usage: cc.twittertools.piggybank.GetLatitude($0#'geo'#'coordinates')
11 | public class GetLatitude extends EvalFunc<String> {
12 | 	public String exec(Tuple input) throws IOException {
13 | 		DataBag bag = (DataBag) input.get(0);
14 | 		Iterator<Tuple> it = bag.iterator();
15 |     if (!it.hasNext()) {
16 |       return null;
17 |     }
18 |     Tuple tup = it.next();
19 | 
20 | 		return (String) tup.get(0);
21 | 	}
22 | }


--------------------------------------------------------------------------------
/twitter-tools-hadoop/src/main/java/cc/twittertools/piggybank/GetLongitude.java:
--------------------------------------------------------------------------------
 1 | package cc.twittertools.piggybank;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Iterator;
 5 | 
 6 | import org.apache.pig.EvalFunc;
 7 | import org.apache.pig.data.DataBag;
 8 | import org.apache.pig.data.Tuple;
 9 | 
10 | // Sample usage: cc.twittertools.piggybank.GetLongitude($0#'geo'#'coordinates');
11 | public class GetLongitude extends EvalFunc<String> {
12 |   public String exec(Tuple input) throws IOException {
13 |     DataBag bag = (DataBag) input.get(0);
14 |     Iterator<Tuple> it = bag.iterator();
15 |     if (!it.hasNext()) {
16 |       return null;
17 |     }
18 |     it.next();
19 |     if (!it.hasNext()) {
20 |       return null;
21 |     }
22 | 
23 |     Tuple tup = it.next();
24 | 
25 |     return (String) tup.get(0);
26 |   }
27 | }


--------------------------------------------------------------------------------
/twitter-tools-hadoop/src/main/java/cc/twittertools/piggybank/IsMap.java:
--------------------------------------------------------------------------------
 1 | package cc.twittertools.piggybank;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Map;
 5 | 
 6 | import org.apache.pig.FilterFunc;
 7 | import org.apache.pig.data.Tuple;
 8 | 
 9 | public class IsMap extends FilterFunc {
10 | 
11 |   @Override
12 |   public Boolean exec(Tuple input) throws IOException {
13 |     if (input == null || input.size() == 0) {
14 |       return false;
15 |     }
16 | 
17 |     return (input.get(0) instanceof Map);
18 |   }
19 | }


--------------------------------------------------------------------------------
/twitter-tools-hadoop/src/main/java/cc/twittertools/udf/GetDate.java:
--------------------------------------------------------------------------------
 1 | package cc.twittertools.udf;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.regex.Matcher;
 5 | import java.util.regex.Pattern;
 6 | import org.apache.lucene.analysis.Tokenizer;
 7 | import org.apache.lucene.analysis.TokenStream;
 8 | import org.apache.pig.EvalFunc;
 9 | import org.apache.pig.data.Tuple;
10 | import cc.twittertools.index.LowerCaseEntityPreservingFilter;
11 | import org.apache.lucene.analysis.core.WhitespaceTokenizer;
12 | 
13 | public class GetDate extends EvalFunc<String>{
14 | 	
15 | 	public String exec(Tuple input) throws IOException {
16 | 		if(input == null || input.size() == 0){
17 | 			return null;
18 | 		}
19 | 		//Standard Time Format: Tue Feb 08 23:59:59 +0000 2011
20 | 		try{
21 | 			String str = (String) input.get(0);
22 | 			String[] groups = str.split("\\s+");
23 | 			String year = groups[5];
24 | 			String month = groups[1];
25 | 			String day= groups[2];
26 | 			return year+" "+month+" "+day;
27 | 		}catch(Exception e){
28 | 			throw new IOException("caught exception",e);
29 | 		}
30 | 	}
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/twitter-tools-hadoop/src/main/java/cc/twittertools/udf/GetInterval.java:
--------------------------------------------------------------------------------
 1 | package cc.twittertools.udf;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.pig.EvalFunc;
 6 | import org.apache.pig.data.Tuple;
 7 | 
 8 | public class GetInterval extends EvalFunc<String>{
 9 | 	public String exec(Tuple input) throws IOException {
10 | 		if(input == null || input.size() == 0){
11 | 			return null;
12 | 		}
13 | 		//Standard Time Format: Tue Feb 08 23:59:59 +0000 2011
14 | 		try{
15 | 			String str = (String) input.get(0);
16 | 			String[] groups = str.split("\\s+");
17 | 			String time = groups[3];
18 | 			String[] timeGroups= time.split(":");
19 | 			int interval = (Integer.valueOf(timeGroups[0]))*12 + (Integer.valueOf(timeGroups[1])/5);
20 | 			return String.valueOf(interval);
21 | 		}catch(Exception e){
22 | 			throw new IOException("caught exception",e);
23 | 		}
24 | 	}
25 | }
26 | 


--------------------------------------------------------------------------------
/twitter-tools-hadoop/src/main/java/cc/twittertools/udf/LuceneTokenizer.java:
--------------------------------------------------------------------------------
 1 | package cc.twittertools.udf;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.StringReader;
 5 | import java.util.StringTokenizer;
 6 | 
 7 | import org.apache.lucene.analysis.Analyzer;
 8 | import org.apache.lucene.analysis.TokenStream;
 9 | import org.apache.lucene.analysis.Tokenizer;
10 | import org.apache.lucene.analysis.core.WhitespaceTokenizer;
11 | import org.apache.lucene.analysis.en.PorterStemFilter;
12 | import org.apache.lucene.analysis.standard.StandardTokenizer;
13 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
14 | import org.apache.lucene.util.Version;
15 | import org.apache.pig.EvalFunc;
16 | import org.apache.pig.data.BagFactory;
17 | import org.apache.pig.data.DataBag;
18 | import org.apache.pig.data.Tuple;
19 | import org.apache.pig.data.TupleFactory;
20 | 
21 | import cc.twittertools.index.LowerCaseEntityPreservingFilter;
22 | 
23 | public class LuceneTokenizer extends EvalFunc<DataBag>{
24 | 	TupleFactory mTupleFactory = TupleFactory.getInstance();
25 |     BagFactory mBagFactory = BagFactory.getInstance();
26 |     
27 |     public DataBag exec(Tuple input) throws IOException{
28 | 	    try {
29 | 	        DataBag output = mBagFactory.newDefaultBag();
30 | 	        Object o = input.get(0);
31 | 	        if (!(o instanceof String)) {
32 | 	            throw new IOException("Expected input to be chararray, but  got " + o.getClass().getName());
33 | 	        }
34 | 	        Tokenizer source = new WhitespaceTokenizer(Version.LUCENE_43, new StringReader((String)o));
35 | 	        TokenStream tokenstream = new LowerCaseEntityPreservingFilter(source);
36 | 	        tokenstream.reset();
37 | 	        while (tokenstream.incrementToken()){
38 | 	        	String token = tokenstream.getAttribute(CharTermAttribute.class).toString();
39 | 	        	output.add(mTupleFactory.newTuple(token));
40 | 	        }
41 | 	        return output;
42 | 	    } catch (Exception e) {
43 | 	        // error handling goes here
44 | 	    	throw new IOException("caught exception",e);
45 | 	    }
46 |     }
47 | }
48 | 


--------------------------------------------------------------------------------
/twitter-tools-hadoop/wordcountbytime.pig:
--------------------------------------------------------------------------------
 1 | register 'jar/elephant-bird-core-4.5.jar';
 2 | register 'jar/elephant-bird-pig-4.5.jar';
 3 | register 'jar/elephant-bird-hadoop-compat-4.5.jar';
 4 | register 'jar/json-simple-1.1.1.jar';
 5 | register 'jar/twitter-tools-hadoop-1.0-SNAPSHOT.jar';
 6 | register 'jar/twitter-tools-core-1.4.3-SNAPSHOT.jar'; 
 7 | register 'jar/lucene-core-4.8.0.jar';
 8 | register 'jar/lucene-analyzers-common-4.8.0.jar';
 9 | register 'jar/twitter-text-1.9.0.jar';
10 | 
11 | raw = load '/shared/collections/Tweets2011/20110208-099.json.gz' using com.twitter.elephantbird.pig.load.JsonLoader('-nestedLoad');
12 | 
13 | a = foreach raw generate $0#'created_at',$0#'text';
14 | b = foreach a generate cc.twittertools.udf.GetDate($0), cc.twittertools.udf.GetInterval($0), flatten(cc.twittertools.udf.LuceneTokenizer($1));
15 | c = group b by ($0,$1,$2);
16 | d = foreach c generate flatten(group),COUNT(b);
17 | 
18 | store d into 'wordcount';
19 | 


--------------------------------------------------------------------------------
/twitter-tools-rm3/README.md:
--------------------------------------------------------------------------------
 1 | microblog-demos
 2 | ===============
 3 | 
 4 | Examples of using the [2013 TREC microblog API](http://twittertools.cc/). Basically clones IndriRunQuery.
 5 | 
 6 | Getting Started
 7 | --------------
 8 | 
 9 | Once you've cloned the repository, build the package with Maven:
10 | 
11 | ```
12 | $ mvn clean package appassembler:assemble
13 | ```
14 | 
15 | Appassembler will automatically generate a launch scripts for:
16 | 
17 | + `target/appassembler/bin/RunQueries`: baseline run.  with or without RM3 feedback
18 | 
19 | To automatically generate project files for Eclipse:
20 | 
21 | ```
22 | $ mvn eclipse:clean
23 | $ mvn eclipse:eclipse
24 | ```
25 | 
26 | You can then use Eclipse's Import "Existing Projects into Workspace" functionality to import the project.
27 | 
28 | 
29 | Invoking Sample Runs
30 | --------------------
31 | After building, you can run the sample programs via somthing like this:
32 | 
33 | ```
34 | $ sh ./target/appassembler/bin/RunQueries ./config/params_run.json
35 | ```
36 | 
37 | which will run a simple baseline query likelihood retrieval.  All runnable programs are in ./target/appassembler/bin/ .  Also, all programs take a single argument: a JSON-formatted file that will look something like this:
38 | ```
39 | {
40 | "queries"      :  "./data/topics.microblog2012.txt",
41 | "host"         :  "<host_name>",
42 | "port"         :  9090,
43 | "num_results"  :  1000,
44 | "fb_docs"      :  0,
45 | "fb_terms"     :  0,
46 | "group"        :  "<your_group_here>",
47 | "token"        :  "<your_token_here>",
48 | "runtag"       :  "<run_tag_here>"
49 | }
50 | ```
51 | 
52 | Hopefully these variables are self-explanatory.  Setting either `fb_docs` or `fb_terms` to 0 gives a run with no feedback.  If both of these
53 | are set >0, pseudo-feedback using RM3 is used.
54 | 
55 | License
56 | -------
57 | 
58 | Licensed under the Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0
59 | 


--------------------------------------------------------------------------------
/twitter-tools-rm3/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | mvn clean package appassembler:assemble
3 | rm target/appassembler/bin/*bat
4 | chmod +x ./target/appassembler/bin/*
5 | 


--------------------------------------------------------------------------------
/twitter-tools-rm3/config/run_params_sample.json:
--------------------------------------------------------------------------------
 1 | {
 2 | "queries"      :  "./data/topics.microblog2011.json",
 3 | "stopper"      :  "./data/stoplist.twitter",
 4 | "fb_docs"      :  50,
 5 | "fb_terms"     :  20,
 6 | "host"         :  "<host name>",
 7 | "port"         :  9090,
 8 | "num_results"  :  1000,
 9 | "group"        :  "<group id>",
10 | "token"        :  "<group token>",
11 | "runtag"       :  "<run tag>"
12 | }
13 | 


--------------------------------------------------------------------------------
/twitter-tools-rm3/data/stoplist.twitter:
--------------------------------------------------------------------------------
  1 | de
  2 | en
  3 | new
  4 | y
  5 | i'm
  6 | el
  7 | que
  8 | tinyurl.com
  9 | en
 10 | t.co
 11 | rt
 12 | ow.ly
 13 | bit.ly
 14 | twitpic
 15 | http
 16 | html
 17 | www
 18 | https
 19 | com
 20 | php
 21 | htm
 22 | free
 23 | cfm
 24 | asp
 25 | jsp
 26 | a
 27 | about
 28 | above
 29 | according
 30 | across
 31 | after
 32 | afterwards
 33 | again
 34 | against
 35 | albeit
 36 | all
 37 | almost
 38 | alone
 39 | along
 40 | already
 41 | also
 42 | although
 43 | always
 44 | am
 45 | among
 46 | amongst
 47 | an
 48 | and
 49 | another
 50 | any
 51 | anybody
 52 | anyhow
 53 | anyone
 54 | anything
 55 | anyway
 56 | anywhere
 57 | apart
 58 | are
 59 | around
 60 | as
 61 | at
 62 | av
 63 | be
 64 | became
 65 | because
 66 | become
 67 | becomes
 68 | becoming
 69 | been
 70 | before
 71 | beforehand
 72 | behind
 73 | being
 74 | below
 75 | beside
 76 | besides
 77 | between
 78 | beyond
 79 | both
 80 | but
 81 | by
 82 | can
 83 | cannot
 84 | canst
 85 | certain
 86 | cf
 87 | choose
 88 | contrariwise
 89 | cos
 90 | could
 91 | cu
 92 | day
 93 | do
 94 | does
 95 | doesn't
 96 | doing
 97 | dost
 98 | doth
 99 | double
100 | down
101 | dual
102 | during
103 | each
104 | either
105 | else
106 | elsewhere
107 | enough
108 | et
109 | etc
110 | even
111 | ever
112 | every
113 | everybody
114 | everyone
115 | everything
116 | everywhere
117 | except
118 | excepted
119 | excepting
120 | exception
121 | exclude
122 | excluding
123 | exclusive
124 | far
125 | farther
126 | farthest
127 | few
128 | ff
129 | first
130 | for
131 | formerly
132 | forth
133 | forward
134 | from
135 | front
136 | further
137 | furthermore
138 | furthest
139 | get
140 | go
141 | had
142 | halves
143 | hardly
144 | has
145 | hast
146 | hath
147 | have
148 | he
149 | hence
150 | henceforth
151 | her
152 | here
153 | hereabouts
154 | hereafter
155 | hereby
156 | herein
157 | hereto
158 | hereupon
159 | hers
160 | herself
161 | him
162 | himself
163 | hindmost
164 | his
165 | hither
166 | hitherto
167 | how
168 | however
169 | howsoever
170 | i
171 | ie
172 | if
173 | in
174 | inasmuch
175 | inc
176 | include
177 | included
178 | including
179 | indeed
180 | indoors
181 | inside
182 | insomuch
183 | instead
184 | into
185 | inward
186 | inwards
187 | is
188 | it
189 | its
190 | itself
191 | just
192 | kind
193 | kg
194 | km
195 | last
196 | latter
197 | latterly
198 | less
199 | lest
200 | let
201 | like
202 | little
203 | ltd
204 | many
205 | may
206 | maybe
207 | me
208 | meantime
209 | meanwhile
210 | might
211 | moreover
212 | most
213 | mostly
214 | more
215 | mr
216 | mrs
217 | ms
218 | much
219 | must
220 | my
221 | myself
222 | namely
223 | need
224 | neither
225 | never
226 | nevertheless
227 | next
228 | no
229 | nobody
230 | none
231 | nonetheless
232 | noone
233 | nope
234 | nor
235 | not
236 | nothing
237 | notwithstanding
238 | now
239 | nowadays
240 | nowhere
241 | of
242 | off
243 | often
244 | ok
245 | on
246 | once
247 | one
248 | only
249 | onto
250 | or
251 | other
252 | others
253 | otherwise
254 | ought
255 | our
256 | ours
257 | ourselves
258 | out
259 | outside
260 | over
261 | own
262 | per
263 | perhaps
264 | plenty
265 | provide
266 | quite
267 | rather
268 | really
269 | round
270 | said
271 | sake
272 | same
273 | sang
274 | save
275 | saw
276 | see
277 | seeing
278 | seem
279 | seemed
280 | seeming
281 | seems
282 | seen
283 | seldom
284 | selves
285 | sent
286 | several
287 | shalt
288 | she
289 | should
290 | shown
291 | sideways
292 | since
293 | slept
294 | slew
295 | slung
296 | slunk
297 | smote
298 | so
299 | some
300 | somebody
301 | somehow
302 | someone
303 | something
304 | sometime
305 | sometimes
306 | somewhat
307 | somewhere
308 | spake
309 | spat
310 | spoke
311 | spoken
312 | sprang
313 | sprung
314 | stave
315 | staves
316 | still
317 | such
318 | supposing
319 | than
320 | that
321 | the
322 | thee
323 | their
324 | them
325 | themselves
326 | then
327 | thence
328 | thenceforth
329 | there
330 | thereabout
331 | thereabouts
332 | thereafter
333 | thereby
334 | therefore
335 | therein
336 | thereof
337 | thereon
338 | thereto
339 | thereupon
340 | these
341 | they
342 | this
343 | those
344 | thou
345 | though
346 | thrice
347 | through
348 | throughout
349 | thru
350 | thus
351 | thy
352 | thyself
353 | till
354 | to
355 | together
356 | too
357 | toward
358 | towards
359 | ugh
360 | unable
361 | under
362 | underneath
363 | unless
364 | unlike
365 | until
366 | up
367 | upon
368 | upward
369 | upwards
370 | us
371 | use
372 | used
373 | using
374 | very
375 | via
376 | vs
377 | want
378 | was
379 | we
380 | week
381 | well
382 | were
383 | what
384 | whatever
385 | whatsoever
386 | when
387 | whence
388 | whenever
389 | whensoever
390 | where
391 | whereabouts
392 | whereafter
393 | whereas
394 | whereat
395 | whereby
396 | wherefore
397 | wherefrom
398 | wherein
399 | whereinto
400 | whereof
401 | whereon
402 | wheresoever
403 | whereto
404 | whereunto
405 | whereupon
406 | wherever
407 | wherewith
408 | whether
409 | whew
410 | which
411 | whichever
412 | whichsoever
413 | while
414 | whilst
415 | whither
416 | who
417 | whoa
418 | whoever
419 | whole
420 | whom
421 | whomever
422 | whomsoever
423 | whose
424 | whosoever
425 | why
426 | will
427 | wilt
428 | with
429 | within
430 | without
431 | worse
432 | worst
433 | would
434 | wow
435 | ye
436 | yet
437 | year
438 | yippee
439 | you
440 | your
441 | yours
442 | yourself
443 | yourselves
444 | 


--------------------------------------------------------------------------------
/twitter-tools-rm3/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 2 |   <modelVersion>4.0.0</modelVersion>
 3 |   <groupId>edu.illinois.lis</groupId>
 4 |   <artifactId>twitter-tools-rm3</artifactId>
 5 |   <packaging>jar</packaging>
 6 |   <version>0.1-SNAPSHOT</version>
 7 |   <name>twitter-tools-rm3</name>
 8 |   <description>demo classes for using the TREC 2013 Microblog API</description>
 9 |   <url>http://people.lis.illinois.edu/~mefron/</url>
10 | 
11 |   <licenses>
12 |     <license>
13 |       <name>The Apache Software License, Version 2.0</name>
14 |       <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
15 |       <distribution>repo</distribution>
16 |     </license>
17 |   </licenses>
18 | 
19 |   <scm>
20 |     <connection>scm:git:git@github.com:milesefron/microblog-demos.git</connection>
21 |     <developerConnection>scm:git:git@github.com:milesefron/microblog-demos.git</developerConnection>
22 |     <url>git@github.com:milesefron/microblog-demos.git</url>
23 |   </scm>
24 | 
25 |   <developers>
26 |     <developer>
27 |       <id>milesefron</id>
28 |       <name>Miles Efron</name>
29 |       <email>mefron@illinois.edu</email>
30 |     </developer>
31 |   </developers>
32 | 
33 |   <parent>
34 |     <groupId>org.sonatype.oss</groupId>
35 |     <artifactId>oss-parent</artifactId>
36 |     <version>7</version>
37 |   </parent>
38 | 
39 |   <build>
40 |     <plugins>
41 |       <plugin>
42 |         <groupId>org.codehaus.mojo</groupId>
43 |         <artifactId>appassembler-maven-plugin</artifactId>
44 |         <version>1.3.1</version>
45 |         <configuration>
46 |           <programs>
47 |             <program>
48 |               <mainClass>edu.illinois.lis.search.RunQueries</mainClass>
49 |               <name>RunQueries</name>
50 |             </program>
51 |           </programs>
52 |         </configuration>
53 |       </plugin>
54 |     </plugins>
55 |   </build>
56 | 
57 |   <properties>
58 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
59 |     <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
60 |   </properties>
61 | 
62 |   <dependencies>
63 |     <dependency>
64 |       <groupId>junit</groupId>
65 |       <artifactId>junit</artifactId>
66 |       <version>4.11</version>
67 |       <scope>test</scope>
68 |     </dependency>
69 |     <dependency>
70 |       <groupId>commons-cli</groupId>
71 |       <artifactId>commons-cli</artifactId>
72 |       <version>1.2</version>
73 |     </dependency>
74 |     <dependency>
75 |       <groupId>commons-io</groupId>
76 |       <artifactId>commons-io</artifactId>
77 |       <version>2.4</version>
78 |     </dependency>
79 |     <dependency>
80 |       <groupId>org.apache.commons</groupId>
81 |       <artifactId>commons-math3</artifactId>
82 |       <version>3.2</version>
83 |     </dependency>
84 |     <dependency>
85 |       <groupId>cc.twittertools</groupId>
86 |       <artifactId>twitter-tools-core</artifactId>
87 |       <version>1.4.2</version>
88 |     </dependency>
89 |   </dependencies>
90 | </project>
91 | 


--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/feedback/FeedbackModel.java:
--------------------------------------------------------------------------------
  1 | package edu.illinois.lis.feedback;
  2 | 
  3 | import java.text.DecimalFormat;
  4 | import java.util.Collections;
  5 | import java.util.HashMap;
  6 | import java.util.Iterator;
  7 | import java.util.List;
  8 | import java.util.Map;
  9 | 
 10 | import cc.twittertools.thrift.gen.TResult;
 11 | import edu.illinois.lis.document.FeatureVector;
 12 | import edu.illinois.lis.query.GQuery;
 13 | import edu.illinois.lis.utils.KeyValuePair;
 14 | import edu.illinois.lis.utils.ScorableComparator;
 15 | import edu.illinois.lis.utils.Stopper;
 16 | 
 17 | 
 18 | public abstract class FeedbackModel {
 19 | 	protected List<TResult> relDocs;
 20 | 	protected GQuery originalQuery;
 21 | 	protected int fbDocCount  = 20;
 22 | 	protected int fbTermCount = 20;
 23 | 	protected List<KeyValuePair> features;		// these will be KeyValuePair objects
 24 | 	protected Stopper stopper;
 25 | 	
 26 | 	
 27 | 	
 28 | 	public void build(Stopper stopper) {
 29 | 		this.stopper = stopper;
 30 | 	}
 31 | 	
 32 | 
 33 | 	
 34 | 	public GQuery asGquery() {
 35 | 		GQuery newQuery = new GQuery();
 36 | 		newQuery.setTitle(originalQuery.getTitle());
 37 | 		newQuery.setText(originalQuery.getText());
 38 | 		
 39 | 		FeatureVector finalVector = new FeatureVector(stopper);
 40 | 		
 41 | 		ScorableComparator comparator = new ScorableComparator(true);
 42 | 		Collections.sort(features, comparator);
 43 | 		Iterator<KeyValuePair> it = features.iterator();
 44 | 		
 45 | 		int i=0;
 46 | 		while(it.hasNext() && i++ < fbTermCount) {			
 47 | 			KeyValuePair tuple = it.next();
 48 | 			finalVector.addTerm(tuple.getKey(), tuple.getScore());
 49 | 		}
 50 | 		
 51 | 		newQuery.setFeatureVector(finalVector);
 52 | 		
 53 | 		return newQuery;
 54 | 	}
 55 | 
 56 | 	public FeatureVector asFeatureVector() {
 57 | 		FeatureVector f = new FeatureVector(stopper);
 58 | 		Iterator<KeyValuePair> it = features.iterator();
 59 | 		
 60 | 		while(it.hasNext()) {			
 61 | 			KeyValuePair tuple = it.next();
 62 | 			f.addTerm(tuple.getKey(), tuple.getScore());
 63 | 		}	
 64 | 		
 65 | 		return f;
 66 | 	}
 67 | 	
 68 | 	public Map<String,Double> asMap() {
 69 | 		Map<String,Double> map = new HashMap<String,Double>(features.size());
 70 | 		Iterator<KeyValuePair> it = features.iterator();
 71 | 		while(it.hasNext()) {
 72 | 			KeyValuePair tuple = it.next();
 73 | 			map.put(tuple.getKey(), tuple.getScore());
 74 | 		}
 75 | 
 76 | 		return map;
 77 | 	}
 78 | 	
 79 | 	@Override 
 80 | 	public String toString() {
 81 | 		return toString(features.size());
 82 | 	}
 83 | 	
 84 | 	public String toString(int k) {
 85 | 		DecimalFormat format = new DecimalFormat("#.#####################");
 86 | 
 87 | 
 88 | 		
 89 | 		ScorableComparator comparator = new ScorableComparator(true);
 90 | 		Collections.sort(features, comparator);
 91 | 		
 92 | 		double sum = 0.0;
 93 | 		Iterator<KeyValuePair> it = features.iterator();
 94 | 		int i=0;
 95 | 		while(it.hasNext() && i++ < k) {			
 96 | 			sum += it.next().getScore();
 97 | 		}
 98 | 		
 99 | 		StringBuilder b = new StringBuilder();
100 | 		it = features.iterator();
101 | 		i=0;
102 | 		while(it.hasNext() && i++ < k) {			
103 | 			KeyValuePair tuple = it.next();
104 | 			b.append(format.format(tuple.getScore()/sum) + " " + tuple.getKey() + "\n");
105 | 		}
106 | 		
107 | 		return b.toString();
108 | 	}
109 | 	
110 | 
111 | 	public void setRes(List<TResult> relDocs) {
112 | 		this.relDocs = relDocs;
113 | 	}
114 | 	public void setOriginalQuery(GQuery originalQuery) {
115 | 		this.originalQuery = originalQuery;
116 | 	}
117 | 	public void setFbTermCount(int fbTermCount) {
118 | 		this.fbTermCount = fbTermCount;
119 | 	}
120 | 	
121 | 	
122 | 	
123 | 	
124 | }
125 | 


--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/feedback/FeedbackRelevanceModel.java:
--------------------------------------------------------------------------------
 1 | package edu.illinois.lis.feedback;
 2 | 
 3 | import java.util.HashSet;
 4 | import java.util.Iterator;
 5 | import java.util.LinkedList;
 6 | import java.util.List;
 7 | import java.util.Set;
 8 | 
 9 | import cc.twittertools.thrift.gen.TResult;
10 | 
11 | import edu.illinois.lis.document.FeatureVector;
12 | import edu.illinois.lis.utils.Stopper;
13 | import edu.illinois.lis.utils.KeyValuePair;
14 | 
15 | 
16 | 
17 | 
18 | public class FeedbackRelevanceModel extends FeedbackModel {
19 | 	private boolean stripNumbers = false;
20 | 	private double[] docWeights = null;
21 | 	
22 | 	@Override
23 | 	public void build(Stopper stopper) {
24 | 		this.stopper = stopper;
25 | 		try {
26 | 			Set<String> vocab = new HashSet<String>();
27 | 			List<FeatureVector> fbDocVectors = new LinkedList<FeatureVector>();
28 | 
29 | 			
30 | 
31 | 			double[] rsvs = new double[relDocs.size()];
32 | 			int k=0;
33 | 			Iterator<TResult> hitIterator = relDocs.iterator();
34 | 			while(hitIterator.hasNext()) {
35 | 				TResult hit = hitIterator.next();
36 | 				rsvs[k++] = hit.getRsv();
37 | 			}
38 | 			
39 | 			hitIterator = relDocs.iterator();
40 | 			while(hitIterator.hasNext()) {
41 | 				TResult hit = hitIterator.next();
42 | 				String text = hit.getText().toLowerCase();
43 | 				FeatureVector docVector = new FeatureVector(text, stopper);
44 | 				vocab.addAll(docVector.getFeatures());
45 | 				fbDocVectors.add(docVector);
46 | 			}
47 | 
48 | 			features = new LinkedList<KeyValuePair>();
49 | 
50 | 			
51 | 			Iterator<String> it = vocab.iterator();
52 | 			while(it.hasNext()) {
53 | 				String term = it.next();				
54 | 				double fbWeight = 0.0;
55 | 
56 | 				Iterator<FeatureVector> docIT = fbDocVectors.iterator();
57 | 				k=0;
58 | 				while(docIT.hasNext()) {
59 | 					double docWeight = 1.0;
60 | 					if(docWeights != null)
61 | 						docWeight = docWeights[k];
62 | 					FeatureVector docVector = docIT.next();
63 | 					double docProb = docVector.getFeaturetWeight(term) / docVector.getLength();
64 | 					docProb *= rsvs[k++] * docWeight;
65 | 
66 | 					fbWeight += docProb;
67 | 				}
68 | 				
69 | 				fbWeight /= (double)fbDocVectors.size();
70 | 				
71 | 				KeyValuePair tuple = new KeyValuePair(term, fbWeight);
72 | 				features.add(tuple);
73 | 			}
74 | 			
75 | 			
76 | 			
77 | 		} catch (Exception e) {
78 | 			e.printStackTrace();
79 | 		}
80 | 	}
81 | 	
82 | 	public void setDocWeights(double[] docWeights) {
83 | 		this.docWeights = docWeights;
84 | 	}
85 | 
86 | 
87 | }
88 | 


--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/query/GQueries.java:
--------------------------------------------------------------------------------
 1 | package edu.illinois.lis.query;
 2 | 
 3 | import java.util.Iterator;
 4 | 
 5 | /**
 6 |  * A container for holding a bunch of GQuery objects, with various types of convenience functionality added in 
 7 |  * instantiating classes.
 8 |  * 
 9 |  * @author Miles Efron
10 |  *
11 |  */
12 | public interface GQueries {
13 | 	public void read(String pathToQueries);
14 | 		
15 | 	public Iterator<GQuery> iterator();
16 | 	
17 | 	public GQuery getIthQuery(int i);
18 | 	
19 | 	public GQuery getNamedQuery(String queryName);
20 | 	
21 | 	public int numQueries();
22 | }
23 | 


--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/query/GQueriesJsonImpl.java:
--------------------------------------------------------------------------------
  1 | package edu.illinois.lis.query;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.FileReader;
  5 | import java.util.ArrayList;
  6 | import java.util.HashMap;
  7 | import java.util.Iterator;
  8 | import java.util.List;
  9 | import java.util.Map;
 10 | 
 11 | import org.apache.log4j.Logger;
 12 | 
 13 | 
 14 | import com.google.gson.JsonArray;
 15 | import com.google.gson.JsonElement;
 16 | import com.google.gson.JsonObject;
 17 | import com.google.gson.JsonParser;
 18 | 
 19 | import edu.illinois.lis.document.FeatureVector;
 20 | 
 21 | 
 22 | /**
 23 |  * reads and holds GQueries stored as a serialized JSON file on disk.
 24 |  * 
 25 |  * @author Miles Efron
 26 |  *
 27 |  */
 28 | public class GQueriesJsonImpl implements GQueries {
 29 | 	private static final Logger LOG = Logger.getLogger(GQueriesJsonImpl.class);
 30 | 
 31 | 	private static final JsonParser JSON_PARSER = new JsonParser();
 32 | 	private List<GQuery> queryList;
 33 | 	private Map<String,Integer> nameToIndex;
 34 | 
 35 | 	public void read(String pathToQueries) {
 36 | 		JsonObject obj = null;
 37 | 		try {
 38 | 			obj = (JsonObject) JSON_PARSER.parse(new BufferedReader(new FileReader(pathToQueries)));
 39 | 		} catch (Exception e) {
 40 | 			LOG.fatal("died reading queries from json file", e);
 41 | 			System.exit(-1);
 42 | 		}
 43 | 
 44 | 		
 45 | 		JsonArray queryObjectArray = obj.getAsJsonArray("queries");
 46 | 		queryList = new ArrayList<GQuery>(queryObjectArray.size());
 47 | 		nameToIndex = new HashMap<String,Integer>(queryList.size());
 48 | 		Iterator<JsonElement> queryObjectIterator = queryObjectArray.iterator();
 49 | 		int k=0;
 50 | 		while(queryObjectIterator.hasNext()) {
 51 | 			JsonObject queryObject = (JsonObject) queryObjectIterator.next();
 52 | 			String title = queryObject.get("title").getAsString();
 53 | 			String text  = queryObject.get("text").getAsString();
 54 | 			double epoch = queryObject.get("epoch").getAsDouble();
 55 | 			long querytweettime = queryObject.get("querytweettime").getAsLong();
 56 | 			nameToIndex.put(title, k++);
 57 | 			FeatureVector featureVector = new FeatureVector(null);
 58 | 			JsonArray modelObjectArray = queryObject.getAsJsonArray("model");
 59 | 			Iterator<JsonElement> featureIterator = modelObjectArray.iterator();
 60 | 			while(featureIterator.hasNext()) {
 61 | 				JsonObject featureObject = (JsonObject)featureIterator.next();
 62 | 				double weight  = featureObject.get("weight").getAsDouble();
 63 | 				String feature = featureObject.get("feature").getAsString();
 64 | 				featureVector.addTerm(feature, weight);
 65 | 			}
 66 | 			
 67 | 			
 68 | 			GQuery gQuery = new GQuery();
 69 | 			gQuery.setTitle(title);
 70 | 			gQuery.setText(text);
 71 | 			gQuery.setEpoch(epoch);
 72 | 			gQuery.setQuerytweettime(querytweettime);
 73 | 			gQuery.setFeatureVector(featureVector);
 74 | 			
 75 | 			queryList.add(gQuery);
 76 | 			
 77 | 		}	
 78 | 	}
 79 | 
 80 | 	public GQuery getIthQuery(int i) {
 81 | 		if(queryList == null || i >= queryList.size()) {
 82 | 			LOG.fatal("died trying to get query number " + i + "  when we have only " + queryList.size() + " queries.");
 83 | 			System.exit(-1);		
 84 | 		}
 85 | 		return queryList.get(i);
 86 | 	}
 87 | 	
 88 | 	public GQuery getNamedQuery(String queryName) {
 89 | 		if(queryList == null || ! nameToIndex.containsKey(queryName)) {
 90 | 			LOG.fatal("died trying to get query  " + queryName + ".");
 91 | 			System.exit(-1);		}
 92 | 		return queryList.get(nameToIndex.get(queryName));
 93 | 	}
 94 | 	
 95 | 
 96 | 	public Iterator<GQuery> iterator() {
 97 | 		return queryList.iterator();
 98 | 	}
 99 | 
100 | 	public int numQueries() {
101 | 		return queryList.size();
102 | 	}
103 | 	
104 | 	@Override
105 | 	public String toString() {
106 | 		StringBuilder b = new StringBuilder();
107 | 		
108 | 		Iterator<GQuery> it = queryList.iterator();
109 | 		while(it.hasNext()) {
110 | 			b.append(it.next());
111 | 		}
112 | 		
113 | 		return b.toString();
114 | 	}
115 | 
116 | 	
117 | 
118 | }
119 | 


--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/query/GQuery.java:
--------------------------------------------------------------------------------
 1 | package edu.illinois.lis.query;
 2 | 
 3 | 
 4 | import java.util.HashMap;
 5 | import java.util.Map;
 6 | 
 7 | import edu.illinois.lis.document.FeatureVector;
 8 | 
 9 | 
10 | /**
11 |  * a fairly rich representation of a query (or query-like) object.  at a minimum, it will typically contain a 
12 |  * name some text.
13 |  *  
14 |  * @author Miles Efron
15 |  *
16 |  */
17 | public class GQuery {
18 | 	private String name;
19 | 	private String text;
20 | 	private double epoch = -1.0;
21 | 	private long querytweettime = -1L;
22 | 	private FeatureVector featureVector;
23 | 	
24 | 
25 | 	public String getTitle() {
26 | 		return name;
27 | 	}
28 | 	public String getText() {
29 | 		return text;
30 | 	}
31 | 	public void setTitle(String name) {
32 | 		this.name = name;
33 | 	}
34 | 	public void setText(String text) {
35 | 		this.text = text;
36 | 	}
37 | 	public void setEpoch(double epoch) {
38 | 		this.epoch = epoch;
39 | 	}
40 | 	public void setQuerytweettime(long querytweettime) {
41 | 		this.querytweettime = querytweettime;
42 | 	}
43 | 	public double getEpoch() {
44 | 		return epoch;
45 | 	}
46 | 	public long getQuerytweettime() {
47 | 		return querytweettime;
48 | 	}
49 | 
50 | 
51 | 	public FeatureVector getFeatureVector() {
52 | 		return featureVector;
53 | 	}
54 | 	public void setFeatureVector(FeatureVector featureVector) {
55 | 		this.featureVector = featureVector;
56 | 	}
57 | 	
58 | }
59 | 


--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/query/TrecTemporalTopic.java:
--------------------------------------------------------------------------------
 1 | package edu.illinois.lis.query;
 2 | 
 3 | import com.google.common.base.Preconditions;
 4 | 
 5 | public class TrecTemporalTopic {
 6 |   private String query;
 7 |   private String id;
 8 |   private long time;
 9 |   private double epoch;
10 | 
11 |   public TrecTemporalTopic(String id, String query, long time, double epoch) {
12 |     this.id = Preconditions.checkNotNull(id);
13 |     this.query = Preconditions.checkNotNull(query);
14 |     Preconditions.checkArgument(time > 0);
15 |     this.time = time;
16 |     Preconditions.checkArgument(epoch > 0);
17 |     this.epoch = epoch;
18 |   }
19 | 
20 |   public String getId() {
21 |     return id;
22 |   }
23 | 
24 |   public String getQuery() {
25 |     return query;
26 |   }
27 | 
28 |   public long getQueryTweetTime() {
29 |     return time;
30 |   }
31 |   
32 |   public double getEpoch() {
33 | 	  return epoch;
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/query/TrecTemporalTopicSet.java:
--------------------------------------------------------------------------------
  1 | package edu.illinois.lis.query;
  2 | 
  3 | import java.io.File;
  4 | import java.io.IOException;
  5 | import java.text.ParseException;
  6 | import java.text.SimpleDateFormat;
  7 | import java.util.Iterator;
  8 | import java.util.List;
  9 | import java.util.regex.Matcher;
 10 | import java.util.regex.Pattern;
 11 | 
 12 | import com.google.common.base.Charsets;
 13 | import com.google.common.base.Joiner;
 14 | import com.google.common.base.Preconditions;
 15 | import com.google.common.collect.Lists;
 16 | import com.google.common.io.Files;
 17 | 
 18 | public class TrecTemporalTopicSet implements Iterable<TrecTemporalTopic>{
 19 |   private List<TrecTemporalTopic> queries = Lists.newArrayList();
 20 | 
 21 |   private TrecTemporalTopicSet() {}
 22 | 
 23 |   private void add(TrecTemporalTopic q) {
 24 |     queries.add(q);
 25 |   }
 26 | 
 27 |   public Iterator<TrecTemporalTopic> iterator() {
 28 |     return queries.iterator();
 29 |   }
 30 | 
 31 |   private static final String DATE_FORMAT = "EEE MMM d k:m:s ZZZZZ yyyy"; //"Fri Mar 29 11:03:41 +0000 2013"; 
 32 | 
 33 |   private static final Pattern TOP_PATTERN = Pattern.compile("<top(.*?)</top>", Pattern.DOTALL);
 34 |   private static final Pattern NUM_PATTERN = Pattern.compile("<num> Number: (MB\\d+) </num>", Pattern.DOTALL);
 35 | 
 36 |   // TREC 2011 topics uses <title> tag
 37 |   private static final Pattern TITLE_PATTERN = Pattern.compile("<title>\\s*(.*?)\\s*</title>", Pattern.DOTALL);
 38 |   // TREC 2012 topics use <query> tag
 39 |   private static final Pattern TITLE_PATTERN2 = Pattern.compile("<query>\\s*(.*?)\\s*</query>", Pattern.DOTALL);
 40 | 
 41 |   private static final Pattern TIMESTAMP_PATTERN = Pattern.compile("<querytime>\\s*(.*?)\\s*</querytime>", Pattern.DOTALL);
 42 | 
 43 |   private static final Pattern TWEETTIME_PATTERN = Pattern.compile("<querytweettime>\\s*(\\d+)\\s*</querytweettime>", Pattern.DOTALL);
 44 | 
 45 |   
 46 |   public static TrecTemporalTopicSet fromFile(File f) throws IOException {
 47 |     Preconditions.checkNotNull(f);
 48 |     Preconditions.checkArgument(f.exists());
 49 | 
 50 |     String s = Joiner.on("\n").join(Files.readLines(f, Charsets.UTF_8));
 51 |     TrecTemporalTopicSet queries = new TrecTemporalTopicSet();
 52 | 
 53 |     Matcher matcher = TOP_PATTERN.matcher(s);
 54 |     while (matcher.find()) {
 55 |       String top = matcher.group(0);
 56 | 
 57 |       
 58 |       Matcher m = NUM_PATTERN.matcher(top);
 59 |       if (!m.find()) {
 60 |         throw new IOException("Error parsing " + f);
 61 |       }
 62 |       String id = m.group(1);
 63 |       // Topics from 2012 are inconsistently numbered,
 64 |       // e.g., MB051 should match the qrels, which has MB51
 65 |       if (id.matches("MB0\\d\\d")) {
 66 |         id = id.replace("MB0", "MB");
 67 |       }
 68 | 
 69 |       m = TITLE_PATTERN.matcher(top);
 70 |       if (!m.find()) {
 71 |         m = TITLE_PATTERN2.matcher(top);
 72 |         if (!m.find()) {
 73 |           throw new IOException("Error parsing " + f);
 74 |         }
 75 |       }
 76 |       String text = m.group(1);
 77 | 
 78 |       m = TIMESTAMP_PATTERN.matcher(top);
 79 |       if (!m.find()) {
 80 |         throw new IOException("Error parsing " + f);
 81 |       }
 82 |       double epoch = -1.0;
 83 |       try {
 84 |           epoch = (new SimpleDateFormat(DATE_FORMAT)).parse(m.group(1)).getTime() / 1000;
 85 |         } catch (ParseException e) {
 86 |           epoch = -1.0;
 87 |         }
 88 |       
 89 |       m = TWEETTIME_PATTERN.matcher(top);
 90 |       if (!m.find()) {
 91 |         throw new IOException("Error parsing " + f);
 92 |       }
 93 |       long time = Long.parseLong(m.group(1));
 94 |       
 95 | 
 96 |       
 97 |       queries.add(new TrecTemporalTopic(id, text, time, epoch));
 98 |     }
 99 |     return queries;
100 |   }
101 | }
102 | 


--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/rerank/SearchReranker.java:
--------------------------------------------------------------------------------
 1 | package edu.illinois.lis.rerank;
 2 | 
 3 | import java.util.Collections;
 4 | import java.util.List;
 5 | 
 6 | 
 7 | import cc.twittertools.thrift.gen.TResult;
 8 | 
 9 | 
10 | public abstract class SearchReranker {
11 | 	protected List<TResult> results;
12 | 	
13 | 	protected abstract void score();
14 | 	
15 | 	public List<TResult> getReranked() {
16 | 		TResultComparator comparator = new TResultComparator(true);
17 | 		Collections.sort(results, comparator);
18 | 		return results;
19 | 	}
20 | }
21 | 


--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/rerank/TResultComparator.java:
--------------------------------------------------------------------------------
 1 | package edu.illinois.lis.rerank;
 2 | 
 3 | import java.util.Comparator;
 4 | 
 5 | import cc.twittertools.thrift.gen.TResult;
 6 | 
 7 | 
 8 | public class TResultComparator implements Comparator<TResult>{
 9 | 	private boolean decreasing = true;
10 | 	
11 | 	public TResultComparator(boolean decreasing) {
12 | 		this.decreasing = decreasing;
13 | 	}
14 | 	public int compare(TResult x, TResult y) {
15 | 		double xVal = x.getRsv();
16 | 		double yVal = y.getRsv();
17 | 		
18 | 		if(decreasing) {
19 | 			return (xVal > yVal  ? -1 : (xVal == yVal ? 0 : 1));
20 | 		} else {
21 | 			return (xVal < yVal  ? -1 : (xVal == yVal ? 0 : 1));
22 | 		}
23 | 
24 | 	}
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/search/RunQueries.java:
--------------------------------------------------------------------------------
  1 | package edu.illinois.lis.search;
  2 | 
  3 | import java.io.PrintStream;
  4 | import java.util.Iterator;
  5 | import java.util.List;
  6 | 
  7 | 
  8 | 
  9 | 
 10 | 
 11 | 
 12 | import cc.twittertools.search.api.TrecSearchThriftClient;
 13 | import cc.twittertools.thrift.gen.TResult;
 14 | import edu.illinois.lis.document.FeatureVector;
 15 | import edu.illinois.lis.feedback.FeedbackRelevanceModel;
 16 | import edu.illinois.lis.query.GQueries;
 17 | import edu.illinois.lis.query.GQueriesJsonImpl;
 18 | import edu.illinois.lis.query.GQuery;
 19 | import edu.illinois.lis.utils.ParameterBroker;
 20 | import edu.illinois.lis.utils.Stopper;
 21 | 
 22 | public class RunQueries {
 23 | 	private static final String DEFAULT_RUNTAG = "lucene4lm";
 24 | 
 25 | 	private static final String HOST_OPTION = "host";
 26 | 	private static final String PORT_OPTION = "port";
 27 | 	private static final String QUERIES_OPTION = "queries";
 28 | 	private static final String STOPPER_OPTION = "stopper";
 29 | 	private static final String FB_DOCS_OPTION = "fb_docs";
 30 | 	private static final String FB_TERMS_OPTION = "fb_terms";
 31 | 	private static final String NUM_RESULTS_OPTION = "num_results";
 32 | 	private static final String GROUP_OPTION = "group";
 33 | 	private static final String TOKEN_OPTION = "token";
 34 | 	private static final String RUNTAG_OPTION = "runtag";
 35 | 
 36 | 	private static final double ORIG_QUERY_WEIGHT = 0.5;
 37 | 	
 38 | 	private RunQueries() {}
 39 | 
 40 | 	public static void main(String[] args) throws Exception {
 41 | 		ParameterBroker params = new ParameterBroker(args[0]);
 42 | 
 43 | 		PrintStream out = new PrintStream(System.out, true, "UTF-8");
 44 | 		PrintStream err = new PrintStream(System.err, true, "UTF-8");
 45 | 
 46 | 		GQueries queries = new GQueriesJsonImpl();
 47 | 		queries.read(params.getParamValue(QUERIES_OPTION));
 48 | 		
 49 | 		Stopper stopper = null;
 50 | 		if(params.getParamValue(STOPPER_OPTION) != null)
 51 | 			stopper = new Stopper(params.getParamValue(STOPPER_OPTION));
 52 | 		
 53 | 		// max number of docs to send to output
 54 | 		int numResults = 1000;
 55 | 		try {
 56 | 			if (params.getParamValue(NUM_RESULTS_OPTION) != null) {
 57 | 				numResults = Integer.parseInt(params.getParamValue(NUM_RESULTS_OPTION));
 58 | 			}
 59 | 		} catch (NumberFormatException e) {
 60 | 			err.println("Invalid " + NUM_RESULTS_OPTION + ": " + params.getParamValue(NUM_RESULTS_OPTION));
 61 | 			System.exit(-1);
 62 | 		}
 63 | 
 64 | 		int fbDocs = 0;
 65 | 		try {
 66 | 			if (params.getParamValue(FB_DOCS_OPTION) != null) {
 67 | 				fbDocs = Integer.parseInt(params.getParamValue(FB_DOCS_OPTION));
 68 | 			}
 69 | 		} catch (NumberFormatException e) {
 70 | 			err.println("Invalid " + FB_DOCS_OPTION + ": " + params.getParamValue(FB_DOCS_OPTION));
 71 | 			System.exit(-1);
 72 | 		}
 73 | 		
 74 | 		int fbTerms = 0;
 75 | 		try {
 76 | 			if (params.getParamValue(FB_TERMS_OPTION) != null) {
 77 | 				fbTerms = Integer.parseInt(params.getParamValue(FB_TERMS_OPTION));
 78 | 			}
 79 | 		} catch (NumberFormatException e) {
 80 | 			err.println("Invalid " + FB_TERMS_OPTION + ": " + params.getParamValue(FB_TERMS_OPTION));
 81 | 			System.exit(-1);
 82 | 		}
 83 | 		
 84 | 		// authentication credentials
 85 | 		String group = params.getParamValue(GROUP_OPTION);
 86 | 		if(group==null) {
 87 | 			err.println("Invalid " + GROUP_OPTION + ": must set a valid group ID");
 88 | 			System.exit(-1);
 89 | 		}
 90 | 		String token = params.getParamValue(TOKEN_OPTION);
 91 | 		if(group==null) {
 92 | 			err.println("Invalid " + TOKEN_OPTION + ": must set a valid authentication token");
 93 | 			System.exit(-1);
 94 | 		}
 95 | 
 96 | 		TrecSearchThriftClient client = new TrecSearchThriftClient(params.getParamValue(HOST_OPTION),
 97 | 				Integer.parseInt(params.getParamValue(PORT_OPTION)), group, token);
 98 | 
 99 | 		Iterator<GQuery> queryIterator = queries.iterator();
100 | 		while(queryIterator.hasNext()) {
101 | 			GQuery query = queryIterator.next();
102 | 			System.err.println(query.getTitle());
103 | 			String queryText = query.getText();
104 | 			
105 | 			// stupid hack.  need to lowercase the query vector
106 | 			FeatureVector temp = new FeatureVector(null);
107 | 			Iterator<String> qTerms = query.getFeatureVector().iterator();
108 | 			while(qTerms.hasNext()) {
109 | 				String term = qTerms.next();
110 | 				temp.addTerm(term.toLowerCase(), query.getFeatureVector().getFeaturetWeight(term));
111 | 			}
112 | 			temp.normalizeToOne();
113 | 			query.setFeatureVector(temp);
114 | 			
115 | 			
116 | 			// if we're doing feedback
117 | 			if(fbDocs > 0 && fbTerms > 0) {
118 | 				List<TResult> results = client.search(queryText, query.getQuerytweettime(), fbDocs);
119 | 				FeedbackRelevanceModel fb = new FeedbackRelevanceModel();
120 | 				fb.setOriginalQuery(query);
121 | 				fb.setRes(results);
122 | 				fb.build(stopper);
123 | 				
124 | 				FeatureVector fbVector = fb.asFeatureVector();
125 | 				fbVector.pruneToSize(fbTerms);
126 | 				fbVector.normalizeToOne();
127 | 				fbVector = FeatureVector.interpolate(query.getFeatureVector(), fbVector, ORIG_QUERY_WEIGHT);
128 | 		
129 | 				System.err.println(fbVector);
130 | 				
131 | 				StringBuilder builder = new StringBuilder();
132 | 				Iterator<String> terms = fbVector.iterator();
133 | 				while(terms.hasNext()) {
134 | 					String term = terms.next();
135 | 					if(term.length() < 2)
136 | 						continue;
137 | 					double prob = fbVector.getFeaturetWeight(term);
138 | 					builder.append(term + "^" + prob + " ");
139 | 				}
140 | 				queryText = builder.toString().trim();
141 | 				
142 | 			}
143 | 			
144 | 			List<TResult> results = client.search(queryText, query.getQuerytweettime(), numResults);
145 | 			String runTag = params.getParamValue(RUNTAG_OPTION);
146 | 			if(runTag==null) 
147 | 				runTag = DEFAULT_RUNTAG;
148 | 
149 | 			int i = 1;
150 | 			Iterator<TResult> hitIterator = results.iterator();
151 | 			while(hitIterator.hasNext()) {
152 | 				TResult hit = hitIterator.next();
153 | 				out.println(String.format("%s Q0 %s %d %f %s", query.getTitle(), hit.getId(), i,
154 | 						hit.getRsv(), runTag));
155 | 
156 | 				if(i++ >= numResults)
157 | 					break;
158 | 			}
159 | 
160 | 		}
161 | 		out.close();
162 | 	}
163 | }
164 | 


--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/searchsource/IndexWrapperMicroblogApi.java:
--------------------------------------------------------------------------------
  1 | package edu.illinois.lis.searchsource;
  2 | 
  3 | import java.util.HashMap;
  4 | import java.util.Iterator;
  5 | import java.util.List;
  6 | import java.util.Map;
  7 | 
  8 | 
  9 | 
 10 | import cc.twittertools.search.api.TrecSearchThriftClient;
 11 | import cc.twittertools.thrift.gen.TResult;
 12 | import edu.illinois.lis.document.FeatureVector;
 13 | 
 14 | 
 15 | 
 16 | public class IndexWrapperMicroblogApi {
 17 | 	// API-specific variables
 18 | 	private String hostname;
 19 | 	private int port;
 20 | 	private String groupId;
 21 | 	private String authToken;
 22 | 
 23 | 	private Map<String,String> seenDocs;	// we store the text of any docs we've harvested.  e.g. for FB.
 24 | 
 25 | 	private TrecSearchThriftClient client;
 26 | 
 27 | 	
 28 | 	public IndexWrapperMicroblogApi(String hostname, int port, String groupId, String authToken) {
 29 | 		this.hostname  = hostname;
 30 | 		this.port      = port;
 31 | 		this.groupId   = groupId;
 32 | 		this.authToken = authToken;
 33 | 		
 34 | 		seenDocs = new HashMap<String,String>();
 35 | 		
 36 | 		try {
 37 | 			client = new TrecSearchThriftClient(hostname, port, groupId, authToken);
 38 | 		} catch (Exception e) {
 39 | 
 40 | 		}
 41 | 	}
 42 | 
 43 | 	public double docCount() {
 44 | 		return 0;
 45 | 	}
 46 | 
 47 | 
 48 | 	public double docFreq(String arg0) {
 49 | 		return 0;
 50 | 	}
 51 | 	
 52 | 	public double termFreq(String arg0) {
 53 | 		return 0;
 54 | 	}
 55 | 
 56 | 	public double termTokenCount() {
 57 | 		return 0;
 58 | 	}
 59 | 
 60 | 	public double termTypeCount() {
 61 | 		return 0;
 62 | 	}
 63 | 
 64 | 	public Object getActualIndex() {
 65 | 		return null;
 66 | 	}
 67 | 	
 68 | 	public FeatureVector getDocVector(String docId) {
 69 | 		if(seenDocs.containsKey(docId))
 70 | 			return new FeatureVector(seenDocs.get(docId), null);
 71 | 
 72 | 		// we should also be able to ping the API to get docs we haven't already seen
 73 | 		return null;
 74 | 	}
 75 | 
 76 | 	public List<TResult> runQuery(String query, long upperBoundTime, int count) {
 77 | 		List<TResult> results = null;
 78 | 		try {
 79 | 			results = client.search(query,upperBoundTime, count);
 80 | 			
 81 | 			// store our text for future reference
 82 | 			Iterator<TResult> resultIterator = results.iterator();
 83 | 			while(resultIterator.hasNext()) {
 84 | 				TResult result = resultIterator.next();
 85 | 				seenDocs.put(Long.toString(result.getId()), result.getText());
 86 | 			}
 87 | 		} catch (Exception e) {
 88 | 
 89 | 		}
 90 | 		return results;
 91 | 	}
 92 | 
 93 | 
 94 | 	
 95 | 
 96 | 
 97 | 
 98 | 
 99 | }
100 | 


--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/utils/ExtractGqueriesFromTrecFormat.java:
--------------------------------------------------------------------------------
 1 | package edu.illinois.lis.utils;
 2 | 
 3 | 
 4 | 
 5 | 
 6 | import java.io.File;
 7 | 
 8 | import com.google.gson.Gson;
 9 | import com.google.gson.GsonBuilder;
10 | import com.google.gson.JsonArray;
11 | import com.google.gson.JsonObject;
12 | 
13 | import edu.illinois.lis.query.TrecTemporalTopicSet;
14 | 
15 | 
16 | /**
17 |  * creates a simple set of gQueries from the official TREC MB topic file
18 |  * 
19 |  * @author Miles Efron
20 |  *
21 |  */
22 | public class ExtractGqueriesFromTrecFormat {
23 | 
24 | 	private JsonObject outputObjects = null;
25 | 	private String pathToTrecTopics;
26 | 	
27 | 	public ExtractGqueriesFromTrecFormat(String pathToTrecTopics) {
28 | 		this.pathToTrecTopics = pathToTrecTopics;
29 | 		outputObjects = new JsonObject();
30 | 	}
31 | 
32 | 	public void harvest() {	
33 | 		TrecTemporalTopicSet topicsFile = null;
34 | 		try {
35 | 			topicsFile = TrecTemporalTopicSet.fromFile(new File(pathToTrecTopics));
36 | 		} catch (Exception e) {
37 | 			e.printStackTrace();
38 | 		}
39 | 		
40 | 		JsonArray outputJsonArray = new JsonArray();
41 | 		for(edu.illinois.lis.query.TrecTemporalTopic query : topicsFile) {
42 | 			
43 | 
44 | 			JsonObject outputQueryObject = new JsonObject();
45 | 			outputQueryObject.addProperty("title", query.getId());
46 | 			outputQueryObject.addProperty("text", query.getQuery());
47 | 			outputQueryObject.addProperty("epoch", Double.toString(query.getEpoch()));
48 | 			outputQueryObject.addProperty("querytweettime", Long.toString(query.getQueryTweetTime()));
49 | 			
50 | 			String text = query.getQuery();
51 | 			String[] toks = text.split(" ");
52 | 			
53 | 			JsonArray modelArray = new JsonArray();
54 | 			for(String tok : toks) {
55 | 				JsonObject tupleObject = new JsonObject();
56 | 				tupleObject.addProperty("weight", 1.0);
57 | 				tupleObject.addProperty("feature", tok);
58 | 				modelArray.add(tupleObject);
59 | 			}
60 | 			outputQueryObject.add("model", modelArray);
61 | 
62 | 
63 | 			outputJsonArray.add(outputQueryObject);
64 | 		}	
65 | 		outputObjects.add("queries", outputJsonArray);
66 | 	}
67 | 
68 | 
69 | 	public String toString() {
70 | 		Gson gson = new GsonBuilder().setPrettyPrinting().create();
71 | 		String json = gson.toJson(outputObjects);
72 | 		return json;
73 | 	}
74 | 
75 | 
76 | 
77 | 
78 | 	public static void main(String[] args) throws Exception {
79 | 		String trecQueryPath = args[0];
80 | 
81 | 		ExtractGqueriesFromTrecFormat harvester = new ExtractGqueriesFromTrecFormat(trecQueryPath);
82 | 		harvester.harvest();
83 | 
84 | 		System.out.println(harvester);
85 | 	}
86 | 
87 | 
88 | 
89 | }
90 | 


--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/utils/KeyValuePair.java:
--------------------------------------------------------------------------------
 1 | package edu.illinois.lis.utils;
 2 | 
 3 | public class KeyValuePair implements Scorable {
 4 | 	private String key;
 5 | 	private double value;
 6 | 	
 7 | 	public KeyValuePair(String key, double value)  {
 8 | 		this.key = key;
 9 | 		this.value = value;
10 | 	}
11 | 
12 | 	public String getKey() {
13 | 		return key;
14 | 	}
15 | 	
16 | 	@Override
17 | 	public String toString() {
18 | 		StringBuilder b = new StringBuilder(value + "\t" + key);
19 | 		return b.toString();
20 | 	}
21 | 
22 | 	public void setScore(double score) {
23 | 		this.value = score;
24 | 	}
25 | 
26 | 	public double getScore() {
27 | 		return value;
28 | 	}
29 | }
30 | 


--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/utils/ListUtils.java:
--------------------------------------------------------------------------------
 1 | package edu.illinois.lis.utils;
 2 | 
 3 | import java.util.Iterator;
 4 | import java.util.List;
 5 | 
 6 | public class ListUtils {
 7 | 	
 8 | 	public static double[] listToArray(List<Double> x) {
 9 | 		double[] a = new double[x.size()];
10 | 		Iterator<Double> it = x.iterator();
11 | 		int i=0;
12 | 		while(it.hasNext()) {
13 | 			a[i++] = it.next();
14 | 		}
15 | 		return a;
16 | 	}
17 | }
18 | 


--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/utils/LuceneQuery.java:
--------------------------------------------------------------------------------
 1 | package edu.illinois.lis.utils;
 2 | 
 3 | import java.util.Iterator;
 4 | 
 5 | import edu.illinois.lis.document.FeatureVector;
 6 | import edu.illinois.lis.query.GQuery;
 7 | 
 8 | public class LuceneQuery {
 9 | 	public static String gQueryToLucene(GQuery gQuery, int k) {
10 | 		FeatureVector mainVector = new FeatureVector(gQuery.getText(), null);
11 | 		mainVector.normalizeToOne();
12 | 		FeatureVector fbVector = gQuery.getFeatureVector();
13 | 		fbVector.pruneToSize(k);
14 | 		fbVector.normalizeToOne();
15 | 		FeatureVector finalVector = FeatureVector.interpolate(mainVector, fbVector, 0.5);
16 | 		StringBuilder b = new StringBuilder();
17 | 		Iterator<String> terms = finalVector.iterator();
18 | 		while(terms.hasNext()) {
19 | 			String term = terms.next();
20 | 			double weight = finalVector.getFeaturetWeight(term);
21 | 			b.append(term + "^" + weight + " ");
22 | 		}
23 | 		return b.toString().trim();
24 | 	}
25 | }
26 | 


--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/utils/ParameterBroker.java:
--------------------------------------------------------------------------------
 1 | package edu.illinois.lis.utils;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.FileReader;
 5 | import java.util.HashMap;
 6 | import java.util.Iterator;
 7 | import java.util.Map;
 8 | import java.util.Map.Entry;
 9 | import java.util.Set;
10 | 
11 | 
12 | import com.google.gson.JsonElement;
13 | import com.google.gson.JsonObject;
14 | import com.google.gson.JsonParser;
15 | 
16 | /**
17 |  * N.B.  All params are stored as strings.  It is the responsibility of calling classes to transform into
18 |  * appropriate data types. 
19 |  * e.g. mu = Double.parseDouble(paramBroker.getParamValue("mu")
20 |  * 
21 |  * @author Miles Efron
22 |  *
23 |  */
24 | public class ParameterBroker {
25 | 
26 | 	private static final JsonParser JSON_PARSER = new JsonParser();
27 | 	private Map<String,String> params;
28 | 	
29 | 
30 | 	
31 | 	/**
32 | 	 * constructor where we initialize from a json file of structure:
33 | 	 * {
34 | 	 *  "param1":"value1",
35 | 	 *  "param2":"value2"
36 | 	 * }
37 | 	 * 
38 | 	 * @param pathToJson
39 | 	 */
40 | 	public ParameterBroker(String pathToJson) {
41 | 		params = new HashMap<String,String>();
42 | 		JsonObject json = null;
43 | 		try {
44 | 			json = (JsonObject) JSON_PARSER.parse(new BufferedReader(new FileReader(pathToJson)));
45 | 		} catch (Exception e) {
46 | 			System.err.println("died trying to parse json file: " + pathToJson);
47 | 			System.exit(-1);
48 | 		}
49 | 
50 | 		Set<Entry<String, JsonElement>> jsonEntries = json.entrySet();
51 | 		Iterator<Entry<String, JsonElement>> entryIterator = jsonEntries.iterator();
52 | 		while(entryIterator.hasNext()) {
53 | 			Entry<String, JsonElement> entry = entryIterator.next();
54 | 			params.put(entry.getKey(), entry.getValue().getAsString());
55 | 			System.setProperty(entry.getKey(), entry.getValue().getAsString());
56 | 		}
57 | 	}
58 | 	
59 | 	
60 | 	public String getParamValue(String paramName) {
61 | 		if(!params.containsKey(paramName))
62 | 			return null;
63 | 	    return params.get(paramName);
64 | 	}
65 | 	
66 | 	public void setParam(String name, String value) {
67 | 		params.put(name, value);
68 | 	}
69 | 
70 | 	
71 | }
72 | 


--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/utils/Qrels.java:
--------------------------------------------------------------------------------
 1 | package edu.illinois.lis.utils;
 2 | 
 3 | import java.io.File;
 4 | import java.io.FileReader;
 5 | import java.util.HashMap;
 6 | import java.util.HashSet;
 7 | import java.util.Iterator;
 8 | import java.util.List;
 9 | import java.util.Map;
10 | import java.util.Set;
11 | import java.util.regex.Pattern;
12 | 
13 | import org.apache.commons.io.IOUtils;
14 | 
15 | public class Qrels {
16 | 
17 | 	public static final Pattern SPACE_PATTERN = Pattern.compile(" ", Pattern.DOTALL);
18 | 	
19 | 	private static final int QUERY_COLUMN = 0;
20 | 	private static final int DOCNO_COLUMN = 2;
21 | 	private static final int REL_COLUMN   = 3;
22 | 	
23 | 	private Map<String,Set<String>> rel;
24 | 	private int minRel = 1;
25 | 	
26 | 	public Qrels(String pathToQrelsFile) {
27 | 		try {
28 | 			
29 | 			rel = new HashMap<String,Set<String>>();
30 | 			
31 | 			List<String> lines = IOUtils.readLines(new FileReader(new File(pathToQrelsFile)));
32 | 			Iterator<String> linesIt = lines.iterator();
33 | 			while(linesIt.hasNext()) {
34 | 				String[] toks = SPACE_PATTERN.split(linesIt.next());
35 | 				if(toks==null || toks.length != 4) {
36 | 					System.err.println("bad qrels line");
37 | 					continue;
38 | 				}
39 | 				String query = toks[QUERY_COLUMN];
40 | 				String docno = toks[DOCNO_COLUMN];
41 | 				int r = Integer.parseInt(toks[REL_COLUMN]);
42 | 				if(r >= minRel) {
43 | 					Set<String> relDocs = null;
44 | 					if(!rel.containsKey(query)) {
45 | 						relDocs = new HashSet<String>();
46 | 					} else {
47 | 						relDocs = rel.get(query);
48 | 					}
49 | 					relDocs.add(docno);
50 | 					rel.put(query, relDocs);
51 | 				} else {
52 | 				}
53 | 			}
54 | 		} catch (Exception e) {
55 | 			System.err.println("died trying to read qrel file: " + pathToQrelsFile);
56 | 			System.exit(-1);
57 | 		}
58 | 	}
59 | 	
60 | 	public boolean isRel(String query, String docno) {
61 | 		if(!rel.containsKey(query)) {
62 | 			System.err.println("no relevant documents found for query " + query);
63 | 			return false;
64 | 		}
65 | 		return rel.get(query).contains(docno);
66 | 	}
67 | 	
68 | 	public Set<String> getRelDocs(String query) {
69 | 		if(!rel.containsKey(query)) {
70 | 			System.err.println("no relevant documents found for query " + query);
71 | 			return null;
72 | 		}
73 | 		return rel.get(query);
74 | 	}
75 | 	
76 | 	public double numRel(String query) {
77 | 		if(!rel.containsKey(query)) {
78 | 			System.err.println("no relevant documents found for query " + query);
79 | 			return 0.0;
80 | 		}
81 | 		return (double)rel.get(query).size();
82 | 	}
83 | }
84 | 


--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/utils/Scorable.java:
--------------------------------------------------------------------------------
1 | package edu.illinois.lis.utils;
2 | 
3 | public interface Scorable {
4 | 
5 | 	public void setScore(double score);
6 | 	
7 | 	public double getScore();
8 | }
9 | 


--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/utils/ScorableComparator.java:
--------------------------------------------------------------------------------
 1 | package edu.illinois.lis.utils;
 2 | 
 3 | import java.util.Comparator;
 4 | 
 5 | 
 6 | public class ScorableComparator implements Comparator<Scorable>{
 7 | 	private boolean decreasing = true;
 8 | 	
 9 | 	public ScorableComparator(boolean decreasing) {
10 | 		this.decreasing = decreasing;
11 | 	}
12 | 	public int compare(Scorable x, Scorable y) {
13 | 		double xVal = x.getScore();
14 | 		double yVal = y.getScore();
15 | 		
16 | 		if(decreasing) {
17 | 			return (xVal > yVal  ? -1 : (xVal == yVal ? 0 : 1));
18 | 		} else {
19 | 			return (xVal < yVal  ? -1 : (xVal == yVal ? 0 : 1));
20 | 		}
21 | 
22 | 	}
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/utils/Stopper.java:
--------------------------------------------------------------------------------
 1 | package edu.illinois.lis.utils;
 2 | 
 3 | import java.io.FileInputStream;
 4 | import java.util.HashSet;
 5 | import java.util.Iterator;
 6 | import java.util.List;
 7 | import java.util.Set;
 8 | import java.util.regex.Pattern;
 9 | 
10 | import org.apache.commons.io.IOUtils;
11 | 
12 | public class Stopper {
13 | 	public static final Pattern SPACE_PATTERN = Pattern.compile(" ", Pattern.DOTALL);
14 | 	private Set<String> stopwords;
15 | 
16 | 
17 | 	public Stopper() {
18 | 		stopwords = new HashSet<String>();
19 | 	}
20 | 	
21 | 	public Stopper(String pathToStoplist) {
22 | 		try {
23 | 			stopwords = new HashSet<String>();
24 | 			
25 | 			// assume our stoplist has one stopword per line
26 | 			List<String> lines = IOUtils.readLines(new FileInputStream(pathToStoplist));
27 | 			Iterator<String> it = lines.iterator();
28 | 			while(it.hasNext()) {
29 | 				stopwords.add(it.next());
30 | 			}
31 | 		} catch (Exception e) {
32 | 			e.printStackTrace();
33 | 		}
34 | 	}
35 | 	
36 | 	public String apply(String text) {
37 | 		StringBuilder b = new StringBuilder();
38 | 		String[] toks = SPACE_PATTERN.split(text);
39 | 		for(String tok : toks) {
40 | 			if(! isStopWord(tok))
41 | 				b.append(tok + " ");
42 | 		}
43 | 		return b.toString().trim();
44 | 	}
45 | 	public void addStopword(String term) {
46 | 		stopwords.add(term);
47 | 	}
48 | 	public boolean isStopWord(String term) {
49 | 		return (stopwords.contains(term)) ? true : false;
50 | 	}
51 | 	
52 | 	public Set<String> asSet() {
53 | 		return stopwords;
54 | 	}
55 | }
56 | 


--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootLogger=INFO, A1
2 | log4j.appender.A1=org.apache.log4j.ConsoleAppender
3 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout
4 | 
5 | # Print the date in ISO 8601 format
6 | log4j.appender.A1.layout.ConversionPattern=%d [%t] %-5p %c{1} - %m%n
7 | log4j.logger.com.ning.http.client=WARN
8 | 


--------------------------------------------------------------------------------
/twitter-tools-ttgbaseline/README.md:
--------------------------------------------------------------------------------
 1 | microblogTTGBaseline
 2 | ====================
 3 | 
 4 | A baseline run using an (empirically determined) Jaccard similarity score to cluster tweets.
 5 | 
 6 | 1. Build with `mvn package`
 7 | 2. Set your `host`, `group`, and `package` parameters in `config/run_params.json`. Change any other parameters you want.
 8 | 3. Run with `java -cp target/microblogTTGBaseline-0.0.1-SNAPSHOT-jar-with-dependencies.jar edu.gslis.ttg.main.RunTTGBaseline`
 9 | 
10 | Note: Weighted scoring does not work properly, yet.
11 | 


--------------------------------------------------------------------------------
/twitter-tools-ttgbaseline/config/run_params.json:
--------------------------------------------------------------------------------
 1 | {
 2 | "queries"		:  "./topics/topics.microblog-2013.json",
 3 | "host"         		:  HOST_NAME_HERE,
 4 | "training_port"		:  9090,
 5 | "testing_port"		:  9091,
 6 | "num_results"  		:  1000,
 7 | "group"        		:  YOUR_GROUP_HERE,
 8 | "token"        		:  YOUR_TOKEN_HERE,
 9 | "runtag"       		:  "baseline",
10 | "jaccard_step" 		:  0.1,
11 | "training_queries"	:  "./topics/topics.ttg-training.json",
12 | "training_clusters"	:  
13 | "../data/clusters.training.microblog2011-2012.json",
14 | "qrels"			:  
15 | "../data/qrels.microblog2011-2012.txt",
16 | "evaluation_type"	:  "unweighted"
17 | }
18 | 


--------------------------------------------------------------------------------
/twitter-tools-ttgbaseline/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 2 |   <modelVersion>4.0.0</modelVersion>
 3 |   <groupId>edu.gslis</groupId>
 4 |   <artifactId>microblogTTGBaseline</artifactId>
 5 |   <version>0.0.1-SNAPSHOT</version>
 6 |   
 7 |   <name>microblog TTG baseline</name>
 8 |   <url>http://maven.apache.org</url>
 9 | 
10 |   <properties>
11 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
12 |   </properties>
13 |     <distributionManagement>
14 |         <repository>
15 |             <id>nema-dev.lis.illinois.edu</id>
16 |             <name>nema-dev.lis.illinois.edu-releases</name>
17 |             <url>http://nema-dev.lis.illinois.edu/artifactory//ir-libs</url>
18 |         </repository>
19 |         <snapshotRepository>
20 |             <id>nema-dev.lis.illinois.edu</id>
21 |             <name>nema-dev.lis.illinois.edu-snapshots</name>
22 |             <url>http://nema-dev.lis.illinois.edu/artifactory//ir-libs</url>
23 |         </snapshotRepository>
24 |     </distributionManagement>
25 |     <repositories>
26 |         <repository>
27 |             <name>ir-libs</name>
28 |             <id>ir-libs</id>
29 |             <url>http://nema-dev.lis.illinois.edu/artifactory/ir-libs/</url>
30 |             <snapshots>
31 |                 <enabled>true</enabled>
32 |                 <updatePolicy>never</updatePolicy>
33 |             </snapshots>
34 |             <releases>
35 |                 <enabled>true</enabled>
36 |                 <updatePolicy>never</updatePolicy>
37 |             </releases>
38 |         </repository>
39 |     </repositories>  
40 | 	 <build>
41 | 	    <sourceDirectory>src</sourceDirectory>
42 | 	    <plugins>
43 | 	      <plugin>
44 | 	        <artifactId>maven-compiler-plugin</artifactId>
45 | 	        <version>3.1</version>
46 | 	        <configuration>
47 | 	          <source>1.6</source>
48 | 	          <target>1.6</target>
49 | 	        </configuration>
50 | 	      </plugin>
51 | 	      <plugin>
52 | 			<artifactId>maven-assembly-plugin</artifactId>
53 | 			<configuration>
54 | 				<descriptorRefs>
55 | 					<descriptorRef>jar-with-dependencies</descriptorRef>
56 | 				</descriptorRefs>
57 | 			</configuration>
58 | 			<executions>
59 | 				<execution>
60 | 					<id>simple-command</id>
61 | 					<phase>package</phase>
62 | 					<goals>
63 | 						<goal>attached</goal>
64 | 					</goals>
65 | 				</execution>
66 | 			</executions>
67 | 		</plugin>
68 | 	    </plugins>
69 | 	</build>
70 | 	<dependencies>
71 | 		<dependency>
72 | 			<groupId>indri</groupId>
73 | 			<artifactId>indri</artifactId>
74 | 			<version>0.1</version>
75 | 		</dependency>
76 | 		<dependency>
77 | 			<groupId>edu.gslis</groupId>
78 | 			<artifactId>ir-utils</artifactId>
79 | 			<version>0.0.1-SNAPSHOT</version>
80 | 		</dependency>
81 | 		<dependency>
82 | 			<groupId>cc.twittertools</groupId>
83 | 			<artifactId>twitter-tools-core</artifactId>
84 | 			<version>1.4.1</version>
85 | 		</dependency>
86 | 		<dependency>
87 | 			<groupId>cc.twittertools</groupId>
88 | 			<artifactId>twitter-tools</artifactId>
89 | 			<version>1.3.0</version>
90 | 		</dependency>
91 | 		<dependency>
92 | 			<groupId>com.googlecode.json-simple</groupId>
93 | 			<artifactId>json-simple</artifactId>
94 | 			<version>1.1</version>
95 | 		</dependency>
96 | 	</dependencies>
97 | </project>


--------------------------------------------------------------------------------
/twitter-tools-ttgbaseline/src/edu/gslis/ttg/clusters/Cluster.java:
--------------------------------------------------------------------------------
 1 | package edu.gslis.ttg.clusters;
 2 | 
 3 | import java.util.Arrays;
 4 | import java.util.HashSet;
 5 | import java.util.Set;
 6 | 
 7 | import edu.gslis.eval.Qrels;
 8 | import edu.gslis.queries.GQuery;
 9 | 
10 | public class Cluster {
11 | 	private Set<Long> members;
12 | 	
13 | 	public Cluster() {
14 | 		members = new HashSet<Long>();
15 | 	}
16 | 	
17 | 	public Cluster(long member) {
18 | 		members = new HashSet<Long>();
19 | 		members.add(member);
20 | 	}
21 | 	
22 | 	public void add(long member) {
23 | 		members.add(member);
24 | 	}
25 | 	
26 | 	public void add(Set<Long> newMembers) {
27 | 		members.addAll(newMembers);
28 | 	}
29 | 	
30 | 	public Set<Long> getMembers() {
31 | 		return members;
32 | 	}
33 | 	
34 | 	public long getFirstMember() {
35 | 		return members.iterator().next();
36 | 	}
37 | 	
38 | 	public boolean hasMember(long member) {
39 | 		return members.contains(member);
40 | 	}
41 | 	
42 | 	public int getWeight(GQuery query, Qrels qrels) {
43 | 		// hack to change e.g. MB01 to 01
44 | 		String q = String.valueOf(Integer.parseInt(query.getTitle().substring(2, query.getTitle().length())));
45 | 		
46 | 		int weight = 0;
47 | 		for (long member : members) {
48 | 			if (qrels.isRel(q, String.valueOf(member))) {
49 | 				int level = qrels.getRelLevel(q, String.valueOf(member));
50 | 				weight += level;
51 | 			}
52 | 		}
53 | 		return weight;
54 | 	}
55 | 	
56 | 	@Override
57 | 	public String toString() {
58 | 		return Arrays.deepToString(members.toArray());
59 | 	}
60 | 	
61 | 	public int size() {
62 | 		return members.size();
63 | 	}
64 | 
65 | }
66 | 


--------------------------------------------------------------------------------
/twitter-tools-ttgbaseline/src/edu/gslis/ttg/clusters/Clusters.java:
--------------------------------------------------------------------------------
  1 | package edu.gslis.ttg.clusters;
  2 | 
  3 | import java.util.HashMap;
  4 | import java.util.HashSet;
  5 | import java.util.Iterator;
  6 | import java.util.Map;
  7 | import java.util.Set;
  8 | 
  9 | public class Clusters implements Iterable<Cluster> {
 10 | 	private Set<Cluster> clusters;
 11 | 	private Map<Long, Cluster> clusterMemberLookup;
 12 | 	
 13 | 	public Clusters() {
 14 | 		clusters = new HashSet<Cluster>();
 15 | 		clusterMemberLookup = new HashMap<Long, Cluster>();
 16 | 	}
 17 | 	
 18 | 	public void add(Cluster cluster) {
 19 | 		clusters.add(cluster);
 20 | 		for (long member : cluster.getMembers()) {
 21 | 			clusterMemberLookup.put(member, cluster);
 22 | 		}
 23 | 	}
 24 | 	
 25 | 	public Set<Cluster> getClusters() {
 26 | 		return clusters;
 27 | 	}
 28 | 	
 29 | 	public boolean hasCluster(Cluster cluster) {
 30 | 		return clusters.contains(cluster);
 31 | 	}
 32 | 	
 33 | 	public Cluster findCluster(long member) {
 34 | 		try {
 35 | 			return clusterMemberLookup.get(member);
 36 | 		} catch (NullPointerException e) {
 37 | 			return null;
 38 | 		}
 39 | 	}
 40 | 	
 41 | 	public Set<Long> getAllClusteredResults() {
 42 | 		return clusterMemberLookup.keySet();
 43 | 	}
 44 | 	
 45 | 	// Merge cluster 2 into cluster 1 and update the clusterMemberLookup
 46 | 	// Note: only call this function if cluster 1 is already in the clusters set
 47 | 	// 	(cluster 2 can be new or existing)
 48 | 	public void mergeExistingClusters(Cluster c1, Cluster c2) {
 49 | 		c1.add(c2.getMembers());
 50 | 		clusters.remove(c1);
 51 | 		try {
 52 | 			clusters.remove(c2);
 53 | 		} catch (Exception e) {
 54 | 			System.err.println("Unable to remove cluster 2 from clusters. Might be a new cluster.");
 55 | 		}
 56 | 		clusters.add(c1);
 57 | 		
 58 | 		updateClusterMembership(c1);
 59 | 	}
 60 | 	
 61 | 	// Merge two new clusters into the clusters set
 62 | 	public void mergeNewClusters(Cluster c1, Cluster c2) {
 63 | 		c1.add(c2.getMembers());
 64 | 		clusters.add(c1);
 65 | 		
 66 | 		updateClusterMembership(c1);
 67 | 	}
 68 | 	
 69 | 	public void mergeMembers(long m1, long m2) {
 70 | 		Cluster c1 = findCluster(m1);
 71 | 		Cluster c2 = findCluster(m2);
 72 | 		if (c1 == null && c2 == null) {
 73 | 			c1 = new Cluster(m1);
 74 | 			c2 = new Cluster(m2);
 75 | 			mergeNewClusters(c1, c2);
 76 | 		} else if (c1 == null) { // c2 exists
 77 | 			c1 = new Cluster(m1);
 78 | 			mergeExistingClusters(c2, c1);
 79 | 		} else { // c1 exists
 80 | 			if (c2 == null) {
 81 | 				c2 = new Cluster(m2);
 82 | 			}
 83 | 			mergeExistingClusters(c1, c2);
 84 | 		}
 85 | 	}
 86 | 	
 87 | 	public int size() {
 88 | 		return clusters.size();
 89 | 	}
 90 | 
 91 | 	@Override
 92 | 	public Iterator<Cluster> iterator() {
 93 | 		return clusters.iterator();
 94 | 	}
 95 | 	
 96 | 	@Override
 97 | 	public String toString() {
 98 | 		String output = "";
 99 | 		output += "[";
100 | 		Iterator<Cluster> it = clusters.iterator();
101 | 		while (it.hasNext()) {
102 | 			Cluster cluster = it.next();
103 | 			output += cluster.toString();
104 | 			if (it.hasNext()) {
105 | 				output += ", ";
106 | 			}
107 | 		}
108 | 		output += "]";
109 | 		return output;
110 | 	}
111 | 	
112 | 	private void updateClusterMembership(Cluster cluster) {
113 | 		for (long member : cluster.getMembers()) {
114 | 			clusterMemberLookup.put(member, cluster);
115 | 		}
116 | 	}
117 | 
118 | }
119 | 


--------------------------------------------------------------------------------
/twitter-tools-ttgbaseline/src/edu/gslis/ttg/clusters/clusterers/SimpleJaccardClusterer.java:
--------------------------------------------------------------------------------
 1 | package edu.gslis.ttg.clusters.clusterers;
 2 | 
 3 | import java.util.Iterator;
 4 | import java.util.List;
 5 | import java.util.NavigableMap;
 6 | 
 7 | import cc.twittertools.thrift.gen.TResult;
 8 | import edu.gslis.ttg.clusters.Clusters;
 9 | import edu.gslis.ttg.jaccard.JaccardStore;
10 | 
11 | public class SimpleJaccardClusterer {
12 | 	
13 | 	private List<TResult> results;
14 | 	private JaccardStore jaccardScores;
15 | 	
16 | 	public SimpleJaccardClusterer(List<TResult> results) {
17 | 		this.results = results;
18 | 		this.jaccardScores = computeJaccardSimilarity();
19 | 	}
20 | 	
21 | 	public Clusters cluster(double threshold) {
22 | 		Clusters clusters = new Clusters();
23 | 		
24 | 		NavigableMap<Double, List<long[]>> thresholdPairs = jaccardScores.getDocsGreaterThanScore(threshold);
25 | 		Iterator<Double> pairsIt = thresholdPairs.keySet().iterator();
26 | 		while (pairsIt.hasNext()) { // for each pair of documents matching this jaccard score
27 | 			List<long[]> docPairs = thresholdPairs.get(pairsIt.next());
28 | 			Iterator<long[]> docPairIt = docPairs.iterator();
29 | 			while (docPairIt.hasNext()) { // 
30 | 				long[] docs = docPairIt.next();
31 | 				clusters.mergeMembers(docs[0], docs[1]);
32 | 			}
33 | 		}
34 | 		
35 | 		return clusters;
36 | 	}
37 | 	
38 | 	public List<TResult> getResults() {
39 | 		return results;
40 | 	}
41 | 
42 | 	public void setResults(List<TResult> results) {
43 | 		this.results = results;
44 | 	}
45 | 	
46 | 	private JaccardStore computeJaccardSimilarity() {	
47 | 		// compute jaccard similarity for each pair of results
48 | 		JaccardStore scores = new JaccardStore();
49 | 		for (int j = 0; j < results.size(); j++) {
50 | 			TResult doc1 = results.get(j);
51 | 			for (int k = j + 1; k < results.size(); k++) {
52 | 				TResult doc2 = results.get(k);
53 | 				
54 | 				double jaccardSim = JaccardStore.computeJaccardSimilarity(doc1.getText(), doc2.getText());
55 | 				scores.setScore(doc1.getId(), doc2.getId(), jaccardSim);
56 | 			}
57 | 		}
58 | 		
59 | 		return scores;
60 | 	}
61 | 
62 | }
63 | 


--------------------------------------------------------------------------------
/twitter-tools-ttgbaseline/src/edu/gslis/ttg/jaccard/JaccardStore.java:
--------------------------------------------------------------------------------
 1 | package edu.gslis.ttg.jaccard;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.Arrays;
 5 | import java.util.HashMap;
 6 | import java.util.HashSet;
 7 | import java.util.List;
 8 | import java.util.Map;
 9 | import java.util.NavigableMap;
10 | import java.util.Set;
11 | import java.util.TreeMap;
12 | 
13 | public class JaccardStore {
14 | 	
15 | 	private Map<long[], Double> scores; // <docPair, jaccardScore>
16 | 	private TreeMap<Double, List<long[]>> scoreLookup; // <jaccardScore, docPairs>
17 | 	
18 | 	public JaccardStore() {
19 | 		scores = new HashMap<long[], Double>(); 
20 | 		scoreLookup = new TreeMap<Double, List<long[]>>();
21 | 	}
22 | 	
23 | 	public double getScore(long doc1, long doc2) {
24 | 		return scores.get(ordered(doc1, doc2));
25 | 	}
26 | 	
27 | 	public void setScore(long doc1, long doc2, double score) {
28 | 		scores.put(ordered(doc1, doc2), score);
29 | 		if (scoreLookup.get(score) == null) {
30 | 			scoreLookup.put(score, new ArrayList<long[]>());
31 | 		}
32 | 		scoreLookup.get(score).add(ordered(doc1, doc2));
33 | 	}
34 | 	
35 | 	public List<long[]> getDocsForScore(double score) {
36 | 		return scoreLookup.get(score);
37 | 	}
38 | 	
39 | 	public NavigableMap<Double, List<long[]>> getDocsGreaterThanScore(double score) {
40 | 		return scoreLookup.tailMap(score, true);
41 | 	}
42 | 	
43 | 	public int size() {
44 | 		return scores.keySet().size();
45 | 	}
46 | 	
47 | 	private long[] ordered(long doc1, long doc2) {
48 | 		long[] ordered = new long[2];
49 | 		if (doc1 < doc2) {
50 | 			ordered[0] = doc1;
51 | 			ordered[1] = doc2;
52 | 		} else {
53 | 			ordered[0] = doc2;
54 | 			ordered[1] = doc1;
55 | 		}
56 | 		return ordered;
57 | 	}
58 | 	
59 | 	public static double computeJaccardSimilarity(Set<String> doc1, Set<String> doc2) {
60 | 		Set<String> intersection = new HashSet<String>(doc1);
61 | 		Set<String> union = new HashSet<String>(doc1);
62 | 		
63 | 		intersection.retainAll(doc2);
64 | 		union.addAll(doc2);
65 | 		
66 | 		return intersection.size() / (double) union.size();
67 | 	}
68 | 	
69 | 	public static double computeJaccardSimilarity(String doc1, String doc2) {
70 | 		String[] docOneTerms = doc1.toLowerCase().split("[^A-Za-z0-9]");
71 | 		List<String> termList = new ArrayList<String>(Arrays.asList(docOneTerms));
72 | 		termList.removeAll(Arrays.asList("", null));
73 | 		Set<String> docOneBag = new HashSet<String>(termList);
74 | 		
75 | 		String[] docTwoTerms = doc2.toLowerCase().split("[^A-Za-z0-9]");
76 | 		termList = new ArrayList<String>(Arrays.asList(docTwoTerms));
77 | 		termList.removeAll(Arrays.asList("", null));
78 | 		Set<String> docTwoBag = new HashSet<String>(termList);
79 | 		
80 | 		return computeJaccardSimilarity(docOneBag, docTwoBag);
81 | 	}
82 | 
83 | }
84 | 


--------------------------------------------------------------------------------
/twitter-tools-ttgbaseline/src/edu/gslis/ttg/searchers/SimpleSearcher.java:
--------------------------------------------------------------------------------
 1 | package edu.gslis.ttg.searchers;
 2 | 
 3 | import java.util.HashMap;
 4 | import java.util.Iterator;
 5 | import java.util.List;
 6 | import java.util.Map;
 7 | 
 8 | import cc.twittertools.search.api.TrecSearchThriftClient;
 9 | import cc.twittertools.thrift.gen.TResult;
10 | import edu.gslis.queries.GQuery;
11 | import edu.gslis.textrepresentation.FeatureVector;
12 | 
13 | public class SimpleSearcher {
14 | 	
15 | 	private TrecSearchThriftClient client;
16 | 	private int maxResults;
17 | 	
18 | 	public SimpleSearcher(TrecSearchThriftClient client, int maxResults) {
19 | 		this.client = client;
20 | 		this.maxResults = maxResults;
21 | 	}
22 | 	
23 | 	
24 | 	public Map<Long, TResult> search(GQuery query) {
25 | 		// clean up query
26 | 		String queryText = query.getText();
27 | 		queryText = queryText.replaceAll("[,'\\.\\?]", " ");
28 | 		queryText = queryText.replaceAll("  ", " ").trim();
29 | 		
30 | 		// need to lowercase the query vector
31 | 		FeatureVector temp = new FeatureVector(null);
32 | 		Iterator<String> qTerms = query.getFeatureVector().iterator();
33 | 		while(qTerms.hasNext()) {
34 | 			String term = qTerms.next();
35 | 			temp.addTerm(term.toLowerCase(), query.getFeatureVector().getFeatureWeight(term));
36 | 		}
37 | 		temp.normalize();;
38 | 		query.setFeatureVector(temp);
39 | 		
40 | 		System.err.println(query.getTitle()+": "+queryText);
41 | 		
42 | 		// perform search
43 | 		List<TResult> results = null;
44 | 		try {
45 | 			results = client.search(queryText, Long.parseLong(query.getMetadata("querytweettime")), maxResults);
46 | 		} catch (Exception e) {
47 | 			System.err.println("Error searching.");
48 | 			System.exit(-1);
49 | 		}
50 | 		
51 | 		// set cutoff score heuristically
52 | 		double topScore = results.get(0).getRsv();
53 | 		double cutOffScore = topScore / 2;
54 | 		
55 | 		// record hits, removing duplicates
56 | 		int i = 1;
57 | 		Map<Long, TResult> seenMap = new HashMap<Long, TResult>(); 
58 | 		Iterator<TResult> hitIterator = results.iterator();
59 | 		while(hitIterator.hasNext()) {
60 | 			TResult hit = hitIterator.next();
61 | 			if (hit.getRsv() < cutOffScore) {
62 | 				break;
63 | 			}
64 | 			
65 | 			long docId = hit.id;
66 | 			if (seenMap.containsKey(docId))
67 | 				continue;
68 | 			seenMap.put(docId, hit);
69 | 
70 | 			if(i++ >= maxResults)
71 | 				break;
72 | 		}
73 | 		
74 | 		return seenMap;
75 | 	}
76 | 
77 | }
78 | 


--------------------------------------------------------------------------------
/twitter-tools-ttgbaseline/topics/topics.ttg-training.json:
--------------------------------------------------------------------------------
  1 | {
  2 | 	"queries": [
  3 | 		{
  4 | 		  "title": "MB03",
  5 | 		  "text": "Haiti Aristide return",
  6 | 		  "epoch": "1.297200733E9",
  7 | 		  "querytweettime": "35088534306033665",
  8 | 		  "model": [
  9 | 			{
 10 | 			  "weight": 1.0,
 11 | 			  "feature": "Haiti"
 12 | 			},
 13 | 			{
 14 | 			  "weight": 1.0,
 15 | 			  "feature": "Aristide"
 16 | 			},
 17 | 			{
 18 | 			  "weight": 1.0,
 19 | 			  "feature": "return"
 20 | 			}
 21 | 		  ]
 22 | 		},
 23 | 		{
 24 | 		  "title": "MB21",
 25 | 		  "text": "Emanuel residency court rulings",
 26 | 		  "epoch": "1.29627021E9",
 27 | 		  "querytweettime": "31185639047172097",
 28 | 		  "model": [
 29 | 			{
 30 | 			  "weight": 1.0,
 31 | 			  "feature": "Emanuel"
 32 | 			},
 33 | 			{
 34 | 			  "weight": 1.0,
 35 | 			  "feature": "residency"
 36 | 			},
 37 | 			{
 38 | 			  "weight": 1.0,
 39 | 			  "feature": "court"
 40 | 			},
 41 | 			{
 42 | 			  "weight": 1.0,
 43 | 			  "feature": "rulings"
 44 | 			}
 45 | 		  ]
 46 | 		},
 47 | 		{
 48 | 		  "title": "MB22",
 49 | 		  "text": "healthcare law unconstitutional",
 50 | 		  "epoch": "1.296598654E9",
 51 | 		  "querytweettime": "32563233118224385",
 52 | 		  "model": [
 53 | 			{
 54 | 			  "weight": 1.0,
 55 | 			  "feature": "healthcare"
 56 | 			},
 57 | 			{
 58 | 			  "weight": 1.0,
 59 | 			  "feature": "law"
 60 | 			},
 61 | 			{
 62 | 			  "weight": 1.0,
 63 | 			  "feature": "unconstitutional"
 64 | 			}
 65 | 		  ]
 66 | 		},
 67 | 		{
 68 | 		  "title": "MB26",
 69 | 		  "text": "US unemployment",
 70 | 		  "epoch": "1.296828651E9",
 71 | 		  "querytweettime": "33527910379814912",
 72 | 		  "model": [
 73 | 			{
 74 | 			  "weight": 1.0,
 75 | 			  "feature": "US"
 76 | 			},
 77 | 			{
 78 | 			  "weight": 1.0,
 79 | 			  "feature": "unemployment"
 80 | 			}
 81 | 		  ]
 82 | 		},
 83 | 		{
 84 | 		  "title": "MB42",
 85 | 		  "text": "Holland Iran envoy recall",
 86 | 		  "epoch": "1.297111633E9",
 87 | 		  "querytweettime": "34714824982134784",
 88 | 		  "model": [
 89 | 			{
 90 | 			  "weight": 1.0,
 91 | 			  "feature": "Holland"
 92 | 			},
 93 | 			{
 94 | 			  "weight": 1.0,
 95 | 			  "feature": "Iran"
 96 | 			},
 97 | 			{
 98 | 			  "weight": 1.0,
 99 | 			  "feature": "envoy"
100 | 			},
101 | 			{
102 | 			  "weight": 1.0,
103 | 			  "feature": "recall"
104 | 			}
105 | 		  ]
106 | 		},
107 | 		{
108 | 		  "title": "MB51",
109 | 		  "text": "British Government cuts",
110 | 		  "epoch": "1.297209406E9",
111 | 		  "querytweettime": "35124912364457984",
112 | 		  "model": [
113 | 			{
114 | 			  "weight": 1.0,
115 | 			  "feature": "British"
116 | 			},
117 | 			{
118 | 			  "weight": 1.0,
119 | 			  "feature": "Government"
120 | 			},
121 | 			{
122 | 			  "weight": 1.0,
123 | 			  "feature": "cuts"
124 | 			}
125 | 		  ]
126 | 		},
127 | 		{
128 | 		  "title": "MB57",
129 | 		  "text": "Chicago blizzard",
130 | 		  "epoch": "1.296683586E9",
131 | 		  "querytweettime": "32919462151720960",
132 | 		  "model": [
133 | 			{
134 | 			  "weight": 1.0,
135 | 			  "feature": "Chicago"
136 | 			},
137 | 			{
138 | 			  "weight": 1.0,
139 | 			  "feature": "blizzard"
140 | 			}
141 | 		  ]
142 | 		},
143 | 		{
144 | 		  "title": "MB66",
145 | 		  "text": "Journalists treatment in Egypt",
146 | 		  "epoch": "1.296865923E9",
147 | 		  "querytweettime": "33684239400566784",
148 | 		  "model": [
149 | 			{
150 | 			  "weight": 1.0,
151 | 			  "feature": "Journalists"
152 | 			},
153 | 			{
154 | 			  "weight": 1.0,
155 | 			  "feature": "treatment"
156 | 			},
157 | 			{
158 | 			  "weight": 1.0,
159 | 			  "feature": "in"
160 | 			},
161 | 			{
162 | 			  "weight": 1.0,
163 | 			  "feature": "Egypt"
164 | 			}
165 | 		  ]
166 | 		},
167 | 		{
168 | 		  "title": "MB68",
169 | 		  "text": "Charlie Sheen rehab",
170 | 		  "epoch": "1.296591293E9",
171 | 		  "querytweettime": "32532358276063232",
172 | 		  "model": [
173 | 			{
174 | 			  "weight": 1.0,
175 | 			  "feature": "Charlie"
176 | 			},
177 | 			{
178 | 			  "weight": 1.0,
179 | 			  "feature": "Sheen"
180 | 			},
181 | 			{
182 | 			  "weight": 1.0,
183 | 			  "feature": "rehab"
184 | 			}
185 | 		  ]
186 | 		},
187 | 		{
188 | 		  "title": "MB88",
189 | 		  "text": "Kings Speech awards",
190 | 		  "epoch": "1.297126104E9",
191 | 		  "querytweettime": "34775520600129536",
192 | 		  "model": [
193 | 			{
194 | 			  "weight": 1.0,
195 | 			  "feature": "Kings"
196 | 			},
197 | 			{
198 | 			  "weight": 1.0,
199 | 			  "feature": "Speech"
200 | 			},
201 | 			{
202 | 			  "weight": 1.0,
203 | 			  "feature": "awards"
204 | 			}
205 | 		  ]
206 | 		}
207 | 	]
208 | }


--------------------------------------------------------------------------------