├── .gitignore
├── API-agreement.pdf
├── HISTORY.md
├── README.md
├── data
├── clusters.training.microblog2011-2012.json
├── qrels.microblog2011-2012.txt
├── qrels.microblog2011.txt.gz
├── qrels.microblog2012.txt.gz
├── qrels.microblog2013.txt.gz
├── qrels.microblog2014.txt.gz
├── queries.trec2005efficiency.txt.gz
├── run.lm.xml
├── topics.microblog2011.txt
├── topics.microblog2012.txt
├── topics.microblog2013.txt
└── topics.microblog2014.txt
├── etc
├── trec_eval.9.0.tar.gz
└── ttg_eval.py
├── twitter-tools-core
├── .settings
│ ├── org.eclipse.jdt.core.prefs
│ └── org.eclipse.jdt.ui.prefs
├── pom.xml
└── src
│ ├── attic
│ └── java
│ │ └── cc
│ │ └── twittertools
│ │ ├── corpus
│ │ └── data
│ │ │ ├── TSVStatusBlockReader.java
│ │ │ └── TSVStatusCorpusReader.java
│ │ └── download
│ │ ├── AsyncJsonStatusBlockCrawler.java
│ │ └── VerifyJsonStatusBlockCrawl.java
│ ├── main
│ ├── java
│ │ ├── cc
│ │ │ └── twittertools
│ │ │ │ ├── corpus
│ │ │ │ ├── data
│ │ │ │ │ ├── HTMLStatusExtractor.java
│ │ │ │ │ ├── JsonStatusBlockReader.java
│ │ │ │ │ ├── JsonStatusCorpusReader.java
│ │ │ │ │ ├── Status.java
│ │ │ │ │ └── StatusStream.java
│ │ │ │ └── demo
│ │ │ │ │ └── ReadStatuses.java
│ │ │ │ ├── download
│ │ │ │ ├── AsyncEmbeddedJsonStatusBlockCrawler.java
│ │ │ │ └── AsyncHTMLStatusBlockCrawler.java
│ │ │ │ ├── index
│ │ │ │ ├── ExtractTermStatisticsFromIndex.java
│ │ │ │ ├── ExtractTweetidsFromCollection.java
│ │ │ │ ├── ExtractTweetidsFromIndex.java
│ │ │ │ ├── IndexStatuses.java
│ │ │ │ ├── LowerCaseEntityPreservingFilter.java
│ │ │ │ └── TweetAnalyzer.java
│ │ │ │ ├── search
│ │ │ │ ├── TrecTopic.java
│ │ │ │ ├── TrecTopicSet.java
│ │ │ │ ├── api
│ │ │ │ │ ├── RunQueriesBaselineThrift.java
│ │ │ │ │ ├── RunQueriesThrift.java
│ │ │ │ │ ├── SearchStatusesThrift.java
│ │ │ │ │ ├── TResultComparable.java
│ │ │ │ │ ├── TrecSearchHandler.java
│ │ │ │ │ ├── TrecSearchThriftClient.java
│ │ │ │ │ ├── TrecSearchThriftLoadGenerator.java
│ │ │ │ │ └── TrecSearchThriftServer.java
│ │ │ │ └── local
│ │ │ │ │ ├── RunQueries.java
│ │ │ │ │ └── SearchStatuses.java
│ │ │ │ ├── stream
│ │ │ │ └── GatherStatusStream.java
│ │ │ │ ├── thrift
│ │ │ │ └── gen
│ │ │ │ │ ├── TQuery.java
│ │ │ │ │ ├── TResult.java
│ │ │ │ │ ├── TrecSearch.java
│ │ │ │ │ └── TrecSearchException.java
│ │ │ │ └── util
│ │ │ │ ├── ExtractSubcollection.java
│ │ │ │ └── VerifySubcollection.java
│ │ └── log4j.properties
│ ├── perl
│ │ ├── extract_deletes.pl
│ │ └── join_deletes_with_collection.pl
│ ├── python
│ │ ├── Search
│ │ │ ├── TrecSearch-remote
│ │ │ ├── TrecSearch.py
│ │ │ ├── __init__.py
│ │ │ ├── constants.py
│ │ │ └── ttypes.py
│ │ ├── TrecSearchThriftClientCli.py
│ │ └── twittertools
│ │ │ └── stream
│ │ │ └── gather_status_stream.py
│ ├── resources
│ │ └── log4j.properties
│ └── thrift
│ │ ├── gen-py
│ │ ├── __init__.py
│ │ └── twittertools
│ │ │ ├── TrecSearch-remote
│ │ │ ├── TrecSearch.py
│ │ │ ├── __init__.py
│ │ │ ├── constants.py
│ │ │ └── ttypes.py
│ │ └── twittertools.thrift
│ └── test
│ └── java
│ └── cc
│ └── twittertools
│ ├── download
│ └── FetchStatusTest.java
│ ├── index
│ └── TokenizationTest.java
│ └── search
│ └── TrecTopicSetTest.java
├── twitter-tools-hadoop
├── .settings
│ ├── org.eclipse.jdt.core.prefs
│ └── org.eclipse.jdt.ui.prefs
├── README.md
├── pom.xml
├── src
│ └── main
│ │ └── java
│ │ └── cc
│ │ └── twittertools
│ │ ├── hadoop
│ │ └── Example.java
│ │ ├── hbase
│ │ ├── LoadWordCount.java
│ │ └── WordCountDAO.java
│ │ ├── piggybank
│ │ ├── ConvertCreatedAtToEpoch.java
│ │ ├── GetLatitude.java
│ │ ├── GetLongitude.java
│ │ └── IsMap.java
│ │ └── udf
│ │ ├── GetDate.java
│ │ ├── GetInterval.java
│ │ └── LuceneTokenizer.java
└── wordcountbytime.pig
├── twitter-tools-rm3
├── README.md
├── build.sh
├── config
│ └── run_params_sample.json
├── data
│ ├── qrels.microblog
│ ├── stoplist.twitter
│ ├── topics.microblog2011.json
│ ├── topics.microblog2012.json
│ └── topics.microblog2013.json
├── pom.xml
└── src
│ └── main
│ ├── java
│ └── edu
│ │ └── illinois
│ │ └── lis
│ │ ├── document
│ │ └── FeatureVector.java
│ │ ├── feedback
│ │ ├── FeedbackModel.java
│ │ └── FeedbackRelevanceModel.java
│ │ ├── query
│ │ ├── GQueries.java
│ │ ├── GQueriesJsonImpl.java
│ │ ├── GQuery.java
│ │ ├── TrecTemporalTopic.java
│ │ └── TrecTemporalTopicSet.java
│ │ ├── rerank
│ │ ├── SearchReranker.java
│ │ └── TResultComparator.java
│ │ ├── search
│ │ └── RunQueries.java
│ │ ├── searchsource
│ │ └── IndexWrapperMicroblogApi.java
│ │ └── utils
│ │ ├── ExtractGqueriesFromTrecFormat.java
│ │ ├── KeyValuePair.java
│ │ ├── ListUtils.java
│ │ ├── LuceneQuery.java
│ │ ├── ParameterBroker.java
│ │ ├── Qrels.java
│ │ ├── Scorable.java
│ │ ├── ScorableComparator.java
│ │ └── Stopper.java
│ └── resources
│ └── log4j.properties
└── twitter-tools-ttgbaseline
├── README.md
├── config
└── run_params.json
├── pom.xml
├── src
└── edu
│ └── gslis
│ └── ttg
│ ├── clusters
│ ├── Cluster.java
│ ├── Clusters.java
│ └── clusterers
│ │ └── SimpleJaccardClusterer.java
│ ├── jaccard
│ └── JaccardStore.java
│ ├── main
│ └── RunTTGBaseline.java
│ └── searchers
│ └── SimpleSearcher.java
└── topics
├── topics.microblog-2011.json
├── topics.microblog-2012.json
├── topics.microblog-2013.json
└── topics.ttg-training.json
/.gitignore:
--------------------------------------------------------------------------------
1 | twitter-tools-core/.classpath
2 | twitter-tools-core/.project
3 | twitter-tools-core/target/
4 | twitter-tools-rm3/.classpath
5 | twitter-tools-rm3/.project
6 | twitter-tools-rm3/target/
7 | twitter-tools-ttgbaseline/.classpath
8 | twitter-tools-ttgbaseline/.project
9 | twitter-tools-ttgbaseline/.settings/
10 | twitter-tools-ttgbaseline/target/
11 | twitter-tools-ttgbaseline/output.txt
12 | twitter-tools-hadoop/.classpath
13 | twitter-tools-hadoop/.project
14 | twitter-tools-hadoop/target/
15 | etc/run.sh
16 | etc/trec_eval.9.0/
17 | etc/trec_eval
18 | data/qrels.microblog2011.txt
19 | data/qrels.microblog2012.txt
20 | data/qrels.microblog2013.txt
21 | data/qrels.microblog2014.txt
22 | data/queries.trec2005efficiency.txt
23 | *~
24 | .DS_Store
25 | *.pyc
26 |
--------------------------------------------------------------------------------
/API-agreement.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lintool/twitter-tools/777776083d1e4a76da5bdc5860551a38f8fd6766/API-agreement.pdf
--------------------------------------------------------------------------------
/HISTORY.md:
--------------------------------------------------------------------------------
1 | Version 1.4.3
2 | =============
3 | December 26, 2014
4 |
5 | + API serving the Tweets2013 collection for TREC 2014, includes minor code fixes during TREC evaluations that have been merged back to master
6 |
7 | Version 1.4.2
8 | =============
9 | March 15, 2014
10 |
11 | + Added code to generate Thrift baseline runs
12 | + Added code to extract subcollection and term statistics
13 | + Added topics and qrels for TREC 2013
14 |
15 | Version 1.4.1
16 | =============
17 | July 7, 2013
18 |
19 | + Cleaned up dependencies and eliminated direct dependency on Solr
20 | + Fixed unnecessary string -> int/long parsing in retrieval
21 |
22 | Version 1.4.0
23 | =============
24 | July 3, 2013
25 |
26 | + Switched over from Ant to Maven for build management, with artifactId `twitter-tools-core`
27 |
28 | Version 1.3.0
29 | =============
30 | June 12, 2013
31 |
32 | + Package refactoring/renaming and code cleanup
33 | + Upgraded to Lucene 4.3
34 | + Added initial Python client
35 | + Installed Tweet-specific Lucene analyzer
36 | + Added simple Perl scripts for processing deletes
37 |
38 | Version 1.2.0
39 | =============
40 | June 6, 2013
41 |
42 | + Initial release of the API for TREC 2013
43 |
44 | Version 1.1.1
45 | =============
46 | January 28, 2013
47 |
48 | + Noted that `AsyncEmbeddedJsonStatusBlockCrawler` is currently broken
49 |
50 | Version 1.1.0
51 | =============
52 | January 23, 2013
53 |
54 | + Added crawler for Twitter public stream
55 |
56 | Version 1.0.0
57 | =============
58 | January 15, 2013
59 |
60 | + Cleaned up code
61 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Twitter Tools
2 | =============
3 |
4 | This repo holds a collection of tools for the TREC Microblog tracks, which officially ended in 2015. The track mailing list can be found at [trec-microblog@googlegroups.com](http://groups.google.com/group/trec-microblog).
5 |
6 | Archival Documents
7 | ------------------
8 |
9 | + [TREC 2013 API Specifications](https://github.com/lintool/twitter-tools/wiki/TREC-2013-API-Specifications)
10 | + [TREC 2013 Track Guidelines](https://github.com/lintool/twitter-tools/wiki/TREC-2013-Track-Guidelines)
11 | + [TREC 2014 Track Guidelines](https://github.com/lintool/twitter-tools/wiki/TREC-2014-Track-Guidelines)
12 | + [TREC 2015 Track Guidelines](https://github.com/lintool/twitter-tools/wiki/TREC-2015-Track-Guidelines)
13 |
14 | API Access
15 | ----------
16 |
17 | The Microblog tracks in 2013 and 2014 used the "evaluation as a service" (EaaS) model, where teams interact with the official corpus via a common API. Although the evaluation has ended, the API is still available for researcher use.
18 |
19 | To request access to the API, follow these steps:
20 |
21 | 1. Fill out the [API usage agreement](http://lintool.github.io/twitter-tools/API-agreement.pdf).
22 | 2. Email the usage agreement to `microblog-request@nist.gov`.
23 | 3. After NIST receives your request, you will receive an access token from NIST.
24 | 4. The code for accessing the API can be found in this repository. The endpoint of API itself (i.e., hostname, port) will be provided by NIST.
25 |
26 | Getting Stated
27 | --------------
28 |
29 | The main Maven artifact for the TREC Microblog API is `twitter-tools-core`. The latest releases of Maven artifacts are available at [Maven Central](http://search.maven.org/#search%7Cga%7C1%7Ccc.twittertools).
30 |
31 | You can clone the repo with the following command:
32 |
33 | ```
34 | $ git clone git://github.com/lintool/twitter-tools.git
35 | ```
36 |
37 | Once you've cloned the repository, change directory into `twitter-tools-core` and build the package with Maven:
38 |
39 | ```
40 | $ cd twitter-tools-core
41 | $ mvn clean package appassembler:assemble
42 | ```
43 |
44 | For more information, see the [project wiki](https://github.com/lintool/twitter-tools/wiki).
45 |
46 | Replicating TREC Baselines
47 | --------------------------
48 |
49 | One advantage of the TREC Microblog API is that it is possible to deploy a community baseline whose results are replicable by *anyone*. The `raw` results are simply the output of the API unmodified. The `baseline` results are the `raw` results that have been post-processed to remove retweets and break score ties by reverse chronological order (earliest first).
50 |
51 | To run the `raw` results for TREC 2011, issue the following command:
52 |
53 | ```
54 | sh target/appassembler/bin/RunQueriesThrift \
55 | -host [host] -port [port] -group [group] -token [token] \
56 | -queries ../data/topics.microblog2011.txt > run.microblog2011.raw.txt
57 | ```
58 |
59 | And to run the `baseline` results for TREC 2011, issue the following command:
60 |
61 | ```
62 | sh target/appassembler/bin/RunQueriesBaselineThrift \
63 | -host [host] -port [port] -group [group] -token [token] \
64 | -queries ../data/topics.microblog2011.txt > run.microblog2011.baseline.txt
65 | ```
66 |
67 | Note that `trec_eval` is included in `twitter-tools/etc` (just needs to be compiled), and the qrels are stored in `twitter-tools/data` (just needs to be uncompressed), so you can evaluate as follows:
68 |
69 | ```
70 | ../etc/trec_eval.9.0/trec_eval ../data/qrels.microblog2011.txt run.microblog2011.raw.txt
71 | ```
72 |
73 | Similar commands will allow you to replicate runs for TREC 2012 and TREC 2013. With `trec_eval`, you should get *exactly* the following results:
74 |
75 | MAP | raw | baseline
76 | ----------|--------|---------
77 | TREC 2011 | 0.3050 | 0.3576
78 | TREC 2012 | 0.1751 | 0.2091
79 | TREC 2013 | 0.2044 | 0.2532
80 | TREC 2014 | 0.3090 | 0.3924
81 |
82 | P30 | raw | baseline
83 | ----------|--------|---------
84 | TREC 2011 | 0.3483 | 0.4000
85 | TREC 2012 | 0.2831 | 0.3311
86 | TREC 2013 | 0.3761 | 0.4450
87 | TREC 2014 | 0.5145 | 0.6182
88 |
89 |
90 | License
91 | -------
92 |
93 | Licensed under the [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0).
94 |
95 |
96 | Acknowledgments
97 | ---------------
98 |
99 | This work is supported in part by the National Science Foundation under award [IIS-1218043](http://www.nsf.gov/awardsearch/showAward?AWD_ID=1218043). Any opinions, findings, and conclusions or recommendations expressed are those of the researchers and do not necessarily reflect the views of the National Science Foundation.
100 |
--------------------------------------------------------------------------------
/data/qrels.microblog2011.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lintool/twitter-tools/777776083d1e4a76da5bdc5860551a38f8fd6766/data/qrels.microblog2011.txt.gz
--------------------------------------------------------------------------------
/data/qrels.microblog2012.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lintool/twitter-tools/777776083d1e4a76da5bdc5860551a38f8fd6766/data/qrels.microblog2012.txt.gz
--------------------------------------------------------------------------------
/data/qrels.microblog2013.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lintool/twitter-tools/777776083d1e4a76da5bdc5860551a38f8fd6766/data/qrels.microblog2013.txt.gz
--------------------------------------------------------------------------------
/data/qrels.microblog2014.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lintool/twitter-tools/777776083d1e4a76da5bdc5860551a38f8fd6766/data/qrels.microblog2014.txt.gz
--------------------------------------------------------------------------------
/data/queries.trec2005efficiency.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lintool/twitter-tools/777776083d1e4a76da5bdc5860551a38f8fd6766/data/queries.trec2005efficiency.txt.gz
--------------------------------------------------------------------------------
/data/run.lm.xml:
--------------------------------------------------------------------------------
1 |
2 | tweets2011-index
3 | true
4 | 1000
5 | lm
6 |
7 |
--------------------------------------------------------------------------------
/etc/trec_eval.9.0.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lintool/twitter-tools/777776083d1e4a76da5bdc5860551a38f8fd6766/etc/trec_eval.9.0.tar.gz
--------------------------------------------------------------------------------
/etc/ttg_eval.py:
--------------------------------------------------------------------------------
1 | #This file is to take run file (as an input argument) and ground truth non-redundant tweets
2 | #to compute the unweighted precision, recall and weighted precision per topic.
3 | import json
4 | from sets import Set
5 | import argparse
6 |
7 | parser = argparse.ArgumentParser(description='Tweet Timeline Generation (TTG) evaluation script (version 1.0)')
8 | parser.add_argument('-q', required=True, metavar='qrels', help='qrels file')
9 | parser.add_argument('-c', required=True, metavar='clusters', help='cluster anotations')
10 | parser.add_argument('-r', required=True, metavar='run', help='run file')
11 |
12 | args = parser.parse_args()
13 | file_qrels_path = vars(args)['q']
14 | clusters_path = vars(args)['c']
15 | run_path = vars(args)['r']
16 |
17 | #Take qrels to generate dictionary of {topic number:{tweetid:weight}}
18 | #where weight is 0(non-relevant), 1(relevant), 2(highly relevant)
19 | qrels_dt = {}
20 | file_qrels = open(file_qrels_path, "r")
21 | lines = file_qrels.readlines()
22 | for line in lines:
23 | line = line.strip().split()
24 | topic_ind = line[0]
25 | if topic_ind not in qrels_dt:
26 | qrels_dt[topic_ind] = {}
27 | qrels_dt[topic_ind][line[2]] = line[3]
28 |
29 | #Take run file and generate dictionary of {topic number:Set of tweetids for that topic}
30 | runlength = len(run_path) - run_path.index("/") - 1
31 | clusters_run_dt = {}
32 | file_run = open(run_path, "r")
33 | lines = file_run.readlines()
34 | for line in lines:
35 | line = line.strip().split()
36 | topic_ind = line[0][line[0].index("MB") + 2:]
37 | if topic_ind not in clusters_run_dt:
38 | clusters_run_dt[topic_ind] = Set()
39 | clusters_run_dt[topic_ind].add(line[2])
40 |
41 | #Take ground truth, generate dictionary of {topic number:2D array of clusters of tweetids}, for each topic,
42 | #compare tweet from each cluster with that from run file and compute unweighted precision, recall and weighted recall.
43 | clusters_dt = {}
44 | precision_total = 0
45 | unweighted_recall_total = 0
46 | weighted_recall_total = 0
47 | file_clusters = open(clusters_path, "r")
48 | data = json.load(file_clusters)
49 | topics = data["topics"]
50 | print "runtag".ljust(runlength) + "\ttopic\tunweighted_recall weighted_recall precision"
51 | for topic in sorted(topics.keys()):
52 | total_weight = 0
53 | credits = 0
54 | hit_num = 0
55 | topic_ind = topic[line[0].index("MB") + 2:]
56 | topic_ind = topic_ind.encode("utf-8")
57 | clusters_json = topics[topic]["clusters"]
58 | for i in range(len(clusters_json)):
59 | clusters_json[i] = [s.encode("utf-8") for s in clusters_json[i]]
60 | clusters_dt[topic_ind] = clusters_json
61 | for cluster in clusters_dt[topic_ind]:
62 | weight = 0
63 | hit_flag = 0
64 | for tweet in cluster:
65 | weight = weight + int(qrels_dt[topic_ind][tweet])
66 | if tweet in clusters_run_dt[topic_ind]:
67 | hit_flag = 1
68 | total_weight = total_weight + weight
69 | if hit_flag == 1:
70 | credits = credits + weight
71 | hit_num = hit_num + 1
72 | hit_flag = 0
73 | precision = float(hit_num) / len(clusters_run_dt[topic_ind])
74 | unweighted_recall = float(hit_num) / len(clusters_dt[topic_ind])
75 | weighted_recall = float(credits) / total_weight
76 | precision_total = precision_total + precision
77 | unweighted_recall_total = unweighted_recall_total + unweighted_recall
78 | weighted_recall_total = weighted_recall_total + weighted_recall
79 | print run_path[run_path.rindex("/") + 1:].ljust(max(runlength, 6)) + "\t" + "MB" + str(topic_ind) + "\t" + "%12.4f" % unweighted_recall + "\t" + "%12.4f" % weighted_recall + "\t" + "%10.4f" % precision
80 | precision_mean = precision_total / len(clusters_dt)
81 | unweighted_recall_mean = unweighted_recall_total / len(clusters_dt)
82 | weighted_recall_mean = weighted_recall_total / len(clusters_dt)
83 | print run_path[run_path.rindex("/") + 1:].ljust(max(runlength, 6)) + "\t" + "all".ljust(5) + "\t" + "%12.4f" % unweighted_recall_mean + "\t" + "%12.4f" % weighted_recall_mean + "\t" + "%10.4f" % precision_mean
84 | file_run.close()
85 | file_clusters.close()
86 |
--------------------------------------------------------------------------------
/twitter-tools-core/.settings/org.eclipse.jdt.ui.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | formatter_profile=_twitter-tools
3 | formatter_settings_version=12
4 | org.eclipse.jdt.ui.exception.name=e
5 | org.eclipse.jdt.ui.gettersetter.use.is=true
6 | org.eclipse.jdt.ui.keywordthis=false
7 | org.eclipse.jdt.ui.overrideannotation=true
8 |
--------------------------------------------------------------------------------
/twitter-tools-core/src/attic/java/cc/twittertools/corpus/data/TSVStatusBlockReader.java:
--------------------------------------------------------------------------------
1 | package cc.twittertools.corpus.data;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.File;
5 | import java.io.FileInputStream;
6 | import java.io.IOException;
7 | import java.io.InputStreamReader;
8 | import java.util.zip.GZIPInputStream;
9 |
10 |
11 | /**
12 | * Abstraction for an stream of statuses, backed by an underlying gzipped file with JSON-encoded
13 | * tweets, one per line.
14 | */
15 | public class TSVStatusBlockReader implements StatusStream {
16 | private final BufferedReader br;
17 |
18 | public TSVStatusBlockReader(File file) throws IOException {
19 |
20 | if (!file.getName().endsWith(".gz")) {
21 | throw new IOException("Expecting .gz compressed file!");
22 | }
23 |
24 | br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file)), "UTF-8"));
25 | }
26 |
27 | /**
28 | * Returns the next status, or null
if no more statuses.
29 | */
30 | public Status next() throws IOException {
31 | Status nxt = null;
32 | String raw = null;
33 |
34 | while (nxt == null) {
35 | raw = br.readLine();
36 |
37 | // Check to see if we've reached end of file.
38 | if ( raw == null) {
39 | return null;
40 | }
41 |
42 | nxt = Status.fromTSV(raw);
43 | }
44 | return Status.fromTSV(raw);
45 | }
46 |
47 | public void close() throws IOException {
48 | br.close();
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/twitter-tools-core/src/attic/java/cc/twittertools/corpus/data/TSVStatusCorpusReader.java:
--------------------------------------------------------------------------------
1 | package cc.twittertools.corpus.data;
2 |
3 | import java.io.File;
4 | import java.io.FileFilter;
5 | import java.io.IOException;
6 |
7 |
8 | /**
9 | * Abstraction for a corpus of statuses. A corpus is assumed to consist of a number of blocks, each
10 | * represented by a gzipped file within a root directory. This object will allow to caller to read
11 | * through all blocks, in sorted lexicographic order of the files.
12 | */
13 | public class TSVStatusCorpusReader implements StatusStream {
14 | private final File[] files;
15 | private int nextFile = 0;
16 | private TSVStatusBlockReader currentBlock = null;
17 |
18 | public TSVStatusCorpusReader(File file) throws IOException {
19 |
20 | if (!file.isDirectory()) {
21 | throw new IOException("Expecting " + file + " to be a directory!");
22 | }
23 |
24 | files = file.listFiles(new FileFilter() {
25 | public boolean accept(File path) {
26 | return path.getName().endsWith(".gz") ? true : false;
27 | }
28 | });
29 |
30 | if (files.length == 0) {
31 | throw new IOException(file + " does not contain any .gz files!");
32 | }
33 | }
34 |
35 | /**
36 | * Returns the next status, or null
if no more statuses.
37 | */
38 | public Status next() throws IOException {
39 | if (currentBlock == null) {
40 | currentBlock = new TSVStatusBlockReader(files[nextFile]);
41 | nextFile++;
42 | }
43 |
44 | Status status = null;
45 | while (true) {
46 | status = currentBlock.next();
47 | if (status != null) {
48 | return status;
49 | }
50 |
51 | if (nextFile >= files.length) {
52 | // We're out of files to read. Must be the end of the corpus.
53 | return null;
54 | }
55 |
56 | currentBlock.close();
57 | // Move to next file.
58 | currentBlock = new TSVStatusBlockReader(files[nextFile]);
59 | nextFile++;
60 | }
61 | }
62 |
63 | public void close() throws IOException {
64 | currentBlock.close();
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/corpus/data/HTMLStatusExtractor.java:
--------------------------------------------------------------------------------
1 | package cc.twittertools.corpus.data;
2 |
3 | import java.util.HashMap;
4 | import java.util.LinkedHashMap;
5 | import java.util.Map;
6 | import java.io.BufferedReader;
7 | import java.io.InputStreamReader;
8 | import java.io.FileInputStream;
9 | import java.io.IOException;
10 | import java.net.URL;
11 | import java.net.URLDecoder;
12 | import java.text.SimpleDateFormat;
13 | import java.util.Date;
14 | import java.util.TimeZone;
15 |
16 | import org.jsoup.Jsoup;
17 | import org.jsoup.nodes.Element;
18 | import org.jsoup.nodes.Document;
19 | import org.jsoup.select.Elements;
20 |
21 | import com.google.gson.Gson;
22 | import com.google.gson.GsonBuilder;
23 | import com.google.gson.JsonObject;
24 |
25 | import org.apache.commons.cli.CommandLine;
26 | import org.apache.commons.cli.CommandLineParser;
27 | import org.apache.commons.cli.GnuParser;
28 | import org.apache.commons.cli.HelpFormatter;
29 | import org.apache.commons.cli.OptionBuilder;
30 | import org.apache.commons.cli.Options;
31 | import org.apache.commons.cli.ParseException;
32 |
33 | public class HTMLStatusExtractor {
34 |
35 | public SimpleDateFormat date_fmt = new SimpleDateFormat("EEE MMM d kk:mm:ss Z yyyy");
36 |
37 | public HTMLStatusExtractor() {
38 | date_fmt.setTimeZone(TimeZone.getTimeZone("UTC"));
39 | }
40 |
41 | public static Map splitQuery(URL url)
42 | throws java.io.UnsupportedEncodingException {
43 | Map query_pairs = new LinkedHashMap();
44 | String query = url.getQuery();
45 | String[] pairs = query.split("&");
46 | for (String pair : pairs) {
47 | int idx = pair.indexOf("=");
48 | query_pairs.put(URLDecoder.decode(pair.substring(0, idx), "UTF-8"),
49 | URLDecoder.decode(pair.substring(idx + 1), "UTF-8"));
50 | }
51 | return query_pairs;
52 | }
53 |
54 | public JsonObject extractTweet(String html)
55 | throws java.net.MalformedURLException, java.io.UnsupportedEncodingException {
56 | JsonObject status = new JsonObject();
57 |
58 | Document doc = Jsoup.parse(html);
59 | Element tweet_div = doc.select("div.permalink-tweet").first();
60 |
61 | String tweet_text = tweet_div.select("p.tweet-text").first().text();
62 | status.addProperty("text", tweet_text);
63 |
64 | String tweet_id = tweet_div.attr("data-tweet-id");
65 | status.addProperty("id_str", tweet_id);
66 | status.addProperty("id", Long.parseLong(tweet_id));
67 |
68 | String timestamp = doc.select("span.js-short-timestamp").first().attr("data-time");
69 | Date created_at = new Date();
70 | created_at.setTime(Long.parseLong(timestamp) * 1000);
71 | status.addProperty("created_at", date_fmt.format(created_at));
72 |
73 | Elements js_stats_retweets = doc.select("li.js-stat-retweets");
74 | if (!js_stats_retweets.isEmpty()) {
75 | status.addProperty("retweeted", true);
76 | String count = js_stats_retweets.select("strong").first().text();
77 | status.addProperty("retweet_count", Long.parseLong(count));
78 | } else {
79 | status.addProperty("retweeted", false);
80 | status.addProperty("retweet_count", 0);
81 | }
82 | Elements js_stats_favs = doc.select("li.js-stat-favorites");
83 | status.addProperty("favorited", !js_stats_favs.isEmpty());
84 |
85 |
86 | // User subfield
87 | JsonObject user = new JsonObject();
88 | String user_id = tweet_div.attr("data-user-id");
89 | user.addProperty("id_str", user_id);
90 | user.addProperty("id", Long.parseLong(user_id));
91 | String screen_name = tweet_div.attr("data-screen-name");
92 | user.addProperty("screen_name", screen_name);
93 | String user_name = tweet_div.attr("data-name");
94 | user.addProperty("name", user_name);
95 |
96 | status.add("user", user);
97 |
98 | // Geo information
99 | Elements tweet_loc = doc.select("a.tweet-geo-text");
100 | if (!tweet_loc.isEmpty()) {
101 | JsonObject location = new JsonObject();
102 | Element loc = tweet_loc.first();
103 | // Adding http to avoid malformed URL exception
104 | URL url = new URL("http:" + loc.attr("href"));
105 | Map query_params = HTMLStatusExtractor.splitQuery(url);
106 | // Loop over possible query parameters
107 | // http://asnsblues.blogspot.ch/2011/11/google-maps-query-string-parameters.html
108 | String lat_and_long = null;
109 | if ((lat_and_long = query_params.get("ll")) != null
110 | || (lat_and_long = query_params.get("sll")) != null
111 | || (lat_and_long = query_params.get("cbll")) != null
112 | || (lat_and_long = query_params.get("q")) != null) {
113 | String[] coordinates = lat_and_long.split(",");
114 | double latitude = Double.parseDouble(coordinates[0]);
115 | double longitude = Double.parseDouble(coordinates[1]);
116 | location.addProperty("latitude", latitude);
117 | location.addProperty("longitude", longitude);
118 | }
119 | location.addProperty("location_text", loc.text());
120 | status.add("location", location);
121 | }
122 |
123 | return status;
124 | }
125 |
126 | private static final String HTML_OPTION = "html";
127 |
128 | @SuppressWarnings("static-access")
129 | public static void main(String[] args) throws Exception {
130 | Options options = new Options();
131 | options.addOption(OptionBuilder.withArgName("path").hasArg()
132 | .withDescription("HTML file from twitter.com").create(HTML_OPTION));
133 |
134 | CommandLine cmdline = null;
135 | CommandLineParser parser = new GnuParser();
136 | try {
137 | cmdline = parser.parse(options, args);
138 | } catch (ParseException exp) {
139 | System.err.println("Error parsing command line: " + exp.getMessage());
140 | System.exit(-1);
141 | }
142 |
143 | if (!cmdline.hasOption(HTML_OPTION)) {
144 | HelpFormatter formatter = new HelpFormatter();
145 | formatter.printHelp(HTMLStatusExtractor.class.getName(), options);
146 | System.exit(-1);
147 | }
148 |
149 | String html_filename = cmdline.getOptionValue(HTML_OPTION);
150 | BufferedReader html_file = null;
151 | StringBuffer buf = new StringBuffer();
152 | try {
153 | html_file = new BufferedReader(new InputStreamReader(new FileInputStream(html_filename)));
154 | String line;
155 | while ((line = html_file.readLine()) != null) {
156 | buf.append(line);
157 | buf.append('\n');
158 | }
159 | } catch (IOException e) {
160 | e.printStackTrace();
161 | } finally {
162 | html_file.close();
163 | }
164 |
165 | HTMLStatusExtractor hse = new HTMLStatusExtractor();
166 | JsonObject json = hse.extractTweet(buf.toString());
167 | Gson gson = new GsonBuilder().setPrettyPrinting().create();
168 | System.out.println(gson.toJson(json));
169 | }
170 | }
171 |
--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/corpus/data/JsonStatusBlockReader.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Twitter Tools
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package cc.twittertools.corpus.data;
18 |
19 | import java.io.BufferedReader;
20 | import java.io.File;
21 | import java.io.FileInputStream;
22 | import java.io.IOException;
23 | import java.io.InputStreamReader;
24 | import java.util.zip.GZIPInputStream;
25 |
26 | import com.google.common.base.Preconditions;
27 |
28 | /**
29 | * Abstraction for an stream of statuses, backed by an underlying gzipped file with JSON-encoded
30 | * tweets, one per line.
31 | */
32 | public class JsonStatusBlockReader implements StatusStream {
33 | private final BufferedReader br;
34 |
35 | public JsonStatusBlockReader(File file) throws IOException {
36 | Preconditions.checkNotNull(file);
37 |
38 | if (!file.getName().endsWith(".gz")) {
39 | throw new IOException("Expecting .gz compressed file!");
40 | }
41 |
42 | br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file)), "UTF-8"));
43 | }
44 |
45 | /**
46 | * Returns the next status, or null
if no more statuses.
47 | */
48 | public Status next() throws IOException {
49 | Status nxt = null;
50 | String raw = null;
51 |
52 | while (nxt == null) {
53 | raw = br.readLine();
54 |
55 | // Check to see if we've reached end of file.
56 | if (raw == null) {
57 | return null;
58 | }
59 |
60 | nxt = Status.fromJson(raw);
61 | }
62 | return Status.fromJson(raw);
63 | }
64 |
65 | public void close() throws IOException {
66 | br.close();
67 | }
68 | }
69 |
--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/corpus/data/JsonStatusCorpusReader.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Twitter Tools
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package cc.twittertools.corpus.data;
18 |
19 | import java.io.File;
20 | import java.io.FileFilter;
21 | import java.io.IOException;
22 |
23 | import com.google.common.base.Preconditions;
24 |
25 | /**
26 | * Abstraction for a corpus of statuses. A corpus is assumed to consist of a number of blocks, each
27 | * represented by a gzipped file within a root directory. This object will allow to caller to read
28 | * through all blocks, in sorted lexicographic order of the files.
29 | */
30 | public class JsonStatusCorpusReader implements StatusStream {
31 | private final File[] files;
32 | private int nextFile = 0;
33 | private JsonStatusBlockReader currentBlock = null;
34 |
35 | public JsonStatusCorpusReader(File file) throws IOException {
36 | Preconditions.checkNotNull(file);
37 |
38 | if (!file.isDirectory()) {
39 | throw new IOException("Expecting " + file + " to be a directory!");
40 | }
41 |
42 | files = file.listFiles(new FileFilter() {
43 | public boolean accept(File path) {
44 | return path.getName().endsWith(".gz") ? true : false;
45 | }
46 | });
47 |
48 | if (files.length == 0) {
49 | throw new IOException(file + " does not contain any .gz files!");
50 | }
51 | }
52 |
53 | /**
54 | * Returns the next status, or null
if no more statuses.
55 | */
56 | public Status next() throws IOException {
57 | if (currentBlock == null) {
58 | currentBlock = new JsonStatusBlockReader(files[nextFile]);
59 | nextFile++;
60 | }
61 |
62 | Status status = null;
63 | while (true) {
64 | status = currentBlock.next();
65 | if (status != null) {
66 | return status;
67 | }
68 |
69 | if (nextFile >= files.length) {
70 | // We're out of files to read. Must be the end of the corpus.
71 | return null;
72 | }
73 |
74 | currentBlock.close();
75 | // Move to next file.
76 | currentBlock = new JsonStatusBlockReader(files[nextFile]);
77 | nextFile++;
78 | }
79 | }
80 |
81 | public void close() throws IOException {
82 | currentBlock.close();
83 | }
84 | }
85 |
--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/corpus/data/StatusStream.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Twitter Tools
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package cc.twittertools.corpus.data;
18 |
19 | import java.io.IOException;
20 |
21 | /**
22 | * Abstraction for a stream of statuses. Ordering of the statuses is left to the implementation.
23 | */
24 | public interface StatusStream {
25 | public Status next() throws IOException;
26 | public void close() throws IOException;
27 | }
28 |
--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/corpus/demo/ReadStatuses.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Twitter Tools
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package cc.twittertools.corpus.demo;
18 |
19 | import java.io.File;
20 | import java.io.PrintStream;
21 |
22 | import org.apache.commons.cli.CommandLine;
23 | import org.apache.commons.cli.CommandLineParser;
24 | import org.apache.commons.cli.GnuParser;
25 | import org.apache.commons.cli.HelpFormatter;
26 | import org.apache.commons.cli.OptionBuilder;
27 | import org.apache.commons.cli.Options;
28 | import org.apache.commons.cli.ParseException;
29 | import org.apache.log4j.Logger;
30 |
31 | import cc.twittertools.corpus.data.JsonStatusBlockReader;
32 | import cc.twittertools.corpus.data.JsonStatusCorpusReader;
33 | import cc.twittertools.corpus.data.Status;
34 | import cc.twittertools.corpus.data.StatusStream;
35 |
36 | /**
37 | * Sample program to illustrate how to work with {@link StatusStream}.
38 | */
39 | public class ReadStatuses {
40 | private static final Logger LOG = Logger.getLogger(ReadStatuses.class);
41 |
42 | private ReadStatuses() {}
43 |
44 | private static final String INPUT_OPTION = "input";
45 | private static final String VERBOSE_OPTION = "verbose";
46 | private static final String DUMP_OPTION = "dump";
47 |
48 | @SuppressWarnings("static-access")
49 | public static void main(String[] args) throws Exception {
50 | Options options = new Options();
51 | options.addOption(OptionBuilder.withArgName("path").hasArg()
52 | .withDescription("input directory or file").create(INPUT_OPTION));
53 | options.addOption(VERBOSE_OPTION, false, "print logging output every 10000 tweets");
54 | options.addOption(DUMP_OPTION, false, "dump statuses");
55 |
56 | CommandLine cmdline = null;
57 | CommandLineParser parser = new GnuParser();
58 | try {
59 | cmdline = parser.parse(options, args);
60 | } catch (ParseException exp) {
61 | System.err.println("Error parsing command line: " + exp.getMessage());
62 | System.exit(-1);
63 | }
64 |
65 | if (!cmdline.hasOption(INPUT_OPTION)) {
66 | HelpFormatter formatter = new HelpFormatter();
67 | formatter.printHelp(ReadStatuses.class.getName(), options);
68 | System.exit(-1);
69 | }
70 |
71 | PrintStream out = new PrintStream(System.out, true, "UTF-8");
72 |
73 | StatusStream stream;
74 | // Figure out if we're reading from HTML SequenceFiles or JSON.
75 | File file = new File(cmdline.getOptionValue(INPUT_OPTION));
76 | if (!file.exists()) {
77 | System.err.println("Error: " + file + " does not exist!");
78 | System.exit(-1);
79 | }
80 |
81 | if (file.isDirectory()) {
82 | stream = new JsonStatusCorpusReader(file);
83 | } else {
84 | stream = new JsonStatusBlockReader(file);
85 | }
86 |
87 | int cnt = 0;
88 | Status status;
89 | while ((status = stream.next()) != null) {
90 | if (cmdline.hasOption(DUMP_OPTION)) {
91 | String text = status.getText();
92 | if (text != null) {
93 | text = text.replaceAll("\\s+", " ");
94 | text = text.replaceAll("\0", "");
95 | }
96 | out.println(String.format("%d\t%s\t%s\t%s", status.getId(), status.getScreenname(),
97 | status.getCreatedAt(), text));
98 | }
99 | cnt++;
100 | if ( cnt % 10000 == 0 && cmdline.hasOption(VERBOSE_OPTION)) {
101 | LOG.info(cnt + " statuses read");
102 | }
103 | }
104 | stream.close();
105 | LOG.info(String.format("Total of %s statuses read.", cnt));
106 | }
107 | }
108 |
--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/index/ExtractTermStatisticsFromIndex.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Twitter Tools
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package cc.twittertools.index;
18 |
19 | import java.io.File;
20 | import java.io.PrintStream;
21 |
22 | import org.apache.commons.cli.CommandLine;
23 | import org.apache.commons.cli.CommandLineParser;
24 | import org.apache.commons.cli.GnuParser;
25 | import org.apache.commons.cli.HelpFormatter;
26 | import org.apache.commons.cli.OptionBuilder;
27 | import org.apache.commons.cli.Options;
28 | import org.apache.commons.cli.ParseException;
29 | import org.apache.lucene.index.DirectoryReader;
30 | import org.apache.lucene.index.IndexReader;
31 | import org.apache.lucene.index.SlowCompositeReaderWrapper;
32 | import org.apache.lucene.index.Terms;
33 | import org.apache.lucene.index.TermsEnum;
34 | import org.apache.lucene.store.FSDirectory;
35 | import org.apache.lucene.util.BytesRef;
36 |
37 | import cc.twittertools.index.IndexStatuses.StatusField;
38 |
39 | public class ExtractTermStatisticsFromIndex {
40 | private static final String INDEX_OPTION = "index";
41 | private static final String MIN_OPTION = "min";
42 |
43 | @SuppressWarnings("static-access")
44 | public static void main(String[] args) throws Exception {
45 | Options options = new Options();
46 |
47 | options.addOption(OptionBuilder.withArgName("dir").hasArg()
48 | .withDescription("index").create(INDEX_OPTION));
49 | options.addOption(OptionBuilder.withArgName("num").hasArg()
50 | .withDescription("min").create(MIN_OPTION));
51 |
52 | CommandLine cmdline = null;
53 | CommandLineParser parser = new GnuParser();
54 | try {
55 | cmdline = parser.parse(options, args);
56 | } catch (ParseException exp) {
57 | System.err.println("Error parsing command line: " + exp.getMessage());
58 | System.exit(-1);
59 | }
60 |
61 | if (!cmdline.hasOption(INDEX_OPTION)) {
62 | HelpFormatter formatter = new HelpFormatter();
63 | formatter.printHelp(ExtractTermStatisticsFromIndex.class.getName(), options);
64 | System.exit(-1);
65 | }
66 |
67 | String indexLocation = cmdline.getOptionValue(INDEX_OPTION);
68 | int min = cmdline.hasOption(MIN_OPTION) ?
69 | Integer.parseInt(cmdline.getOptionValue(MIN_OPTION)) : 1;
70 |
71 | PrintStream out = new PrintStream(System.out, true, "UTF-8");
72 |
73 | IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation)));
74 | Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms(StatusField.TEXT.name);
75 | TermsEnum termsEnum = terms.iterator(TermsEnum.EMPTY);
76 |
77 | long missingCnt = 0;
78 | int skippedTerms = 0;
79 | BytesRef bytes = new BytesRef();
80 | while ( (bytes = termsEnum.next()) != null) {
81 | byte[] buf = new byte[bytes.length];
82 | System.arraycopy(bytes.bytes, 0, buf, 0, bytes.length);
83 | String term = new String(buf, "UTF-8");
84 | int df = termsEnum.docFreq();
85 | long cf = termsEnum.totalTermFreq();
86 |
87 | if ( df < min) {
88 | skippedTerms++;
89 | missingCnt += cf;
90 | continue;
91 | }
92 |
93 | out.println(term + "\t" + df + "\t" + cf);
94 | }
95 |
96 | reader.close();
97 | out.close();
98 | System.err.println("skipped terms: " + skippedTerms + ", cnt: " + missingCnt);
99 | }
100 | }
101 |
--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/index/ExtractTweetidsFromCollection.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Twitter Tools
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package cc.twittertools.index;
18 |
19 | import java.io.File;
20 |
21 | import org.apache.commons.cli.CommandLine;
22 | import org.apache.commons.cli.CommandLineParser;
23 | import org.apache.commons.cli.GnuParser;
24 | import org.apache.commons.cli.HelpFormatter;
25 | import org.apache.commons.cli.OptionBuilder;
26 | import org.apache.commons.cli.Options;
27 | import org.apache.commons.cli.ParseException;
28 |
29 | import cc.twittertools.corpus.data.JsonStatusCorpusReader;
30 | import cc.twittertools.corpus.data.Status;
31 | import cc.twittertools.corpus.data.StatusStream;
32 |
33 | public class ExtractTweetidsFromCollection {
34 | private static final String COLLECTION_OPTION = "collection";
35 |
36 | @SuppressWarnings("static-access")
37 | public static void main(String[] args) throws Exception {
38 | Options options = new Options();
39 |
40 | options.addOption(OptionBuilder.withArgName("dir").hasArg()
41 | .withDescription("source collection directory").create(COLLECTION_OPTION));
42 |
43 | CommandLine cmdline = null;
44 | CommandLineParser parser = new GnuParser();
45 | try {
46 | cmdline = parser.parse(options, args);
47 | } catch (ParseException exp) {
48 | System.err.println("Error parsing command line: " + exp.getMessage());
49 | System.exit(-1);
50 | }
51 |
52 | if (!cmdline.hasOption(COLLECTION_OPTION)) {
53 | HelpFormatter formatter = new HelpFormatter();
54 | formatter.printHelp(ExtractTweetidsFromCollection.class.getName(), options);
55 | System.exit(-1);
56 | }
57 |
58 | String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION);
59 |
60 | File file = new File(collectionPath);
61 | if (!file.exists()) {
62 | System.err.println("Error: " + file + " does not exist!");
63 | System.exit(-1);
64 | }
65 |
66 | StatusStream stream = new JsonStatusCorpusReader(file);
67 |
68 | Status status;
69 | while ((status = stream.next()) != null) {
70 | System.out.println(status.getId() + "\t" + status.getScreenname());
71 | }
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/index/ExtractTweetidsFromIndex.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Twitter Tools
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package cc.twittertools.index;
18 |
19 | import java.io.File;
20 | import java.io.PrintStream;
21 |
22 | import org.apache.commons.cli.CommandLine;
23 | import org.apache.commons.cli.CommandLineParser;
24 | import org.apache.commons.cli.GnuParser;
25 | import org.apache.commons.cli.HelpFormatter;
26 | import org.apache.commons.cli.OptionBuilder;
27 | import org.apache.commons.cli.Options;
28 | import org.apache.commons.cli.ParseException;
29 | import org.apache.lucene.document.Document;
30 | import org.apache.lucene.index.DirectoryReader;
31 | import org.apache.lucene.index.IndexReader;
32 | import org.apache.lucene.store.FSDirectory;
33 |
34 | import cc.twittertools.index.IndexStatuses.StatusField;
35 |
36 | /**
37 | * Reference implementation for indexing statuses.
38 | */
39 | public class ExtractTweetidsFromIndex {
40 | private ExtractTweetidsFromIndex() {}
41 |
42 | private static final String INDEX_OPTION = "index";
43 |
44 | @SuppressWarnings("static-access")
45 | public static void main(String[] args) throws Exception {
46 | Options options = new Options();
47 |
48 | options.addOption(OptionBuilder.withArgName("dir").hasArg()
49 | .withDescription("index location").create(INDEX_OPTION));
50 |
51 | CommandLine cmdline = null;
52 | CommandLineParser parser = new GnuParser();
53 | try {
54 | cmdline = parser.parse(options, args);
55 | } catch (ParseException exp) {
56 | System.err.println("Error parsing command line: " + exp.getMessage());
57 | System.exit(-1);
58 | }
59 |
60 | if (!cmdline.hasOption(INDEX_OPTION)) {
61 | HelpFormatter formatter = new HelpFormatter();
62 | formatter.printHelp(ExtractTweetidsFromIndex.class.getName(), options);
63 | System.exit(-1);
64 | }
65 |
66 | File indexLocation = new File(cmdline.getOptionValue(INDEX_OPTION));
67 | if (!indexLocation.exists()) {
68 | System.err.println("Error: " + indexLocation + " does not exist!");
69 | System.exit(-1);
70 | }
71 |
72 | IndexReader reader = DirectoryReader.open(FSDirectory.open(indexLocation));
73 | PrintStream out = new PrintStream(System.out, true, "UTF-8");
74 | for (int i=0; i 0);
30 | this.time = time;
31 | }
32 |
33 | public String getId() {
34 | return id;
35 | }
36 |
37 | public String getQuery() {
38 | return query;
39 | }
40 |
41 | public long getQueryTweetTime() {
42 | return time;
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/search/TrecTopicSet.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Twitter Tools
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package cc.twittertools.search;
18 |
19 | import java.io.File;
20 | import java.io.IOException;
21 | import java.util.Iterator;
22 | import java.util.List;
23 | import java.util.regex.Matcher;
24 | import java.util.regex.Pattern;
25 |
26 | import com.google.common.base.Charsets;
27 | import com.google.common.base.Joiner;
28 | import com.google.common.base.Preconditions;
29 | import com.google.common.collect.Lists;
30 | import com.google.common.io.Files;
31 |
32 | public class TrecTopicSet implements Iterable{
33 | private List queries = Lists.newArrayList();
34 |
35 | private TrecTopicSet() {}
36 |
37 | private void add(TrecTopic q) {
38 | queries.add(q);
39 | }
40 |
41 | @Override
42 | public Iterator iterator() {
43 | return queries.iterator();
44 | }
45 |
46 | private static final Pattern TOP_PATTERN = Pattern.compile("", Pattern.DOTALL);
47 | private static final Pattern NUM_PATTERN = Pattern.compile(" Number: (MB\\d+) ", Pattern.DOTALL);
48 |
49 | // TREC 2011 topics uses tag
50 | private static final Pattern TITLE_PATTERN = Pattern.compile("\\s*(.*?)\\s*", Pattern.DOTALL);
51 | // TREC 2012 topics use tag
52 | private static final Pattern TITLE_PATTERN2 = Pattern.compile("\\s*(.*?)\\s*", Pattern.DOTALL);
53 |
54 | private static final Pattern TWEETTIME_PATTERN = Pattern.compile("\\s*(\\d+)\\s*", Pattern.DOTALL);
55 |
56 | public static TrecTopicSet fromFile(File f) throws IOException {
57 | Preconditions.checkNotNull(f);
58 | Preconditions.checkArgument(f.exists());
59 |
60 | String s = Joiner.on("\n").join(Files.readLines(f, Charsets.UTF_8));
61 | TrecTopicSet queries = new TrecTopicSet();
62 |
63 | Matcher matcher = TOP_PATTERN.matcher(s);
64 | while (matcher.find()) {
65 | String top = matcher.group(0);
66 |
67 | Matcher m = NUM_PATTERN.matcher(top);
68 | if (!m.find()) {
69 | throw new IOException("Error parsing " + f);
70 | }
71 | String id = m.group(1);
72 | // Topics from 2012 are inconsistently numbered,
73 | // e.g., MB051 should match the qrels, which has MB51
74 | if (id.matches("MB0\\d\\d")) {
75 | id = id.replace("MB0", "MB");
76 | }
77 |
78 | m = TITLE_PATTERN.matcher(top);
79 | if (!m.find()) {
80 | m = TITLE_PATTERN2.matcher(top);
81 | if (!m.find()) {
82 | throw new IOException("Error parsing " + f);
83 | }
84 | }
85 | String text = m.group(1);
86 |
87 | m = TWEETTIME_PATTERN.matcher(top);
88 | if (!m.find()) {
89 | throw new IOException("Error parsing " + f);
90 | }
91 | long time = Long.parseLong(m.group(1));
92 | queries.add(new TrecTopic(id, text, time));
93 | }
94 | return queries;
95 | }
96 | }
97 |
--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/search/api/RunQueriesThrift.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Twitter Tools
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package cc.twittertools.search.api;
18 |
19 | import java.io.File;
20 | import java.io.PrintStream;
21 | import java.util.List;
22 | import java.util.Set;
23 | import java.util.HashSet;
24 |
25 | import org.apache.commons.cli.CommandLine;
26 | import org.apache.commons.cli.CommandLineParser;
27 | import org.apache.commons.cli.GnuParser;
28 | import org.apache.commons.cli.HelpFormatter;
29 | import org.apache.commons.cli.Option;
30 | import org.apache.commons.cli.OptionBuilder;
31 | import org.apache.commons.cli.Options;
32 | import org.apache.commons.cli.ParseException;
33 |
34 | import cc.twittertools.search.TrecTopicSet;
35 | import cc.twittertools.thrift.gen.TResult;
36 |
37 | public class RunQueriesThrift {
38 | private static final String DEFAULT_RUNTAG = "lucene4lm";
39 |
40 | private static final String HOST_OPTION = "host";
41 | private static final String PORT_OPTION = "port";
42 | private static final String QUERIES_OPTION = "queries";
43 | private static final String NUM_RESULTS_OPTION = "num_results";
44 | private static final String GROUP_OPTION = "group";
45 | private static final String TOKEN_OPTION = "token";
46 | private static final String RUNTAG_OPTION = "runtag";
47 | private static final String VERBOSE_OPTION = "verbose";
48 |
49 | private RunQueriesThrift() {}
50 |
51 | @SuppressWarnings("static-access")
52 | public static void main(String[] args) throws Exception {
53 | Options options = new Options();
54 |
55 | options.addOption(OptionBuilder.withArgName("string").hasArg()
56 | .withDescription("host").create(HOST_OPTION));
57 | options.addOption(OptionBuilder.withArgName("port").hasArg()
58 | .withDescription("port").create(PORT_OPTION));
59 | options.addOption(OptionBuilder.withArgName("file").hasArg()
60 | .withDescription("file containing topics in TREC format").create(QUERIES_OPTION));
61 | options.addOption(OptionBuilder.withArgName("num").hasArg()
62 | .withDescription("number of results to return").create(NUM_RESULTS_OPTION));
63 | options.addOption(OptionBuilder.withArgName("string").hasArg()
64 | .withDescription("group id").create(GROUP_OPTION));
65 | options.addOption(OptionBuilder.withArgName("string").hasArg()
66 | .withDescription("access token").create(TOKEN_OPTION));
67 | options.addOption(OptionBuilder.withArgName("string").hasArg()
68 | .withDescription("runtag").create(RUNTAG_OPTION));
69 | options.addOption(new Option(VERBOSE_OPTION, "print out complete document"));
70 |
71 | CommandLine cmdline = null;
72 | CommandLineParser parser = new GnuParser();
73 | try {
74 | cmdline = parser.parse(options, args);
75 | } catch (ParseException exp) {
76 | System.err.println("Error parsing command line: " + exp.getMessage());
77 | System.exit(-1);
78 | }
79 |
80 | if (!cmdline.hasOption(HOST_OPTION) || !cmdline.hasOption(PORT_OPTION)
81 | || !cmdline.hasOption(QUERIES_OPTION)) {
82 | HelpFormatter formatter = new HelpFormatter();
83 | formatter.printHelp(RunQueriesThrift.class.getName(), options);
84 | System.exit(-1);
85 | }
86 |
87 | String queryFile = cmdline.getOptionValue(QUERIES_OPTION);
88 | if (!new File(queryFile).exists()) {
89 | System.err.println("Error: " + queryFile + " doesn't exist!");
90 | System.exit(-1);
91 | }
92 |
93 | String runtag = cmdline.hasOption(RUNTAG_OPTION) ?
94 | cmdline.getOptionValue(RUNTAG_OPTION) : DEFAULT_RUNTAG;
95 |
96 | TrecTopicSet topicsFile = TrecTopicSet.fromFile(new File(queryFile));
97 |
98 | int numResults = 1000;
99 | try {
100 | if (cmdline.hasOption(NUM_RESULTS_OPTION)) {
101 | numResults = Integer.parseInt(cmdline.getOptionValue(NUM_RESULTS_OPTION));
102 | }
103 | } catch (NumberFormatException e) {
104 | System.err.println("Invalid " + NUM_RESULTS_OPTION + ": " + cmdline.getOptionValue(NUM_RESULTS_OPTION));
105 | System.exit(-1);
106 | }
107 |
108 | String group = cmdline.hasOption(GROUP_OPTION) ? cmdline.getOptionValue(GROUP_OPTION) : null;
109 | String token = cmdline.hasOption(TOKEN_OPTION) ? cmdline.getOptionValue(TOKEN_OPTION) : null;
110 |
111 | boolean verbose = cmdline.hasOption(VERBOSE_OPTION);
112 |
113 | PrintStream out = new PrintStream(System.out, true, "UTF-8");
114 |
115 | TrecSearchThriftClient client = new TrecSearchThriftClient(cmdline.getOptionValue(HOST_OPTION),
116 | Integer.parseInt(cmdline.getOptionValue(PORT_OPTION)), group, token);
117 |
118 | for (cc.twittertools.search.TrecTopic query : topicsFile) {
119 | List results = client.search(query.getQuery(),
120 | query.getQueryTweetTime(), numResults);
121 | int i = 1;
122 | Set tweetIds = new HashSet();
123 | for (TResult result : results) {
124 | if (!tweetIds.contains(result.id)) {
125 | // The TREC official qrels don't have the "MB" prefix and trailing zeros, so we perform
126 | // this transformation so that trec_eval doesn't complain.
127 | String qid = query.getId().replaceFirst("^MB0*", "");
128 | tweetIds.add(result.id);
129 | out.println(String.format("%s Q0 %d %d %f %s", qid, result.id, i, result.rsv, runtag));
130 | if (verbose) {
131 | out.println("# " + result.toString().replaceAll("[\\n\\r]+", " "));
132 | }
133 | i++;
134 | }
135 | }
136 | }
137 | out.close();
138 | }
139 | }
140 |
--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/search/api/SearchStatusesThrift.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Twitter Tools
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package cc.twittertools.search.api;
18 |
19 | import java.io.PrintStream;
20 | import java.util.List;
21 |
22 | import org.apache.commons.cli.CommandLine;
23 | import org.apache.commons.cli.CommandLineParser;
24 | import org.apache.commons.cli.GnuParser;
25 | import org.apache.commons.cli.HelpFormatter;
26 | import org.apache.commons.cli.Option;
27 | import org.apache.commons.cli.OptionBuilder;
28 | import org.apache.commons.cli.Options;
29 | import org.apache.commons.cli.ParseException;
30 |
31 | import cc.twittertools.thrift.gen.TResult;
32 |
33 | public class SearchStatusesThrift {
34 | // Defaults: if user doesn't specify an actual query, run MB01 as a demo.
35 | private static final String DEFAULT_QID = "MB01";
36 | private static final String DEFAULT_Q = "BBC World Service staff cuts";
37 | private static final long DEFAULT_MAX_ID = 34952194402811905L;
38 | private static final int DEFAULT_NUM_RESULTS = 10;
39 | private static final String DEFAULT_RUNTAG = "lucene4lm";
40 |
41 | private static final String HELP_OPTION = "h";
42 | private static final String HOST_OPTION = "host";
43 | private static final String PORT_OPTION = "port";
44 | private static final String QID_OPTION = "qid";
45 | private static final String QUERY_OPTION = "q";
46 | private static final String RUNTAG_OPTION = "runtag";
47 | private static final String MAX_ID_OPTION = "max_id";
48 | private static final String NUM_RESULTS_OPTION = "num_results";
49 | private static final String GROUP_OPTION = "group";
50 | private static final String TOKEN_OPTION = "token";
51 | private static final String VERBOSE_OPTION = "verbose";
52 |
53 | @SuppressWarnings("static-access")
54 | public static void main(String[] args) throws Exception {
55 | Options options = new Options();
56 |
57 | options.addOption(new Option(HELP_OPTION, "show help"));
58 | options.addOption(OptionBuilder.withArgName("string").hasArg()
59 | .withDescription("host").create(HOST_OPTION));
60 | options.addOption(OptionBuilder.withArgName("port").hasArg()
61 | .withDescription("port").create(PORT_OPTION));
62 | options.addOption(OptionBuilder.withArgName("string").hasArg()
63 | .withDescription("query id").create(QID_OPTION));
64 | options.addOption(OptionBuilder.withArgName("string").hasArg()
65 | .withDescription("query text").create(QUERY_OPTION));
66 | options.addOption(OptionBuilder.withArgName("string").hasArg()
67 | .withDescription("runtag").create(RUNTAG_OPTION));
68 | options.addOption(OptionBuilder.withArgName("num").hasArg()
69 | .withDescription("maxid").create(MAX_ID_OPTION));
70 | options.addOption(OptionBuilder.withArgName("num").hasArg()
71 | .withDescription("number of results to return").create(NUM_RESULTS_OPTION));
72 | options.addOption(OptionBuilder.withArgName("string").hasArg()
73 | .withDescription("group id").create(GROUP_OPTION));
74 | options.addOption(OptionBuilder.withArgName("string").hasArg()
75 | .withDescription("access token").create(TOKEN_OPTION));
76 | options.addOption(new Option(VERBOSE_OPTION, "print out complete document"));
77 |
78 | CommandLine cmdline = null;
79 | CommandLineParser parser = new GnuParser();
80 | try {
81 | cmdline = parser.parse(options, args);
82 | } catch (ParseException exp) {
83 | System.err.println("Error parsing command line: " + exp.getMessage());
84 | System.exit(-1);
85 | }
86 |
87 | if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(HOST_OPTION)
88 | || !cmdline.hasOption(PORT_OPTION)) {
89 | HelpFormatter formatter = new HelpFormatter();
90 | formatter.printHelp(SearchStatusesThrift.class.getName(), options);
91 | System.exit(-1);
92 | }
93 |
94 | String qid = cmdline.hasOption(QID_OPTION) ?
95 | cmdline.getOptionValue(QID_OPTION) : DEFAULT_QID;
96 | String query = cmdline.hasOption(QUERY_OPTION) ?
97 | cmdline.getOptionValue(QUERY_OPTION) : DEFAULT_Q;
98 | String runtag = cmdline.hasOption(RUNTAG_OPTION) ?
99 | cmdline.getOptionValue(RUNTAG_OPTION) : DEFAULT_RUNTAG;
100 | long maxId = cmdline.hasOption(MAX_ID_OPTION) ?
101 | Long.parseLong(cmdline.getOptionValue(MAX_ID_OPTION)) : DEFAULT_MAX_ID;
102 | int numResults = cmdline.hasOption(NUM_RESULTS_OPTION) ?
103 | Integer.parseInt(cmdline.getOptionValue(NUM_RESULTS_OPTION)) : DEFAULT_NUM_RESULTS;
104 | boolean verbose = cmdline.hasOption(VERBOSE_OPTION);
105 |
106 | String group = cmdline.hasOption(GROUP_OPTION) ? cmdline.getOptionValue(GROUP_OPTION) : null;
107 | String token = cmdline.hasOption(TOKEN_OPTION) ? cmdline.getOptionValue(TOKEN_OPTION) : null;
108 | TrecSearchThriftClient client = new TrecSearchThriftClient(cmdline.getOptionValue(HOST_OPTION),
109 | Integer.parseInt(cmdline.getOptionValue(PORT_OPTION)), group, token);
110 |
111 | System.err.println("qid: " + qid);
112 | System.err.println("q: " + query);
113 | System.err.println("max_id: " + maxId);
114 | System.err.println("num_results: " + numResults);
115 |
116 | PrintStream out = new PrintStream(System.out, true, "UTF-8");
117 |
118 | List results = client.search(query, maxId, numResults);
119 | int i = 1;
120 | for (TResult result : results) {
121 | out.println(String.format("%s Q0 %d %d %f %s", qid, result.id, i, result.rsv, runtag));
122 | if (verbose) {
123 | System.out.println("# " + result.toString().replaceAll("[\\n\\r]+", " "));
124 | }
125 | i++;
126 | }
127 | out.close();
128 | }
129 | }
--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/search/api/TResultComparable.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Twitter Tools
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package cc.twittertools.search.api;
18 |
19 | import cc.twittertools.thrift.gen.TResult;
20 |
21 | public class TResultComparable implements Comparable {
22 | private TResult tresult;
23 |
24 | public TResultComparable(TResult tresult) {
25 | this.tresult = tresult;
26 | }
27 |
28 | public TResult getTResult() {
29 | return tresult;
30 | }
31 |
32 | public int compareTo(TResultComparable other) {
33 | if (tresult.rsv > other.tresult.rsv) {
34 | return -1;
35 | } else if (tresult.rsv < other.tresult.rsv) {
36 | return 1;
37 | } else {
38 | if (tresult.id > other.tresult.id) {
39 | return -1;
40 | } else if (tresult.id < other.tresult.id) {
41 | return 1;
42 | } else {
43 | return 0;
44 | }
45 | }
46 | }
47 |
48 | public boolean equals(Object other) {
49 | if (other == null) {
50 | return false;
51 | } if (other.getClass() != this.getClass()) {
52 | return false;
53 | }
54 |
55 | return ((TResultComparable) other).tresult.id == this.tresult.id;
56 | }
57 | }
--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/search/api/TrecSearchHandler.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Twitter Tools
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package cc.twittertools.search.api;
18 |
19 | import java.io.File;
20 | import java.io.IOException;
21 | import java.util.List;
22 | import java.util.Map;
23 |
24 | import javax.annotation.Nullable;
25 |
26 | import org.apache.log4j.Logger;
27 | import org.apache.lucene.document.Document;
28 | import org.apache.lucene.index.DirectoryReader;
29 | import org.apache.lucene.index.IndexReader;
30 | import org.apache.lucene.queryparser.classic.QueryParser;
31 | import org.apache.lucene.search.Filter;
32 | import org.apache.lucene.search.IndexSearcher;
33 | import org.apache.lucene.search.NumericRangeFilter;
34 | import org.apache.lucene.search.Query;
35 | import org.apache.lucene.search.ScoreDoc;
36 | import org.apache.lucene.search.TopDocs;
37 | import org.apache.lucene.search.similarities.LMDirichletSimilarity;
38 | import org.apache.lucene.store.FSDirectory;
39 | import org.apache.lucene.util.Version;
40 |
41 | import cc.twittertools.index.IndexStatuses;
42 | import cc.twittertools.index.IndexStatuses.StatusField;
43 | import cc.twittertools.thrift.gen.TQuery;
44 | import cc.twittertools.thrift.gen.TResult;
45 | import cc.twittertools.thrift.gen.TrecSearch;
46 | import cc.twittertools.thrift.gen.TrecSearchException;
47 |
48 | import com.google.common.base.Preconditions;
49 | import com.google.common.collect.Lists;
50 |
51 | public class TrecSearchHandler implements TrecSearch.Iface {
52 | private static final Logger LOG = Logger.getLogger(TrecSearchHandler.class);
53 |
54 | private static QueryParser QUERY_PARSER =
55 | new QueryParser(Version.LUCENE_43, StatusField.TEXT.name, IndexStatuses.ANALYZER);
56 |
57 | private final IndexSearcher searcher;
58 | private final Map credentials;
59 |
60 | public TrecSearchHandler(File indexPath, @Nullable Map credentials)
61 | throws IOException {
62 | Preconditions.checkNotNull(indexPath);
63 | Preconditions.checkArgument(indexPath.exists());
64 |
65 | // Can be null, in which case we don't check for credentials.
66 | this.credentials = credentials;
67 |
68 | IndexReader reader = DirectoryReader.open(FSDirectory.open(indexPath));
69 | searcher = new IndexSearcher(reader);
70 | searcher.setSimilarity(new LMDirichletSimilarity(2500.0f));
71 | }
72 |
73 | public List search(TQuery query) throws TrecSearchException {
74 | Preconditions.checkNotNull(query);
75 |
76 | LOG.info(String.format("Incoming request (%s, %s)", query.group, query.token));
77 |
78 | // Verify credentials.
79 | if (credentials != null && (!credentials.containsKey(query.group) ||
80 | !credentials.get(query.group).equals(query.token))) {
81 | LOG.info(String.format("Access denied for (%s, %s)", query.group, query.token));
82 | throw new TrecSearchException("Invalid credentials: access denied.");
83 | }
84 |
85 | List results = Lists.newArrayList();
86 | long startTime = System.currentTimeMillis();
87 |
88 | try {
89 | Filter filter =
90 | NumericRangeFilter.newLongRange(StatusField.ID.name, 0L, query.max_id, true, true);
91 |
92 | Query q = QUERY_PARSER.parse(query.text);
93 | int num = query.num_results > 10000 ? 10000 : query.num_results;
94 | TopDocs rs = searcher.search(q, filter, num);
95 | for (ScoreDoc scoreDoc : rs.scoreDocs) {
96 | Document hit = searcher.doc(scoreDoc.doc);
97 |
98 | TResult p = new TResult();
99 | p.id = (Long) hit.getField(StatusField.ID.name).numericValue();
100 | p.screen_name = hit.get(StatusField.SCREEN_NAME.name);
101 | p.epoch = (Long) hit.getField(StatusField.EPOCH.name).numericValue();
102 | p.text = hit.get(StatusField.TEXT.name);
103 | p.rsv = scoreDoc.score;
104 |
105 | p.followers_count = (Integer) hit.getField(StatusField.FOLLOWERS_COUNT.name).numericValue();
106 | p.statuses_count = (Integer) hit.getField(StatusField.STATUSES_COUNT.name).numericValue();
107 |
108 | if ( hit.get(StatusField.LANG.name) != null) {
109 | p.lang = hit.get(StatusField.LANG.name);
110 | }
111 |
112 | if ( hit.get(StatusField.IN_REPLY_TO_STATUS_ID.name) != null) {
113 | p.in_reply_to_status_id = (Long) hit.getField(StatusField.IN_REPLY_TO_STATUS_ID.name).numericValue();
114 | }
115 |
116 | if ( hit.get(StatusField.IN_REPLY_TO_USER_ID.name) != null) {
117 | p.in_reply_to_user_id = (Long) hit.getField(StatusField.IN_REPLY_TO_USER_ID.name).numericValue();
118 | }
119 |
120 | if ( hit.get(StatusField.RETWEETED_STATUS_ID.name) != null) {
121 | p.retweeted_status_id = (Long) hit.getField(StatusField.RETWEETED_STATUS_ID.name).numericValue();
122 | }
123 |
124 | if ( hit.get(StatusField.RETWEETED_USER_ID.name) != null) {
125 | p.retweeted_user_id = (Long) hit.getField(StatusField.RETWEETED_USER_ID.name).numericValue();
126 | }
127 |
128 | if ( hit.get(StatusField.RETWEET_COUNT.name) != null) {
129 | p.retweeted_count = (Integer) hit.getField(StatusField.RETWEET_COUNT.name).numericValue();
130 | }
131 |
132 | results.add(p);
133 | }
134 | } catch (Exception e) {
135 | e.printStackTrace();
136 | throw new TrecSearchException(e.getMessage());
137 | }
138 |
139 | long endTime = System.currentTimeMillis();
140 | LOG.info(String.format("%4dms %s", (endTime - startTime), query.toString()));
141 |
142 | return results;
143 | }
144 | }
--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/search/api/TrecSearchThriftClient.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Twitter Tools
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package cc.twittertools.search.api;
18 |
19 | import java.util.List;
20 |
21 | import javax.annotation.Nullable;
22 |
23 | import org.apache.thrift.TException;
24 | import org.apache.thrift.protocol.TBinaryProtocol;
25 | import org.apache.thrift.transport.TSocket;
26 | import org.apache.thrift.transport.TTransport;
27 |
28 | import cc.twittertools.thrift.gen.TQuery;
29 | import cc.twittertools.thrift.gen.TResult;
30 | import cc.twittertools.thrift.gen.TrecSearch;
31 |
32 | import com.google.common.base.Preconditions;
33 |
34 | public class TrecSearchThriftClient {
35 | private final String group;
36 | private final String token;
37 | private final String host;
38 | private final int port;
39 |
40 | public TrecSearchThriftClient(String host, int port,
41 | @Nullable String group, @Nullable String token) {
42 | Preconditions.checkNotNull(host);
43 | Preconditions.checkArgument(port > 0);
44 | this.group = group;
45 | this.token = token;
46 | this.host= host;
47 | this.port = port;
48 | }
49 |
50 | public List search(String query, long maxId, int numResults) throws TException {
51 | TTransport transport = new TSocket(host, port);
52 | transport.open();
53 |
54 | TrecSearch.Client client = new TrecSearch.Client(new TBinaryProtocol(transport));
55 |
56 | TQuery q = new TQuery();
57 | q.text = query;
58 | q.max_id = maxId;
59 | q.num_results = numResults;
60 |
61 | q.group = group;
62 | q.token = token;
63 |
64 | List results = client.search(q);
65 | transport.close();
66 |
67 | return results;
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/search/api/TrecSearchThriftServer.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Twitter Tools
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package cc.twittertools.search.api;
18 |
19 | import java.io.File;
20 | import java.util.Map;
21 |
22 | import org.apache.commons.cli.CommandLine;
23 | import org.apache.commons.cli.CommandLineParser;
24 | import org.apache.commons.cli.GnuParser;
25 | import org.apache.commons.cli.HelpFormatter;
26 | import org.apache.commons.cli.Option;
27 | import org.apache.commons.cli.OptionBuilder;
28 | import org.apache.commons.cli.Options;
29 | import org.apache.commons.cli.ParseException;
30 | import org.apache.thrift.protocol.TBinaryProtocol;
31 | import org.apache.thrift.server.TServer;
32 | import org.apache.thrift.server.TThreadPoolServer;
33 | import org.apache.thrift.transport.TServerSocket;
34 |
35 | import cc.twittertools.thrift.gen.TrecSearch;
36 |
37 | import com.google.common.base.Charsets;
38 | import com.google.common.collect.Maps;
39 | import com.google.common.io.Files;
40 |
41 | public class TrecSearchThriftServer {
42 | private static final int DEFAULT_PORT = 9090;
43 | private static final int DEFAULT_MAX_THREADS = 8;
44 |
45 | private static final String HELP_OPTION = "h";
46 | private static final String INDEX_OPTION = "index";
47 | private static final String PORT_OPTION = "port";
48 | private static final String MAX_THREADS_OPTION = "max_threads";
49 | private static final String CREDENTIALS_OPTION = "credentials";
50 |
51 | @SuppressWarnings("static-access")
52 | public static void main(String[] args) throws Exception {
53 | Options options = new Options();
54 |
55 | options.addOption(new Option(HELP_OPTION, "show help"));
56 | options.addOption(OptionBuilder.withArgName("port").hasArg()
57 | .withDescription("port").create(PORT_OPTION));
58 | options.addOption(OptionBuilder.withArgName("index").hasArg()
59 | .withDescription("index location").create(INDEX_OPTION));
60 | options.addOption(OptionBuilder.withArgName("num").hasArg()
61 | .withDescription("max number of threads in thread pool").create(MAX_THREADS_OPTION));
62 | options.addOption(OptionBuilder.withArgName("file").hasArg()
63 | .withDescription("file containing access tokens").create(CREDENTIALS_OPTION));
64 |
65 | CommandLine cmdline = null;
66 | CommandLineParser parser = new GnuParser();
67 | try {
68 | cmdline = parser.parse(options, args);
69 | } catch (ParseException exp) {
70 | System.err.println("Error parsing command line: " + exp.getMessage());
71 | System.exit(-1);
72 | }
73 |
74 | if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(INDEX_OPTION)) {
75 | HelpFormatter formatter = new HelpFormatter();
76 | formatter.printHelp(TrecSearchThriftServer.class.getName(), options);
77 | System.exit(-1);
78 | }
79 |
80 | int port = cmdline.hasOption(PORT_OPTION) ?
81 | Integer.parseInt(cmdline.getOptionValue(PORT_OPTION)) : DEFAULT_PORT;
82 | int maxThreads = cmdline.hasOption(MAX_THREADS_OPTION) ?
83 | Integer.parseInt(cmdline.getOptionValue(MAX_THREADS_OPTION)) : DEFAULT_MAX_THREADS;
84 | File index = new File(cmdline.getOptionValue(INDEX_OPTION));
85 |
86 | Map credentials = null;
87 | if (cmdline.hasOption(CREDENTIALS_OPTION)) {
88 | credentials = Maps.newHashMap();
89 | File cfile = new File(cmdline.getOptionValue(CREDENTIALS_OPTION));
90 | if (!cfile.exists()) {
91 | System.err.println("Error: " + cfile + " does not exist!");
92 | System.exit(-1);
93 | }
94 | for (String s : Files.readLines(cfile, Charsets.UTF_8)) {
95 | try {
96 | String[] arr = s.split(":");
97 | credentials.put(arr[0], arr[1]);
98 | } catch (Exception e){
99 | // Catch any exceptions from parsing file contain access tokens
100 | System.err.println("Error reading access tokens from " + cfile + "!");
101 | System.exit(-1);
102 | }
103 | }
104 | }
105 |
106 | if (!index.exists()) {
107 | System.err.println("Error: " + index + " does not exist!");
108 | System.exit(-1);
109 | }
110 |
111 | TServerSocket serverSocket = new TServerSocket(port);
112 | TrecSearch.Processor searchProcessor =
113 | new TrecSearch.Processor(new TrecSearchHandler(index, credentials));
114 |
115 | TThreadPoolServer.Args serverArgs = new TThreadPoolServer.Args(serverSocket);
116 | serverArgs.maxWorkerThreads(maxThreads);
117 | TServer thriftServer = new TThreadPoolServer(serverArgs.processor(searchProcessor)
118 | .protocolFactory(new TBinaryProtocol.Factory()));
119 |
120 | thriftServer.serve();
121 | }
122 | }
--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/search/local/RunQueries.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Twitter Tools
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package cc.twittertools.search.local;
18 |
19 | import java.io.File;
20 | import java.io.PrintStream;
21 |
22 | import org.apache.commons.cli.CommandLine;
23 | import org.apache.commons.cli.CommandLineParser;
24 | import org.apache.commons.cli.GnuParser;
25 | import org.apache.commons.cli.HelpFormatter;
26 | import org.apache.commons.cli.Option;
27 | import org.apache.commons.cli.OptionBuilder;
28 | import org.apache.commons.cli.Options;
29 | import org.apache.commons.cli.ParseException;
30 | import org.apache.lucene.document.Document;
31 | import org.apache.lucene.index.DirectoryReader;
32 | import org.apache.lucene.index.IndexReader;
33 | import org.apache.lucene.queryparser.classic.QueryParser;
34 | import org.apache.lucene.search.Filter;
35 | import org.apache.lucene.search.IndexSearcher;
36 | import org.apache.lucene.search.NumericRangeFilter;
37 | import org.apache.lucene.search.Query;
38 | import org.apache.lucene.search.ScoreDoc;
39 | import org.apache.lucene.search.TopDocs;
40 | import org.apache.lucene.search.similarities.BM25Similarity;
41 | import org.apache.lucene.search.similarities.LMDirichletSimilarity;
42 | import org.apache.lucene.store.FSDirectory;
43 | import org.apache.lucene.util.Version;
44 |
45 | import cc.twittertools.index.IndexStatuses;
46 | import cc.twittertools.index.IndexStatuses.StatusField;
47 | import cc.twittertools.search.TrecTopic;
48 | import cc.twittertools.search.TrecTopicSet;
49 |
50 | public class RunQueries {
51 | private static final String DEFAULT_RUNTAG = "lucene4lm";
52 |
53 | private static final String INDEX_OPTION = "index";
54 | private static final String QUERIES_OPTION = "queries";
55 | private static final String NUM_RESULTS_OPTION = "num_results";
56 | private static final String SIMILARITY_OPTION = "similarity";
57 | private static final String RUNTAG_OPTION = "runtag";
58 | private static final String VERBOSE_OPTION = "verbose";
59 |
60 | private RunQueries() {}
61 |
62 | @SuppressWarnings("static-access")
63 | public static void main(String[] args) throws Exception {
64 | Options options = new Options();
65 |
66 | options.addOption(OptionBuilder.withArgName("path").hasArg()
67 | .withDescription("index location").create(INDEX_OPTION));
68 | options.addOption(OptionBuilder.withArgName("num").hasArg()
69 | .withDescription("number of results to return").create(NUM_RESULTS_OPTION));
70 | options.addOption(OptionBuilder.withArgName("file").hasArg()
71 | .withDescription("file containing topics in TREC format").create(QUERIES_OPTION));
72 | options.addOption(OptionBuilder.withArgName("similarity").hasArg()
73 | .withDescription("similarity to use (BM25, LM)").create(SIMILARITY_OPTION));
74 | options.addOption(OptionBuilder.withArgName("string").hasArg()
75 | .withDescription("runtag").create(RUNTAG_OPTION));
76 | options.addOption(new Option(VERBOSE_OPTION, "print out complete document"));
77 |
78 | CommandLine cmdline = null;
79 | CommandLineParser parser = new GnuParser();
80 | try {
81 | cmdline = parser.parse(options, args);
82 | } catch (ParseException exp) {
83 | System.err.println("Error parsing command line: " + exp.getMessage());
84 | System.exit(-1);
85 | }
86 |
87 | if (!cmdline.hasOption(QUERIES_OPTION) || !cmdline.hasOption(INDEX_OPTION)) {
88 | HelpFormatter formatter = new HelpFormatter();
89 | formatter.printHelp(RunQueries.class.getName(), options);
90 | System.exit(-1);
91 | }
92 |
93 | File indexLocation = new File(cmdline.getOptionValue(INDEX_OPTION));
94 | if (!indexLocation.exists()) {
95 | System.err.println("Error: " + indexLocation + " does not exist!");
96 | System.exit(-1);
97 | }
98 |
99 | String runtag = cmdline.hasOption(RUNTAG_OPTION) ?
100 | cmdline.getOptionValue(RUNTAG_OPTION) : DEFAULT_RUNTAG;
101 |
102 | String topicsFile = cmdline.getOptionValue(QUERIES_OPTION);
103 |
104 | int numResults = 1000;
105 | try {
106 | if (cmdline.hasOption(NUM_RESULTS_OPTION)) {
107 | numResults = Integer.parseInt(cmdline.getOptionValue(NUM_RESULTS_OPTION));
108 | }
109 | } catch (NumberFormatException e) {
110 | System.err.println("Invalid " + NUM_RESULTS_OPTION + ": " + cmdline.getOptionValue(NUM_RESULTS_OPTION));
111 | System.exit(-1);
112 | }
113 |
114 | String similarity = "LM";
115 | if (cmdline.hasOption(SIMILARITY_OPTION)) {
116 | similarity = cmdline.getOptionValue(SIMILARITY_OPTION);
117 | }
118 |
119 | boolean verbose = cmdline.hasOption(VERBOSE_OPTION);
120 |
121 | PrintStream out = new PrintStream(System.out, true, "UTF-8");
122 |
123 | IndexReader reader = DirectoryReader.open(FSDirectory.open(indexLocation));
124 | IndexSearcher searcher = new IndexSearcher(reader);
125 |
126 | if (similarity.equalsIgnoreCase("BM25")) {
127 | searcher.setSimilarity(new BM25Similarity());
128 | } else if (similarity.equalsIgnoreCase("LM")) {
129 | searcher.setSimilarity(new LMDirichletSimilarity(2500.0f));
130 | }
131 |
132 | QueryParser p = new QueryParser(Version.LUCENE_43, StatusField.TEXT.name,
133 | IndexStatuses.ANALYZER);
134 |
135 | TrecTopicSet topics = TrecTopicSet.fromFile(new File(topicsFile));
136 | for ( TrecTopic topic : topics ) {
137 | Query query = p.parse(topic.getQuery());
138 | Filter filter = NumericRangeFilter.newLongRange(StatusField.ID.name, 0L,
139 | topic.getQueryTweetTime(), true, true);
140 |
141 | TopDocs rs = searcher.search(query, filter, numResults);
142 |
143 | int i = 1;
144 | for (ScoreDoc scoreDoc : rs.scoreDocs) {
145 | Document hit = searcher.doc(scoreDoc.doc);
146 | out.println(String.format("%s Q0 %s %d %f %s", topic.getId(),
147 | hit.getField(StatusField.ID.name).numericValue(), i, scoreDoc.score, runtag));
148 | if ( verbose) {
149 | out.println("# " + hit.toString().replaceAll("[\\n\\r]+", " "));
150 | }
151 | i++;
152 | }
153 | }
154 | reader.close();
155 | out.close();
156 | }
157 | }
158 |
--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/stream/GatherStatusStream.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Twitter Tools
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package cc.twittertools.stream;
18 |
19 | import org.apache.log4j.ConsoleAppender;
20 | import org.apache.log4j.Level;
21 | import org.apache.log4j.Logger;
22 | import org.apache.log4j.PatternLayout;
23 | import org.apache.log4j.rolling.RollingFileAppender;
24 | import org.apache.log4j.rolling.TimeBasedRollingPolicy;
25 | import org.apache.log4j.varia.LevelRangeFilter;
26 |
27 | import twitter4j.RawStreamListener;
28 | import twitter4j.TwitterException;
29 | import twitter4j.TwitterStream;
30 | import twitter4j.TwitterStreamFactory;
31 |
32 | public final class GatherStatusStream {
33 | private static int cnt = 0;
34 |
35 | @SuppressWarnings("unused")
36 | private static final String MINUTE_ROLL = ".%d{yyyy-MM-dd-HH-mm}.gz";
37 | private static final String HOUR_ROLL = ".%d{yyyy-MM-dd-HH}.gz";
38 |
39 | public static void main(String[] args) throws TwitterException {
40 | PatternLayout layoutStandard = new PatternLayout();
41 | layoutStandard.setConversionPattern("[%p] %d %c %M - %m%n");
42 |
43 | PatternLayout layoutSimple = new PatternLayout();
44 | layoutSimple.setConversionPattern("%m%n");
45 |
46 | // Filter for the statuses: we only want INFO messages
47 | LevelRangeFilter filter = new LevelRangeFilter();
48 | filter.setLevelMax(Level.INFO);
49 | filter.setLevelMin(Level.INFO);
50 | filter.setAcceptOnMatch(true);
51 | filter.activateOptions();
52 |
53 | TimeBasedRollingPolicy statusesRollingPolicy = new TimeBasedRollingPolicy();
54 | statusesRollingPolicy.setFileNamePattern("statuses.log" + HOUR_ROLL);
55 | statusesRollingPolicy.activateOptions();
56 |
57 | RollingFileAppender statusesAppender = new RollingFileAppender();
58 | statusesAppender.setRollingPolicy(statusesRollingPolicy);
59 | statusesAppender.addFilter(filter);
60 | statusesAppender.setLayout(layoutSimple);
61 | statusesAppender.activateOptions();
62 |
63 | TimeBasedRollingPolicy warningsRollingPolicy = new TimeBasedRollingPolicy();
64 | warningsRollingPolicy.setFileNamePattern("warnings.log" + HOUR_ROLL);
65 | warningsRollingPolicy.activateOptions();
66 |
67 | RollingFileAppender warningsAppender = new RollingFileAppender();
68 | warningsAppender.setRollingPolicy(statusesRollingPolicy);
69 | warningsAppender.setThreshold(Level.WARN);
70 | warningsAppender.setLayout(layoutStandard);
71 | warningsAppender.activateOptions();
72 |
73 | ConsoleAppender consoleAppender = new ConsoleAppender();
74 | consoleAppender.setThreshold(Level.WARN);
75 | consoleAppender.setLayout(layoutStandard);
76 | consoleAppender.activateOptions();
77 |
78 | // configures the root logger
79 | Logger rootLogger = Logger.getRootLogger();
80 | rootLogger.setLevel(Level.INFO);
81 | rootLogger.removeAllAppenders();
82 | rootLogger.addAppender(consoleAppender);
83 | rootLogger.addAppender(statusesAppender);
84 | rootLogger.addAppender(warningsAppender);
85 |
86 | // creates a custom logger and log messages
87 | final Logger logger = Logger.getLogger(GatherStatusStream.class);
88 |
89 | TwitterStream twitterStream = new TwitterStreamFactory().getInstance();
90 | RawStreamListener rawListener = new RawStreamListener() {
91 |
92 | @Override
93 | public void onMessage(String rawString) {
94 | cnt++;
95 | logger.info(rawString);
96 | if (cnt % 1000 == 0) {
97 | System.out.println(cnt + " messages received.");
98 | }
99 | }
100 |
101 | @Override
102 | public void onException(Exception ex) {
103 | logger.warn(ex);
104 | }
105 |
106 | };
107 |
108 | twitterStream.addListener(rawListener);
109 | twitterStream.sample();
110 | }
111 | }
112 |
--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/util/ExtractSubcollection.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Twitter Tools
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package cc.twittertools.util;
18 |
19 | import it.unimi.dsi.fastutil.longs.LongIterator;
20 | import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
21 |
22 | import java.io.BufferedReader;
23 | import java.io.BufferedWriter;
24 | import java.io.File;
25 | import java.io.FileInputStream;
26 | import java.io.FileOutputStream;
27 | import java.io.InputStreamReader;
28 | import java.io.OutputStreamWriter;
29 | import java.io.Writer;
30 |
31 | import org.apache.commons.cli.CommandLine;
32 | import org.apache.commons.cli.CommandLineParser;
33 | import org.apache.commons.cli.GnuParser;
34 | import org.apache.commons.cli.HelpFormatter;
35 | import org.apache.commons.cli.OptionBuilder;
36 | import org.apache.commons.cli.Options;
37 | import org.apache.commons.cli.ParseException;
38 | import org.apache.log4j.Logger;
39 |
40 | import cc.twittertools.corpus.data.JsonStatusCorpusReader;
41 | import cc.twittertools.corpus.data.Status;
42 | import cc.twittertools.corpus.data.StatusStream;
43 |
44 | public class ExtractSubcollection {
45 | private static final Logger LOG = Logger.getLogger(ExtractSubcollection.class);
46 |
47 | private static final String COLLECTION_OPTION = "collection";
48 | private static final String ID_OPTION = "tweetids";
49 | private static final String OUTPUT_OPTION = "output";
50 | private static final String MISSING_OPTION = "missing";
51 |
52 | @SuppressWarnings("static-access")
53 | public static void main(String[] args) throws Exception {
54 | Options options = new Options();
55 |
56 | options.addOption(OptionBuilder.withArgName("dir").hasArg()
57 | .withDescription("source collection directory").create(COLLECTION_OPTION));
58 | options.addOption(OptionBuilder.withArgName("file").hasArg()
59 | .withDescription("list of tweetids").create(ID_OPTION));
60 | options.addOption(OptionBuilder.withArgName("file").hasArg()
61 | .withDescription("output JSON").create(OUTPUT_OPTION));
62 | options.addOption(OptionBuilder.withArgName("file").hasArg()
63 | .withDescription("file to store missing tweeids").create(MISSING_OPTION));
64 |
65 | CommandLine cmdline = null;
66 | CommandLineParser parser = new GnuParser();
67 | try {
68 | cmdline = parser.parse(options, args);
69 | } catch (ParseException exp) {
70 | System.err.println("Error parsing command line: " + exp.getMessage());
71 | System.exit(-1);
72 | }
73 |
74 | if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(ID_OPTION) ||
75 | !cmdline.hasOption(OUTPUT_OPTION) || !cmdline.hasOption(MISSING_OPTION)) {
76 | HelpFormatter formatter = new HelpFormatter();
77 | formatter.printHelp(ExtractSubcollection.class.getName(), options);
78 | System.exit(-1);
79 | }
80 |
81 | String outputFile = cmdline.getOptionValue(OUTPUT_OPTION);
82 | String missingFile = cmdline.getOptionValue(MISSING_OPTION);
83 | String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION);
84 |
85 | LongOpenHashSet tweetids = new LongOpenHashSet();
86 | File tweetidsFile = new File(cmdline.getOptionValue(ID_OPTION));
87 | if (!tweetidsFile.exists()) {
88 | System.err.println("Error: " + tweetidsFile + " does not exist!");
89 | System.exit(-1);
90 | }
91 | LOG.info("Reading tweetids from " + tweetidsFile);
92 |
93 | FileInputStream fin = new FileInputStream(tweetidsFile);
94 | BufferedReader br = new BufferedReader(new InputStreamReader(fin));
95 |
96 | String s;
97 | while ((s = br.readLine()) != null) {
98 | tweetids.add(Long.parseLong(s));
99 | }
100 | br.close();
101 | fin.close();
102 | LOG.info("Read " + tweetids.size() + " tweetids.");
103 |
104 | File file = new File(collectionPath);
105 | if (!file.exists()) {
106 | System.err.println("Error: " + file + " does not exist!");
107 | System.exit(-1);
108 | }
109 |
110 | // Store tweet ids we've already seen to dedup.
111 | LongOpenHashSet seen = new LongOpenHashSet();
112 |
113 | Writer out = new BufferedWriter(new OutputStreamWriter(
114 | new FileOutputStream(outputFile), "UTF-8"));
115 |
116 | StatusStream stream = new JsonStatusCorpusReader(file);
117 | Status status;
118 | while ((status = stream.next()) != null) {
119 | if (tweetids.contains(status.getId()) && !seen.contains(status.getId())) {
120 | out.write(status.getJsonObject().toString() + "\n");
121 | seen.add(status.getId());
122 | }
123 | }
124 | stream.close();
125 | out.close();
126 |
127 | LOG.info("Extracted " + seen.size() + " tweetids.");
128 | LOG.info("Storing missing tweetids...");
129 |
130 | out = new BufferedWriter(new OutputStreamWriter(
131 | new FileOutputStream(missingFile), "UTF-8"));
132 | LongIterator iter = tweetids.iterator();
133 | while (iter.hasNext()) {
134 | long t = iter.nextLong();
135 | if (!seen.contains(t)) {
136 | out.write(t + "\n");
137 | }
138 | }
139 | out.close();
140 |
141 | LOG.info("Done!");
142 | }
143 | }
144 |
--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/cc/twittertools/util/VerifySubcollection.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Twitter Tools
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package cc.twittertools.util;
18 |
19 | import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
20 |
21 | import java.io.BufferedReader;
22 | import java.io.File;
23 | import java.io.FileInputStream;
24 | import java.io.InputStreamReader;
25 | import java.io.PrintStream;
26 | import java.util.Map;
27 | import java.util.TreeMap;
28 |
29 | import org.apache.commons.cli.CommandLine;
30 | import org.apache.commons.cli.CommandLineParser;
31 | import org.apache.commons.cli.GnuParser;
32 | import org.apache.commons.cli.HelpFormatter;
33 | import org.apache.commons.cli.OptionBuilder;
34 | import org.apache.commons.cli.Options;
35 | import org.apache.commons.cli.ParseException;
36 | import org.apache.log4j.Logger;
37 |
38 | import cc.twittertools.corpus.data.JsonStatusCorpusReader;
39 | import cc.twittertools.corpus.data.Status;
40 | import cc.twittertools.corpus.data.StatusStream;
41 |
42 | import com.google.common.collect.Maps;
43 |
44 | public class VerifySubcollection {
45 | private static final Logger LOG = Logger.getLogger(VerifySubcollection.class);
46 |
47 | private static final String COLLECTION_OPTION = "collection";
48 | private static final String ID_OPTION = "tweetids";
49 |
50 | @SuppressWarnings("static-access")
51 | public static void main(String[] args) throws Exception {
52 | Options options = new Options();
53 |
54 | options.addOption(OptionBuilder.withArgName("dir").hasArg()
55 | .withDescription("source collection directory").create(COLLECTION_OPTION));
56 | options.addOption(OptionBuilder.withArgName("file").hasArg()
57 | .withDescription("list of tweetids").create(ID_OPTION));
58 |
59 | CommandLine cmdline = null;
60 | CommandLineParser parser = new GnuParser();
61 | try {
62 | cmdline = parser.parse(options, args);
63 | } catch (ParseException exp) {
64 | System.err.println("Error parsing command line: " + exp.getMessage());
65 | System.exit(-1);
66 | }
67 |
68 | if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(ID_OPTION)) {
69 | HelpFormatter formatter = new HelpFormatter();
70 | formatter.printHelp(ExtractSubcollection.class.getName(), options);
71 | System.exit(-1);
72 | }
73 |
74 | String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION);
75 |
76 | LongOpenHashSet tweetids = new LongOpenHashSet();
77 | File tweetidsFile = new File(cmdline.getOptionValue(ID_OPTION));
78 | if (!tweetidsFile.exists()) {
79 | System.err.println("Error: " + tweetidsFile + " does not exist!");
80 | System.exit(-1);
81 | }
82 | LOG.info("Reading tweetids from " + tweetidsFile);
83 |
84 | FileInputStream fin = new FileInputStream(tweetidsFile);
85 | BufferedReader br = new BufferedReader(new InputStreamReader(fin));
86 |
87 | String s;
88 | while ((s = br.readLine()) != null) {
89 | tweetids.add(Long.parseLong(s));
90 | }
91 | br.close();
92 | fin.close();
93 | LOG.info("Read " + tweetids.size() + " tweetids.");
94 |
95 | File file = new File(collectionPath);
96 | if (!file.exists()) {
97 | System.err.println("Error: " + file + " does not exist!");
98 | System.exit(-1);
99 | }
100 |
101 | LongOpenHashSet seen = new LongOpenHashSet();
102 | TreeMap tweets = Maps.newTreeMap();
103 |
104 | PrintStream out = new PrintStream(System.out, true, "UTF-8");
105 | StatusStream stream = new JsonStatusCorpusReader(file);
106 | Status status;
107 | int cnt = 0;
108 | while ((status = stream.next()) != null) {
109 | if (!tweetids.contains(status.getId())) {
110 | LOG.error("tweetid " + status.getId() + " doesn't belong in collection");
111 | continue;
112 | }
113 | if (seen.contains(status.getId())) {
114 | LOG.error("tweetid " + status.getId() + " already seen!");
115 | continue;
116 | }
117 |
118 | tweets.put(status.getId(), status.getJsonObject().toString());
119 | seen.add(status.getId());
120 | cnt++;
121 | }
122 | LOG.info("total of " + cnt + " tweets in subcollection.");
123 |
124 | for ( Map.Entry entry : tweets.entrySet()){
125 | out.println(entry.getValue());
126 | }
127 |
128 | stream.close();
129 | out.close();
130 | }
131 | }
132 |
--------------------------------------------------------------------------------
/twitter-tools-core/src/main/java/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootLogger=INFO, A1
2 | log4j.appender.A1=org.apache.log4j.ConsoleAppender
3 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout
4 |
5 | # Print the date in ISO 8601 format
6 | log4j.appender.A1.layout.ConversionPattern=%d [%t] %-5p %c{1} - %m%n
7 | log4j.logger.com.ning.http.client=WARN
8 |
--------------------------------------------------------------------------------
/twitter-tools-core/src/main/perl/extract_deletes.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 |
3 | # Scans a directory containing the output of the stream crawler and
4 | # extracts the deletes
5 |
6 | $directory = shift or die "$0 [directory]";
7 |
8 | for $f ( `ls $directory` ) {
9 | chomp($f);
10 | my $path = "$directory/$f";
11 |
12 | open(DATA, "gunzip -c $path | grep '{\"delete\"' | ");
13 | while ( my $line = ) {
14 | if ( $line =~ m/{"delete":{"status":{"id":(\d+),/ ) {
15 | print "$1\n";
16 | }
17 | }
18 | close(DATA);
19 | }
20 |
--------------------------------------------------------------------------------
/twitter-tools-core/src/main/perl/join_deletes_with_collection.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 |
3 | # Joins together deletes and collection tweetids to identify the
4 | # deleted statuses.
5 |
6 | $USAGE = "$0 [deletes (bz2)] [collection (bz2)]";
7 |
8 | $deletes = shift or die $USAGE;
9 | $collection = shift or die $USAGE;
10 |
11 | open(DATA, "bzcat $deletes | ");
12 | while ( my $line = ) {
13 | chomp($line);
14 | $H{$line} = 1;
15 | }
16 | close(DATA);
17 |
18 | open(DATA, "bzcat $collection | ");
19 | while ( my $line = ) {
20 | if ($line =~ /^(\d+)/ ) {
21 | print $line if exists($H{$1});
22 | }
23 | }
24 | close(DATA);
25 |
--------------------------------------------------------------------------------
/twitter-tools-core/src/main/python/Search/TrecSearch-remote:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # Autogenerated by Thrift Compiler (0.8.0)
4 | #
5 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
6 | #
7 | # options string: py
8 | #
9 |
10 | import sys
11 | import pprint
12 | from urlparse import urlparse
13 | from thrift.transport import TTransport
14 | from thrift.transport import TSocket
15 | from thrift.transport import THttpClient
16 | from thrift.protocol import TBinaryProtocol
17 |
18 | import TrecSearch
19 | from ttypes import *
20 |
21 | if len(sys.argv) <= 1 or sys.argv[1] == '--help':
22 | print ''
23 | print 'Usage: ' + sys.argv[0] + ' [-h host[:port]] [-u url] [-f[ramed]] function [arg1 [arg2...]]'
24 | print ''
25 | print 'Functions:'
26 | print ' search(TQuery query)'
27 | print ''
28 | sys.exit(0)
29 |
30 | pp = pprint.PrettyPrinter(indent = 2)
31 | host = 'localhost'
32 | port = 9090
33 | uri = ''
34 | framed = False
35 | http = False
36 | argi = 1
37 |
38 | if sys.argv[argi] == '-h':
39 | parts = sys.argv[argi+1].split(':')
40 | host = parts[0]
41 | if len(parts) > 1:
42 | port = int(parts[1])
43 | argi += 2
44 |
45 | if sys.argv[argi] == '-u':
46 | url = urlparse(sys.argv[argi+1])
47 | parts = url[1].split(':')
48 | host = parts[0]
49 | if len(parts) > 1:
50 | port = int(parts[1])
51 | else:
52 | port = 80
53 | uri = url[2]
54 | if url[4]:
55 | uri += '?%s' % url[4]
56 | http = True
57 | argi += 2
58 |
59 | if sys.argv[argi] == '-f' or sys.argv[argi] == '-framed':
60 | framed = True
61 | argi += 1
62 |
63 | cmd = sys.argv[argi]
64 | args = sys.argv[argi+1:]
65 |
66 | if http:
67 | transport = THttpClient.THttpClient(host, port, uri)
68 | else:
69 | socket = TSocket.TSocket(host, port)
70 | if framed:
71 | transport = TTransport.TFramedTransport(socket)
72 | else:
73 | transport = TTransport.TBufferedTransport(socket)
74 | protocol = TBinaryProtocol.TBinaryProtocol(transport)
75 | client = TrecSearch.Client(protocol)
76 | transport.open()
77 |
78 | if cmd == 'search':
79 | if len(args) != 1:
80 | print 'search requires 1 args'
81 | sys.exit(1)
82 | pp.pprint(client.search(eval(args[0]),))
83 |
84 | else:
85 | print 'Unrecognized method %s' % cmd
86 | sys.exit(1)
87 |
88 | transport.close()
89 |
--------------------------------------------------------------------------------
/twitter-tools-core/src/main/python/Search/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ['ttypes', 'constants', 'TrecSearch']
2 |
--------------------------------------------------------------------------------
/twitter-tools-core/src/main/python/Search/constants.py:
--------------------------------------------------------------------------------
1 | #
2 | # Autogenerated by Thrift Compiler (0.8.0)
3 | #
4 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
5 | #
6 | # options string: py
7 | #
8 |
9 | from thrift.Thrift import TType, TMessageType, TException
10 | from ttypes import *
11 |
12 |
--------------------------------------------------------------------------------
/twitter-tools-core/src/main/python/TrecSearchThriftClientCli.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | """
4 | A demonstration of how to use the python thrift bindings to retrieve tweets from the TREC 2013 API.
5 |
6 | This script requires the python-thrift package, which can installed using 'pip install thrift'.
7 |
8 | To execute this script:
9 | python TrecSearchThriftClientCli.py -host='host' -port=port -group='team_name' -token='access_token' -qid='MB01' -q='BBC World Service staff cuts' -runtag='lucene4lm' -max_id=34952194402811905
10 |
11 | """
12 |
13 | from Search import TrecSearch, ttypes
14 |
15 | from thrift import Thrift
16 | from thrift.transport import TSocket
17 | from thrift.transport import TTransport
18 | from thrift.protocol import TBinaryProtocol
19 |
20 | import argparse
21 |
22 | try:
23 | # Command line arguments
24 | parser = argparse.ArgumentParser()
25 | parser.add_argument('-host', dest="host", help='server to connect to', required=True)
26 | parser.add_argument('-port',type=int, dest="port", help='port to use', required=True)
27 | parser.add_argument('-group', dest="group", help='group id', required=True)
28 | parser.add_argument('-token', dest="token", help='access token', required=True)
29 | parser.add_argument('-qid', dest="qid", help='query id', required=False, default='MB01')
30 | parser.add_argument('-q', dest="query", help='query text', required=False, default='BBC World Service staff cuts')
31 | parser.add_argument('-runtag', dest="run_tag", help='runtag', required=False, default='lucene4lm')
32 | parser.add_argument('-max_id', dest="max_id", help='maxid', required=False, default=34952194402811905)
33 | parser.add_argument('-num_results', dest="num_results", help='number of results', required=False, default=10)
34 | args = parser.parse_args()
35 |
36 | # Init thrift connection and protocol handlers
37 | transport = TSocket.TSocket(args.host, args.port)
38 | transport = TTransport.TBufferedTransport(transport)
39 | protocol = TBinaryProtocol.TBinaryProtocol(transport)
40 | client = TrecSearch.Client(protocol)
41 |
42 | # Open the connection to the server
43 | transport.open()
44 |
45 | # Create a new query
46 | q = ttypes.TQuery()
47 | q.group = args.group
48 | q.token = args.token
49 | q.text = args.query
50 | q.max_id = long(args.max_id)
51 | q.num_results = int(args.num_results)
52 |
53 | # Performs the actual search
54 | results = client.search(q)
55 |
56 | for i, result in enumerate(results, 1):
57 | # TREC_eval formatted line
58 | print "%s Q0 %d %d %f %s" % (args.qid, result.id, i, result.rsv, args.run_tag)
59 |
60 | # Close connection
61 | transport.close()
62 |
63 | except Thrift.TException, tx:
64 | print 'Thrift TException: %s' % (tx.message)
65 |
--------------------------------------------------------------------------------
/twitter-tools-core/src/main/python/twittertools/stream/gather_status_stream.py:
--------------------------------------------------------------------------------
1 | # Twitter Tools
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | from tweepy.streaming import StreamListener
17 | from tweepy import OAuthHandler
18 | from tweepy import Stream
19 | import logging
20 | import logging.handlers
21 |
22 |
23 | consumer_key=""
24 | consumer_secret=""
25 |
26 | access_token=""
27 | access_token_secret=""
28 |
29 | class TweetListener(StreamListener):
30 |
31 | def __init__(self,api=None):
32 | super(TweetListener,self).__init__(api)
33 | self.logger = logging.getLogger('tweetlogger')
34 |
35 |
36 | statusHandler = logging.handlers.TimedRotatingFileHandler('status.log',when='H',encoding='bz2',utc=True)
37 | statusHandler.setLevel(logging.INFO)
38 | self.logger.addHandler(statusHandler)
39 |
40 |
41 | warningHandler = logging.handlers.TimedRotatingFileHandler('warning.log',when='H',encoding='bz2',utc=True)
42 | warningHandler.setLevel(logging.WARN)
43 | self.logger.addHandler(warningHandler)
44 | logging.captureWarnings(True);
45 |
46 | consoleHandler = logging.StreamHandler()
47 | consoleHandler.setLevel(logging.WARN)
48 | self.logger.addHandler(consoleHandler)
49 |
50 |
51 | self.logger.setLevel(logging.INFO)
52 | self.count = 0
53 |
54 | def on_data(self,data):
55 | self.count+=1
56 | self.logger.info(data)
57 | if self.count % 1000 == 0:
58 | print "%d statuses processed" % self.count
59 | return True
60 |
61 | def on_error(self,exception):
62 | self.logger.warn(str(exception))
63 |
64 | if __name__ == '__main__':
65 | listener = TweetListener()
66 | auth = OAuthHandler(consumer_key,consumer_secret)
67 | auth.set_access_token(access_token,access_token_secret)
68 |
69 | stream = Stream(auth,listener)
70 | while True:
71 | try:
72 | stream.sample()
73 | except Exception as ex:
74 | print str(ex)
75 | pass
76 |
--------------------------------------------------------------------------------
/twitter-tools-core/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootLogger=INFO, A1
2 | log4j.appender.A1=org.apache.log4j.ConsoleAppender
3 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout
4 |
5 | # Print the date in ISO 8601 format
6 | log4j.appender.A1.layout.ConversionPattern=%d [%t] %-5p %c{1} - %m%n
7 | log4j.logger.com.ning.http.client=WARN
8 |
--------------------------------------------------------------------------------
/twitter-tools-core/src/main/thrift/gen-py/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lintool/twitter-tools/777776083d1e4a76da5bdc5860551a38f8fd6766/twitter-tools-core/src/main/thrift/gen-py/__init__.py
--------------------------------------------------------------------------------
/twitter-tools-core/src/main/thrift/gen-py/twittertools/TrecSearch-remote:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # Autogenerated by Thrift Compiler (0.8.0)
4 | #
5 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
6 | #
7 | # options string: py
8 | #
9 |
10 | import sys
11 | import pprint
12 | from urlparse import urlparse
13 | from thrift.transport import TTransport
14 | from thrift.transport import TSocket
15 | from thrift.transport import THttpClient
16 | from thrift.protocol import TBinaryProtocol
17 |
18 | import TrecSearch
19 | from ttypes import *
20 |
21 | if len(sys.argv) <= 1 or sys.argv[1] == '--help':
22 | print ''
23 | print 'Usage: ' + sys.argv[0] + ' [-h host[:port]] [-u url] [-f[ramed]] function [arg1 [arg2...]]'
24 | print ''
25 | print 'Functions:'
26 | print ' search(TQuery query)'
27 | print ''
28 | sys.exit(0)
29 |
30 | pp = pprint.PrettyPrinter(indent = 2)
31 | host = 'localhost'
32 | port = 9090
33 | uri = ''
34 | framed = False
35 | http = False
36 | argi = 1
37 |
38 | if sys.argv[argi] == '-h':
39 | parts = sys.argv[argi+1].split(':')
40 | host = parts[0]
41 | if len(parts) > 1:
42 | port = int(parts[1])
43 | argi += 2
44 |
45 | if sys.argv[argi] == '-u':
46 | url = urlparse(sys.argv[argi+1])
47 | parts = url[1].split(':')
48 | host = parts[0]
49 | if len(parts) > 1:
50 | port = int(parts[1])
51 | else:
52 | port = 80
53 | uri = url[2]
54 | if url[4]:
55 | uri += '?%s' % url[4]
56 | http = True
57 | argi += 2
58 |
59 | if sys.argv[argi] == '-f' or sys.argv[argi] == '-framed':
60 | framed = True
61 | argi += 1
62 |
63 | cmd = sys.argv[argi]
64 | args = sys.argv[argi+1:]
65 |
66 | if http:
67 | transport = THttpClient.THttpClient(host, port, uri)
68 | else:
69 | socket = TSocket.TSocket(host, port)
70 | if framed:
71 | transport = TTransport.TFramedTransport(socket)
72 | else:
73 | transport = TTransport.TBufferedTransport(socket)
74 | protocol = TBinaryProtocol.TBinaryProtocol(transport)
75 | client = TrecSearch.Client(protocol)
76 | transport.open()
77 |
78 | if cmd == 'search':
79 | if len(args) != 1:
80 | print 'search requires 1 args'
81 | sys.exit(1)
82 | pp.pprint(client.search(eval(args[0]),))
83 |
84 | else:
85 | print 'Unrecognized method %s' % cmd
86 | sys.exit(1)
87 |
88 | transport.close()
89 |
--------------------------------------------------------------------------------
/twitter-tools-core/src/main/thrift/gen-py/twittertools/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ['ttypes', 'constants', 'TrecSearch']
2 |
--------------------------------------------------------------------------------
/twitter-tools-core/src/main/thrift/gen-py/twittertools/constants.py:
--------------------------------------------------------------------------------
1 | #
2 | # Autogenerated by Thrift Compiler (0.8.0)
3 | #
4 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
5 | #
6 | # options string: py
7 | #
8 |
9 | from thrift.Thrift import TType, TMessageType, TException
10 | from ttypes import *
11 |
12 |
--------------------------------------------------------------------------------
/twitter-tools-core/src/main/thrift/twittertools.thrift:
--------------------------------------------------------------------------------
1 | namespace java cc.twittertools.thrift.gen
2 |
3 | struct TResult {
4 | 1: i64 id,
5 | 2: double rsv,
6 | 3: string screen_name,
7 | 4: i64 epoch,
8 | 5: string text,
9 | 6: i32 followers_count,
10 | 7: i32 statuses_count,
11 | 8: string lang,
12 | 9: i64 in_reply_to_status_id,
13 | 10: i64 in_reply_to_user_id,
14 | 11: i64 retweeted_status_id,
15 | 12: i64 retweeted_user_id,
16 | 13: i32 retweeted_count
17 | }
18 |
19 | struct TQuery {
20 | 1: string group,
21 | 2: string token,
22 | 3: string text,
23 | 4: i64 max_id,
24 | 5: i32 num_results
25 | }
26 |
27 | exception TrecSearchException {
28 | 1: string message
29 | }
30 |
31 | service TrecSearch {
32 | list search(1: TQuery query)
33 | throws (1: TrecSearchException error)
34 | }
35 |
--------------------------------------------------------------------------------
/twitter-tools-core/src/test/java/cc/twittertools/download/FetchStatusTest.java:
--------------------------------------------------------------------------------
1 | package cc.twittertools.download;
2 |
3 | import static org.junit.Assert.assertEquals;
4 | import static org.junit.Assert.assertTrue;
5 |
6 | import java.util.concurrent.Future;
7 |
8 | import junit.framework.JUnit4TestAdapter;
9 |
10 | import org.apache.commons.lang.StringEscapeUtils;
11 | import org.junit.Test;
12 |
13 | import cc.twittertools.corpus.data.Status;
14 |
15 | import com.google.gson.JsonObject;
16 | import com.google.gson.JsonParser;
17 | import com.ning.http.client.AsyncHttpClient;
18 | import com.ning.http.client.Response;
19 |
20 | public class FetchStatusTest {
21 | private static final JsonParser JSON_PARSER = new JsonParser();
22 |
23 | @Test
24 | public void basicHTML() throws Exception {
25 | String url = AsyncEmbeddedJsonStatusBlockCrawler.getUrl(1121915133L, "jkrums");
26 | AsyncHttpClient asyncHttpClient = new AsyncHttpClient();
27 | AsyncHttpClient.BoundRequestBuilder request = asyncHttpClient.prepareGet(url);
28 | Future f = request.execute();
29 | Response response = f.get();
30 |
31 | // Make sure status is OK.
32 | String html = response.getResponseBody("UTF-8");
33 | assertTrue(html != null);
34 | }
35 |
36 | // The fetcher is broken, so disabling test.
37 | //@Test
38 | public void basicFamous() throws Exception {
39 | String url = AsyncEmbeddedJsonStatusBlockCrawler.getUrl(1121915133L, "jkrums");
40 | AsyncHttpClient asyncHttpClient = new AsyncHttpClient();
41 | AsyncHttpClient.BoundRequestBuilder request = asyncHttpClient.prepareGet(url);
42 | Future f = request.execute();
43 | Response response = f.get();
44 |
45 | // Make sure status is OK.
46 | assertEquals(200, response.getStatusCode());
47 | String html = response.getResponseBody("UTF-8");
48 |
49 | int jsonStart = html.indexOf(AsyncEmbeddedJsonStatusBlockCrawler.JSON_START);
50 | int jsonEnd = html.indexOf(AsyncEmbeddedJsonStatusBlockCrawler.JSON_END,
51 | jsonStart + AsyncEmbeddedJsonStatusBlockCrawler.JSON_START.length());
52 |
53 | String json = html.substring(jsonStart + AsyncEmbeddedJsonStatusBlockCrawler.JSON_START.length(), jsonEnd);
54 | json = StringEscapeUtils.unescapeHtml(json);
55 | JsonObject page = (JsonObject) JSON_PARSER.parse(json);
56 | JsonObject statusJson = page.getAsJsonObject("embedData").getAsJsonObject("status");
57 |
58 | Status status = Status.fromJson(statusJson.toString());
59 | assertEquals(1121915133L, status.getId());
60 | assertEquals("jkrums", status.getScreenname());
61 | assertEquals("http://twitpic.com/135xa - There's a plane in the Hudson. I'm on the ferry going to pick up the people. Crazy.", status.getText());
62 |
63 | asyncHttpClient.close();
64 | }
65 |
66 | public static junit.framework.Test suite() {
67 | return new JUnit4TestAdapter(FetchStatusTest.class);
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/twitter-tools-core/src/test/java/cc/twittertools/index/TokenizationTest.java:
--------------------------------------------------------------------------------
1 | package cc.twittertools.index;
2 |
3 | import static org.junit.Assert.assertEquals;
4 |
5 | import java.io.IOException;
6 | import java.io.StringReader;
7 | import java.util.List;
8 |
9 | import junit.framework.JUnit4TestAdapter;
10 |
11 | import org.apache.lucene.analysis.Analyzer;
12 | import org.apache.lucene.analysis.TokenStream;
13 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
14 | import org.apache.lucene.util.Version;
15 | import org.junit.Test;
16 |
17 | import cc.twittertools.index.TweetAnalyzer;
18 |
19 | import com.google.common.collect.Lists;
20 |
21 | public class TokenizationTest {
22 |
23 | Object[][] examples = new Object[][] {
24 | {"AT&T getting secret immunity from wiretapping laws for government surveillance http://vrge.co/ZP3Fx5",
25 | new String[] {"att", "get", "secret", "immun", "from", "wiretap", "law", "for", "govern", "surveil", "http://vrge.co/ZP3Fx5"}},
26 |
27 | {"want to see the @verge aston martin GT4 racer tear up long beach? http://theracersgroup.kinja.com/watch-an-aston-martin-vantage-gt4-tear-around-long-beac-479726219 …",
28 | new String[] {"want", "to", "see", "the", "@verge", "aston", "martin", "gt4", "racer", "tear", "up", "long", "beach", "http://theracersgroup.kinja.com/watch-an-aston-martin-vantage-gt4-tear-around-long-beac-479726219"}},
29 |
30 | {"Incredibly good news! #Drupal users rally http://bit.ly/Z8ZoFe to ensure blind accessibility contributor gets to @DrupalCon #Opensource",
31 | new String[] {"incred", "good", "new", "#drupal", "user", "ralli", "http://bit.ly/Z8ZoFe", "to", "ensur", "blind", "access", "contributor", "get", "to", "@drupalcon", "#opensource"}},
32 |
33 | {"We're entering the quiet hours at #amznhack. #Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz",
34 | new String[] {"were", "enter", "the", "quiet", "hour", "at", "#amznhack", "#rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz"}},
35 |
36 | {"The 2013 Social Event Detection Task (SED) at #mediaeval2013, http://bit.ly/16nITsf supported by @linkedtv @project_mmixer @socialsensor_ip",
37 | new String[] {"the", "2013", "social", "event", "detect", "task", "sed", "at", "#mediaeval2013", "http://bit.ly/16nITsf", "support", "by", "@linkedtv", "@project_mmixer", "@socialsensor_ip"}},
38 |
39 | {"U.S.A. U.K. U.K USA UK #US #UK #U.S.A #U.K ...A.B.C...D..E..F..A.LONG WORD",
40 | new String[] {"usa", "uk", "uk", "usa", "uk", "#us", "#uk", "#u", "sa", "#u", "k", "abc", "d", "e", "f", "a", "long", "word"}},
41 |
42 | {"this is @a_valid_mention and this_is_multiple_words",
43 | new String[] {"thi", "is", "@a_valid_mention", "and", "thi", "is", "multipl", "word"}},
44 |
45 | {"PLEASE BE LOWER CASE WHEN YOU COME OUT THE OTHER SIDE - ALSO A @VALID_VALID-INVALID",
46 | new String[] {"pleas", "be", "lower", "case", "when", "you", "come", "out", "the", "other", "side", "also", "a", "@valid_valid", "invalid"}},
47 |
48 | // Note: the at sign is not the normal (at) sign and the crazy hashtag is not the normal #
49 | {"@reply @with #crazy ~#at",
50 | new String[] {"@reply", "@with", "#crazy", "#at"}},
51 |
52 | {":@valid testing(valid)#hashtags. RT:@meniton (the last @mention is #valid and so is this:@valid), however this is@invalid",
53 | new String[] {"@valid", "test", "valid", "#hashtags", "rt", "@meniton", "the", "last", "@mention", "is", "#valid", "and", "so", "is", "thi", "@valid", "howev", "thi", "is", "invalid"}},
54 |
55 | {"this][is[lots[(of)words+with-lots=of-strange!characters?$in-fact=it&has&Every&Single:one;ofin_here_B&N_test_test?test\\test^testing`testing{testing}testing…testing¬testing·testing what?",
56 | new String[] {"thi", "is", "lot", "of", "word", "with", "lot", "of", "strang", "charact", "in", "fact", "it", "ha", "everi", "singl", "on", "of", "them", "in", "here", "bn", "test", "test", "test", "test", "test", "test", "test", "test", "test", "test", "test", "what"}},
57 | };
58 |
59 | @Test
60 | public void basic() throws Exception {
61 | Analyzer analyzer = new TweetAnalyzer(Version.LUCENE_43);
62 |
63 | for (int i = 0; i < examples.length; i++) {
64 | verify((String[]) examples[i][1], parseKeywords(analyzer, (String) examples[i][0]));
65 | }
66 | }
67 |
68 | public void verify(String[] truth, List tokens) {
69 | assertEquals(truth.length, tokens.size());
70 | for ( int i=0; i parseKeywords(Analyzer analyzer, String keywords) throws IOException {
76 | List list = Lists.newArrayList();
77 |
78 | TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(keywords));
79 | CharTermAttribute cattr = tokenStream.addAttribute(CharTermAttribute.class);
80 | tokenStream.reset();
81 | while (tokenStream.incrementToken()) {
82 | if (cattr.toString().length() == 0) {
83 | continue;
84 | }
85 | list.add(cattr.toString());
86 | }
87 | tokenStream.end();
88 | tokenStream.close();
89 |
90 | return list;
91 | }
92 |
93 | public static junit.framework.Test suite() {
94 | return new JUnit4TestAdapter(TokenizationTest.class);
95 | }
96 | }
97 |
--------------------------------------------------------------------------------
/twitter-tools-core/src/test/java/cc/twittertools/search/TrecTopicSetTest.java:
--------------------------------------------------------------------------------
1 | package cc.twittertools.search;
2 |
3 | import static org.junit.Assert.assertEquals;
4 | import static org.junit.Assert.assertTrue;
5 |
6 | import java.io.File;
7 | import java.util.List;
8 |
9 | import junit.framework.JUnit4TestAdapter;
10 |
11 | import org.junit.Test;
12 |
13 | import com.google.common.collect.Lists;
14 |
15 | public class TrecTopicSetTest {
16 |
17 | @Test
18 | public void topics2011() throws Exception {
19 | File f = new File("../data/topics.microblog2011.txt");
20 | assertTrue(f.exists());
21 |
22 | TrecTopicSet topics = TrecTopicSet.fromFile(f);
23 | List t = Lists.newArrayList(topics.iterator());
24 |
25 | assertEquals(50, t.size());
26 | assertEquals("MB01", t.get(0).getId());
27 | assertEquals("MB50", t.get(t.size()-1).getId());
28 | }
29 |
30 | @Test
31 | public void topics2012() throws Exception {
32 | File f = new File("../data/topics.microblog2012.txt");
33 | assertTrue(f.exists());
34 |
35 | TrecTopicSet topics = TrecTopicSet.fromFile(f);
36 | List t = Lists.newArrayList(topics.iterator());
37 |
38 | assertEquals(60, t.size());
39 | assertEquals("MB51", t.get(0).getId());
40 | assertEquals("MB110", t.get(t.size()-1).getId());
41 | }
42 |
43 | public static junit.framework.Test suite() {
44 | return new JUnit4TestAdapter(TrecTopicSetTest.class);
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/twitter-tools-hadoop/.settings/org.eclipse.jdt.ui.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | formatter_profile=_twitter-tools
3 | formatter_settings_version=12
4 | org.eclipse.jdt.ui.exception.name=e
5 | org.eclipse.jdt.ui.gettersetter.use.is=true
6 | org.eclipse.jdt.ui.keywordthis=false
7 | org.eclipse.jdt.ui.overrideannotation=true
8 |
--------------------------------------------------------------------------------
/twitter-tools-hadoop/README.md:
--------------------------------------------------------------------------------
1 | # Analyzing Tweets with Pig: Getting Started
2 |
3 | Since tweets are encoded in JSON, and Pig offers poor native JSON support, it's more convenient to use JSON loaders in Twitter's [Elephant Bird](https://github.com/kevinweil/elephant-bird/) library. Easiest just to fetch the relevant jars directly:
4 |
5 | ```
6 | wget http://repo1.maven.org/maven2/com/twitter/elephantbird/elephant-bird-core/4.5/elephant-bird-core-4.5.jar
7 | wget http://repo1.maven.org/maven2/com/twitter/elephantbird/elephant-bird-pig/4.5/elephant-bird-pig-4.5.jar
8 | wget http://repo1.maven.org/maven2/com/twitter/elephantbird/elephant-bird-hadoop-compat/4.5/elephant-bird-hadoop-compat-4.5.jar
9 | wget http://repo1.maven.org/maven2/com/googlecode/json-simple/json-simple/1.1.1/json-simple-1.1.1.jar
10 | ```
11 |
12 | You're ready to start analyzing tweets with Pig! Here's the obligatory word count example in Pig:
13 |
14 | ```
15 | register 'elephant-bird-core-4.5.jar';
16 | register 'elephant-bird-pig-4.5.jar';
17 | register 'elephant-bird-hadoop-compat-4.5.jar';
18 | register 'json-simple-1.1.1.jar';
19 |
20 | raw = load '/path/to/tweets' using com.twitter.elephantbird.pig.load.JsonLoader('-nestedLoad');
21 |
22 | a = foreach raw generate (chararray) $0#'text' as text;
23 | b = foreach a generate flatten(TOKENIZE(text)) as word;
24 | c = group b by word;
25 | d = foreach c generate COUNT(b), group;
26 |
27 | store d into 'wordcount';
28 | ```
29 |
--------------------------------------------------------------------------------
/twitter-tools-hadoop/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | cc.twittertools.hadoop
6 | twitter-tools-hadoop
7 | 1.0-SNAPSHOT
8 | jar
9 |
10 | twitter-tools-hadoop
11 | http://maven.apache.org
12 |
13 |
14 | UTF-8
15 |
16 |
17 |
18 |
19 | The Apache Software License, Version 2.0
20 | http://www.apache.org/licenses/LICENSE-2.0.txt
21 | repo
22 |
23 |
24 |
25 |
26 |
27 | JeffyRao
28 | Jinfeng Rao
29 | jinfeng@cs.umd.edu
30 |
31 |
32 |
33 |
34 |
35 |
36 | org.codehaus.mojo
37 | appassembler-maven-plugin
38 | 1.3.1
39 |
40 |
41 |
42 | cc.twittertools.hbase.LoadWordCount
43 | LoadWordCount
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 | junit
53 | junit
54 | 3.8.1
55 | test
56 |
57 |
58 | cc.twittertools
59 | twitter-tools-core
60 | 1.4.2
61 |
62 |
63 | org.apache.pig
64 | pig
65 | 0.12.1
66 |
67 |
68 | org.apache.hadoop
69 | hadoop-core
70 | 1.2.1
71 |
72 |
73 | org.apache.hbase
74 | hbase
75 | 0.92.1
76 |
77 |
78 | maven-release-plugin
79 | org.apache.maven.plugins
80 |
81 |
82 |
83 |
85 |
86 | commons-io
87 | commons-io
88 | 2.1
89 |
90 |
91 | org.apache.lucene
92 | lucene-core
93 | 4.8.0
94 |
95 |
96 | com.google.guava
97 | guava
98 | 17.0
99 |
100 |
101 |
102 |
--------------------------------------------------------------------------------
/twitter-tools-hadoop/src/main/java/cc/twittertools/hadoop/Example.java:
--------------------------------------------------------------------------------
1 | package cc.twittertools.hadoop;
2 |
3 | import java.io.IOException;
4 | import java.io.StringReader;
5 |
6 | import org.apache.lucene.analysis.TokenStream;
7 | import org.apache.lucene.analysis.Tokenizer;
8 | import org.apache.lucene.analysis.core.WhitespaceTokenizer;
9 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
10 | import org.apache.lucene.util.Version;
11 |
12 | import cc.twittertools.index.LowerCaseEntityPreservingFilter;
13 |
14 | public class Example {
15 |
16 | public static void main(String[] args) throws IOException{
17 | //Test GetInterval Correctness
18 | try{
19 | String str = "Tue Oct 01 00:07:43 +0000 2011";
20 | String[] groups = str.split("\\s+");
21 | String time = groups[3];
22 | String[] timeGroups= time.split(":");
23 | int interval = (Integer.valueOf(timeGroups[0]))*12 + (Integer.valueOf(timeGroups[1])/5) + 1;
24 | System.out.println(interval);
25 | }catch(Exception e){
26 | throw new IOException("caught exception",e);
27 | }
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/twitter-tools-hadoop/src/main/java/cc/twittertools/hbase/LoadWordCount.java:
--------------------------------------------------------------------------------
1 | package cc.twittertools.hbase;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.File;
5 | import java.io.FileNotFoundException;
6 | import java.io.FileReader;
7 | import java.io.IOException;
8 | import java.util.HashMap;
9 | import java.util.HashSet;
10 | import java.util.Iterator;
11 | import java.util.Map;
12 | import java.util.Set;
13 |
14 | import org.apache.hadoop.hbase.client.HTablePool;
15 | import org.apache.hadoop.hbase.util.Bytes;
16 |
17 | import com.google.common.collect.HashBasedTable;
18 | import com.google.common.collect.Table;
19 |
20 | public class LoadWordCount {
21 |
22 | public static void main(String[] args) throws IOException {
23 | // TODO Auto-generated method stub
24 | if(args.length!=1){
25 | System.out.println("invalid argument");
26 | }
27 | Table wordCountMap = HashBasedTable.create();
28 | File folder = new File(args[0]);
29 | if(folder.isDirectory()){
30 | for (File file : folder.listFiles()) {
31 | if(!file.getName().startsWith("part"))
32 | continue;
33 | System.out.println("Processing "+args[0]+file.getName());
34 | BufferedReader bf = new BufferedReader(new FileReader(args[0]+file.getName()));
35 | // each line in wordcount file is like : 1 twitter 100
36 | String line;
37 | while((line=bf.readLine())!=null){
38 | String[] groups = line.split("\\t");
39 | if(groups.length != 4)
40 | continue;
41 | String day = groups[0]; // each day is viewed as a column in underlying HBase
42 | String interval = groups[1];
43 | String word = groups[2];
44 | String count = groups[3];
45 | if(!wordCountMap.contains(word, day)){
46 | WordCountDAO.WordCount w = new WordCountDAO.WordCount(word, day);
47 | wordCountMap.put(word, day, w);
48 | }
49 | WordCountDAO.WordCount w = wordCountMap.get(word, day);
50 | w.setCount(Integer.valueOf(interval), Integer.valueOf(count));
51 | wordCountMap.put(word, day, w);
52 |
53 | }
54 | }
55 | }
56 |
57 | System.out.println("Total "+wordCountMap.size()+" words");
58 | HTablePool pool = new HTablePool();
59 | WordCountDAO DAO = new WordCountDAO(pool);
60 | DAO.CreateTable();
61 | int count = 0;
62 | for(WordCountDAO.WordCount w: wordCountMap.values()){
63 | DAO.addWordCount(w);
64 | if(++count % 50000==0){
65 | System.out.println("Loading "+count+" words");
66 | }
67 | }
68 | pool.closeTablePool(DAO.TABLE_NAME);
69 | }
70 |
71 | }
72 |
--------------------------------------------------------------------------------
/twitter-tools-hadoop/src/main/java/cc/twittertools/hbase/WordCountDAO.java:
--------------------------------------------------------------------------------
1 | package cc.twittertools.hbase;
2 |
3 | import java.io.IOException;
4 | import java.util.ArrayList;
5 | import java.util.Comparator;
6 | import java.util.List;
7 | import java.util.NavigableMap;
8 | import java.util.Set;
9 | import java.util.TreeMap;
10 |
11 | import org.apache.hadoop.conf.Configuration;
12 | import org.apache.hadoop.hbase.HBaseConfiguration;
13 | import org.apache.hadoop.hbase.HColumnDescriptor;
14 | import org.apache.hadoop.hbase.HTableDescriptor;
15 | import org.apache.hadoop.hbase.MasterNotRunningException;
16 | import org.apache.hadoop.hbase.ZooKeeperConnectionException;
17 | import org.apache.hadoop.hbase.client.Delete;
18 | import org.apache.hadoop.hbase.client.Get;
19 | import org.apache.hadoop.hbase.client.HBaseAdmin;
20 | import org.apache.hadoop.hbase.client.HTableInterface;
21 | import org.apache.hadoop.hbase.client.HTablePool;
22 | import org.apache.hadoop.hbase.client.Put;
23 | import org.apache.hadoop.hbase.client.Result;
24 | import org.apache.hadoop.hbase.client.Scan;
25 | import org.apache.hadoop.hbase.util.Bytes;
26 | import org.apache.log4j.Logger;
27 |
28 |
29 | public class WordCountDAO {
30 | private final static int DAY = 60*24;
31 | private final static int INTERVAL = 5;
32 | public static int NUM_INTERVALS = DAY/INTERVAL;
33 | public static final byte[] TABLE_NAME = Bytes.toBytes("wordcount");
34 | public static final byte[] COLUMN_FAMILY = Bytes.toBytes("count");
35 |
36 | private static final Logger log = Logger.getLogger(WordCountDAO.class);
37 |
38 | private HTablePool pool;
39 |
40 | public WordCountDAO(HTablePool pool) {
41 | this.pool = pool;
42 | }
43 |
44 | public void CreateTable() throws IOException, ZooKeeperConnectionException{
45 | Configuration conf = HBaseConfiguration.create();
46 |
47 | HBaseAdmin hbase = new HBaseAdmin(conf);
48 | HTableDescriptor[] wordcounts = hbase.listTables("wordcount");
49 |
50 | if(wordcounts.length != 0){ //Drop Table if Exists
51 | hbase.disableTable(TABLE_NAME);
52 | hbase.deleteTable(TABLE_NAME);
53 | }
54 |
55 | HTableDescriptor wordcount = new HTableDescriptor(TABLE_NAME);
56 | hbase.createTable(wordcount);
57 | // Cannot edit a stucture on an active table.
58 | hbase.disableTable(TABLE_NAME);
59 | HColumnDescriptor columnFamily = new HColumnDescriptor(COLUMN_FAMILY);
60 | hbase.addColumn(TABLE_NAME, columnFamily);
61 | hbase.enableTable(TABLE_NAME);
62 |
63 | hbase.close();
64 | }
65 |
66 | private static Get mkGet(String word) throws IOException {
67 | log.debug(String.format("Creating Get for %s", word));
68 |
69 | Get g = new Get(Bytes.toBytes(word));
70 | g.addFamily(COLUMN_FAMILY);
71 | return g;
72 | }
73 |
74 | private static Put mkPut(WordCount w){
75 | log.debug(String.format("Creating Put for %s", w.word));
76 |
77 | Put p = new Put(w.word);
78 | // add integer compression here
79 | // convert 2-d byte array to 1-d byte array
80 | byte[] storage = new byte[NUM_INTERVALS*Integer.SIZE/Byte.SIZE];
81 | for(int i=0; i< NUM_INTERVALS; i++){
82 | for(int j=0; j getWordCount(String word) throws IOException {
112 | HTableInterface words = pool.getTable(TABLE_NAME);
113 | Get g = mkGet(word);
114 | Result result = words.get(g);
115 | if (result.isEmpty()) {
116 | log.info(String.format("word %s not found.", word));
117 | return null;
118 | }
119 |
120 | List wordCounts = WordCount.GetWordCountFromResults(result);
121 | words.close();
122 | return wordCounts;
123 | }
124 |
125 | public void deleteUser(String word) throws IOException {
126 | HTableInterface words = pool.getTable(TABLE_NAME);
127 |
128 | Delete d = mkDel(word);
129 | words.delete(d);
130 |
131 | words.close();
132 | }
133 |
134 | public static class WordCount{
135 | public byte[] word;
136 | public byte[] column_id;
137 | public byte[][] count;
138 |
139 | public WordCount(byte[] word, byte[] column_id){
140 | this.word = word;
141 | this.column_id = column_id;
142 | this.count = new byte[NUM_INTERVALS][];
143 | for(int i=0; i < NUM_INTERVALS; i++){
144 | this.count[i] = Bytes.toBytes(0);
145 | }
146 | }
147 |
148 | public WordCount(String word, String column_id){
149 | this.word = Bytes.toBytes(word);
150 | this.column_id = Bytes.toBytes(column_id);
151 | this.count = new byte[NUM_INTERVALS][];
152 | for(int i=0; i < NUM_INTERVALS; i++){
153 | this.count[i] = Bytes.toBytes(0);
154 | }
155 | }
156 |
157 | private WordCount(byte[] word, byte[] column_id, byte[][] count){
158 | this.word = word;
159 | this.column_id = column_id;
160 | this.count = count;
161 | }
162 |
163 | public static List GetWordCountFromResults(Result r){
164 | List wordCounts = new ArrayList();
165 | byte[] word = r.getRow();
166 | // Map from column qualifiers to values
167 | NavigableMap familyMap = r.getFamilyMap(COLUMN_FAMILY);
168 | for(byte[] column: familyMap.keySet()){
169 | byte[] value = familyMap.get(column);
170 | // decompression
171 | byte[][] count = new byte[NUM_INTERVALS][Integer.SIZE/Byte.SIZE];
172 | for(int i=0; i {
11 | private static final String DATE_FORMAT = "EEE MMM d k:m:s ZZZZZ yyyy"; // "Fri Mar 29 11:03:41 +0000 2013";
12 | private static final SimpleDateFormat DATE_PARSER = new SimpleDateFormat(DATE_FORMAT);
13 |
14 | public Long exec(Tuple input) throws IOException {
15 | if (input == null || input.size() == 0) {
16 | return -1L;
17 | }
18 |
19 | String createdAt = (String) input.get(0);
20 | long epoch;
21 | try {
22 | epoch = DATE_PARSER.parse(createdAt).getTime() / 1000;
23 | } catch (ParseException e) {
24 | epoch = -1L;
25 | }
26 |
27 | return epoch;
28 | }
29 | }
--------------------------------------------------------------------------------
/twitter-tools-hadoop/src/main/java/cc/twittertools/piggybank/GetLatitude.java:
--------------------------------------------------------------------------------
1 | package cc.twittertools.piggybank;
2 |
3 | import java.io.IOException;
4 | import java.util.Iterator;
5 |
6 | import org.apache.pig.EvalFunc;
7 | import org.apache.pig.data.DataBag;
8 | import org.apache.pig.data.Tuple;
9 |
10 | // Sample usage: cc.twittertools.piggybank.GetLatitude($0#'geo'#'coordinates')
11 | public class GetLatitude extends EvalFunc {
12 | public String exec(Tuple input) throws IOException {
13 | DataBag bag = (DataBag) input.get(0);
14 | Iterator it = bag.iterator();
15 | if (!it.hasNext()) {
16 | return null;
17 | }
18 | Tuple tup = it.next();
19 |
20 | return (String) tup.get(0);
21 | }
22 | }
--------------------------------------------------------------------------------
/twitter-tools-hadoop/src/main/java/cc/twittertools/piggybank/GetLongitude.java:
--------------------------------------------------------------------------------
1 | package cc.twittertools.piggybank;
2 |
3 | import java.io.IOException;
4 | import java.util.Iterator;
5 |
6 | import org.apache.pig.EvalFunc;
7 | import org.apache.pig.data.DataBag;
8 | import org.apache.pig.data.Tuple;
9 |
10 | // Sample usage: cc.twittertools.piggybank.GetLongitude($0#'geo'#'coordinates');
11 | public class GetLongitude extends EvalFunc {
12 | public String exec(Tuple input) throws IOException {
13 | DataBag bag = (DataBag) input.get(0);
14 | Iterator it = bag.iterator();
15 | if (!it.hasNext()) {
16 | return null;
17 | }
18 | it.next();
19 | if (!it.hasNext()) {
20 | return null;
21 | }
22 |
23 | Tuple tup = it.next();
24 |
25 | return (String) tup.get(0);
26 | }
27 | }
--------------------------------------------------------------------------------
/twitter-tools-hadoop/src/main/java/cc/twittertools/piggybank/IsMap.java:
--------------------------------------------------------------------------------
1 | package cc.twittertools.piggybank;
2 |
3 | import java.io.IOException;
4 | import java.util.Map;
5 |
6 | import org.apache.pig.FilterFunc;
7 | import org.apache.pig.data.Tuple;
8 |
9 | public class IsMap extends FilterFunc {
10 |
11 | @Override
12 | public Boolean exec(Tuple input) throws IOException {
13 | if (input == null || input.size() == 0) {
14 | return false;
15 | }
16 |
17 | return (input.get(0) instanceof Map);
18 | }
19 | }
--------------------------------------------------------------------------------
/twitter-tools-hadoop/src/main/java/cc/twittertools/udf/GetDate.java:
--------------------------------------------------------------------------------
1 | package cc.twittertools.udf;
2 |
3 | import java.io.IOException;
4 | import java.util.regex.Matcher;
5 | import java.util.regex.Pattern;
6 | import org.apache.lucene.analysis.Tokenizer;
7 | import org.apache.lucene.analysis.TokenStream;
8 | import org.apache.pig.EvalFunc;
9 | import org.apache.pig.data.Tuple;
10 | import cc.twittertools.index.LowerCaseEntityPreservingFilter;
11 | import org.apache.lucene.analysis.core.WhitespaceTokenizer;
12 |
13 | public class GetDate extends EvalFunc{
14 |
15 | public String exec(Tuple input) throws IOException {
16 | if(input == null || input.size() == 0){
17 | return null;
18 | }
19 | //Standard Time Format: Tue Feb 08 23:59:59 +0000 2011
20 | try{
21 | String str = (String) input.get(0);
22 | String[] groups = str.split("\\s+");
23 | String year = groups[5];
24 | String month = groups[1];
25 | String day= groups[2];
26 | return year+" "+month+" "+day;
27 | }catch(Exception e){
28 | throw new IOException("caught exception",e);
29 | }
30 | }
31 |
32 | }
33 |
--------------------------------------------------------------------------------
/twitter-tools-hadoop/src/main/java/cc/twittertools/udf/GetInterval.java:
--------------------------------------------------------------------------------
1 | package cc.twittertools.udf;
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.pig.EvalFunc;
6 | import org.apache.pig.data.Tuple;
7 |
8 | public class GetInterval extends EvalFunc{
9 | public String exec(Tuple input) throws IOException {
10 | if(input == null || input.size() == 0){
11 | return null;
12 | }
13 | //Standard Time Format: Tue Feb 08 23:59:59 +0000 2011
14 | try{
15 | String str = (String) input.get(0);
16 | String[] groups = str.split("\\s+");
17 | String time = groups[3];
18 | String[] timeGroups= time.split(":");
19 | int interval = (Integer.valueOf(timeGroups[0]))*12 + (Integer.valueOf(timeGroups[1])/5);
20 | return String.valueOf(interval);
21 | }catch(Exception e){
22 | throw new IOException("caught exception",e);
23 | }
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/twitter-tools-hadoop/src/main/java/cc/twittertools/udf/LuceneTokenizer.java:
--------------------------------------------------------------------------------
1 | package cc.twittertools.udf;
2 |
3 | import java.io.IOException;
4 | import java.io.StringReader;
5 | import java.util.StringTokenizer;
6 |
7 | import org.apache.lucene.analysis.Analyzer;
8 | import org.apache.lucene.analysis.TokenStream;
9 | import org.apache.lucene.analysis.Tokenizer;
10 | import org.apache.lucene.analysis.core.WhitespaceTokenizer;
11 | import org.apache.lucene.analysis.en.PorterStemFilter;
12 | import org.apache.lucene.analysis.standard.StandardTokenizer;
13 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
14 | import org.apache.lucene.util.Version;
15 | import org.apache.pig.EvalFunc;
16 | import org.apache.pig.data.BagFactory;
17 | import org.apache.pig.data.DataBag;
18 | import org.apache.pig.data.Tuple;
19 | import org.apache.pig.data.TupleFactory;
20 |
21 | import cc.twittertools.index.LowerCaseEntityPreservingFilter;
22 |
23 | public class LuceneTokenizer extends EvalFunc{
24 | TupleFactory mTupleFactory = TupleFactory.getInstance();
25 | BagFactory mBagFactory = BagFactory.getInstance();
26 |
27 | public DataBag exec(Tuple input) throws IOException{
28 | try {
29 | DataBag output = mBagFactory.newDefaultBag();
30 | Object o = input.get(0);
31 | if (!(o instanceof String)) {
32 | throw new IOException("Expected input to be chararray, but got " + o.getClass().getName());
33 | }
34 | Tokenizer source = new WhitespaceTokenizer(Version.LUCENE_43, new StringReader((String)o));
35 | TokenStream tokenstream = new LowerCaseEntityPreservingFilter(source);
36 | tokenstream.reset();
37 | while (tokenstream.incrementToken()){
38 | String token = tokenstream.getAttribute(CharTermAttribute.class).toString();
39 | output.add(mTupleFactory.newTuple(token));
40 | }
41 | return output;
42 | } catch (Exception e) {
43 | // error handling goes here
44 | throw new IOException("caught exception",e);
45 | }
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/twitter-tools-hadoop/wordcountbytime.pig:
--------------------------------------------------------------------------------
1 | register 'jar/elephant-bird-core-4.5.jar';
2 | register 'jar/elephant-bird-pig-4.5.jar';
3 | register 'jar/elephant-bird-hadoop-compat-4.5.jar';
4 | register 'jar/json-simple-1.1.1.jar';
5 | register 'jar/twitter-tools-hadoop-1.0-SNAPSHOT.jar';
6 | register 'jar/twitter-tools-core-1.4.3-SNAPSHOT.jar';
7 | register 'jar/lucene-core-4.8.0.jar';
8 | register 'jar/lucene-analyzers-common-4.8.0.jar';
9 | register 'jar/twitter-text-1.9.0.jar';
10 |
11 | raw = load '/shared/collections/Tweets2011/20110208-099.json.gz' using com.twitter.elephantbird.pig.load.JsonLoader('-nestedLoad');
12 |
13 | a = foreach raw generate $0#'created_at',$0#'text';
14 | b = foreach a generate cc.twittertools.udf.GetDate($0), cc.twittertools.udf.GetInterval($0), flatten(cc.twittertools.udf.LuceneTokenizer($1));
15 | c = group b by ($0,$1,$2);
16 | d = foreach c generate flatten(group),COUNT(b);
17 |
18 | store d into 'wordcount';
19 |
--------------------------------------------------------------------------------
/twitter-tools-rm3/README.md:
--------------------------------------------------------------------------------
1 | microblog-demos
2 | ===============
3 |
4 | Examples of using the [2013 TREC microblog API](http://twittertools.cc/). Basically clones IndriRunQuery.
5 |
6 | Getting Started
7 | --------------
8 |
9 | Once you've cloned the repository, build the package with Maven:
10 |
11 | ```
12 | $ mvn clean package appassembler:assemble
13 | ```
14 |
15 | Appassembler will automatically generate a launch scripts for:
16 |
17 | + `target/appassembler/bin/RunQueries`: baseline run. with or without RM3 feedback
18 |
19 | To automatically generate project files for Eclipse:
20 |
21 | ```
22 | $ mvn eclipse:clean
23 | $ mvn eclipse:eclipse
24 | ```
25 |
26 | You can then use Eclipse's Import "Existing Projects into Workspace" functionality to import the project.
27 |
28 |
29 | Invoking Sample Runs
30 | --------------------
31 | After building, you can run the sample programs via somthing like this:
32 |
33 | ```
34 | $ sh ./target/appassembler/bin/RunQueries ./config/params_run.json
35 | ```
36 |
37 | which will run a simple baseline query likelihood retrieval. All runnable programs are in ./target/appassembler/bin/ . Also, all programs take a single argument: a JSON-formatted file that will look something like this:
38 | ```
39 | {
40 | "queries" : "./data/topics.microblog2012.txt",
41 | "host" : "",
42 | "port" : 9090,
43 | "num_results" : 1000,
44 | "fb_docs" : 0,
45 | "fb_terms" : 0,
46 | "group" : "",
47 | "token" : "",
48 | "runtag" : ""
49 | }
50 | ```
51 |
52 | Hopefully these variables are self-explanatory. Setting either `fb_docs` or `fb_terms` to 0 gives a run with no feedback. If both of these
53 | are set >0, pseudo-feedback using RM3 is used.
54 |
55 | License
56 | -------
57 |
58 | Licensed under the Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0
59 |
--------------------------------------------------------------------------------
/twitter-tools-rm3/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | mvn clean package appassembler:assemble
3 | rm target/appassembler/bin/*bat
4 | chmod +x ./target/appassembler/bin/*
5 |
--------------------------------------------------------------------------------
/twitter-tools-rm3/config/run_params_sample.json:
--------------------------------------------------------------------------------
1 | {
2 | "queries" : "./data/topics.microblog2011.json",
3 | "stopper" : "./data/stoplist.twitter",
4 | "fb_docs" : 50,
5 | "fb_terms" : 20,
6 | "host" : "",
7 | "port" : 9090,
8 | "num_results" : 1000,
9 | "group" : "",
10 | "token" : "",
11 | "runtag" : ""
12 | }
13 |
--------------------------------------------------------------------------------
/twitter-tools-rm3/data/stoplist.twitter:
--------------------------------------------------------------------------------
1 | de
2 | en
3 | new
4 | y
5 | i'm
6 | el
7 | que
8 | tinyurl.com
9 | en
10 | t.co
11 | rt
12 | ow.ly
13 | bit.ly
14 | twitpic
15 | http
16 | html
17 | www
18 | https
19 | com
20 | php
21 | htm
22 | free
23 | cfm
24 | asp
25 | jsp
26 | a
27 | about
28 | above
29 | according
30 | across
31 | after
32 | afterwards
33 | again
34 | against
35 | albeit
36 | all
37 | almost
38 | alone
39 | along
40 | already
41 | also
42 | although
43 | always
44 | am
45 | among
46 | amongst
47 | an
48 | and
49 | another
50 | any
51 | anybody
52 | anyhow
53 | anyone
54 | anything
55 | anyway
56 | anywhere
57 | apart
58 | are
59 | around
60 | as
61 | at
62 | av
63 | be
64 | became
65 | because
66 | become
67 | becomes
68 | becoming
69 | been
70 | before
71 | beforehand
72 | behind
73 | being
74 | below
75 | beside
76 | besides
77 | between
78 | beyond
79 | both
80 | but
81 | by
82 | can
83 | cannot
84 | canst
85 | certain
86 | cf
87 | choose
88 | contrariwise
89 | cos
90 | could
91 | cu
92 | day
93 | do
94 | does
95 | doesn't
96 | doing
97 | dost
98 | doth
99 | double
100 | down
101 | dual
102 | during
103 | each
104 | either
105 | else
106 | elsewhere
107 | enough
108 | et
109 | etc
110 | even
111 | ever
112 | every
113 | everybody
114 | everyone
115 | everything
116 | everywhere
117 | except
118 | excepted
119 | excepting
120 | exception
121 | exclude
122 | excluding
123 | exclusive
124 | far
125 | farther
126 | farthest
127 | few
128 | ff
129 | first
130 | for
131 | formerly
132 | forth
133 | forward
134 | from
135 | front
136 | further
137 | furthermore
138 | furthest
139 | get
140 | go
141 | had
142 | halves
143 | hardly
144 | has
145 | hast
146 | hath
147 | have
148 | he
149 | hence
150 | henceforth
151 | her
152 | here
153 | hereabouts
154 | hereafter
155 | hereby
156 | herein
157 | hereto
158 | hereupon
159 | hers
160 | herself
161 | him
162 | himself
163 | hindmost
164 | his
165 | hither
166 | hitherto
167 | how
168 | however
169 | howsoever
170 | i
171 | ie
172 | if
173 | in
174 | inasmuch
175 | inc
176 | include
177 | included
178 | including
179 | indeed
180 | indoors
181 | inside
182 | insomuch
183 | instead
184 | into
185 | inward
186 | inwards
187 | is
188 | it
189 | its
190 | itself
191 | just
192 | kind
193 | kg
194 | km
195 | last
196 | latter
197 | latterly
198 | less
199 | lest
200 | let
201 | like
202 | little
203 | ltd
204 | many
205 | may
206 | maybe
207 | me
208 | meantime
209 | meanwhile
210 | might
211 | moreover
212 | most
213 | mostly
214 | more
215 | mr
216 | mrs
217 | ms
218 | much
219 | must
220 | my
221 | myself
222 | namely
223 | need
224 | neither
225 | never
226 | nevertheless
227 | next
228 | no
229 | nobody
230 | none
231 | nonetheless
232 | noone
233 | nope
234 | nor
235 | not
236 | nothing
237 | notwithstanding
238 | now
239 | nowadays
240 | nowhere
241 | of
242 | off
243 | often
244 | ok
245 | on
246 | once
247 | one
248 | only
249 | onto
250 | or
251 | other
252 | others
253 | otherwise
254 | ought
255 | our
256 | ours
257 | ourselves
258 | out
259 | outside
260 | over
261 | own
262 | per
263 | perhaps
264 | plenty
265 | provide
266 | quite
267 | rather
268 | really
269 | round
270 | said
271 | sake
272 | same
273 | sang
274 | save
275 | saw
276 | see
277 | seeing
278 | seem
279 | seemed
280 | seeming
281 | seems
282 | seen
283 | seldom
284 | selves
285 | sent
286 | several
287 | shalt
288 | she
289 | should
290 | shown
291 | sideways
292 | since
293 | slept
294 | slew
295 | slung
296 | slunk
297 | smote
298 | so
299 | some
300 | somebody
301 | somehow
302 | someone
303 | something
304 | sometime
305 | sometimes
306 | somewhat
307 | somewhere
308 | spake
309 | spat
310 | spoke
311 | spoken
312 | sprang
313 | sprung
314 | stave
315 | staves
316 | still
317 | such
318 | supposing
319 | than
320 | that
321 | the
322 | thee
323 | their
324 | them
325 | themselves
326 | then
327 | thence
328 | thenceforth
329 | there
330 | thereabout
331 | thereabouts
332 | thereafter
333 | thereby
334 | therefore
335 | therein
336 | thereof
337 | thereon
338 | thereto
339 | thereupon
340 | these
341 | they
342 | this
343 | those
344 | thou
345 | though
346 | thrice
347 | through
348 | throughout
349 | thru
350 | thus
351 | thy
352 | thyself
353 | till
354 | to
355 | together
356 | too
357 | toward
358 | towards
359 | ugh
360 | unable
361 | under
362 | underneath
363 | unless
364 | unlike
365 | until
366 | up
367 | upon
368 | upward
369 | upwards
370 | us
371 | use
372 | used
373 | using
374 | very
375 | via
376 | vs
377 | want
378 | was
379 | we
380 | week
381 | well
382 | were
383 | what
384 | whatever
385 | whatsoever
386 | when
387 | whence
388 | whenever
389 | whensoever
390 | where
391 | whereabouts
392 | whereafter
393 | whereas
394 | whereat
395 | whereby
396 | wherefore
397 | wherefrom
398 | wherein
399 | whereinto
400 | whereof
401 | whereon
402 | wheresoever
403 | whereto
404 | whereunto
405 | whereupon
406 | wherever
407 | wherewith
408 | whether
409 | whew
410 | which
411 | whichever
412 | whichsoever
413 | while
414 | whilst
415 | whither
416 | who
417 | whoa
418 | whoever
419 | whole
420 | whom
421 | whomever
422 | whomsoever
423 | whose
424 | whosoever
425 | why
426 | will
427 | wilt
428 | with
429 | within
430 | without
431 | worse
432 | worst
433 | would
434 | wow
435 | ye
436 | yet
437 | year
438 | yippee
439 | you
440 | your
441 | yours
442 | yourself
443 | yourselves
444 |
--------------------------------------------------------------------------------
/twitter-tools-rm3/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | 4.0.0
3 | edu.illinois.lis
4 | twitter-tools-rm3
5 | jar
6 | 0.1-SNAPSHOT
7 | twitter-tools-rm3
8 | demo classes for using the TREC 2013 Microblog API
9 | http://people.lis.illinois.edu/~mefron/
10 |
11 |
12 |
13 | The Apache Software License, Version 2.0
14 | http://www.apache.org/licenses/LICENSE-2.0.txt
15 | repo
16 |
17 |
18 |
19 |
20 | scm:git:git@github.com:milesefron/microblog-demos.git
21 | scm:git:git@github.com:milesefron/microblog-demos.git
22 | git@github.com:milesefron/microblog-demos.git
23 |
24 |
25 |
26 |
27 | milesefron
28 | Miles Efron
29 | mefron@illinois.edu
30 |
31 |
32 |
33 |
34 | org.sonatype.oss
35 | oss-parent
36 | 7
37 |
38 |
39 |
40 |
41 |
42 | org.codehaus.mojo
43 | appassembler-maven-plugin
44 | 1.3.1
45 |
46 |
47 |
48 | edu.illinois.lis.search.RunQueries
49 | RunQueries
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 | UTF-8
59 | UTF-8
60 |
61 |
62 |
63 |
64 | junit
65 | junit
66 | 4.11
67 | test
68 |
69 |
70 | commons-cli
71 | commons-cli
72 | 1.2
73 |
74 |
75 | commons-io
76 | commons-io
77 | 2.4
78 |
79 |
80 | org.apache.commons
81 | commons-math3
82 | 3.2
83 |
84 |
85 | cc.twittertools
86 | twitter-tools-core
87 | 1.4.2
88 |
89 |
90 |
91 |
--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/feedback/FeedbackModel.java:
--------------------------------------------------------------------------------
1 | package edu.illinois.lis.feedback;
2 |
3 | import java.text.DecimalFormat;
4 | import java.util.Collections;
5 | import java.util.HashMap;
6 | import java.util.Iterator;
7 | import java.util.List;
8 | import java.util.Map;
9 |
10 | import cc.twittertools.thrift.gen.TResult;
11 | import edu.illinois.lis.document.FeatureVector;
12 | import edu.illinois.lis.query.GQuery;
13 | import edu.illinois.lis.utils.KeyValuePair;
14 | import edu.illinois.lis.utils.ScorableComparator;
15 | import edu.illinois.lis.utils.Stopper;
16 |
17 |
18 | public abstract class FeedbackModel {
19 | protected List relDocs;
20 | protected GQuery originalQuery;
21 | protected int fbDocCount = 20;
22 | protected int fbTermCount = 20;
23 | protected List features; // these will be KeyValuePair objects
24 | protected Stopper stopper;
25 |
26 |
27 |
28 | public void build(Stopper stopper) {
29 | this.stopper = stopper;
30 | }
31 |
32 |
33 |
34 | public GQuery asGquery() {
35 | GQuery newQuery = new GQuery();
36 | newQuery.setTitle(originalQuery.getTitle());
37 | newQuery.setText(originalQuery.getText());
38 |
39 | FeatureVector finalVector = new FeatureVector(stopper);
40 |
41 | ScorableComparator comparator = new ScorableComparator(true);
42 | Collections.sort(features, comparator);
43 | Iterator it = features.iterator();
44 |
45 | int i=0;
46 | while(it.hasNext() && i++ < fbTermCount) {
47 | KeyValuePair tuple = it.next();
48 | finalVector.addTerm(tuple.getKey(), tuple.getScore());
49 | }
50 |
51 | newQuery.setFeatureVector(finalVector);
52 |
53 | return newQuery;
54 | }
55 |
56 | public FeatureVector asFeatureVector() {
57 | FeatureVector f = new FeatureVector(stopper);
58 | Iterator it = features.iterator();
59 |
60 | while(it.hasNext()) {
61 | KeyValuePair tuple = it.next();
62 | f.addTerm(tuple.getKey(), tuple.getScore());
63 | }
64 |
65 | return f;
66 | }
67 |
68 | public Map asMap() {
69 | Map map = new HashMap(features.size());
70 | Iterator it = features.iterator();
71 | while(it.hasNext()) {
72 | KeyValuePair tuple = it.next();
73 | map.put(tuple.getKey(), tuple.getScore());
74 | }
75 |
76 | return map;
77 | }
78 |
79 | @Override
80 | public String toString() {
81 | return toString(features.size());
82 | }
83 |
84 | public String toString(int k) {
85 | DecimalFormat format = new DecimalFormat("#.#####################");
86 |
87 |
88 |
89 | ScorableComparator comparator = new ScorableComparator(true);
90 | Collections.sort(features, comparator);
91 |
92 | double sum = 0.0;
93 | Iterator it = features.iterator();
94 | int i=0;
95 | while(it.hasNext() && i++ < k) {
96 | sum += it.next().getScore();
97 | }
98 |
99 | StringBuilder b = new StringBuilder();
100 | it = features.iterator();
101 | i=0;
102 | while(it.hasNext() && i++ < k) {
103 | KeyValuePair tuple = it.next();
104 | b.append(format.format(tuple.getScore()/sum) + " " + tuple.getKey() + "\n");
105 | }
106 |
107 | return b.toString();
108 | }
109 |
110 |
111 | public void setRes(List relDocs) {
112 | this.relDocs = relDocs;
113 | }
114 | public void setOriginalQuery(GQuery originalQuery) {
115 | this.originalQuery = originalQuery;
116 | }
117 | public void setFbTermCount(int fbTermCount) {
118 | this.fbTermCount = fbTermCount;
119 | }
120 |
121 |
122 |
123 |
124 | }
125 |
--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/feedback/FeedbackRelevanceModel.java:
--------------------------------------------------------------------------------
1 | package edu.illinois.lis.feedback;
2 |
3 | import java.util.HashSet;
4 | import java.util.Iterator;
5 | import java.util.LinkedList;
6 | import java.util.List;
7 | import java.util.Set;
8 |
9 | import cc.twittertools.thrift.gen.TResult;
10 |
11 | import edu.illinois.lis.document.FeatureVector;
12 | import edu.illinois.lis.utils.Stopper;
13 | import edu.illinois.lis.utils.KeyValuePair;
14 |
15 |
16 |
17 |
18 | public class FeedbackRelevanceModel extends FeedbackModel {
19 | private boolean stripNumbers = false;
20 | private double[] docWeights = null;
21 |
22 | @Override
23 | public void build(Stopper stopper) {
24 | this.stopper = stopper;
25 | try {
26 | Set vocab = new HashSet();
27 | List fbDocVectors = new LinkedList();
28 |
29 |
30 |
31 | double[] rsvs = new double[relDocs.size()];
32 | int k=0;
33 | Iterator hitIterator = relDocs.iterator();
34 | while(hitIterator.hasNext()) {
35 | TResult hit = hitIterator.next();
36 | rsvs[k++] = hit.getRsv();
37 | }
38 |
39 | hitIterator = relDocs.iterator();
40 | while(hitIterator.hasNext()) {
41 | TResult hit = hitIterator.next();
42 | String text = hit.getText().toLowerCase();
43 | FeatureVector docVector = new FeatureVector(text, stopper);
44 | vocab.addAll(docVector.getFeatures());
45 | fbDocVectors.add(docVector);
46 | }
47 |
48 | features = new LinkedList();
49 |
50 |
51 | Iterator it = vocab.iterator();
52 | while(it.hasNext()) {
53 | String term = it.next();
54 | double fbWeight = 0.0;
55 |
56 | Iterator docIT = fbDocVectors.iterator();
57 | k=0;
58 | while(docIT.hasNext()) {
59 | double docWeight = 1.0;
60 | if(docWeights != null)
61 | docWeight = docWeights[k];
62 | FeatureVector docVector = docIT.next();
63 | double docProb = docVector.getFeaturetWeight(term) / docVector.getLength();
64 | docProb *= rsvs[k++] * docWeight;
65 |
66 | fbWeight += docProb;
67 | }
68 |
69 | fbWeight /= (double)fbDocVectors.size();
70 |
71 | KeyValuePair tuple = new KeyValuePair(term, fbWeight);
72 | features.add(tuple);
73 | }
74 |
75 |
76 |
77 | } catch (Exception e) {
78 | e.printStackTrace();
79 | }
80 | }
81 |
82 | public void setDocWeights(double[] docWeights) {
83 | this.docWeights = docWeights;
84 | }
85 |
86 |
87 | }
88 |
--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/query/GQueries.java:
--------------------------------------------------------------------------------
1 | package edu.illinois.lis.query;
2 |
3 | import java.util.Iterator;
4 |
5 | /**
6 | * A container for holding a bunch of GQuery objects, with various types of convenience functionality added in
7 | * instantiating classes.
8 | *
9 | * @author Miles Efron
10 | *
11 | */
12 | public interface GQueries {
13 | public void read(String pathToQueries);
14 |
15 | public Iterator iterator();
16 |
17 | public GQuery getIthQuery(int i);
18 |
19 | public GQuery getNamedQuery(String queryName);
20 |
21 | public int numQueries();
22 | }
23 |
--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/query/GQueriesJsonImpl.java:
--------------------------------------------------------------------------------
1 | package edu.illinois.lis.query;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.FileReader;
5 | import java.util.ArrayList;
6 | import java.util.HashMap;
7 | import java.util.Iterator;
8 | import java.util.List;
9 | import java.util.Map;
10 |
11 | import org.apache.log4j.Logger;
12 |
13 |
14 | import com.google.gson.JsonArray;
15 | import com.google.gson.JsonElement;
16 | import com.google.gson.JsonObject;
17 | import com.google.gson.JsonParser;
18 |
19 | import edu.illinois.lis.document.FeatureVector;
20 |
21 |
22 | /**
23 | * reads and holds GQueries stored as a serialized JSON file on disk.
24 | *
25 | * @author Miles Efron
26 | *
27 | */
28 | public class GQueriesJsonImpl implements GQueries {
29 | private static final Logger LOG = Logger.getLogger(GQueriesJsonImpl.class);
30 |
31 | private static final JsonParser JSON_PARSER = new JsonParser();
32 | private List queryList;
33 | private Map nameToIndex;
34 |
35 | public void read(String pathToQueries) {
36 | JsonObject obj = null;
37 | try {
38 | obj = (JsonObject) JSON_PARSER.parse(new BufferedReader(new FileReader(pathToQueries)));
39 | } catch (Exception e) {
40 | LOG.fatal("died reading queries from json file", e);
41 | System.exit(-1);
42 | }
43 |
44 |
45 | JsonArray queryObjectArray = obj.getAsJsonArray("queries");
46 | queryList = new ArrayList(queryObjectArray.size());
47 | nameToIndex = new HashMap(queryList.size());
48 | Iterator queryObjectIterator = queryObjectArray.iterator();
49 | int k=0;
50 | while(queryObjectIterator.hasNext()) {
51 | JsonObject queryObject = (JsonObject) queryObjectIterator.next();
52 | String title = queryObject.get("title").getAsString();
53 | String text = queryObject.get("text").getAsString();
54 | double epoch = queryObject.get("epoch").getAsDouble();
55 | long querytweettime = queryObject.get("querytweettime").getAsLong();
56 | nameToIndex.put(title, k++);
57 | FeatureVector featureVector = new FeatureVector(null);
58 | JsonArray modelObjectArray = queryObject.getAsJsonArray("model");
59 | Iterator featureIterator = modelObjectArray.iterator();
60 | while(featureIterator.hasNext()) {
61 | JsonObject featureObject = (JsonObject)featureIterator.next();
62 | double weight = featureObject.get("weight").getAsDouble();
63 | String feature = featureObject.get("feature").getAsString();
64 | featureVector.addTerm(feature, weight);
65 | }
66 |
67 |
68 | GQuery gQuery = new GQuery();
69 | gQuery.setTitle(title);
70 | gQuery.setText(text);
71 | gQuery.setEpoch(epoch);
72 | gQuery.setQuerytweettime(querytweettime);
73 | gQuery.setFeatureVector(featureVector);
74 |
75 | queryList.add(gQuery);
76 |
77 | }
78 | }
79 |
80 | public GQuery getIthQuery(int i) {
81 | if(queryList == null || i >= queryList.size()) {
82 | LOG.fatal("died trying to get query number " + i + " when we have only " + queryList.size() + " queries.");
83 | System.exit(-1);
84 | }
85 | return queryList.get(i);
86 | }
87 |
88 | public GQuery getNamedQuery(String queryName) {
89 | if(queryList == null || ! nameToIndex.containsKey(queryName)) {
90 | LOG.fatal("died trying to get query " + queryName + ".");
91 | System.exit(-1); }
92 | return queryList.get(nameToIndex.get(queryName));
93 | }
94 |
95 |
96 | public Iterator iterator() {
97 | return queryList.iterator();
98 | }
99 |
100 | public int numQueries() {
101 | return queryList.size();
102 | }
103 |
104 | @Override
105 | public String toString() {
106 | StringBuilder b = new StringBuilder();
107 |
108 | Iterator it = queryList.iterator();
109 | while(it.hasNext()) {
110 | b.append(it.next());
111 | }
112 |
113 | return b.toString();
114 | }
115 |
116 |
117 |
118 | }
119 |
--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/query/GQuery.java:
--------------------------------------------------------------------------------
1 | package edu.illinois.lis.query;
2 |
3 |
4 | import java.util.HashMap;
5 | import java.util.Map;
6 |
7 | import edu.illinois.lis.document.FeatureVector;
8 |
9 |
10 | /**
11 | * a fairly rich representation of a query (or query-like) object. at a minimum, it will typically contain a
12 | * name some text.
13 | *
14 | * @author Miles Efron
15 | *
16 | */
17 | public class GQuery {
18 | private String name;
19 | private String text;
20 | private double epoch = -1.0;
21 | private long querytweettime = -1L;
22 | private FeatureVector featureVector;
23 |
24 |
25 | public String getTitle() {
26 | return name;
27 | }
28 | public String getText() {
29 | return text;
30 | }
31 | public void setTitle(String name) {
32 | this.name = name;
33 | }
34 | public void setText(String text) {
35 | this.text = text;
36 | }
37 | public void setEpoch(double epoch) {
38 | this.epoch = epoch;
39 | }
40 | public void setQuerytweettime(long querytweettime) {
41 | this.querytweettime = querytweettime;
42 | }
43 | public double getEpoch() {
44 | return epoch;
45 | }
46 | public long getQuerytweettime() {
47 | return querytweettime;
48 | }
49 |
50 |
51 | public FeatureVector getFeatureVector() {
52 | return featureVector;
53 | }
54 | public void setFeatureVector(FeatureVector featureVector) {
55 | this.featureVector = featureVector;
56 | }
57 |
58 | }
59 |
--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/query/TrecTemporalTopic.java:
--------------------------------------------------------------------------------
1 | package edu.illinois.lis.query;
2 |
3 | import com.google.common.base.Preconditions;
4 |
5 | public class TrecTemporalTopic {
6 | private String query;
7 | private String id;
8 | private long time;
9 | private double epoch;
10 |
11 | public TrecTemporalTopic(String id, String query, long time, double epoch) {
12 | this.id = Preconditions.checkNotNull(id);
13 | this.query = Preconditions.checkNotNull(query);
14 | Preconditions.checkArgument(time > 0);
15 | this.time = time;
16 | Preconditions.checkArgument(epoch > 0);
17 | this.epoch = epoch;
18 | }
19 |
20 | public String getId() {
21 | return id;
22 | }
23 |
24 | public String getQuery() {
25 | return query;
26 | }
27 |
28 | public long getQueryTweetTime() {
29 | return time;
30 | }
31 |
32 | public double getEpoch() {
33 | return epoch;
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/query/TrecTemporalTopicSet.java:
--------------------------------------------------------------------------------
1 | package edu.illinois.lis.query;
2 |
3 | import java.io.File;
4 | import java.io.IOException;
5 | import java.text.ParseException;
6 | import java.text.SimpleDateFormat;
7 | import java.util.Iterator;
8 | import java.util.List;
9 | import java.util.regex.Matcher;
10 | import java.util.regex.Pattern;
11 |
12 | import com.google.common.base.Charsets;
13 | import com.google.common.base.Joiner;
14 | import com.google.common.base.Preconditions;
15 | import com.google.common.collect.Lists;
16 | import com.google.common.io.Files;
17 |
18 | public class TrecTemporalTopicSet implements Iterable{
19 | private List queries = Lists.newArrayList();
20 |
21 | private TrecTemporalTopicSet() {}
22 |
23 | private void add(TrecTemporalTopic q) {
24 | queries.add(q);
25 | }
26 |
27 | public Iterator iterator() {
28 | return queries.iterator();
29 | }
30 |
31 | private static final String DATE_FORMAT = "EEE MMM d k:m:s ZZZZZ yyyy"; //"Fri Mar 29 11:03:41 +0000 2013";
32 |
33 | private static final Pattern TOP_PATTERN = Pattern.compile("", Pattern.DOTALL);
34 | private static final Pattern NUM_PATTERN = Pattern.compile(" Number: (MB\\d+) ", Pattern.DOTALL);
35 |
36 | // TREC 2011 topics uses tag
37 | private static final Pattern TITLE_PATTERN = Pattern.compile("\\s*(.*?)\\s*", Pattern.DOTALL);
38 | // TREC 2012 topics use tag
39 | private static final Pattern TITLE_PATTERN2 = Pattern.compile("\\s*(.*?)\\s*", Pattern.DOTALL);
40 |
41 | private static final Pattern TIMESTAMP_PATTERN = Pattern.compile("\\s*(.*?)\\s*", Pattern.DOTALL);
42 |
43 | private static final Pattern TWEETTIME_PATTERN = Pattern.compile("\\s*(\\d+)\\s*", Pattern.DOTALL);
44 |
45 |
46 | public static TrecTemporalTopicSet fromFile(File f) throws IOException {
47 | Preconditions.checkNotNull(f);
48 | Preconditions.checkArgument(f.exists());
49 |
50 | String s = Joiner.on("\n").join(Files.readLines(f, Charsets.UTF_8));
51 | TrecTemporalTopicSet queries = new TrecTemporalTopicSet();
52 |
53 | Matcher matcher = TOP_PATTERN.matcher(s);
54 | while (matcher.find()) {
55 | String top = matcher.group(0);
56 |
57 |
58 | Matcher m = NUM_PATTERN.matcher(top);
59 | if (!m.find()) {
60 | throw new IOException("Error parsing " + f);
61 | }
62 | String id = m.group(1);
63 | // Topics from 2012 are inconsistently numbered,
64 | // e.g., MB051 should match the qrels, which has MB51
65 | if (id.matches("MB0\\d\\d")) {
66 | id = id.replace("MB0", "MB");
67 | }
68 |
69 | m = TITLE_PATTERN.matcher(top);
70 | if (!m.find()) {
71 | m = TITLE_PATTERN2.matcher(top);
72 | if (!m.find()) {
73 | throw new IOException("Error parsing " + f);
74 | }
75 | }
76 | String text = m.group(1);
77 |
78 | m = TIMESTAMP_PATTERN.matcher(top);
79 | if (!m.find()) {
80 | throw new IOException("Error parsing " + f);
81 | }
82 | double epoch = -1.0;
83 | try {
84 | epoch = (new SimpleDateFormat(DATE_FORMAT)).parse(m.group(1)).getTime() / 1000;
85 | } catch (ParseException e) {
86 | epoch = -1.0;
87 | }
88 |
89 | m = TWEETTIME_PATTERN.matcher(top);
90 | if (!m.find()) {
91 | throw new IOException("Error parsing " + f);
92 | }
93 | long time = Long.parseLong(m.group(1));
94 |
95 |
96 |
97 | queries.add(new TrecTemporalTopic(id, text, time, epoch));
98 | }
99 | return queries;
100 | }
101 | }
102 |
--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/rerank/SearchReranker.java:
--------------------------------------------------------------------------------
1 | package edu.illinois.lis.rerank;
2 |
3 | import java.util.Collections;
4 | import java.util.List;
5 |
6 |
7 | import cc.twittertools.thrift.gen.TResult;
8 |
9 |
10 | public abstract class SearchReranker {
11 | protected List results;
12 |
13 | protected abstract void score();
14 |
15 | public List getReranked() {
16 | TResultComparator comparator = new TResultComparator(true);
17 | Collections.sort(results, comparator);
18 | return results;
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/rerank/TResultComparator.java:
--------------------------------------------------------------------------------
1 | package edu.illinois.lis.rerank;
2 |
3 | import java.util.Comparator;
4 |
5 | import cc.twittertools.thrift.gen.TResult;
6 |
7 |
8 | public class TResultComparator implements Comparator{
9 | private boolean decreasing = true;
10 |
11 | public TResultComparator(boolean decreasing) {
12 | this.decreasing = decreasing;
13 | }
14 | public int compare(TResult x, TResult y) {
15 | double xVal = x.getRsv();
16 | double yVal = y.getRsv();
17 |
18 | if(decreasing) {
19 | return (xVal > yVal ? -1 : (xVal == yVal ? 0 : 1));
20 | } else {
21 | return (xVal < yVal ? -1 : (xVal == yVal ? 0 : 1));
22 | }
23 |
24 | }
25 |
26 | }
27 |
--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/search/RunQueries.java:
--------------------------------------------------------------------------------
1 | package edu.illinois.lis.search;
2 |
3 | import java.io.PrintStream;
4 | import java.util.Iterator;
5 | import java.util.List;
6 |
7 |
8 |
9 |
10 |
11 |
12 | import cc.twittertools.search.api.TrecSearchThriftClient;
13 | import cc.twittertools.thrift.gen.TResult;
14 | import edu.illinois.lis.document.FeatureVector;
15 | import edu.illinois.lis.feedback.FeedbackRelevanceModel;
16 | import edu.illinois.lis.query.GQueries;
17 | import edu.illinois.lis.query.GQueriesJsonImpl;
18 | import edu.illinois.lis.query.GQuery;
19 | import edu.illinois.lis.utils.ParameterBroker;
20 | import edu.illinois.lis.utils.Stopper;
21 |
22 | public class RunQueries {
23 | private static final String DEFAULT_RUNTAG = "lucene4lm";
24 |
25 | private static final String HOST_OPTION = "host";
26 | private static final String PORT_OPTION = "port";
27 | private static final String QUERIES_OPTION = "queries";
28 | private static final String STOPPER_OPTION = "stopper";
29 | private static final String FB_DOCS_OPTION = "fb_docs";
30 | private static final String FB_TERMS_OPTION = "fb_terms";
31 | private static final String NUM_RESULTS_OPTION = "num_results";
32 | private static final String GROUP_OPTION = "group";
33 | private static final String TOKEN_OPTION = "token";
34 | private static final String RUNTAG_OPTION = "runtag";
35 |
36 | private static final double ORIG_QUERY_WEIGHT = 0.5;
37 |
38 | private RunQueries() {}
39 |
40 | public static void main(String[] args) throws Exception {
41 | ParameterBroker params = new ParameterBroker(args[0]);
42 |
43 | PrintStream out = new PrintStream(System.out, true, "UTF-8");
44 | PrintStream err = new PrintStream(System.err, true, "UTF-8");
45 |
46 | GQueries queries = new GQueriesJsonImpl();
47 | queries.read(params.getParamValue(QUERIES_OPTION));
48 |
49 | Stopper stopper = null;
50 | if(params.getParamValue(STOPPER_OPTION) != null)
51 | stopper = new Stopper(params.getParamValue(STOPPER_OPTION));
52 |
53 | // max number of docs to send to output
54 | int numResults = 1000;
55 | try {
56 | if (params.getParamValue(NUM_RESULTS_OPTION) != null) {
57 | numResults = Integer.parseInt(params.getParamValue(NUM_RESULTS_OPTION));
58 | }
59 | } catch (NumberFormatException e) {
60 | err.println("Invalid " + NUM_RESULTS_OPTION + ": " + params.getParamValue(NUM_RESULTS_OPTION));
61 | System.exit(-1);
62 | }
63 |
64 | int fbDocs = 0;
65 | try {
66 | if (params.getParamValue(FB_DOCS_OPTION) != null) {
67 | fbDocs = Integer.parseInt(params.getParamValue(FB_DOCS_OPTION));
68 | }
69 | } catch (NumberFormatException e) {
70 | err.println("Invalid " + FB_DOCS_OPTION + ": " + params.getParamValue(FB_DOCS_OPTION));
71 | System.exit(-1);
72 | }
73 |
74 | int fbTerms = 0;
75 | try {
76 | if (params.getParamValue(FB_TERMS_OPTION) != null) {
77 | fbTerms = Integer.parseInt(params.getParamValue(FB_TERMS_OPTION));
78 | }
79 | } catch (NumberFormatException e) {
80 | err.println("Invalid " + FB_TERMS_OPTION + ": " + params.getParamValue(FB_TERMS_OPTION));
81 | System.exit(-1);
82 | }
83 |
84 | // authentication credentials
85 | String group = params.getParamValue(GROUP_OPTION);
86 | if(group==null) {
87 | err.println("Invalid " + GROUP_OPTION + ": must set a valid group ID");
88 | System.exit(-1);
89 | }
90 | String token = params.getParamValue(TOKEN_OPTION);
91 | if(group==null) {
92 | err.println("Invalid " + TOKEN_OPTION + ": must set a valid authentication token");
93 | System.exit(-1);
94 | }
95 |
96 | TrecSearchThriftClient client = new TrecSearchThriftClient(params.getParamValue(HOST_OPTION),
97 | Integer.parseInt(params.getParamValue(PORT_OPTION)), group, token);
98 |
99 | Iterator queryIterator = queries.iterator();
100 | while(queryIterator.hasNext()) {
101 | GQuery query = queryIterator.next();
102 | System.err.println(query.getTitle());
103 | String queryText = query.getText();
104 |
105 | // stupid hack. need to lowercase the query vector
106 | FeatureVector temp = new FeatureVector(null);
107 | Iterator qTerms = query.getFeatureVector().iterator();
108 | while(qTerms.hasNext()) {
109 | String term = qTerms.next();
110 | temp.addTerm(term.toLowerCase(), query.getFeatureVector().getFeaturetWeight(term));
111 | }
112 | temp.normalizeToOne();
113 | query.setFeatureVector(temp);
114 |
115 |
116 | // if we're doing feedback
117 | if(fbDocs > 0 && fbTerms > 0) {
118 | List results = client.search(queryText, query.getQuerytweettime(), fbDocs);
119 | FeedbackRelevanceModel fb = new FeedbackRelevanceModel();
120 | fb.setOriginalQuery(query);
121 | fb.setRes(results);
122 | fb.build(stopper);
123 |
124 | FeatureVector fbVector = fb.asFeatureVector();
125 | fbVector.pruneToSize(fbTerms);
126 | fbVector.normalizeToOne();
127 | fbVector = FeatureVector.interpolate(query.getFeatureVector(), fbVector, ORIG_QUERY_WEIGHT);
128 |
129 | System.err.println(fbVector);
130 |
131 | StringBuilder builder = new StringBuilder();
132 | Iterator terms = fbVector.iterator();
133 | while(terms.hasNext()) {
134 | String term = terms.next();
135 | if(term.length() < 2)
136 | continue;
137 | double prob = fbVector.getFeaturetWeight(term);
138 | builder.append(term + "^" + prob + " ");
139 | }
140 | queryText = builder.toString().trim();
141 |
142 | }
143 |
144 | List results = client.search(queryText, query.getQuerytweettime(), numResults);
145 | String runTag = params.getParamValue(RUNTAG_OPTION);
146 | if(runTag==null)
147 | runTag = DEFAULT_RUNTAG;
148 |
149 | int i = 1;
150 | Iterator hitIterator = results.iterator();
151 | while(hitIterator.hasNext()) {
152 | TResult hit = hitIterator.next();
153 | out.println(String.format("%s Q0 %s %d %f %s", query.getTitle(), hit.getId(), i,
154 | hit.getRsv(), runTag));
155 |
156 | if(i++ >= numResults)
157 | break;
158 | }
159 |
160 | }
161 | out.close();
162 | }
163 | }
164 |
--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/searchsource/IndexWrapperMicroblogApi.java:
--------------------------------------------------------------------------------
1 | package edu.illinois.lis.searchsource;
2 |
3 | import java.util.HashMap;
4 | import java.util.Iterator;
5 | import java.util.List;
6 | import java.util.Map;
7 |
8 |
9 |
10 | import cc.twittertools.search.api.TrecSearchThriftClient;
11 | import cc.twittertools.thrift.gen.TResult;
12 | import edu.illinois.lis.document.FeatureVector;
13 |
14 |
15 |
16 | public class IndexWrapperMicroblogApi {
17 | // API-specific variables
18 | private String hostname;
19 | private int port;
20 | private String groupId;
21 | private String authToken;
22 |
23 | private Map seenDocs; // we store the text of any docs we've harvested. e.g. for FB.
24 |
25 | private TrecSearchThriftClient client;
26 |
27 |
28 | public IndexWrapperMicroblogApi(String hostname, int port, String groupId, String authToken) {
29 | this.hostname = hostname;
30 | this.port = port;
31 | this.groupId = groupId;
32 | this.authToken = authToken;
33 |
34 | seenDocs = new HashMap();
35 |
36 | try {
37 | client = new TrecSearchThriftClient(hostname, port, groupId, authToken);
38 | } catch (Exception e) {
39 |
40 | }
41 | }
42 |
43 | public double docCount() {
44 | return 0;
45 | }
46 |
47 |
48 | public double docFreq(String arg0) {
49 | return 0;
50 | }
51 |
52 | public double termFreq(String arg0) {
53 | return 0;
54 | }
55 |
56 | public double termTokenCount() {
57 | return 0;
58 | }
59 |
60 | public double termTypeCount() {
61 | return 0;
62 | }
63 |
64 | public Object getActualIndex() {
65 | return null;
66 | }
67 |
68 | public FeatureVector getDocVector(String docId) {
69 | if(seenDocs.containsKey(docId))
70 | return new FeatureVector(seenDocs.get(docId), null);
71 |
72 | // we should also be able to ping the API to get docs we haven't already seen
73 | return null;
74 | }
75 |
76 | public List runQuery(String query, long upperBoundTime, int count) {
77 | List results = null;
78 | try {
79 | results = client.search(query,upperBoundTime, count);
80 |
81 | // store our text for future reference
82 | Iterator resultIterator = results.iterator();
83 | while(resultIterator.hasNext()) {
84 | TResult result = resultIterator.next();
85 | seenDocs.put(Long.toString(result.getId()), result.getText());
86 | }
87 | } catch (Exception e) {
88 |
89 | }
90 | return results;
91 | }
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 | }
100 |
--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/utils/ExtractGqueriesFromTrecFormat.java:
--------------------------------------------------------------------------------
1 | package edu.illinois.lis.utils;
2 |
3 |
4 |
5 |
6 | import java.io.File;
7 |
8 | import com.google.gson.Gson;
9 | import com.google.gson.GsonBuilder;
10 | import com.google.gson.JsonArray;
11 | import com.google.gson.JsonObject;
12 |
13 | import edu.illinois.lis.query.TrecTemporalTopicSet;
14 |
15 |
16 | /**
17 | * creates a simple set of gQueries from the official TREC MB topic file
18 | *
19 | * @author Miles Efron
20 | *
21 | */
22 | public class ExtractGqueriesFromTrecFormat {
23 |
24 | private JsonObject outputObjects = null;
25 | private String pathToTrecTopics;
26 |
27 | public ExtractGqueriesFromTrecFormat(String pathToTrecTopics) {
28 | this.pathToTrecTopics = pathToTrecTopics;
29 | outputObjects = new JsonObject();
30 | }
31 |
32 | public void harvest() {
33 | TrecTemporalTopicSet topicsFile = null;
34 | try {
35 | topicsFile = TrecTemporalTopicSet.fromFile(new File(pathToTrecTopics));
36 | } catch (Exception e) {
37 | e.printStackTrace();
38 | }
39 |
40 | JsonArray outputJsonArray = new JsonArray();
41 | for(edu.illinois.lis.query.TrecTemporalTopic query : topicsFile) {
42 |
43 |
44 | JsonObject outputQueryObject = new JsonObject();
45 | outputQueryObject.addProperty("title", query.getId());
46 | outputQueryObject.addProperty("text", query.getQuery());
47 | outputQueryObject.addProperty("epoch", Double.toString(query.getEpoch()));
48 | outputQueryObject.addProperty("querytweettime", Long.toString(query.getQueryTweetTime()));
49 |
50 | String text = query.getQuery();
51 | String[] toks = text.split(" ");
52 |
53 | JsonArray modelArray = new JsonArray();
54 | for(String tok : toks) {
55 | JsonObject tupleObject = new JsonObject();
56 | tupleObject.addProperty("weight", 1.0);
57 | tupleObject.addProperty("feature", tok);
58 | modelArray.add(tupleObject);
59 | }
60 | outputQueryObject.add("model", modelArray);
61 |
62 |
63 | outputJsonArray.add(outputQueryObject);
64 | }
65 | outputObjects.add("queries", outputJsonArray);
66 | }
67 |
68 |
69 | public String toString() {
70 | Gson gson = new GsonBuilder().setPrettyPrinting().create();
71 | String json = gson.toJson(outputObjects);
72 | return json;
73 | }
74 |
75 |
76 |
77 |
78 | public static void main(String[] args) throws Exception {
79 | String trecQueryPath = args[0];
80 |
81 | ExtractGqueriesFromTrecFormat harvester = new ExtractGqueriesFromTrecFormat(trecQueryPath);
82 | harvester.harvest();
83 |
84 | System.out.println(harvester);
85 | }
86 |
87 |
88 |
89 | }
90 |
--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/utils/KeyValuePair.java:
--------------------------------------------------------------------------------
1 | package edu.illinois.lis.utils;
2 |
3 | public class KeyValuePair implements Scorable {
4 | private String key;
5 | private double value;
6 |
7 | public KeyValuePair(String key, double value) {
8 | this.key = key;
9 | this.value = value;
10 | }
11 |
12 | public String getKey() {
13 | return key;
14 | }
15 |
16 | @Override
17 | public String toString() {
18 | StringBuilder b = new StringBuilder(value + "\t" + key);
19 | return b.toString();
20 | }
21 |
22 | public void setScore(double score) {
23 | this.value = score;
24 | }
25 |
26 | public double getScore() {
27 | return value;
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/utils/ListUtils.java:
--------------------------------------------------------------------------------
1 | package edu.illinois.lis.utils;
2 |
3 | import java.util.Iterator;
4 | import java.util.List;
5 |
6 | public class ListUtils {
7 |
8 | public static double[] listToArray(List x) {
9 | double[] a = new double[x.size()];
10 | Iterator it = x.iterator();
11 | int i=0;
12 | while(it.hasNext()) {
13 | a[i++] = it.next();
14 | }
15 | return a;
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/utils/LuceneQuery.java:
--------------------------------------------------------------------------------
1 | package edu.illinois.lis.utils;
2 |
3 | import java.util.Iterator;
4 |
5 | import edu.illinois.lis.document.FeatureVector;
6 | import edu.illinois.lis.query.GQuery;
7 |
8 | public class LuceneQuery {
9 | public static String gQueryToLucene(GQuery gQuery, int k) {
10 | FeatureVector mainVector = new FeatureVector(gQuery.getText(), null);
11 | mainVector.normalizeToOne();
12 | FeatureVector fbVector = gQuery.getFeatureVector();
13 | fbVector.pruneToSize(k);
14 | fbVector.normalizeToOne();
15 | FeatureVector finalVector = FeatureVector.interpolate(mainVector, fbVector, 0.5);
16 | StringBuilder b = new StringBuilder();
17 | Iterator terms = finalVector.iterator();
18 | while(terms.hasNext()) {
19 | String term = terms.next();
20 | double weight = finalVector.getFeaturetWeight(term);
21 | b.append(term + "^" + weight + " ");
22 | }
23 | return b.toString().trim();
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/utils/ParameterBroker.java:
--------------------------------------------------------------------------------
1 | package edu.illinois.lis.utils;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.FileReader;
5 | import java.util.HashMap;
6 | import java.util.Iterator;
7 | import java.util.Map;
8 | import java.util.Map.Entry;
9 | import java.util.Set;
10 |
11 |
12 | import com.google.gson.JsonElement;
13 | import com.google.gson.JsonObject;
14 | import com.google.gson.JsonParser;
15 |
16 | /**
17 | * N.B. All params are stored as strings. It is the responsibility of calling classes to transform into
18 | * appropriate data types.
19 | * e.g. mu = Double.parseDouble(paramBroker.getParamValue("mu")
20 | *
21 | * @author Miles Efron
22 | *
23 | */
24 | public class ParameterBroker {
25 |
26 | private static final JsonParser JSON_PARSER = new JsonParser();
27 | private Map params;
28 |
29 |
30 |
31 | /**
32 | * constructor where we initialize from a json file of structure:
33 | * {
34 | * "param1":"value1",
35 | * "param2":"value2"
36 | * }
37 | *
38 | * @param pathToJson
39 | */
40 | public ParameterBroker(String pathToJson) {
41 | params = new HashMap();
42 | JsonObject json = null;
43 | try {
44 | json = (JsonObject) JSON_PARSER.parse(new BufferedReader(new FileReader(pathToJson)));
45 | } catch (Exception e) {
46 | System.err.println("died trying to parse json file: " + pathToJson);
47 | System.exit(-1);
48 | }
49 |
50 | Set> jsonEntries = json.entrySet();
51 | Iterator> entryIterator = jsonEntries.iterator();
52 | while(entryIterator.hasNext()) {
53 | Entry entry = entryIterator.next();
54 | params.put(entry.getKey(), entry.getValue().getAsString());
55 | System.setProperty(entry.getKey(), entry.getValue().getAsString());
56 | }
57 | }
58 |
59 |
60 | public String getParamValue(String paramName) {
61 | if(!params.containsKey(paramName))
62 | return null;
63 | return params.get(paramName);
64 | }
65 |
66 | public void setParam(String name, String value) {
67 | params.put(name, value);
68 | }
69 |
70 |
71 | }
72 |
--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/utils/Qrels.java:
--------------------------------------------------------------------------------
1 | package edu.illinois.lis.utils;
2 |
3 | import java.io.File;
4 | import java.io.FileReader;
5 | import java.util.HashMap;
6 | import java.util.HashSet;
7 | import java.util.Iterator;
8 | import java.util.List;
9 | import java.util.Map;
10 | import java.util.Set;
11 | import java.util.regex.Pattern;
12 |
13 | import org.apache.commons.io.IOUtils;
14 |
15 | public class Qrels {
16 |
17 | public static final Pattern SPACE_PATTERN = Pattern.compile(" ", Pattern.DOTALL);
18 |
19 | private static final int QUERY_COLUMN = 0;
20 | private static final int DOCNO_COLUMN = 2;
21 | private static final int REL_COLUMN = 3;
22 |
23 | private Map> rel;
24 | private int minRel = 1;
25 |
26 | public Qrels(String pathToQrelsFile) {
27 | try {
28 |
29 | rel = new HashMap>();
30 |
31 | List lines = IOUtils.readLines(new FileReader(new File(pathToQrelsFile)));
32 | Iterator linesIt = lines.iterator();
33 | while(linesIt.hasNext()) {
34 | String[] toks = SPACE_PATTERN.split(linesIt.next());
35 | if(toks==null || toks.length != 4) {
36 | System.err.println("bad qrels line");
37 | continue;
38 | }
39 | String query = toks[QUERY_COLUMN];
40 | String docno = toks[DOCNO_COLUMN];
41 | int r = Integer.parseInt(toks[REL_COLUMN]);
42 | if(r >= minRel) {
43 | Set relDocs = null;
44 | if(!rel.containsKey(query)) {
45 | relDocs = new HashSet();
46 | } else {
47 | relDocs = rel.get(query);
48 | }
49 | relDocs.add(docno);
50 | rel.put(query, relDocs);
51 | } else {
52 | }
53 | }
54 | } catch (Exception e) {
55 | System.err.println("died trying to read qrel file: " + pathToQrelsFile);
56 | System.exit(-1);
57 | }
58 | }
59 |
60 | public boolean isRel(String query, String docno) {
61 | if(!rel.containsKey(query)) {
62 | System.err.println("no relevant documents found for query " + query);
63 | return false;
64 | }
65 | return rel.get(query).contains(docno);
66 | }
67 |
68 | public Set getRelDocs(String query) {
69 | if(!rel.containsKey(query)) {
70 | System.err.println("no relevant documents found for query " + query);
71 | return null;
72 | }
73 | return rel.get(query);
74 | }
75 |
76 | public double numRel(String query) {
77 | if(!rel.containsKey(query)) {
78 | System.err.println("no relevant documents found for query " + query);
79 | return 0.0;
80 | }
81 | return (double)rel.get(query).size();
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/utils/Scorable.java:
--------------------------------------------------------------------------------
1 | package edu.illinois.lis.utils;
2 |
3 | public interface Scorable {
4 |
5 | public void setScore(double score);
6 |
7 | public double getScore();
8 | }
9 |
--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/utils/ScorableComparator.java:
--------------------------------------------------------------------------------
1 | package edu.illinois.lis.utils;
2 |
3 | import java.util.Comparator;
4 |
5 |
6 | public class ScorableComparator implements Comparator{
7 | private boolean decreasing = true;
8 |
9 | public ScorableComparator(boolean decreasing) {
10 | this.decreasing = decreasing;
11 | }
12 | public int compare(Scorable x, Scorable y) {
13 | double xVal = x.getScore();
14 | double yVal = y.getScore();
15 |
16 | if(decreasing) {
17 | return (xVal > yVal ? -1 : (xVal == yVal ? 0 : 1));
18 | } else {
19 | return (xVal < yVal ? -1 : (xVal == yVal ? 0 : 1));
20 | }
21 |
22 | }
23 |
24 | }
25 |
--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/java/edu/illinois/lis/utils/Stopper.java:
--------------------------------------------------------------------------------
1 | package edu.illinois.lis.utils;
2 |
3 | import java.io.FileInputStream;
4 | import java.util.HashSet;
5 | import java.util.Iterator;
6 | import java.util.List;
7 | import java.util.Set;
8 | import java.util.regex.Pattern;
9 |
10 | import org.apache.commons.io.IOUtils;
11 |
12 | public class Stopper {
13 | public static final Pattern SPACE_PATTERN = Pattern.compile(" ", Pattern.DOTALL);
14 | private Set stopwords;
15 |
16 |
17 | public Stopper() {
18 | stopwords = new HashSet();
19 | }
20 |
21 | public Stopper(String pathToStoplist) {
22 | try {
23 | stopwords = new HashSet();
24 |
25 | // assume our stoplist has one stopword per line
26 | List lines = IOUtils.readLines(new FileInputStream(pathToStoplist));
27 | Iterator it = lines.iterator();
28 | while(it.hasNext()) {
29 | stopwords.add(it.next());
30 | }
31 | } catch (Exception e) {
32 | e.printStackTrace();
33 | }
34 | }
35 |
36 | public String apply(String text) {
37 | StringBuilder b = new StringBuilder();
38 | String[] toks = SPACE_PATTERN.split(text);
39 | for(String tok : toks) {
40 | if(! isStopWord(tok))
41 | b.append(tok + " ");
42 | }
43 | return b.toString().trim();
44 | }
45 | public void addStopword(String term) {
46 | stopwords.add(term);
47 | }
48 | public boolean isStopWord(String term) {
49 | return (stopwords.contains(term)) ? true : false;
50 | }
51 |
52 | public Set asSet() {
53 | return stopwords;
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/twitter-tools-rm3/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootLogger=INFO, A1
2 | log4j.appender.A1=org.apache.log4j.ConsoleAppender
3 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout
4 |
5 | # Print the date in ISO 8601 format
6 | log4j.appender.A1.layout.ConversionPattern=%d [%t] %-5p %c{1} - %m%n
7 | log4j.logger.com.ning.http.client=WARN
8 |
--------------------------------------------------------------------------------
/twitter-tools-ttgbaseline/README.md:
--------------------------------------------------------------------------------
1 | microblogTTGBaseline
2 | ====================
3 |
4 | A baseline run using an (empirically determined) Jaccard similarity score to cluster tweets.
5 |
6 | 1. Build with `mvn package`
7 | 2. Set your `host`, `group`, and `package` parameters in `config/run_params.json`. Change any other parameters you want.
8 | 3. Run with `java -cp target/microblogTTGBaseline-0.0.1-SNAPSHOT-jar-with-dependencies.jar edu.gslis.ttg.main.RunTTGBaseline`
9 |
10 | Note: Weighted scoring does not work properly, yet.
11 |
--------------------------------------------------------------------------------
/twitter-tools-ttgbaseline/config/run_params.json:
--------------------------------------------------------------------------------
1 | {
2 | "queries" : "./topics/topics.microblog-2013.json",
3 | "host" : HOST_NAME_HERE,
4 | "training_port" : 9090,
5 | "testing_port" : 9091,
6 | "num_results" : 1000,
7 | "group" : YOUR_GROUP_HERE,
8 | "token" : YOUR_TOKEN_HERE,
9 | "runtag" : "baseline",
10 | "jaccard_step" : 0.1,
11 | "training_queries" : "./topics/topics.ttg-training.json",
12 | "training_clusters" :
13 | "../data/clusters.training.microblog2011-2012.json",
14 | "qrels" :
15 | "../data/qrels.microblog2011-2012.txt",
16 | "evaluation_type" : "unweighted"
17 | }
18 |
--------------------------------------------------------------------------------
/twitter-tools-ttgbaseline/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | 4.0.0
3 | edu.gslis
4 | microblogTTGBaseline
5 | 0.0.1-SNAPSHOT
6 |
7 | microblog TTG baseline
8 | http://maven.apache.org
9 |
10 |
11 | UTF-8
12 |
13 |
14 |
15 | nema-dev.lis.illinois.edu
16 | nema-dev.lis.illinois.edu-releases
17 | http://nema-dev.lis.illinois.edu/artifactory//ir-libs
18 |
19 |
20 | nema-dev.lis.illinois.edu
21 | nema-dev.lis.illinois.edu-snapshots
22 | http://nema-dev.lis.illinois.edu/artifactory//ir-libs
23 |
24 |
25 |
26 |
27 | ir-libs
28 | ir-libs
29 | http://nema-dev.lis.illinois.edu/artifactory/ir-libs/
30 |
31 | true
32 | never
33 |
34 |
35 | true
36 | never
37 |
38 |
39 |
40 |
41 | src
42 |
43 |
44 | maven-compiler-plugin
45 | 3.1
46 |
47 | 1.6
48 | 1.6
49 |
50 |
51 |
52 | maven-assembly-plugin
53 |
54 |
55 | jar-with-dependencies
56 |
57 |
58 |
59 |
60 | simple-command
61 | package
62 |
63 | attached
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 | indri
73 | indri
74 | 0.1
75 |
76 |
77 | edu.gslis
78 | ir-utils
79 | 0.0.1-SNAPSHOT
80 |
81 |
82 | cc.twittertools
83 | twitter-tools-core
84 | 1.4.1
85 |
86 |
87 | cc.twittertools
88 | twitter-tools
89 | 1.3.0
90 |
91 |
92 | com.googlecode.json-simple
93 | json-simple
94 | 1.1
95 |
96 |
97 |
--------------------------------------------------------------------------------
/twitter-tools-ttgbaseline/src/edu/gslis/ttg/clusters/Cluster.java:
--------------------------------------------------------------------------------
1 | package edu.gslis.ttg.clusters;
2 |
3 | import java.util.Arrays;
4 | import java.util.HashSet;
5 | import java.util.Set;
6 |
7 | import edu.gslis.eval.Qrels;
8 | import edu.gslis.queries.GQuery;
9 |
10 | public class Cluster {
11 | private Set members;
12 |
13 | public Cluster() {
14 | members = new HashSet();
15 | }
16 |
17 | public Cluster(long member) {
18 | members = new HashSet();
19 | members.add(member);
20 | }
21 |
22 | public void add(long member) {
23 | members.add(member);
24 | }
25 |
26 | public void add(Set newMembers) {
27 | members.addAll(newMembers);
28 | }
29 |
30 | public Set getMembers() {
31 | return members;
32 | }
33 |
34 | public long getFirstMember() {
35 | return members.iterator().next();
36 | }
37 |
38 | public boolean hasMember(long member) {
39 | return members.contains(member);
40 | }
41 |
42 | public int getWeight(GQuery query, Qrels qrels) {
43 | // hack to change e.g. MB01 to 01
44 | String q = String.valueOf(Integer.parseInt(query.getTitle().substring(2, query.getTitle().length())));
45 |
46 | int weight = 0;
47 | for (long member : members) {
48 | if (qrels.isRel(q, String.valueOf(member))) {
49 | int level = qrels.getRelLevel(q, String.valueOf(member));
50 | weight += level;
51 | }
52 | }
53 | return weight;
54 | }
55 |
56 | @Override
57 | public String toString() {
58 | return Arrays.deepToString(members.toArray());
59 | }
60 |
61 | public int size() {
62 | return members.size();
63 | }
64 |
65 | }
66 |
--------------------------------------------------------------------------------
/twitter-tools-ttgbaseline/src/edu/gslis/ttg/clusters/Clusters.java:
--------------------------------------------------------------------------------
1 | package edu.gslis.ttg.clusters;
2 |
3 | import java.util.HashMap;
4 | import java.util.HashSet;
5 | import java.util.Iterator;
6 | import java.util.Map;
7 | import java.util.Set;
8 |
9 | public class Clusters implements Iterable {
10 | private Set clusters;
11 | private Map clusterMemberLookup;
12 |
13 | public Clusters() {
14 | clusters = new HashSet();
15 | clusterMemberLookup = new HashMap();
16 | }
17 |
18 | public void add(Cluster cluster) {
19 | clusters.add(cluster);
20 | for (long member : cluster.getMembers()) {
21 | clusterMemberLookup.put(member, cluster);
22 | }
23 | }
24 |
25 | public Set getClusters() {
26 | return clusters;
27 | }
28 |
29 | public boolean hasCluster(Cluster cluster) {
30 | return clusters.contains(cluster);
31 | }
32 |
33 | public Cluster findCluster(long member) {
34 | try {
35 | return clusterMemberLookup.get(member);
36 | } catch (NullPointerException e) {
37 | return null;
38 | }
39 | }
40 |
41 | public Set getAllClusteredResults() {
42 | return clusterMemberLookup.keySet();
43 | }
44 |
45 | // Merge cluster 2 into cluster 1 and update the clusterMemberLookup
46 | // Note: only call this function if cluster 1 is already in the clusters set
47 | // (cluster 2 can be new or existing)
48 | public void mergeExistingClusters(Cluster c1, Cluster c2) {
49 | c1.add(c2.getMembers());
50 | clusters.remove(c1);
51 | try {
52 | clusters.remove(c2);
53 | } catch (Exception e) {
54 | System.err.println("Unable to remove cluster 2 from clusters. Might be a new cluster.");
55 | }
56 | clusters.add(c1);
57 |
58 | updateClusterMembership(c1);
59 | }
60 |
61 | // Merge two new clusters into the clusters set
62 | public void mergeNewClusters(Cluster c1, Cluster c2) {
63 | c1.add(c2.getMembers());
64 | clusters.add(c1);
65 |
66 | updateClusterMembership(c1);
67 | }
68 |
69 | public void mergeMembers(long m1, long m2) {
70 | Cluster c1 = findCluster(m1);
71 | Cluster c2 = findCluster(m2);
72 | if (c1 == null && c2 == null) {
73 | c1 = new Cluster(m1);
74 | c2 = new Cluster(m2);
75 | mergeNewClusters(c1, c2);
76 | } else if (c1 == null) { // c2 exists
77 | c1 = new Cluster(m1);
78 | mergeExistingClusters(c2, c1);
79 | } else { // c1 exists
80 | if (c2 == null) {
81 | c2 = new Cluster(m2);
82 | }
83 | mergeExistingClusters(c1, c2);
84 | }
85 | }
86 |
87 | public int size() {
88 | return clusters.size();
89 | }
90 |
91 | @Override
92 | public Iterator iterator() {
93 | return clusters.iterator();
94 | }
95 |
96 | @Override
97 | public String toString() {
98 | String output = "";
99 | output += "[";
100 | Iterator it = clusters.iterator();
101 | while (it.hasNext()) {
102 | Cluster cluster = it.next();
103 | output += cluster.toString();
104 | if (it.hasNext()) {
105 | output += ", ";
106 | }
107 | }
108 | output += "]";
109 | return output;
110 | }
111 |
112 | private void updateClusterMembership(Cluster cluster) {
113 | for (long member : cluster.getMembers()) {
114 | clusterMemberLookup.put(member, cluster);
115 | }
116 | }
117 |
118 | }
119 |
--------------------------------------------------------------------------------
/twitter-tools-ttgbaseline/src/edu/gslis/ttg/clusters/clusterers/SimpleJaccardClusterer.java:
--------------------------------------------------------------------------------
1 | package edu.gslis.ttg.clusters.clusterers;
2 |
3 | import java.util.Iterator;
4 | import java.util.List;
5 | import java.util.NavigableMap;
6 |
7 | import cc.twittertools.thrift.gen.TResult;
8 | import edu.gslis.ttg.clusters.Clusters;
9 | import edu.gslis.ttg.jaccard.JaccardStore;
10 |
11 | public class SimpleJaccardClusterer {
12 |
13 | private List results;
14 | private JaccardStore jaccardScores;
15 |
16 | public SimpleJaccardClusterer(List results) {
17 | this.results = results;
18 | this.jaccardScores = computeJaccardSimilarity();
19 | }
20 |
21 | public Clusters cluster(double threshold) {
22 | Clusters clusters = new Clusters();
23 |
24 | NavigableMap> thresholdPairs = jaccardScores.getDocsGreaterThanScore(threshold);
25 | Iterator pairsIt = thresholdPairs.keySet().iterator();
26 | while (pairsIt.hasNext()) { // for each pair of documents matching this jaccard score
27 | List docPairs = thresholdPairs.get(pairsIt.next());
28 | Iterator docPairIt = docPairs.iterator();
29 | while (docPairIt.hasNext()) { //
30 | long[] docs = docPairIt.next();
31 | clusters.mergeMembers(docs[0], docs[1]);
32 | }
33 | }
34 |
35 | return clusters;
36 | }
37 |
38 | public List getResults() {
39 | return results;
40 | }
41 |
42 | public void setResults(List results) {
43 | this.results = results;
44 | }
45 |
46 | private JaccardStore computeJaccardSimilarity() {
47 | // compute jaccard similarity for each pair of results
48 | JaccardStore scores = new JaccardStore();
49 | for (int j = 0; j < results.size(); j++) {
50 | TResult doc1 = results.get(j);
51 | for (int k = j + 1; k < results.size(); k++) {
52 | TResult doc2 = results.get(k);
53 |
54 | double jaccardSim = JaccardStore.computeJaccardSimilarity(doc1.getText(), doc2.getText());
55 | scores.setScore(doc1.getId(), doc2.getId(), jaccardSim);
56 | }
57 | }
58 |
59 | return scores;
60 | }
61 |
62 | }
63 |
--------------------------------------------------------------------------------
/twitter-tools-ttgbaseline/src/edu/gslis/ttg/jaccard/JaccardStore.java:
--------------------------------------------------------------------------------
1 | package edu.gslis.ttg.jaccard;
2 |
3 | import java.util.ArrayList;
4 | import java.util.Arrays;
5 | import java.util.HashMap;
6 | import java.util.HashSet;
7 | import java.util.List;
8 | import java.util.Map;
9 | import java.util.NavigableMap;
10 | import java.util.Set;
11 | import java.util.TreeMap;
12 |
13 | public class JaccardStore {
14 |
15 | private Map scores; //
16 | private TreeMap> scoreLookup; //
17 |
18 | public JaccardStore() {
19 | scores = new HashMap();
20 | scoreLookup = new TreeMap>();
21 | }
22 |
23 | public double getScore(long doc1, long doc2) {
24 | return scores.get(ordered(doc1, doc2));
25 | }
26 |
27 | public void setScore(long doc1, long doc2, double score) {
28 | scores.put(ordered(doc1, doc2), score);
29 | if (scoreLookup.get(score) == null) {
30 | scoreLookup.put(score, new ArrayList());
31 | }
32 | scoreLookup.get(score).add(ordered(doc1, doc2));
33 | }
34 |
35 | public List getDocsForScore(double score) {
36 | return scoreLookup.get(score);
37 | }
38 |
39 | public NavigableMap> getDocsGreaterThanScore(double score) {
40 | return scoreLookup.tailMap(score, true);
41 | }
42 |
43 | public int size() {
44 | return scores.keySet().size();
45 | }
46 |
47 | private long[] ordered(long doc1, long doc2) {
48 | long[] ordered = new long[2];
49 | if (doc1 < doc2) {
50 | ordered[0] = doc1;
51 | ordered[1] = doc2;
52 | } else {
53 | ordered[0] = doc2;
54 | ordered[1] = doc1;
55 | }
56 | return ordered;
57 | }
58 |
59 | public static double computeJaccardSimilarity(Set doc1, Set doc2) {
60 | Set intersection = new HashSet(doc1);
61 | Set union = new HashSet(doc1);
62 |
63 | intersection.retainAll(doc2);
64 | union.addAll(doc2);
65 |
66 | return intersection.size() / (double) union.size();
67 | }
68 |
69 | public static double computeJaccardSimilarity(String doc1, String doc2) {
70 | String[] docOneTerms = doc1.toLowerCase().split("[^A-Za-z0-9]");
71 | List termList = new ArrayList(Arrays.asList(docOneTerms));
72 | termList.removeAll(Arrays.asList("", null));
73 | Set docOneBag = new HashSet(termList);
74 |
75 | String[] docTwoTerms = doc2.toLowerCase().split("[^A-Za-z0-9]");
76 | termList = new ArrayList(Arrays.asList(docTwoTerms));
77 | termList.removeAll(Arrays.asList("", null));
78 | Set docTwoBag = new HashSet(termList);
79 |
80 | return computeJaccardSimilarity(docOneBag, docTwoBag);
81 | }
82 |
83 | }
84 |
--------------------------------------------------------------------------------
/twitter-tools-ttgbaseline/src/edu/gslis/ttg/searchers/SimpleSearcher.java:
--------------------------------------------------------------------------------
1 | package edu.gslis.ttg.searchers;
2 |
3 | import java.util.HashMap;
4 | import java.util.Iterator;
5 | import java.util.List;
6 | import java.util.Map;
7 |
8 | import cc.twittertools.search.api.TrecSearchThriftClient;
9 | import cc.twittertools.thrift.gen.TResult;
10 | import edu.gslis.queries.GQuery;
11 | import edu.gslis.textrepresentation.FeatureVector;
12 |
13 | public class SimpleSearcher {
14 |
15 | private TrecSearchThriftClient client;
16 | private int maxResults;
17 |
18 | public SimpleSearcher(TrecSearchThriftClient client, int maxResults) {
19 | this.client = client;
20 | this.maxResults = maxResults;
21 | }
22 |
23 |
24 | public Map search(GQuery query) {
25 | // clean up query
26 | String queryText = query.getText();
27 | queryText = queryText.replaceAll("[,'\\.\\?]", " ");
28 | queryText = queryText.replaceAll(" ", " ").trim();
29 |
30 | // need to lowercase the query vector
31 | FeatureVector temp = new FeatureVector(null);
32 | Iterator qTerms = query.getFeatureVector().iterator();
33 | while(qTerms.hasNext()) {
34 | String term = qTerms.next();
35 | temp.addTerm(term.toLowerCase(), query.getFeatureVector().getFeatureWeight(term));
36 | }
37 | temp.normalize();;
38 | query.setFeatureVector(temp);
39 |
40 | System.err.println(query.getTitle()+": "+queryText);
41 |
42 | // perform search
43 | List results = null;
44 | try {
45 | results = client.search(queryText, Long.parseLong(query.getMetadata("querytweettime")), maxResults);
46 | } catch (Exception e) {
47 | System.err.println("Error searching.");
48 | System.exit(-1);
49 | }
50 |
51 | // set cutoff score heuristically
52 | double topScore = results.get(0).getRsv();
53 | double cutOffScore = topScore / 2;
54 |
55 | // record hits, removing duplicates
56 | int i = 1;
57 | Map seenMap = new HashMap();
58 | Iterator hitIterator = results.iterator();
59 | while(hitIterator.hasNext()) {
60 | TResult hit = hitIterator.next();
61 | if (hit.getRsv() < cutOffScore) {
62 | break;
63 | }
64 |
65 | long docId = hit.id;
66 | if (seenMap.containsKey(docId))
67 | continue;
68 | seenMap.put(docId, hit);
69 |
70 | if(i++ >= maxResults)
71 | break;
72 | }
73 |
74 | return seenMap;
75 | }
76 |
77 | }
78 |
--------------------------------------------------------------------------------
/twitter-tools-ttgbaseline/topics/topics.ttg-training.json:
--------------------------------------------------------------------------------
1 | {
2 | "queries": [
3 | {
4 | "title": "MB03",
5 | "text": "Haiti Aristide return",
6 | "epoch": "1.297200733E9",
7 | "querytweettime": "35088534306033665",
8 | "model": [
9 | {
10 | "weight": 1.0,
11 | "feature": "Haiti"
12 | },
13 | {
14 | "weight": 1.0,
15 | "feature": "Aristide"
16 | },
17 | {
18 | "weight": 1.0,
19 | "feature": "return"
20 | }
21 | ]
22 | },
23 | {
24 | "title": "MB21",
25 | "text": "Emanuel residency court rulings",
26 | "epoch": "1.29627021E9",
27 | "querytweettime": "31185639047172097",
28 | "model": [
29 | {
30 | "weight": 1.0,
31 | "feature": "Emanuel"
32 | },
33 | {
34 | "weight": 1.0,
35 | "feature": "residency"
36 | },
37 | {
38 | "weight": 1.0,
39 | "feature": "court"
40 | },
41 | {
42 | "weight": 1.0,
43 | "feature": "rulings"
44 | }
45 | ]
46 | },
47 | {
48 | "title": "MB22",
49 | "text": "healthcare law unconstitutional",
50 | "epoch": "1.296598654E9",
51 | "querytweettime": "32563233118224385",
52 | "model": [
53 | {
54 | "weight": 1.0,
55 | "feature": "healthcare"
56 | },
57 | {
58 | "weight": 1.0,
59 | "feature": "law"
60 | },
61 | {
62 | "weight": 1.0,
63 | "feature": "unconstitutional"
64 | }
65 | ]
66 | },
67 | {
68 | "title": "MB26",
69 | "text": "US unemployment",
70 | "epoch": "1.296828651E9",
71 | "querytweettime": "33527910379814912",
72 | "model": [
73 | {
74 | "weight": 1.0,
75 | "feature": "US"
76 | },
77 | {
78 | "weight": 1.0,
79 | "feature": "unemployment"
80 | }
81 | ]
82 | },
83 | {
84 | "title": "MB42",
85 | "text": "Holland Iran envoy recall",
86 | "epoch": "1.297111633E9",
87 | "querytweettime": "34714824982134784",
88 | "model": [
89 | {
90 | "weight": 1.0,
91 | "feature": "Holland"
92 | },
93 | {
94 | "weight": 1.0,
95 | "feature": "Iran"
96 | },
97 | {
98 | "weight": 1.0,
99 | "feature": "envoy"
100 | },
101 | {
102 | "weight": 1.0,
103 | "feature": "recall"
104 | }
105 | ]
106 | },
107 | {
108 | "title": "MB51",
109 | "text": "British Government cuts",
110 | "epoch": "1.297209406E9",
111 | "querytweettime": "35124912364457984",
112 | "model": [
113 | {
114 | "weight": 1.0,
115 | "feature": "British"
116 | },
117 | {
118 | "weight": 1.0,
119 | "feature": "Government"
120 | },
121 | {
122 | "weight": 1.0,
123 | "feature": "cuts"
124 | }
125 | ]
126 | },
127 | {
128 | "title": "MB57",
129 | "text": "Chicago blizzard",
130 | "epoch": "1.296683586E9",
131 | "querytweettime": "32919462151720960",
132 | "model": [
133 | {
134 | "weight": 1.0,
135 | "feature": "Chicago"
136 | },
137 | {
138 | "weight": 1.0,
139 | "feature": "blizzard"
140 | }
141 | ]
142 | },
143 | {
144 | "title": "MB66",
145 | "text": "Journalists treatment in Egypt",
146 | "epoch": "1.296865923E9",
147 | "querytweettime": "33684239400566784",
148 | "model": [
149 | {
150 | "weight": 1.0,
151 | "feature": "Journalists"
152 | },
153 | {
154 | "weight": 1.0,
155 | "feature": "treatment"
156 | },
157 | {
158 | "weight": 1.0,
159 | "feature": "in"
160 | },
161 | {
162 | "weight": 1.0,
163 | "feature": "Egypt"
164 | }
165 | ]
166 | },
167 | {
168 | "title": "MB68",
169 | "text": "Charlie Sheen rehab",
170 | "epoch": "1.296591293E9",
171 | "querytweettime": "32532358276063232",
172 | "model": [
173 | {
174 | "weight": 1.0,
175 | "feature": "Charlie"
176 | },
177 | {
178 | "weight": 1.0,
179 | "feature": "Sheen"
180 | },
181 | {
182 | "weight": 1.0,
183 | "feature": "rehab"
184 | }
185 | ]
186 | },
187 | {
188 | "title": "MB88",
189 | "text": "Kings Speech awards",
190 | "epoch": "1.297126104E9",
191 | "querytweettime": "34775520600129536",
192 | "model": [
193 | {
194 | "weight": 1.0,
195 | "feature": "Kings"
196 | },
197 | {
198 | "weight": 1.0,
199 | "feature": "Speech"
200 | },
201 | {
202 | "weight": 1.0,
203 | "feature": "awards"
204 | }
205 | ]
206 | }
207 | ]
208 | }
--------------------------------------------------------------------------------