├── .gitignore ├── CLEF.md ├── CW09.md ├── CW12.md ├── Gov2.md ├── README.md ├── ec2-setup.md ├── eval ├── gdeval ├── statAP_MQ_eval_v3.pl └── trec_eval.9.0.tar.gz ├── resources └── CLEF │ ├── bg_sl.txt │ ├── de_sl.txt │ ├── es_sl.txt │ ├── fa_sl.txt │ ├── fi_sl.txt │ ├── fr_sl.txt │ ├── hu_sl.txt │ ├── it_sl.txt │ ├── nl_sl.txt │ ├── pt_sl.txt │ ├── ru_sl.txt │ └── sv_sl.txt ├── results └── CLEF │ ├── indri │ └── info.txt │ ├── lucene │ └── info.txt │ └── terrier │ └── info.txt ├── runs └── CLEF │ ├── indri │ └── info.txt │ ├── lucene │ └── info.txt │ └── terrier │ └── info.txt ├── systems ├── ATIRE │ ├── README.md │ ├── cw09.sh │ ├── cw12.sh │ ├── dotgov2.sh │ └── setup.sh ├── JASS │ ├── cw09.sh │ ├── cw12.sh │ ├── dotgov2.sh │ └── setup.sh ├── MG4J │ ├── README.md │ ├── cw12-bm25.sh │ ├── cw12-eval-pos.sh │ ├── cw12-eval.sh │ ├── cw12-index-pos.sh │ ├── cw12-index.sh │ ├── genqueries.sh │ ├── genqueriespos.sh │ ├── gensubsets.rb │ ├── gensubsetspos.rb │ ├── gov2-bm25.sh │ ├── gov2-eval-pos.sh │ ├── gov2-eval.sh │ ├── gov2-index-pos.sh │ ├── gov2-index.sh │ └── logback.xml ├── common.sh ├── galago │ ├── dotgov2.sh │ └── make_query_json.py ├── indri │ ├── clean.sh │ ├── dm.pl │ ├── dotgov2.sh │ ├── index-clef.sh │ ├── index-clef_ReadMe.txt │ ├── indexParaSP_bg │ ├── indexParaSP_de │ ├── indexParaSP_es │ ├── indexParaSP_fa │ ├── indexParaSP_fi │ ├── indexParaSP_fr │ ├── indexParaSP_hu │ ├── indexParaSP_it │ ├── indexParaSP_nl │ ├── indexParaSP_pt │ ├── indexParaSP_ru │ ├── indexParaSP_sv │ ├── queryParaLMSP_bg │ ├── queryParaLMSP_de │ ├── queryParaLMSP_es │ ├── queryParaLMSP_fa │ ├── queryParaLMSP_fi │ ├── queryParaLMSP_fr │ ├── queryParaLMSP_hu │ ├── queryParaLMSP_it │ ├── queryParaLMSP_nl │ ├── queryParaLMSP_pt │ ├── queryParaLMSP_ru │ ├── queryParaLMSP_sv │ └── query_LM.sh ├── lucene │ ├── clef.sh │ ├── clef │ │ ├── LICENSE.txt │ │ ├── README.md │ │ ├── pom.xml │ │ └── src │ │ │ ├── main │ │ │ ├── java │ │ │ │ └── it │ │ │ │ │ └── unipd │ │ │ │ │ └── dei │ │ │ │ │ └── ims │ │ │ │ │ └── lucene │ │ │ │ │ └── clef │ │ │ │ │ ├── AnalyzerFactory.java │ │ │ │ │ ├── App.java │ │ │ │ │ ├── applications │ │ │ │ │ ├── BatchRetrieval.java │ │ │ │ │ └── BuildIndex.java │ │ │ │ │ └── parser │ │ │ │ │ ├── ClefDocParser.java │ │ │ │ │ └── ClefQQParser.java │ │ │ └── resources │ │ │ │ ├── logback.xml │ │ │ │ └── lucene-clef.properties │ │ │ └── test │ │ │ ├── java │ │ │ └── it │ │ │ │ └── unipd │ │ │ │ └── dei │ │ │ │ └── ims │ │ │ │ └── lucene │ │ │ │ └── clef │ │ │ │ └── parser │ │ │ │ └── ClefQQParserTest.java │ │ │ └── resources │ │ │ └── topics │ │ │ ├── bg_topics.xml │ │ │ ├── de_topics.xml │ │ │ ├── es_topics.xml │ │ │ ├── fa_topics.xml │ │ │ ├── fi_topics.xml │ │ │ ├── fr_topics.xml │ │ │ ├── hu_topics.xml │ │ │ ├── it_topics.xml │ │ │ ├── nl_topics.xml │ │ │ ├── pt_topics.xml │ │ │ ├── ru_topics.xml │ │ │ └── sv_topics.xml │ ├── clef_experiments.sh │ ├── clef_runs │ ├── dotgov2.sh │ ├── ingester │ │ ├── pom.xml │ │ └── src │ │ │ └── main │ │ │ └── java │ │ │ └── luceneingester │ │ │ ├── Args.java │ │ │ ├── ForceMerge.java │ │ │ ├── IndexStats.java │ │ │ ├── IndexThreads.java │ │ │ ├── NoPositionsTextField.java │ │ │ ├── TrecDriver.java │ │ │ └── TrecIngester.java │ └── lib │ │ ├── lucene-analyzers-common-5.2.1.jar │ │ ├── lucene-backward-codecs-5.2.1.jar │ │ ├── lucene-benchmark-5.2.1.jar │ │ ├── lucene-core-5.2.1.jar │ │ └── lucene-queryparser-5.2.1.jar └── terrier │ ├── clef_experiments.sh │ ├── dotgov2-prox.sh │ ├── dotgov2-qe.sh │ ├── dotgov2-ranker.sh │ └── dotgov2.sh └── topics-and-qrels ├── CLEF ├── qrels │ ├── bg_qrels.txt │ ├── de_qrels.txt │ ├── es_qrels.txt │ ├── fa_qrels.txt │ ├── fi_qrels.txt │ ├── fr_qrels.txt │ ├── hu_qrels.txt │ ├── it_qrels.txt │ ├── nl_qrels.txt │ ├── pt_qrels.txt │ ├── ru_qrels.txt │ └── sv_qrels.txt └── topics │ ├── bg_topics.xml │ ├── de_topics.xml │ ├── es_topics.xml │ ├── fa_topics.xml │ ├── fi_topics.xml │ ├── fr_topics.xml │ ├── hu_topics.xml │ ├── it_topics.xml │ ├── nl_topics.xml │ ├── pt_topics.xml │ ├── ru_topics.xml │ └── sv_topics.xml ├── README.md ├── prels.web.1-50.txt ├── qrels.701-750.txt ├── qrels.751-800.txt ├── qrels.801-850.txt ├── qrels.web.101-150.txt ├── qrels.web.151-200.txt ├── qrels.web.201-250.txt ├── qrels.web.251-300.txt ├── qrels.web.51-100.txt ├── topics.701-750.txt ├── topics.751-800.txt ├── topics.801-850.txt ├── topics.web.1-50.txt ├── topics.web.101-150.txt ├── topics.web.151-200.txt ├── topics.web.201-250.txt ├── topics.web.251-300.txt └── topics.web.51-100.txt /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | eval/trec_eval.9.0/ -------------------------------------------------------------------------------- /CW09.md: -------------------------------------------------------------------------------- 1 | # ClueWeb09 Category B Comparisons 2 | What follows is an initial comparison of selected information retrieval systems on the ClueWeb09 Category B collection using scripts provided by authors/leading contributors of those systems. The systems are listed in alphabetical order. 3 | 4 | ## Indexing 5 | Two metrics for indexing are reported below: the size of the generated index, and the time taken to generate that index. 6 | 7 | System | Type | Size | Time | Terms | Postings | Tokens | 8 | :-------|:------------------|--------:|----------------------:|------:|---------:|--------: 9 | ATIRE | Count | 33.3 GB | 1h 57m | | | | 10 | ATIRE | Count + Quantized | 43.2 GB | 3h 12m | | | | 11 | JASS | | 66.1 GB | ATIRE Quantized + 12m | | | | 12 | 13 | ###### ATIRE 14 | + The quantized index pre-calculates the BM25 scores at indexing time and stores these instead of term frequencies, more about the quantization in ATIRE can be found in [Crane et al. (2013)](http://dl.acm.org/citation.cfm?id=2507860). 15 | + The quantization is performed single threaded although easily parallelized. 16 | + Both indexes were not stemmed. 17 | + Both indexes were pruned of SGML tags, used for typically unused search time features. 18 | + Both indexes postings lists are stored impact ordered, with docids being compressed using variable-byte compression after being delta encoded. 19 | 20 | ## Retrieval 21 | Both retrieval efficiency (by query latency) and effectiveness (MAP@1000) were measured on four query sets: 1-50, 51-100, 101-150, and 151-200. 22 | 23 | ### Retrieval Models 24 | 25 | ###### ATIRE 26 | + ATIRE uses a modified version of BM25, described [here](http://www.cs.otago.ac.nz/homepages/andrew/papers/2012-1.pdf). 27 | + Searching was done using top-k search also described in the above paper. 28 | + This is not early termination, all documents for all terms in the query still get scored. 29 | + BM25 parameters were set to the default for ATIRE, `k1=0.9 b=0.4`. 30 | + Only stopping of tags was performed, this has no effect on search. 31 | 32 | ### Retrieval Latency 33 | The table below shows the average search time across queries by query set. The search times were taken from the internal reporting of each systems. 34 | 35 | System | Model | Index | Topics 1-50 | Topics 51-100 | Topics 101-150 | Topics 151-200 36 | :-------|:---------------|-------------------|------------:|--------------:|---------------:|--------------: 37 | ATIRE | BM25 | Count | | 651ms | 760ms | 452ms 38 | ATIRE | Quantized BM25 | Count + Quantized | | 182ms | 272ms | 179ms 39 | JASS | | | | 175ms | 234ms | 168ms 40 | JASS | 5M Postings | | | 63ms | 105ms | 64ms 41 | 42 | ### Retrieval Effectiveness 43 | The systems generated run files to be consumed by the `trec_eval` tool. Each system was evaluated on the top 1000 results for each query, and the table below shows the MAP scores for the systems. 44 | 45 | System | Model | Index | Topics 1-50 | Topics 51-100 | Topics 101-150 | Topics 151-200 46 | :-------|:---------------|-------------------|------------:|--------------:|---------------:|--------------: 47 | ATIRE | BM25 | Count | | 0.1137 | 0.1082 | 0.0982 48 | ATIRE | Quantized BM25 | Count + Quantized | | 0.1154 | 0.1070 | 0.0998 49 | JASS | | | | 0.1154 | 0.1070 | 0.0998 50 | JASS | 5M Postings | | | 0.1151 | 0.1046 | 0.0973 51 | 52 | ##### Statistical Analysis 53 | 54 | **TODO:** Need to run statistical analyses. 55 | -------------------------------------------------------------------------------- /CW12.md: -------------------------------------------------------------------------------- 1 | # ClueWeb12 B13 Comparisons 2 | What follows is an initial comparison of selected information retrieval systems on the ClueWeb12 B13 collection using scripts provided by authors/leading contributors of those systems. The systems are listed in alphabetical order. 3 | 4 | ## Indexing 5 | Two metrics for indexing are reported below: the size of the generated index, and the time taken to generate that index. 6 | 7 | System | Type | Size | Time | Terms | Postings | Tokens | 8 | :-------|:------------------|--------:|----------------------:|------:|---------:|--------: 9 | ATIRE | Count | 42.4 GB | 3h 03m | | | | 10 | ATIRE | Count + Quantized | 53.4 GB | 4h 25m | | | | 11 | JASS | | 83.2 GB | ATIRE Quantized + 32m | | | | 12 | MG4J | Count | 17 GB | 2h 38m | 133M | 12.7G | | 13 | MG4J | Position | 58 GB | 3h 20m | 133M | 12.7G | 33.8G | 14 | 15 | ###### ATIRE 16 | + The quantized index pre-calculates the BM25 scores at indexing time and stores these instead of term frequencies, more about the quantization in ATIRE can be found in [Crane et al. (2013)](http://dl.acm.org/citation.cfm?id=2507860). 17 | + The quantization is performed single threaded although easily parallelized. 18 | + Both indexes were not stemmed. 19 | + Both indexes were pruned of SGML tags, used for typically unused search time features. 20 | + Both indexes postings lists are stored impact ordered, with docids being compressed using variable-byte compression after being delta encoded. 21 | 22 | ## Retrieval 23 | Both retrieval efficiency (by query latency) and effectiveness (MAP@1000) were measured on two query sets: 201-250, and 251-300. 24 | 25 | ### Retrieval Models 26 | 27 | ###### ATIRE 28 | + ATIRE uses a modified version of BM25, described [here](http://www.cs.otago.ac.nz/homepages/andrew/papers/2012-1.pdf). 29 | + Searching was done using top-k search also described in the above paper. 30 | + This is not early termination, all documents for all terms in the query still get scored. 31 | + BM25 parameters were set to the default for ATIRE, `k1=0.9 b=0.4`. 32 | + Only stopping of tags was performed, this has no effect on search. 33 | 34 | ###### MG4J 35 | 36 | See the description for the [Gov2 runs](Gov2.md). 37 | 38 | ### Retrieval Latency 39 | The table below shows the average search time across queries by query set. The search times were taken from the internal reporting of each systems. 40 | 41 | System | Model | Index | Topics 201-250 | Topics 251-300 42 | :-------|:---------------|-------------------|---------------:|--------------: 43 | ATIRE | BM25 | Count | 809ms | 788ms 44 | ATIRE | Quantized BM25 | Count + Quantized | 290ms | 296ms 45 | JASS | | | 222ms | 261ms 46 | JASS | 5M Postings | | 103ms | 88ms 47 | MG4J | BM25 | Count | 706ms | 570ms 48 | MG4J | Model B | Count | 60ms | 73ms 49 | MG4J | Model B+ | Position | 122ms | 258ms 50 | 51 | ### Retrieval Effectiveness 52 | The systems generated run files to be consumed by the `trec_eval` tool. Each system was evaluated on the top 1000 results for each query, and the table below shows the MAP scores for the systems. 53 | 54 | System | Model | Index | Topics 201-250 | Topics 251-300 55 | :-------|:---------------|-------------------|---------------:|--------------: 56 | ATIRE | BM25 | Count | 0.0439 | 0.0196 57 | ATIRE | Quantized BM25 | Count + Quantized | 0.0429 | 0.0201 58 | JASS | | | 0.0429 | 0.0201 59 | JASS | 5M Postings | | 0.0393 | 0.0193 60 | MG4J | BM25 | Count | 0.0410 | 0.0207 61 | MG4J | Model B | Count | 0.0418 | 0.0206 62 | MG4J | Model B+ | Position | 0.0402 | 0.0166 63 | 64 | ##### Statistical Analysis 65 | 66 | **TODO:** Need to run statistical analyses. 67 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | The Open-Source Information Retrieval Reproducibility Challenge 2 | =============================================================== 3 | 4 | There's a general consensus in the information retrieval community that open-source search engines help enhance dissemination of results and support reproducibility. There's also general agreement that reproducibility is "a good thing". This issue has received some attention recently, including a dedicated track at ECIR 2015. However, we as a community still have a long way to go. 5 | 6 | The goal of this project is to tackle the issue of reproducible *baselines*. This is a more difficult challenge than it seems. Just to provide two examples: Mühleisen et al. (2014) reported large differences in effectiveness across four systems that all purport to implement BM25. Trotman et al. (2014) pointed out that BM25 and query likelihood with Dirichlet smoothing can actually refer to at least half a dozen different variants; in some cases, differences in effectiveness are statistically significant. Given this state of affairs, how can we confidently report comparisons to "baselines" in our papers when even the baselines themselves are ill-defined? Indeed, Armstrong et al. (2009) point to the issue of weak baselines as the reason why ad hoc retrieval techniques haven't really been improving. 7 | 8 | This project started as part of the [SIGIR 2015 Workshop on Reproducibility, Inexplicability, and Generalizability of Results (RIGOR)](https://sites.google.com/site/sigirrigor/), where we will report our initial findings. 9 | 10 | Goals 11 | ----- 12 | 13 | The purpose of this exercise is to invite the *developers* of open-source search engines to provide reproducible baselines of their systems in a common environment on Amazon's EC2 so that the community can have a better understanding of the effectiveness and efficiency differences of various baseline implementations. All results will be archived for future reference by the community. This archive is specifically designed to address the following scenarios: 14 | 15 | 1. I want to evaluate my new technique X. As a baseline, I'll use open-source search engine Y. Or alternatively, I'm building on open-source search engine Y, so I need a baseline anyway. 16 | 17 | 1. How do I know what's a "reasonable" result for system Y on test collection Z? What settings should I use? (Which stopwords list? What retrieval model? What parameter settings? Etc.) How do I know if I've configured system Y correctly? 18 | 19 | 1. Correspondingly, as a reviewer of a paper that describes technique X, how do I know if the baseline is any good? Maybe the authors misconfigured system Y (inadvertently), thereby making their technique "look good" (i.e., it's a weak baseline). 20 | 21 | As a result of this exercise, researchers will be able to go to this resource, and for a number of open-source search engines, they'll learn how to reproduce (through extensive documentation) what the developers of those systems themselves consider to be a reasonable baseline. 22 | 23 | Similarly, reviewers of papers will be able to consult this resource to determine if the baseline the authors used is reasonable or somehow "faulty". 24 | 25 | Another anticipated result of this exercise is that we'll gain a better understanding of why all these supposed "baselines" are different. We can imagine a system-by-feature matrix, where the features range from stemming algorithm to HTML cleaning technique. After this exercise, we'll have a partially-filled matrix, from which we'll be able to hopefully learn some generalizations, for example (completely hypothetical): HTML cleaning really makes a big difference, up to 10% in terms of NDCG; which stemming algorithm you use (Krovetz vs. Porter, etc.) doesn't really matter; etc. 26 | 27 | References 28 | ---------- 29 | 30 | T. Armstrong, A. Moffat, W. Webber, J. Zobel. Improvements That Don't Add Up: Ad-Hoc Retrieval Results Since 1998. CIKM 2009, pages 601-610. 31 | 32 | H. Mühleisen, T. Samar, J. Lin, and A. de Vries. Old Dogs Are Great at New Tricks: Column Stores for IR Prototyping. SIGIR 2014, pages 863-866. 33 | 34 | A. Trotman, A. Puurula, and B. Burgess, Improvements to BM25 and Language Models Examined. ADCS 2014. 35 | -------------------------------------------------------------------------------- /ec2-setup.md: -------------------------------------------------------------------------------- 1 | EC2 Setup 2 | ========= 3 | 4 | For the Gov2 experiments, we are currently running the `r3.4xlarge` instance, with 16 vCPUs and 122 GiB memory, Ubuntu Server 14.04 LTS (HVM). 5 | 6 | After logging in, the instance is first prepped by installing common missing packages: 7 | 8 | ``` 9 | sudo apt-add-repository -y ppa:webupd8team/java 10 | sudo apt-get -y update 11 | sudo apt-get -y install oracle-java8-installer 12 | sudo apt-get -y install emacs24 13 | sudo apt-get -y install make gcc g++ 14 | sudo apt-get -y install git mercurial 15 | sudo apt-get -y install zlibc zlib1g zlib1g-dev 16 | sudo apt-get -y install maven 17 | ``` 18 | 19 | After that, the collection is mounted: 20 | 21 | ``` 22 | sudo mkdir /media/Gov2 23 | sudo mount /dev/xvdf /media/Gov2 24 | ``` 25 | 26 | The collection is held on a [standard](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EBSVolumeTypes.html) EBS volume (i.e., magnetic). According to Amazon, this volume type will deliver 40-90 MiB/s maximum throughput. This should be sufficient for IR engines, but if any system is bumping up against this limit, we'll certainly revisit. 27 | 28 | So, you'll see: 29 | 30 | ``` 31 | $ ls /media/Gov2/data/ 32 | GX000 GX011 GX022 GX033 GX044 GX055 GX066 GX077 GX088 GX099 GX110 GX121 GX132 GX143 GX154 GX165 GX176 GX187 GX198 GX209 GX220 GX231 GX242 GX253 GX264 33 | GX001 GX012 GX023 GX034 GX045 GX056 GX067 GX078 GX089 GX100 GX111 GX122 GX133 GX144 GX155 GX166 GX177 GX188 GX199 GX210 GX221 GX232 GX243 GX254 GX265 34 | GX002 GX013 GX024 GX035 GX046 GX057 GX068 GX079 GX090 GX101 GX112 GX123 GX134 GX145 GX156 GX167 GX178 GX189 GX200 GX211 GX222 GX233 GX244 GX255 GX266 35 | GX003 GX014 GX025 GX036 GX047 GX058 GX069 GX080 GX091 GX102 GX113 GX124 GX135 GX146 GX157 GX168 GX179 GX190 GX201 GX212 GX223 GX234 GX245 GX256 GX267 36 | GX004 GX015 GX026 GX037 GX048 GX059 GX070 GX081 GX092 GX103 GX114 GX125 GX136 GX147 GX158 GX169 GX180 GX191 GX202 GX213 GX224 GX235 GX246 GX257 GX268 37 | GX005 GX016 GX027 GX038 GX049 GX060 GX071 GX082 GX093 GX104 GX115 GX126 GX137 GX148 GX159 GX170 GX181 GX192 GX203 GX214 GX225 GX236 GX247 GX258 GX269 38 | GX006 GX017 GX028 GX039 GX050 GX061 GX072 GX083 GX094 GX105 GX116 GX127 GX138 GX149 GX160 GX171 GX182 GX193 GX204 GX215 GX226 GX237 GX248 GX259 GX270 39 | GX007 GX018 GX029 GX040 GX051 GX062 GX073 GX084 GX095 GX106 GX117 GX128 GX139 GX150 GX161 GX172 GX183 GX194 GX205 GX216 GX227 GX238 GX249 GX260 GX271 40 | GX008 GX019 GX030 GX041 GX052 GX063 GX074 GX085 GX096 GX107 GX118 GX129 GX140 GX151 GX162 GX173 GX184 GX195 GX206 GX217 GX228 GX239 GX250 GX261 GX272 41 | GX009 GX020 GX031 GX042 GX053 GX064 GX075 GX086 GX097 GX108 GX119 GX130 GX141 GX152 GX163 GX174 GX185 GX196 GX207 GX218 GX229 GX240 GX251 GX262 42 | GX010 GX021 GX032 GX043 GX054 GX065 GX076 GX087 GX098 GX109 GX120 GX131 GX142 GX153 GX164 GX175 GX186 GX197 GX208 GX219 GX230 GX241 GX252 GX263 43 | ``` 44 | 45 | Then, the workspace is mounted: 46 | 47 | ``` 48 | sudo mkdir /media/workspace 49 | sudo mount /dev/xvdg /media/workspace 50 | ``` 51 | 52 | The workspace will serve as the location for holding code, indexes, etc. It is a general purpose SSD EBS volume. This is where the IR-Reproducibility repo should reside. 53 | 54 | -------------------------------------------------------------------------------- /eval/trec_eval.9.0.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/IR-Reproducibility/8223ad29e72b20d3610376e22ad84a0e986022f8/eval/trec_eval.9.0.tar.gz -------------------------------------------------------------------------------- /resources/CLEF/bg_sl.txt: -------------------------------------------------------------------------------- 1 | а 2 | автентичен 3 | аз 4 | ако 5 | ала 6 | бе 7 | без 8 | беше 9 | би 10 | бивш 11 | бивша 12 | бившо 13 | бил 14 | била 15 | били 16 | било 17 | благодаря 18 | близо 19 | бъдат 20 | бъде 21 | бяха 22 | в 23 | вас 24 | ваш 25 | ваша 26 | вероятно 27 | вече 28 | взема 29 | ви 30 | вие 31 | винаги 32 | внимава 33 | време 34 | все 35 | всеки 36 | всички 37 | всичко 38 | всяка 39 | във 40 | въпреки 41 | върху 42 | г 43 | ги 44 | главен 45 | главна 46 | главно 47 | глас 48 | го 49 | година 50 | години 51 | годишен 52 | д 53 | да 54 | дали 55 | два 56 | двама 57 | двамата 58 | две 59 | двете 60 | ден 61 | днес 62 | дни 63 | до 64 | добра 65 | добре 66 | добро 67 | добър 68 | докато 69 | докога 70 | дори 71 | досега 72 | доста 73 | друг 74 | друга 75 | други 76 | е 77 | евтин 78 | едва 79 | един 80 | една 81 | еднаква 82 | еднакви 83 | еднакъв 84 | едно 85 | екип 86 | ето 87 | живот 88 | за 89 | забавям 90 | зад 91 | заедно 92 | заради 93 | засега 94 | заспал 95 | затова 96 | защо 97 | защото 98 | и 99 | из 100 | или 101 | им 102 | има 103 | имат 104 | иска 105 | й 106 | каза 107 | как 108 | каква 109 | какво 110 | както 111 | какъв 112 | като 113 | кога 114 | когато 115 | което 116 | които 117 | кой 118 | който 119 | колко 120 | която 121 | къде 122 | където 123 | към 124 | лесен 125 | лесно 126 | ли 127 | лош 128 | м 129 | май 130 | малко 131 | ме 132 | между 133 | мек 134 | мен 135 | месец 136 | ми 137 | много 138 | мнозина 139 | мога 140 | могат 141 | може 142 | мокър 143 | моля 144 | момента 145 | му 146 | н 147 | на 148 | над 149 | назад 150 | най 151 | направи 152 | напред 153 | например 154 | нас 155 | не 156 | него 157 | нещо 158 | нея 159 | ни 160 | ние 161 | никой 162 | нито 163 | нищо 164 | но 165 | нов 166 | нова 167 | нови 168 | новина 169 | някои 170 | някой 171 | няколко 172 | няма 173 | обаче 174 | около 175 | освен 176 | особено 177 | от 178 | отгоре 179 | отново 180 | още 181 | пак 182 | по 183 | повече 184 | повечето 185 | под 186 | поне 187 | поради 188 | после 189 | почти 190 | прави 191 | пред 192 | преди 193 | през 194 | при 195 | пък 196 | първата 197 | първи 198 | първо 199 | пъти 200 | равен 201 | равна 202 | с 203 | са 204 | сам 205 | само 206 | се 207 | сега 208 | си 209 | син 210 | скоро 211 | след 212 | следващ 213 | сме 214 | смях 215 | според 216 | сред 217 | срещу 218 | сте 219 | съм 220 | със 221 | също 222 | т 223 | тази 224 | така 225 | такива 226 | такъв 227 | там 228 | твой 229 | те 230 | тези 231 | ти 232 | т.н. 233 | то 234 | това 235 | тогава 236 | този 237 | той 238 | толкова 239 | точно 240 | три 241 | трябва 242 | тук 243 | тъй 244 | тя 245 | тях 246 | у 247 | утре 248 | харесва 249 | хиляди 250 | ч 251 | часа 252 | че 253 | често 254 | чрез 255 | ще 256 | щом 257 | юмрук 258 | я 259 | як -------------------------------------------------------------------------------- /resources/CLEF/de_sl.txt: -------------------------------------------------------------------------------- 1 | a 2 | ab 3 | aber 4 | aber 5 | ach 6 | acht 7 | achte 8 | achten 9 | achter 10 | achtes 11 | ag 12 | alle 13 | allein 14 | allem 15 | allen 16 | aller 17 | allerdings 18 | alles 19 | allgemeinen 20 | als 21 | als 22 | also 23 | am 24 | an 25 | andere 26 | anderen 27 | andern 28 | anders 29 | au 30 | auch 31 | auch 32 | auf 33 | aus 34 | ausser 35 | außer 36 | ausserdem 37 | außerdem 38 | b 39 | bald 40 | bei 41 | beide 42 | beiden 43 | beim 44 | beispiel 45 | bekannt 46 | bereits 47 | besonders 48 | besser 49 | besten 50 | bin 51 | bis 52 | bisher 53 | bist 54 | c 55 | d 56 | da 57 | dabei 58 | dadurch 59 | dafür 60 | dagegen 61 | daher 62 | dahin 63 | dahinter 64 | damals 65 | damit 66 | danach 67 | daneben 68 | dank 69 | dann 70 | daran 71 | darauf 72 | daraus 73 | darf 74 | darfst 75 | darin 76 | darüber 77 | darum 78 | darunter 79 | das 80 | das 81 | dasein 82 | daselbst 83 | dass 84 | daß 85 | dasselbe 86 | davon 87 | davor 88 | dazu 89 | dazwischen 90 | dein 91 | deine 92 | deinem 93 | deiner 94 | dem 95 | dementsprechend 96 | demgegenüber 97 | demgemäss 98 | demgemäß 99 | demselben 100 | demzufolge 101 | den 102 | denen 103 | denn 104 | denn 105 | denselben 106 | der 107 | deren 108 | derjenige 109 | derjenigen 110 | dermassen 111 | dermaßen 112 | derselbe 113 | derselben 114 | des 115 | deshalb 116 | desselben 117 | dessen 118 | deswegen 119 | d.h 120 | dich 121 | die 122 | diejenige 123 | diejenigen 124 | dies 125 | diese 126 | dieselbe 127 | dieselben 128 | diesem 129 | diesen 130 | dieser 131 | dieses 132 | dir 133 | doch 134 | dort 135 | drei 136 | drin 137 | dritte 138 | dritten 139 | dritter 140 | drittes 141 | du 142 | durch 143 | durchaus 144 | dürfen 145 | dürft 146 | durfte 147 | durften 148 | e 149 | eben 150 | ebenso 151 | ehrlich 152 | ei 153 | ei, 154 | ei, 155 | eigen 156 | eigene 157 | eigenen 158 | eigener 159 | eigenes 160 | ein 161 | einander 162 | eine 163 | einem 164 | einen 165 | einer 166 | eines 167 | einige 168 | einigen 169 | einiger 170 | einiges 171 | einmal 172 | einmal 173 | eins 174 | elf 175 | en 176 | ende 177 | endlich 178 | entweder 179 | entweder 180 | er 181 | Ernst 182 | erst 183 | erste 184 | ersten 185 | erster 186 | erstes 187 | es 188 | etwa 189 | etwas 190 | euch 191 | f 192 | früher 193 | fünf 194 | fünfte 195 | fünften 196 | fünfter 197 | fünftes 198 | für 199 | g 200 | gab 201 | ganz 202 | ganze 203 | ganzen 204 | ganzer 205 | ganzes 206 | gar 207 | gedurft 208 | gegen 209 | gegenüber 210 | gehabt 211 | gehen 212 | geht 213 | gekannt 214 | gekonnt 215 | gemacht 216 | gemocht 217 | gemusst 218 | genug 219 | gerade 220 | gern 221 | gesagt 222 | gesagt 223 | geschweige 224 | gewesen 225 | gewollt 226 | geworden 227 | gibt 228 | ging 229 | gleich 230 | gott 231 | gross 232 | groß 233 | grosse 234 | große 235 | grossen 236 | großen 237 | grosser 238 | großer 239 | grosses 240 | großes 241 | gut 242 | gute 243 | guter 244 | gutes 245 | h 246 | habe 247 | haben 248 | habt 249 | hast 250 | hat 251 | hatte 252 | hätte 253 | hatten 254 | hätten 255 | heisst 256 | her 257 | heute 258 | hier 259 | hin 260 | hinter 261 | hoch 262 | i 263 | ich 264 | ihm 265 | ihn 266 | ihnen 267 | ihr 268 | ihre 269 | ihrem 270 | ihren 271 | ihrer 272 | ihres 273 | im 274 | im 275 | immer 276 | in 277 | in 278 | indem 279 | infolgedessen 280 | ins 281 | irgend 282 | ist 283 | j 284 | ja 285 | ja 286 | jahr 287 | jahre 288 | jahren 289 | je 290 | jede 291 | jedem 292 | jeden 293 | jeder 294 | jedermann 295 | jedermanns 296 | jedoch 297 | jemand 298 | jemandem 299 | jemanden 300 | jene 301 | jenem 302 | jenen 303 | jener 304 | jenes 305 | jetzt 306 | k 307 | kam 308 | kann 309 | kannst 310 | kaum 311 | kein 312 | keine 313 | keinem 314 | keinen 315 | keiner 316 | kleine 317 | kleinen 318 | kleiner 319 | kleines 320 | kommen 321 | kommt 322 | können 323 | könnt 324 | konnte 325 | könnte 326 | konnten 327 | kurz 328 | l 329 | lang 330 | lange 331 | lange 332 | leicht 333 | leide 334 | lieber 335 | los 336 | m 337 | machen 338 | macht 339 | machte 340 | mag 341 | magst 342 | mahn 343 | man 344 | manche 345 | manchem 346 | manchen 347 | mancher 348 | manches 349 | mann 350 | mehr 351 | mein 352 | meine 353 | meinem 354 | meinen 355 | meiner 356 | meines 357 | mensch 358 | menschen 359 | mich 360 | mir 361 | mit 362 | mittel 363 | mochte 364 | möchte 365 | mochten 366 | mögen 367 | möglich 368 | mögt 369 | morgen 370 | muss 371 | muß 372 | müssen 373 | musst 374 | müsst 375 | musste 376 | mussten 377 | n 378 | na 379 | nach 380 | nachdem 381 | nahm 382 | natürlich 383 | neben 384 | nein 385 | neue 386 | neuen 387 | neun 388 | neunte 389 | neunten 390 | neunter 391 | neuntes 392 | nicht 393 | nicht 394 | nichts 395 | nie 396 | niemand 397 | niemandem 398 | niemanden 399 | noch 400 | nun 401 | nun 402 | nur 403 | o 404 | ob 405 | ob 406 | oben 407 | oder 408 | oder 409 | offen 410 | oft 411 | oft 412 | ohne 413 | Ordnung 414 | p 415 | q 416 | r 417 | recht 418 | rechte 419 | rechten 420 | rechter 421 | rechtes 422 | richtig 423 | rund 424 | s 425 | sa 426 | sache 427 | sagt 428 | sagte 429 | sah 430 | satt 431 | schlecht 432 | Schluss 433 | schon 434 | sechs 435 | sechste 436 | sechsten 437 | sechster 438 | sechstes 439 | sehr 440 | sei 441 | sei 442 | seid 443 | seien 444 | sein 445 | seine 446 | seinem 447 | seinen 448 | seiner 449 | seines 450 | seit 451 | seitdem 452 | selbst 453 | selbst 454 | sich 455 | sie 456 | sieben 457 | siebente 458 | siebenten 459 | siebenter 460 | siebentes 461 | sind 462 | so 463 | solang 464 | solche 465 | solchem 466 | solchen 467 | solcher 468 | solches 469 | soll 470 | sollen 471 | sollte 472 | sollten 473 | sondern 474 | sonst 475 | sowie 476 | später 477 | statt 478 | t 479 | tag 480 | tage 481 | tagen 482 | tat 483 | teil 484 | tel 485 | tritt 486 | trotzdem 487 | tun 488 | u 489 | über 490 | überhaupt 491 | übrigens 492 | uhr 493 | um 494 | und 495 | und? 496 | uns 497 | unser 498 | unsere 499 | unserer 500 | unter 501 | v 502 | vergangenen 503 | viel 504 | viele 505 | vielem 506 | vielen 507 | vielleicht 508 | vier 509 | vierte 510 | vierten 511 | vierter 512 | viertes 513 | vom 514 | von 515 | vor 516 | w 517 | wahr? 518 | während 519 | währenddem 520 | währenddessen 521 | wann 522 | war 523 | wäre 524 | waren 525 | wart 526 | warum 527 | was 528 | wegen 529 | weil 530 | weit 531 | weiter 532 | weitere 533 | weiteren 534 | weiteres 535 | welche 536 | welchem 537 | welchen 538 | welcher 539 | welches 540 | wem 541 | wen 542 | wenig 543 | wenig 544 | wenige 545 | weniger 546 | weniges 547 | wenigstens 548 | wenn 549 | wenn 550 | wer 551 | werde 552 | werden 553 | werdet 554 | wessen 555 | wie 556 | wie 557 | wieder 558 | will 559 | willst 560 | wir 561 | wird 562 | wirklich 563 | wirst 564 | wo 565 | wohl 566 | wollen 567 | wollt 568 | wollte 569 | wollten 570 | worden 571 | wurde 572 | würde 573 | wurden 574 | würden 575 | x 576 | y 577 | z 578 | z.b 579 | zehn 580 | zehnte 581 | zehnten 582 | zehnter 583 | zehntes 584 | zeit 585 | zu 586 | zuerst 587 | zugleich 588 | zum 589 | zum 590 | zunächst 591 | zur 592 | zurück 593 | zusammen 594 | zwanzig 595 | zwar 596 | zwar 597 | zwei 598 | zweite 599 | zweiten 600 | zweiter 601 | zweites 602 | zwischen 603 | zwölf 604 | -------------------------------------------------------------------------------- /resources/CLEF/es_sl.txt: -------------------------------------------------------------------------------- 1 | a 2 | acuerdo 3 | adelante 4 | ademas 5 | además 6 | adrede 7 | ahi 8 | ahí 9 | ahora 10 | al 11 | alli 12 | allí 13 | alrededor 14 | antano 15 | antaño 16 | ante 17 | antes 18 | apenas 19 | aproximadamente 20 | aquel 21 | aquél 22 | aquella 23 | aquélla 24 | aquellas 25 | aquéllas 26 | aquello 27 | aquellos 28 | aquéllos 29 | aqui 30 | aquí 31 | arribaabajo 32 | asi 33 | así 34 | aun 35 | aún 36 | aunque 37 | b 38 | bajo 39 | bastante 40 | bien 41 | breve 42 | c 43 | casi 44 | cerca 45 | claro 46 | como 47 | cómo 48 | con 49 | conmigo 50 | contigo 51 | contra 52 | cual 53 | cuál 54 | cuales 55 | cuáles 56 | cuando 57 | cuándo 58 | cuanta 59 | cuánta 60 | cuantas 61 | cuántas 62 | cuanto 63 | cuánto 64 | cuantos 65 | cuántos 66 | d 67 | de 68 | debajo 69 | del 70 | delante 71 | demasiado 72 | dentro 73 | deprisa 74 | desde 75 | despacio 76 | despues 77 | después 78 | detras 79 | detrás 80 | dia 81 | día 82 | dias 83 | días 84 | donde 85 | dónde 86 | dos 87 | durante 88 | e 89 | el 90 | él 91 | ella 92 | ellas 93 | ellos 94 | en 95 | encima 96 | enfrente 97 | enseguida 98 | entre 99 | es 100 | esa 101 | ésa 102 | esas 103 | ésas 104 | ese 105 | ése 106 | eso 107 | esos 108 | ésos 109 | esta 110 | está 111 | ésta 112 | estado 113 | estados 114 | estan 115 | están 116 | estar 117 | estas 118 | éstas 119 | este 120 | éste 121 | esto 122 | estos 123 | éstos 124 | ex 125 | excepto 126 | f 127 | final 128 | fue 129 | fuera 130 | fueron 131 | g 132 | general 133 | gran 134 | h 135 | ha 136 | habia 137 | había 138 | habla 139 | hablan 140 | hace 141 | hacia 142 | han 143 | hasta 144 | hay 145 | horas 146 | hoy 147 | i 148 | incluso 149 | informo 150 | informó 151 | j 152 | junto 153 | k 154 | l 155 | la 156 | lado 157 | las 158 | le 159 | lejos 160 | lo 161 | los 162 | luego 163 | m 164 | mal 165 | mas 166 | más 167 | mayor 168 | me 169 | medio 170 | mejor 171 | menos 172 | menudo 173 | mi 174 | mí 175 | mia 176 | mía 177 | mias 178 | mías 179 | mientras 180 | mio 181 | mío 182 | mios 183 | míos 184 | mis 185 | mismo 186 | mucho 187 | muy 188 | n 189 | nada 190 | nadie 191 | ninguna 192 | no 193 | nos 194 | nosotras 195 | nosotros 196 | nuestra 197 | nuestras 198 | nuestro 199 | nuestros 200 | nueva 201 | nuevo 202 | nunca 203 | o 204 | os 205 | otra 206 | otros 207 | p 208 | pais 209 | paìs 210 | para 211 | parte 212 | pasado 213 | peor 214 | pero 215 | poco 216 | por 217 | porque 218 | pronto 219 | proximo 220 | próximo 221 | puede 222 | q 223 | qeu 224 | que 225 | qué 226 | quien 227 | quién 228 | quienes 229 | quiénes 230 | quiza 231 | quizá 232 | quizas 233 | quizás 234 | r 235 | raras 236 | repente 237 | s 238 | salvo 239 | se 240 | sé 241 | segun 242 | según 243 | ser 244 | sera 245 | será 246 | si 247 | sí 248 | sido 249 | siempre 250 | sin 251 | sobre 252 | solamente 253 | solo 254 | sólo 255 | son 256 | soyos 257 | su 258 | supuesto 259 | sus 260 | suya 261 | suyas 262 | suyo 263 | t 264 | tal 265 | tambien 266 | también 267 | tampoco 268 | tarde 269 | te 270 | temprano 271 | ti 272 | tiene 273 | todavia 274 | todavía 275 | todo 276 | todos 277 | tras 278 | tu 279 | tú 280 | tus 281 | tuya 282 | tuyas 283 | tuyo 284 | tuyos 285 | u 286 | un 287 | una 288 | unas 289 | uno 290 | unos 291 | usted 292 | ustedes 293 | v 294 | veces 295 | vez 296 | vosotras 297 | vosotros 298 | vuestra 299 | vuestras 300 | vuestro 301 | vuestros 302 | w 303 | x 304 | y 305 | ya 306 | yo 307 | z 308 | -------------------------------------------------------------------------------- /resources/CLEF/fa_sl.txt: -------------------------------------------------------------------------------- 1 | و 2 | در 3 | به 4 | از 5 | كه 6 | مي 7 | اين 8 | است 9 | را 10 | با 11 | هاي 12 | براي 13 | آن 14 | يك 15 | شود 16 | شده 17 | خود 18 | ها 19 | كرد 20 | شد 21 | اي 22 | تا 23 | كند 24 | بر 25 | بود 26 | گفت 27 | نيز 28 | وي 29 | هم 30 | كنند 31 | دارد 32 | ما 33 | كرده 34 | يا 35 | اما 36 | بايد 37 | دو 38 | اند 39 | هر 40 | خواهد 41 | او 42 | مورد 43 | آنها 44 | باشد 45 | ديگر 46 | مردم 47 | نمي 48 | بين 49 | پيش 50 | پس 51 | اگر 52 | همه 53 | صورت 54 | يكي 55 | هستند 56 | بي 57 | من 58 | دهد 59 | هزار 60 | نيست 61 | استفاده 62 | داد 63 | داشته 64 | راه 65 | داشت 66 | چه 67 | همچنين 68 | كردند 69 | داده 70 | بوده 71 | دارند 72 | همين 73 | ميليون 74 | سوي 75 | شوند 76 | بيشتر 77 | بسيار 78 | روي 79 | گرفته 80 | هايي 81 | تواند 82 | اول 83 | نام 84 | هيچ 85 | چند 86 | جديد 87 | بيش 88 | شدن 89 | كردن 90 | كنيم 91 | نشان 92 | حتي 93 | اينكه 94 | ولی 95 | توسط 96 | چنين 97 | برخي 98 | نه 99 | ديروز 100 | دوم 101 | درباره 102 | بعد 103 | مختلف 104 | گيرد 105 | شما 106 | گفته 107 | آنان 108 | بار 109 | طور 110 | گرفت 111 | دهند 112 | گذاري 113 | بسياري 114 | طي 115 | بودند 116 | ميليارد 117 | بدون 118 | تمام 119 | كل 120 | تر 121 | براساس 122 | شدند 123 | ترين 124 | امروز 125 | باشند 126 | ندارد 127 | چون 128 | قابل 129 | گويد 130 | ديگري 131 | همان 132 | خواهند 133 | قبل 134 | آمده 135 | اكنون 136 | تحت 137 | طريق 138 | گيري 139 | جاي 140 | هنوز 141 | چرا 142 | البته 143 | كنيد 144 | سازي 145 | سوم 146 | كنم 147 | بلكه 148 | زير 149 | توانند 150 | ضمن 151 | فقط 152 | بودن 153 | حق 154 | آيد 155 | وقتي 156 | اش 157 | يابد 158 | نخستين 159 | مقابل 160 | خدمات 161 | امسال 162 | تاكنون 163 | مانند 164 | تازه 165 | آورد 166 | فكر 167 | آنچه 168 | نخست 169 | نشده 170 | شايد 171 | چهار 172 | جريان 173 | پنج 174 | ساخته 175 | زيرا 176 | نزديك 177 | برداري 178 | كسي 179 | ريزي 180 | رفت 181 | گردد 182 | مثل 183 | آمد 184 | ام 185 | بهترين 186 | دانست 187 | كمتر 188 | دادن 189 | تمامي 190 | جلوگيري 191 | بيشتري 192 | ايم 193 | ناشي 194 | چيزي 195 | آنكه 196 | بالا 197 | بنابراين 198 | ايشان 199 | بعضي 200 | دادند 201 | داشتند 202 | برخوردار 203 | نخواهد 204 | هنگام 205 | نبايد 206 | غير 207 | نبود 208 | ديده 209 | وگو 210 | داريم 211 | چگونه 212 | بندي 213 | خواست 214 | فوق 215 | ده 216 | نوعي 217 | هستيم 218 | ديگران 219 | همچنان 220 | سراسر 221 | ندارند 222 | گروهي 223 | سعي 224 | روزهاي 225 | آنجا 226 | يكديگر 227 | كردم 228 | بيست 229 | بروز 230 | سپس 231 | رفته 232 | آورده 233 | نمايد 234 | باشيم 235 | گويند 236 | زياد 237 | خويش 238 | همواره 239 | گذاشته 240 | شش 241 | نداشته 242 | شناسي 243 | خواهيم 244 | آباد 245 | داشتن 246 | نظير 247 | همچون 248 | باره 249 | نكرده 250 | شان 251 | سابق 252 | هفت 253 | دانند 254 | جايي 255 | بی 256 | جز 257 | زیرِ 258 | رویِ 259 | سریِ 260 | تویِ 261 | جلویِ 262 | پیشِ 263 | عقبِ 264 | بالایِ 265 | خارجِ 266 | وسطِ 267 | بیرونِ 268 | سویِ 269 | کنارِ 270 | پاعینِ 271 | نزدِ 272 | نزدیکِ 273 | دنبالِ 274 | حدودِ 275 | برابرِ 276 | طبقِ 277 | مانندِ 278 | ضدِّ 279 | هنگامِ 280 | برایِ 281 | مثلِ 282 | بارة 283 | اثرِ 284 | تولِ 285 | علّتِ 286 | سمتِ 287 | عنوانِ 288 | قصدِ 289 | روب 290 | جدا 291 | کی 292 | که 293 | چیست 294 | هست 295 | کجا 296 | کجاست 297 | کَی 298 | چطور 299 | کدام 300 | آیا 301 | مگر 302 | چندین 303 | یک 304 | چیزی 305 | دیگر 306 | کسی 307 | بعری 308 | هیچ 309 | چیز 310 | جا 311 | کس 312 | هرگز 313 | یا 314 | تنها 315 | بلکه 316 | خیاه 317 | بله 318 | بلی 319 | آره 320 | آری 321 | مرسی 322 | البتّه 323 | لطفاً 324 | ّه 325 | انکه 326 | وقتیکه 327 | همین 328 | پیش 329 | مدّتی 330 | هنگامی 331 | مان 332 | تان 333 | -------------------------------------------------------------------------------- /resources/CLEF/fr_sl.txt: -------------------------------------------------------------------------------- 1 | a 2 | à 3 | â 4 | abord 5 | afin 6 | ah 7 | ai 8 | aie 9 | ainsi 10 | allaient 11 | allo 12 | allô 13 | allons 14 | après 15 | assez 16 | attendu 17 | au 18 | aucun 19 | aucune 20 | aujourd 21 | aujourd'hui 22 | auquel 23 | aura 24 | auront 25 | aussi 26 | autre 27 | autres 28 | aux 29 | auxquelles 30 | auxquels 31 | avaient 32 | avais 33 | avait 34 | avant 35 | avec 36 | avoir 37 | ayant 38 | b 39 | bah 40 | beaucoup 41 | bien 42 | bigre 43 | boum 44 | bravo 45 | brrr 46 | c 47 | ça 48 | car 49 | ce 50 | ceci 51 | cela 52 | celle 53 | celle-ci 54 | celle-là 55 | celles 56 | celles-ci 57 | celles-là 58 | celui 59 | celui-ci 60 | celui-là 61 | cent 62 | cependant 63 | certain 64 | certaine 65 | certaines 66 | certains 67 | certes 68 | ces 69 | cet 70 | cette 71 | ceux 72 | ceux-ci 73 | ceux-là 74 | chacun 75 | chaque 76 | cher 77 | chère 78 | chères 79 | chers 80 | chez 81 | chiche 82 | chut 83 | ci 84 | cinq 85 | cinquantaine 86 | cinquante 87 | cinquantième 88 | cinquième 89 | clac 90 | clic 91 | combien 92 | comme 93 | comment 94 | compris 95 | concernant 96 | contre 97 | couic 98 | crac 99 | d 100 | da 101 | dans 102 | de 103 | debout 104 | dedans 105 | dehors 106 | delà 107 | depuis 108 | derrière 109 | des 110 | dès 111 | désormais 112 | desquelles 113 | desquels 114 | dessous 115 | dessus 116 | deux 117 | deuxième 118 | deuxièmement 119 | devant 120 | devers 121 | devra 122 | différent 123 | différente 124 | différentes 125 | différents 126 | dire 127 | divers 128 | diverse 129 | diverses 130 | dix 131 | dix-huit 132 | dixième 133 | dix-neuf 134 | dix-sept 135 | doit 136 | doivent 137 | donc 138 | dont 139 | douze 140 | douzième 141 | dring 142 | du 143 | duquel 144 | durant 145 | e 146 | effet 147 | eh 148 | elle 149 | elle-même 150 | elles 151 | elles-mêmes 152 | en 153 | encore 154 | entre 155 | envers 156 | environ 157 | es 158 | ès 159 | est 160 | et 161 | etant 162 | étaient 163 | étais 164 | était 165 | étant 166 | etc 167 | été 168 | etre 169 | être 170 | eu 171 | euh 172 | eux 173 | eux-mêmes 174 | excepté 175 | f 176 | façon 177 | fais 178 | faisaient 179 | faisant 180 | fait 181 | feront 182 | fi 183 | flac 184 | floc 185 | font 186 | g 187 | gens 188 | h 189 | ha 190 | hé 191 | hein 192 | hélas 193 | hem 194 | hep 195 | hi 196 | ho 197 | holà 198 | hop 199 | hormis 200 | hors 201 | hou 202 | houp 203 | hue 204 | hui 205 | huit 206 | huitième 207 | hum 208 | hurrah 209 | i 210 | il 211 | ils 212 | importe 213 | j 214 | je 215 | jusqu 216 | jusque 217 | k 218 | l 219 | la 220 | là 221 | laquelle 222 | las 223 | le 224 | lequel 225 | les 226 | lès 227 | lesquelles 228 | lesquels 229 | leur 230 | leurs 231 | longtemps 232 | lorsque 233 | lui 234 | lui-même 235 | m 236 | ma 237 | maint 238 | mais 239 | malgré 240 | me 241 | même 242 | mêmes 243 | merci 244 | mes 245 | mien 246 | mienne 247 | miennes 248 | miens 249 | mille 250 | mince 251 | moi 252 | moi-même 253 | moins 254 | mon 255 | moyennant 256 | n 257 | na 258 | ne 259 | néanmoins 260 | neuf 261 | neuvième 262 | ni 263 | nombreuses 264 | nombreux 265 | non 266 | nos 267 | notre 268 | nôtre 269 | nôtres 270 | nous 271 | nous-mêmes 272 | nul 273 | o 274 | o| 275 | ô 276 | oh 277 | ohé 278 | olé 279 | ollé 280 | on 281 | ont 282 | onze 283 | onzième 284 | ore 285 | ou 286 | où 287 | ouf 288 | ouias 289 | oust 290 | ouste 291 | outre 292 | p 293 | paf 294 | pan 295 | par 296 | parmi 297 | partant 298 | particulier 299 | particulière 300 | particulièrement 301 | pas 302 | passé 303 | pendant 304 | personne 305 | peu 306 | peut 307 | peuvent 308 | peux 309 | pff 310 | pfft 311 | pfut 312 | pif 313 | plein 314 | plouf 315 | plus 316 | plusieurs 317 | plutôt 318 | pouah 319 | pour 320 | pourquoi 321 | premier 322 | première 323 | premièrement 324 | près 325 | proche 326 | psitt 327 | puisque 328 | q 329 | qu 330 | quand 331 | quant 332 | quanta 333 | quant-à-soi 334 | quarante 335 | quatorze 336 | quatre 337 | quatre-vingt 338 | quatrième 339 | quatrièmement 340 | que 341 | quel 342 | quelconque 343 | quelle 344 | quelles 345 | quelque 346 | quelques 347 | quelqu'un 348 | quels 349 | qui 350 | quiconque 351 | quinze 352 | quoi 353 | quoique 354 | r 355 | revoici 356 | revoilà 357 | rien 358 | s 359 | sa 360 | sacrebleu 361 | sans 362 | sapristi 363 | sauf 364 | se 365 | seize 366 | selon 367 | sept 368 | septième 369 | sera 370 | seront 371 | ses 372 | si 373 | sien 374 | sienne 375 | siennes 376 | siens 377 | sinon 378 | six 379 | sixième 380 | soi 381 | soi-même 382 | soit 383 | soixante 384 | son 385 | sont 386 | sous 387 | stop 388 | suis 389 | suivant 390 | sur 391 | surtout 392 | t 393 | ta 394 | tac 395 | tant 396 | te 397 | té 398 | tel 399 | telle 400 | tellement 401 | telles 402 | tels 403 | tenant 404 | tes 405 | tic 406 | tien 407 | tienne 408 | tiennes 409 | tiens 410 | toc 411 | toi 412 | toi-même 413 | ton 414 | touchant 415 | toujours 416 | tous 417 | tout 418 | toute 419 | toutes 420 | treize 421 | trente 422 | très 423 | trois 424 | troisième 425 | troisièmement 426 | trop 427 | tsoin 428 | tsouin 429 | tu 430 | u 431 | un 432 | une 433 | unes 434 | uns 435 | v 436 | va 437 | vais 438 | vas 439 | vé 440 | vers 441 | via 442 | vif 443 | vifs 444 | vingt 445 | vivat 446 | vive 447 | vives 448 | vlan 449 | voici 450 | voilà 451 | vont 452 | vos 453 | votre 454 | vôtre 455 | vôtres 456 | vous 457 | vous-mêmes 458 | vu 459 | w 460 | x 461 | y 462 | z 463 | zut 464 | -------------------------------------------------------------------------------- /resources/CLEF/it_sl.txt: -------------------------------------------------------------------------------- 1 | a 2 | abbastanza 3 | accidenti 4 | ad 5 | adesso 6 | affinche 7 | agli 8 | ahime 9 | ahimè 10 | ai 11 | al 12 | alcuna 13 | alcuni 14 | alcuno 15 | all 16 | alla 17 | alle 18 | allo 19 | altri 20 | altrimenti 21 | altro 22 | altrui 23 | anche 24 | ancora 25 | anni 26 | anno 27 | ansa 28 | assai 29 | attesa 30 | avanti 31 | avendo 32 | avente 33 | aver 34 | avere 35 | avete 36 | aveva 37 | avuta 38 | avute 39 | avuti 40 | avuto 41 | basta 42 | bene 43 | benissimo 44 | berlusconi 45 | brava 46 | bravo 47 | c 48 | casa 49 | caso 50 | cento 51 | certa 52 | certe 53 | certi 54 | certo 55 | che 56 | chi 57 | chicchessia 58 | chiunque 59 | ci 60 | ciascuna 61 | ciascuno 62 | cima 63 | cio 64 | ciò 65 | cioe 66 | cioè 67 | circa 68 | citta 69 | città 70 | codesta 71 | codesti 72 | codesto 73 | cogli 74 | coi 75 | col 76 | colei 77 | coll 78 | coloro 79 | colui 80 | come 81 | con 82 | concernente 83 | consiglio 84 | contro 85 | cortesia 86 | cos 87 | cosa 88 | cosi 89 | così 90 | cui 91 | d 92 | da 93 | dagli 94 | dai 95 | dal 96 | dall 97 | dalla 98 | dalle 99 | dallo 100 | davanti 101 | degli 102 | dei 103 | del 104 | dell 105 | della 106 | delle 107 | dello 108 | dentro 109 | detto 110 | deve 111 | di 112 | dice 113 | dietro 114 | dire 115 | dirimpetto 116 | dopo 117 | dove 118 | dovra 119 | dovrà 120 | due 121 | dunque 122 | durante 123 | e 124 | è 125 | ecco 126 | ed 127 | egli 128 | ella 129 | eppure 130 | era 131 | erano 132 | esse 133 | essendo 134 | esser 135 | essere 136 | essi 137 | ex 138 | fa 139 | fare 140 | fatto 141 | favore 142 | fin 143 | finalmente 144 | finche 145 | fine 146 | fino 147 | forse 148 | fra 149 | fuori 150 | gia 151 | già 152 | giacche 153 | giorni 154 | giorno 155 | gli 156 | gliela 157 | gliele 158 | glieli 159 | glielo 160 | gliene 161 | governo 162 | grande 163 | grazie 164 | gruppo 165 | ha 166 | hai 167 | hanno 168 | ho 169 | i 170 | ieri 171 | il 172 | improvviso 173 | in 174 | infatti 175 | insieme 176 | intanto 177 | intorno 178 | invece 179 | io 180 | l 181 | la 182 | là 183 | lavoro 184 | le 185 | lei 186 | li 187 | lo 188 | lontano 189 | loro 190 | lui 191 | lungo 192 | ma 193 | macche 194 | magari 195 | mai 196 | male 197 | malgrado 198 | malissimo 199 | me 200 | medesimo 201 | mediante 202 | meglio 203 | meno 204 | mentre 205 | mesi 206 | mezzo 207 | mi 208 | mia 209 | mie 210 | miei 211 | mila 212 | miliardi 213 | milioni 214 | ministro 215 | mio 216 | moltissimo 217 | molto 218 | mondo 219 | nazionale 220 | ne 221 | negli 222 | nei 223 | nel 224 | nell 225 | nella 226 | nelle 227 | nello 228 | nemmeno 229 | neppure 230 | nessuna 231 | nessuno 232 | niente 233 | no 234 | noi 235 | non 236 | nondimeno 237 | nostra 238 | nostre 239 | nostri 240 | nostro 241 | nulla 242 | nuovo 243 | o 244 | od 245 | oggi 246 | ogni 247 | ognuna 248 | ognuno 249 | oltre 250 | oppure 251 | ora 252 | ore 253 | osi 254 | ossia 255 | paese 256 | parecchi 257 | parecchie 258 | parecchio 259 | parte 260 | partendo 261 | peccato 262 | peggio 263 | per 264 | perche 265 | perchè 266 | percio 267 | perciò 268 | perfino 269 | pero 270 | però 271 | persone 272 | piedi 273 | pieno 274 | piglia 275 | piu 276 | più 277 | po 278 | pochissimo 279 | poco 280 | poi 281 | poiche 282 | press 283 | prima 284 | primo 285 | proprio 286 | puo 287 | può 288 | pure 289 | purtroppo 290 | qualche 291 | qualcuna 292 | qualcuno 293 | quale 294 | quali 295 | qualunque 296 | quando 297 | quanta 298 | quante 299 | quanti 300 | quanto 301 | quantunque 302 | quasi 303 | quattro 304 | quel 305 | quella 306 | quelli 307 | quello 308 | quest 309 | questa 310 | queste 311 | questi 312 | questo 313 | qui 314 | quindi 315 | riecco 316 | salvo 317 | sara 318 | sarà 319 | sarebbe 320 | scopo 321 | scorso 322 | se 323 | secondo 324 | seguente 325 | sei 326 | sempre 327 | senza 328 | si 329 | sia 330 | siamo 331 | siete 332 | solito 333 | solo 334 | sono 335 | sopra 336 | sotto 337 | sta 338 | staranno 339 | stata 340 | state 341 | stati 342 | stato 343 | stesso 344 | su 345 | sua 346 | successivo 347 | sue 348 | sugli 349 | sui 350 | sul 351 | sull 352 | sulla 353 | sulle 354 | sullo 355 | suo 356 | suoi 357 | tale 358 | talvolta 359 | tanto 360 | te 361 | tempo 362 | ti 363 | torino 364 | tra 365 | tranne 366 | tre 367 | troppo 368 | tu 369 | tua 370 | tue 371 | tuo 372 | tuoi 373 | tutta 374 | tuttavia 375 | tutte 376 | tutti 377 | tutto 378 | uguali 379 | un 380 | una 381 | uno 382 | uomo 383 | va 384 | vale 385 | varia 386 | varie 387 | vario 388 | verso 389 | vi 390 | via 391 | vicino 392 | visto 393 | vita 394 | voi 395 | volta 396 | vostra 397 | vostre 398 | vostri 399 | vostro 400 | -------------------------------------------------------------------------------- /resources/CLEF/nl_sl.txt: -------------------------------------------------------------------------------- 1 | de 2 | en 3 | van 4 | ik 5 | te 6 | dat 7 | die 8 | in 9 | een 10 | hij 11 | het 12 | niet 13 | zijn 14 | is 15 | was 16 | op 17 | aan 18 | met 19 | als 20 | voor 21 | had 22 | er 23 | maar 24 | om 25 | hem 26 | dan 27 | zou 28 | of 29 | wat 30 | mijn 31 | men 32 | dit 33 | zo 34 | door 35 | over 36 | ze 37 | zich 38 | bij 39 | ook 40 | tot 41 | je 42 | mij 43 | uit 44 | der 45 | daar 46 | haar 47 | naar 48 | heb 49 | hoe 50 | heeft 51 | hebben 52 | deze 53 | u 54 | want 55 | nog 56 | zal 57 | me 58 | zij 59 | nu 60 | ge 61 | geen 62 | omdat 63 | iets 64 | worden 65 | toch 66 | al 67 | waren 68 | veel 69 | meer 70 | doen 71 | toen 72 | moet 73 | ben 74 | zonder 75 | kan 76 | hun 77 | dus 78 | alles 79 | onder 80 | ja 81 | eens 82 | hier 83 | wie 84 | werd 85 | altijd 86 | doch 87 | wordt 88 | wezen 89 | kunnen 90 | ons 91 | zelf 92 | tegen 93 | na 94 | reeds 95 | wil 96 | kon 97 | niets 98 | uw 99 | iemand 100 | geweest 101 | andere -------------------------------------------------------------------------------- /resources/CLEF/pt_sl.txt: -------------------------------------------------------------------------------- 1 | a 2 | à 3 | adeus 4 | agora 5 | aí 6 | ainda 7 | além 8 | algo 9 | algumas 10 | alguns 11 | ali 12 | ano 13 | anos 14 | antes 15 | ao 16 | aos 17 | apenas 18 | apoio 19 | após 20 | aquela 21 | aquelas 22 | aquele 23 | aqueles 24 | aqui 25 | aquilo 26 | área 27 | as 28 | às 29 | assim 30 | até 31 | atrás 32 | através 33 | baixo 34 | bastante 35 | bem 36 | bom 37 | breve 38 | cá 39 | cada 40 | catorze 41 | cedo 42 | cento 43 | certamente 44 | certeza 45 | cima 46 | cinco 47 | coisa 48 | com 49 | como 50 | conselho 51 | contra 52 | custa 53 | da 54 | dá 55 | dão 56 | daquela 57 | daquele 58 | dar 59 | das 60 | de 61 | debaixo 62 | demais 63 | dentro 64 | depois 65 | desde 66 | dessa 67 | desse 68 | desta 69 | deste 70 | deve 71 | deverá 72 | dez 73 | dezanove 74 | dezasseis 75 | dezassete 76 | dezoito 77 | dia 78 | diante 79 | diz 80 | dizem 81 | dizer 82 | do 83 | dois 84 | dos 85 | doze 86 | duas 87 | dúvida 88 | e 89 | é 90 | ela 91 | elas 92 | ele 93 | eles 94 | em 95 | embora 96 | entre 97 | era 98 | és 99 | essa 100 | essas 101 | esse 102 | esses 103 | esta 104 | está 105 | estar 106 | estas 107 | estás 108 | estava 109 | este 110 | estes 111 | esteve 112 | estive 113 | estivemos 114 | estiveram 115 | estiveste 116 | estivestes 117 | estou 118 | eu 119 | exemplo 120 | faço 121 | falta 122 | favor 123 | faz 124 | fazeis 125 | fazem 126 | fazemos 127 | fazer 128 | fazes 129 | fez 130 | fim 131 | final 132 | foi 133 | fomos 134 | for 135 | foram 136 | forma 137 | foste 138 | fostes 139 | fui 140 | geral 141 | grande 142 | grandes 143 | grupo 144 | há 145 | hoje 146 | horas 147 | isso 148 | isto 149 | já 150 | lá 151 | lado 152 | local 153 | logo 154 | longe 155 | lugar 156 | maior 157 | maioria 158 | mais 159 | mal 160 | mas 161 | máximo 162 | me 163 | meio 164 | menor 165 | menos 166 | mês 167 | meses 168 | meu 169 | meus 170 | mil 171 | minha 172 | minhas 173 | momento 174 | muito 175 | muitos 176 | na 177 | nada 178 | não 179 | naquela 180 | naquele 181 | nas 182 | nem 183 | nenhuma 184 | nessa 185 | nesse 186 | nesta 187 | neste 188 | nível 189 | no 190 | noite 191 | nome 192 | nos 193 | nós 194 | nossa 195 | nossas 196 | nosso 197 | nossos 198 | nova 199 | nove 200 | novo 201 | novos 202 | num 203 | numa 204 | número 205 | nunca 206 | o 207 | obra 208 | obrigada 209 | obrigado 210 | oitava 211 | oitavo 212 | oito 213 | onde 214 | ontem 215 | onze 216 | os 217 | ou 218 | outra 219 | outras 220 | outro 221 | outros 222 | para 223 | parece 224 | parte 225 | partir 226 | pela 227 | pelas 228 | pelo 229 | pelos 230 | perto 231 | pode 232 | pôde 233 | podem 234 | poder 235 | põe 236 | põem 237 | ponto 238 | pontos 239 | por 240 | porque 241 | porquê 242 | posição 243 | possível 244 | possivelmente 245 | posso 246 | pouca 247 | pouco 248 | primeira 249 | primeiro 250 | próprio 251 | próximo 252 | puderam 253 | qual 254 | quando 255 | quanto 256 | quarta 257 | quarto 258 | quatro 259 | que 260 | quê 261 | quem 262 | quer 263 | quero 264 | questão 265 | quinta 266 | quinto 267 | quinze 268 | relação 269 | sabe 270 | são 271 | se 272 | segunda 273 | segundo 274 | sei 275 | seis 276 | sem 277 | sempre 278 | ser 279 | seria 280 | sete 281 | sétima 282 | sétimo 283 | seu 284 | seus 285 | sexta 286 | sexto 287 | sim 288 | sistema 289 | sob 290 | sobre 291 | sois 292 | somos 293 | sou 294 | sua 295 | suas 296 | tal 297 | talvez 298 | também 299 | tanto 300 | tão 301 | tarde 302 | te 303 | tem 304 | têm 305 | temos 306 | tendes 307 | tenho 308 | tens 309 | ter 310 | terceira 311 | terceiro 312 | teu 313 | teus 314 | teve 315 | tive 316 | tivemos 317 | tiveram 318 | tiveste 319 | tivestes 320 | toda 321 | todas 322 | todo 323 | todos 324 | trabalho 325 | três 326 | treze 327 | tu 328 | tua 329 | tuas 330 | tudo 331 | um 332 | uma 333 | umas 334 | uns 335 | vai 336 | vais 337 | vão 338 | vários 339 | vem 340 | vêm 341 | vens 342 | ver 343 | vez 344 | vezes 345 | viagem 346 | vindo 347 | vinte 348 | você 349 | vocês 350 | vos 351 | vós 352 | vossa 353 | vossas 354 | vosso 355 | vossos 356 | zero 357 | -------------------------------------------------------------------------------- /resources/CLEF/ru_sl.txt: -------------------------------------------------------------------------------- 1 | а 2 | е 3 | и 4 | ж 5 | м 6 | о 7 | на 8 | не 9 | ни 10 | об 11 | но 12 | он 13 | мне 14 | мои 15 | мож 16 | она 17 | они 18 | оно 19 | мной 20 | много 21 | многочисленное 22 | многочисленная 23 | многочисленные 24 | многочисленный 25 | мною 26 | мой 27 | мог 28 | могут 29 | можно 30 | может 31 | можхо 32 | мор 33 | моя 34 | моё 35 | мочь 36 | над 37 | нее 38 | оба 39 | нам 40 | нем 41 | нами 42 | ними 43 | мимо 44 | немного 45 | одной 46 | одного 47 | менее 48 | однажды 49 | однако 50 | меня 51 | нему 52 | меньше 53 | ней 54 | наверху 55 | него 56 | ниже 57 | мало 58 | надо 59 | один 60 | одиннадцать 61 | одиннадцатый 62 | назад 63 | наиболее 64 | недавно 65 | миллионов 66 | недалеко 67 | между 68 | низко 69 | меля 70 | нельзя 71 | нибудь 72 | непрерывно 73 | наконец 74 | никогда 75 | никуда 76 | нас 77 | наш 78 | нет 79 | нею 80 | неё 81 | них 82 | мира 83 | наша 84 | наше 85 | наши 86 | ничего 87 | начала 88 | нередко 89 | несколько 90 | обычно 91 | опять 92 | около 93 | мы 94 | ну 95 | нх 96 | от 97 | отовсюду 98 | особенно 99 | нужно 100 | очень 101 | отсюда 102 | в 103 | во 104 | вон 105 | вниз 106 | внизу 107 | вокруг 108 | вот 109 | восемнадцать 110 | восемнадцатый 111 | восемь 112 | восьмой 113 | вверх 114 | вам 115 | вами 116 | важное 117 | важная 118 | важные 119 | важный 120 | вдали 121 | везде 122 | ведь 123 | вас 124 | ваш 125 | ваша 126 | ваше 127 | ваши 128 | впрочем 129 | весь 130 | вдруг 131 | вы 132 | все 133 | второй 134 | всем 135 | всеми 136 | времени 137 | время 138 | всему 139 | всего 140 | всегда 141 | всех 142 | всею 143 | всю 144 | вся 145 | всё 146 | всюду 147 | г 148 | год 149 | говорил 150 | говорит 151 | года 152 | году 153 | где 154 | да 155 | ее 156 | за 157 | из 158 | ли 159 | же 160 | им 161 | до 162 | по 163 | ими 164 | под 165 | иногда 166 | довольно 167 | именно 168 | долго 169 | позже 170 | более 171 | должно 172 | пожалуйста 173 | значит 174 | иметь 175 | больше 176 | пока 177 | ему 178 | имя 179 | пор 180 | пора 181 | потом 182 | потому 183 | после 184 | почему 185 | почти 186 | посреди 187 | ей 188 | два 189 | две 190 | двенадцать 191 | двенадцатый 192 | двадцать 193 | двадцатый 194 | двух 195 | его 196 | дел 197 | или 198 | без 199 | день 200 | занят 201 | занята 202 | занято 203 | заняты 204 | действительно 205 | давно 206 | девятнадцать 207 | девятнадцатый 208 | девять 209 | девятый 210 | даже 211 | алло 212 | жизнь 213 | далеко 214 | близко 215 | здесь 216 | дальше 217 | для 218 | лет 219 | зато 220 | даром 221 | первый 222 | перед 223 | затем 224 | зачем 225 | лишь 226 | десять 227 | десятый 228 | ею 229 | её 230 | их 231 | бы 232 | еще 233 | при 234 | был 235 | про 236 | процентов 237 | против 238 | просто 239 | бывает 240 | бывь 241 | если 242 | люди 243 | была 244 | были 245 | было 246 | будем 247 | будет 248 | будете 249 | будешь 250 | прекрасно 251 | буду 252 | будь 253 | будто 254 | будут 255 | ещё 256 | пятнадцать 257 | пятнадцатый 258 | друго 259 | другое 260 | другой 261 | другие 262 | другая 263 | других 264 | есть 265 | пять 266 | быть 267 | лучше 268 | пятый 269 | к 270 | ком 271 | конечно 272 | кому 273 | кого 274 | когда 275 | которой 276 | которого 277 | которая 278 | которые 279 | который 280 | которых 281 | кем 282 | каждое 283 | каждая 284 | каждые 285 | каждый 286 | кажется 287 | как 288 | какой 289 | какая 290 | кто 291 | кроме 292 | куда 293 | кругом 294 | с 295 | т 296 | у 297 | я 298 | та 299 | те 300 | уж 301 | со 302 | то 303 | том 304 | снова 305 | тому 306 | совсем 307 | того 308 | тогда 309 | тоже 310 | собой 311 | тобой 312 | собою 313 | тобою 314 | сначала 315 | только 316 | уметь 317 | тот 318 | тою 319 | хорошо 320 | хотеть 321 | хочешь 322 | хоть 323 | хотя 324 | свое 325 | свои 326 | твой 327 | своей 328 | своего 329 | своих 330 | свою 331 | твоя 332 | твоё 333 | раз 334 | уже 335 | сам 336 | там 337 | тем 338 | чем 339 | сама 340 | сами 341 | теми 342 | само 343 | рано 344 | самом 345 | самому 346 | самой 347 | самого 348 | семнадцать 349 | семнадцатый 350 | самим 351 | самими 352 | самих 353 | саму 354 | семь 355 | чему 356 | раньше 357 | сейчас 358 | чего 359 | сегодня 360 | себе 361 | тебе 362 | сеаой 363 | человек 364 | разве 365 | теперь 366 | себя 367 | тебя 368 | седьмой 369 | спасибо 370 | слишком 371 | так 372 | такое 373 | такой 374 | такие 375 | также 376 | такая 377 | сих 378 | тех 379 | чаще 380 | четвертый 381 | через 382 | часто 383 | шестой 384 | шестнадцать 385 | шестнадцатый 386 | шесть 387 | четыре 388 | четырнадцать 389 | четырнадцатый 390 | сколько 391 | сказал 392 | сказала 393 | сказать 394 | ту 395 | ты 396 | три 397 | эта 398 | эти 399 | что 400 | это 401 | чтоб 402 | этом 403 | этому 404 | этой 405 | этого 406 | чтобы 407 | этот 408 | стал 409 | туда 410 | этим 411 | этими 412 | рядом 413 | тринадцать 414 | тринадцатый 415 | этих 416 | третий 417 | тут 418 | эту 419 | суть 420 | чуть 421 | тысяч 422 | 423 | -------------------------------------------------------------------------------- /resources/CLEF/sv_sl.txt: -------------------------------------------------------------------------------- 1 | aderton 2 | adertonde 3 | adjö 4 | aldrig 5 | alla 6 | allas 7 | allt 8 | alltid 9 | alltså 10 | än 11 | andra 12 | andras 13 | annan 14 | annat 15 | ännu 16 | artonde 17 | artonn 18 | åtminstone 19 | att 20 | åtta 21 | åttio 22 | åttionde 23 | åttonde 24 | av 25 | även 26 | båda 27 | bådas 28 | bakom 29 | bara 30 | bäst 31 | bättre 32 | behöva 33 | behövas 34 | behövde 35 | behövt 36 | beslut 37 | beslutat 38 | beslutit 39 | bland 40 | blev 41 | bli 42 | blir 43 | blivit 44 | bort 45 | borta 46 | bra 47 | då 48 | dag 49 | dagar 50 | dagarna 51 | dagen 52 | där 53 | därför 54 | de 55 | del 56 | delen 57 | dem 58 | den 59 | deras 60 | dess 61 | det 62 | detta 63 | dig 64 | din 65 | dina 66 | dit 67 | ditt 68 | dock 69 | du 70 | efter 71 | eftersom 72 | elfte 73 | eller 74 | elva 75 | en 76 | enkel 77 | enkelt 78 | enkla 79 | enligt 80 | er 81 | era 82 | ert 83 | ett 84 | ettusen 85 | få 86 | fanns 87 | får 88 | fått 89 | fem 90 | femte 91 | femtio 92 | femtionde 93 | femton 94 | femtonde 95 | fick 96 | fin 97 | finnas 98 | finns 99 | fjärde 100 | fjorton 101 | fjortonde 102 | fler 103 | flera 104 | flesta 105 | följande 106 | för 107 | före 108 | förlåt 109 | förra 110 | första 111 | fram 112 | framför 113 | från 114 | fyra 115 | fyrtio 116 | fyrtionde 117 | gå 118 | gälla 119 | gäller 120 | gällt 121 | går 122 | gärna 123 | gått 124 | genast 125 | genom 126 | gick 127 | gjorde 128 | gjort 129 | god 130 | goda 131 | godare 132 | godast 133 | gör 134 | göra 135 | gott 136 | ha 137 | hade 138 | haft 139 | han 140 | hans 141 | har 142 | här 143 | heller 144 | hellre 145 | helst 146 | helt 147 | henne 148 | hennes 149 | hit 150 | hög 151 | höger 152 | högre 153 | högst 154 | hon 155 | honom 156 | hundra 157 | hundraen 158 | hundraett 159 | hur 160 | i 161 | ibland 162 | idag 163 | igår 164 | igen 165 | imorgon 166 | in 167 | inför 168 | inga 169 | ingen 170 | ingenting 171 | inget 172 | innan 173 | inne 174 | inom 175 | inte 176 | inuti 177 | ja 178 | jag 179 | jämfört 180 | kan 181 | kanske 182 | knappast 183 | kom 184 | komma 185 | kommer 186 | kommit 187 | kr 188 | kunde 189 | kunna 190 | kunnat 191 | kvar 192 | länge 193 | längre 194 | långsam 195 | långsammare 196 | långsammast 197 | långsamt 198 | längst 199 | långt 200 | lätt 201 | lättare 202 | lättast 203 | legat 204 | ligga 205 | ligger 206 | lika 207 | likställd 208 | likställda 209 | lilla 210 | lite 211 | liten 212 | litet 213 | man 214 | många 215 | måste 216 | med 217 | mellan 218 | men 219 | mer 220 | mera 221 | mest 222 | mig 223 | min 224 | mina 225 | mindre 226 | minst 227 | mitt 228 | mittemot 229 | möjlig 230 | möjligen 231 | möjligt 232 | möjligtvis 233 | mot 234 | mycket 235 | någon 236 | någonting 237 | något 238 | några 239 | när 240 | nästa 241 | ned 242 | nederst 243 | nedersta 244 | nedre 245 | nej 246 | ner 247 | ni 248 | nio 249 | nionde 250 | nittio 251 | nittionde 252 | nitton 253 | nittonde 254 | nödvändig 255 | nödvändiga 256 | nödvändigt 257 | nödvändigtvis 258 | nog 259 | noll 260 | nr 261 | nu 262 | nummer 263 | och 264 | också 265 | ofta 266 | oftast 267 | olika 268 | olikt 269 | om 270 | oss 271 | över 272 | övermorgon 273 | överst 274 | övre 275 | på 276 | rakt 277 | rätt 278 | redan 279 | så 280 | sade 281 | säga 282 | säger 283 | sagt 284 | samma 285 | sämre 286 | sämst 287 | sedan 288 | senare 289 | senast 290 | sent 291 | sex 292 | sextio 293 | sextionde 294 | sexton 295 | sextonde 296 | sig 297 | sin 298 | sina 299 | sist 300 | sista 301 | siste 302 | sitt 303 | sjätte 304 | sju 305 | sjunde 306 | sjuttio 307 | sjuttionde 308 | sjutton 309 | sjuttonde 310 | ska 311 | skall 312 | skulle 313 | slutligen 314 | små 315 | smått 316 | snart 317 | som 318 | stor 319 | stora 320 | större 321 | störst 322 | stort 323 | tack 324 | tidig 325 | tidigare 326 | tidigast 327 | tidigt 328 | till 329 | tills 330 | tillsammans 331 | tio 332 | tionde 333 | tjugo 334 | tjugoen 335 | tjugoett 336 | tjugonde 337 | tjugotre 338 | tjugotvå 339 | tjungo 340 | tolfte 341 | tolv 342 | tre 343 | tredje 344 | trettio 345 | trettionde 346 | tretton 347 | trettonde 348 | två 349 | tvåhundra 350 | under 351 | upp 352 | ur 353 | ursäkt 354 | ut 355 | utan 356 | utanför 357 | ute 358 | vad 359 | vänster 360 | vänstra 361 | var 362 | vår 363 | vara 364 | våra 365 | varför 366 | varifrån 367 | varit 368 | varken 369 | värre 370 | varsågod 371 | vart 372 | vårt 373 | vem 374 | vems 375 | verkligen 376 | vi 377 | vid 378 | vidare 379 | viktig 380 | viktigare 381 | viktigast 382 | viktigt 383 | vilka 384 | vilken 385 | vilket 386 | vill 387 | -------------------------------------------------------------------------------- /results/CLEF/indri/info.txt: -------------------------------------------------------------------------------- 1 | Results are not available on-line, you need to compute them with trec_eval. You can find the resulting runs at: https://github.com/gmdn/IR-reproducibiliy-grium/tree/master/results -------------------------------------------------------------------------------- /results/CLEF/lucene/info.txt: -------------------------------------------------------------------------------- 1 | You can find trec_eval results at: https://github.com/dibuccio/IR-Reproducibility/tree/master/results/CLEF/lucene -------------------------------------------------------------------------------- /results/CLEF/terrier/info.txt: -------------------------------------------------------------------------------- 1 | You can find trec_eval results at: https://github.com/mmaistro/IR-Reproducibility/tree/mmaistro/results/CLEF/terrier -------------------------------------------------------------------------------- /runs/CLEF/indri/info.txt: -------------------------------------------------------------------------------- 1 | You can find the resulting runs at: https://github.com/gmdn/IR-reproducibiliy-grium/tree/master/results -------------------------------------------------------------------------------- /runs/CLEF/lucene/info.txt: -------------------------------------------------------------------------------- 1 | You can find the resulting runs at: https://github.com/dibuccio/IR-Reproducibility/tree/master/runs/CLEF/lucene -------------------------------------------------------------------------------- /runs/CLEF/terrier/info.txt: -------------------------------------------------------------------------------- 1 | You can find the resulting runs at: https://github.com/mmaistro/IR-Reproducibility/tree/mmaistro/runs/CLEF/terrier -------------------------------------------------------------------------------- /systems/ATIRE/README.md: -------------------------------------------------------------------------------- 1 | ATIRE 2 | ===== 3 | 4 | The ATIRE makefile has a set of options that have been already preselected to provide good performance for indexing. 5 | 6 | The script `dotgov2.sh` provides a script that will clone the required repositories, build the system, index and search all the query sets. 7 | 8 | Indexing 9 | -------- 10 | 11 | The first two arguments to the indexer simply specify a progress output every `n` documents (`-N`, in this case one million), and to print statistics at the conclusion of the indexing. 12 | 13 | The next argument specifies the format of the documents. In this case it has been set to recursively find and parse documents according to the TREC format; treating everything between `` and ``, inclusive, as indexable content. The content between `` and `` contains the document id. The exception to this are terms that appear in `SMGL` tags, of which only the tag itself is stored uppercased to allow for focussed retrieval. A term is defined as either a sequence of alpha characters, or a sequence of numeric characters, the definitions of which come from Unicode version 6.0. 14 | 15 | As the .gov2 collection is supposed to contain only ASCII data, the `-iscrub:an` option is specified. This option will replace the `NUL` character, and other non-ASCII characters with a space. This prevents malformed data getting into the index. No other content filtering other than this is applied. 16 | 17 | Finally, the indexer uses an s-stripping stemmer. 18 | 19 | The script generates two indexes using these options, the first is a quantized index, which pre-calculates the retrieval scores and stores them in the index rather than having to be calculated at search time. The `-Q` parameter identifies the ranking function, and the `-q` parameter stores the quantized values. The second index is missing these parameters as it is an unquantized index. 20 | 21 | Finally are a number of arguments to the indexer. These determine the locations for the recursive searching to take place. In the case of the gov2 script, each sub-folder of the gov2 collection is recursively searched for files matching the pattern `*.gz`. The number of arguments to the indexer determines one of the degrees of parallelism, the other is static. This combination has been shown empirically to perform better than other combinations. 22 | 23 | Searching 24 | --------- 25 | 26 | The search is performed to completion for both indexes, for all query sets. A run file is generated for each query set, with the top-1000 results presented, which can be evaluated using traditional TREC tools. 27 | 28 | The first search command is targeted at high efficiency. To do this it uses the quantized index (specified with `-findex `), which sets the ranking function internally. 29 | 30 | Counter intuitively, the top-k parameter is not used for the speed baseline, as our experiments suggest that at high values of `k` this has an adverse affect on speed. The second argument to the efficiency baseline is the `-M` flag, which tells the search program to load the entire index into memory at startup. 31 | 32 | The second search command is targeted at high effectiveness. It uses the default index filename (`index.aspt`), which is generated by the non-quantized indexer. 33 | 34 | The ranking function for the second search is left to the default BM25 implementation with ATIRE, with the default parameters of `k1=0.9, b=0.4`. 35 | 36 | The second parameter, `-Qr`, to the effectiveness search specifies to use Rocchio blind relevance feedback. There are optional parameters to this argument to specify the number of document to analyse, and the number of terms to extract. These are left at the defaults of `17` documents, and `5` terms. 37 | 38 | The remainder of the arguments are shared among both searches. These arguments specify to print statistics (`-sa`), and setup the query type (`-QN:t` -- `` fields from a TREC topic file), the file containing the queries (`-q <filepath>`), and options related to generating the run file. Each run file is named `atire.<query set>.<speed|precision>.txt`, and the search statistics are redirected to `<query set>.<speed|precision>.search_stats.txt`. 39 | -------------------------------------------------------------------------------- /systems/ATIRE/cw09.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ef 3 | 4 | source ../common.sh 5 | source setup.sh 6 | 7 | CW09B_FILES=$(find $CW09B_LOCATION -mindepth 1 -maxdepth 1 -type d -printf '%p/*.warc.gz ') 8 | 9 | BASE_INDEX="stdbuf -oL ./bin/index -N1000000 -sa -rrwarcgz -iscrub:un -kt" 10 | ${BASE_INDEX} -findex cw09_index.aspt ${CW09B_FILES[@]} | tee cw09_indexing.txt 11 | ${BASE_INDEX} -QBM25 -q -findex cw09_quantized.aspt ${CW09B_FILES[@]} | tee cw09_quantized.indexing.txt 12 | 13 | for index in "cw09_index.aspt" "cw09_quantized.aspt" 14 | do 15 | for queries in "1-50" 16 | do 17 | query_file=../$TOPICS_QRELS/topics.web.${queries}.txt 18 | qrel_file=../$TOPICS_QRELS/prels.web.${queries}.txt 19 | stat_file=${index}.${queries}.search_stats.txt 20 | run_file=atire.${index}.${queries}.txt 21 | eval_file=eval.${index}.${queries}.txt 22 | 23 | echo "Searching queries ${queries} on index ${index}" 24 | ./bin/atire -findex ${index} -sa -QN:q -k1000 -q ${query_file} -et -l1000 -o${run_file} -iatire > ${stat_file} 25 | ../${SAP_EVAL} ${qrel_file} ${run_file} > ${eval_file} 26 | done 27 | done 28 | 29 | for index in "cw09_index.aspt" "cw09_quantized.aspt" 30 | do 31 | for queries in "51-100" "101-150" "151-200" 32 | do 33 | query_file=../$TOPICS_QRELS/topics.web.${queries}.txt 34 | qrel_file=../$TOPICS_QRELS/qrels.web.${queries}.txt 35 | stat_file=${index}.${queries}.search_stats.txt 36 | run_file=atire.${index}.${queries}.txt 37 | eval_file=eval.${index}.${queries}.txt 38 | 39 | echo "Searching queries ${queries} on index ${index}" 40 | ./bin/atire -findex ${index} -sa -QN:q -k1000 -q ${query_file} -et -l1000 -o${run_file} -iatire > ${stat_file} 41 | ../${TREC_EVAL} ${qrel_file} ${run_file} > ${eval_file} 42 | ../${GD_EVAL} -c -traditional ${qrel_file} ${run_file} >> ${eval_file} 43 | done 44 | done 45 | -------------------------------------------------------------------------------- /systems/ATIRE/cw12.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | source ../common.sh 5 | source setup.sh 6 | 7 | set +o noglob 8 | CW12B_FILES=$(find $CW12B_LOCATION/ClueWeb* -mindepth 1 -maxdepth 1 -type d -printf '%p/*.gz ') 9 | set -o noglob 10 | 11 | BASE_INDEX="stdbuf -oL ./bin/index -N1000000 -sa -rrwarcgz -iscrub:un -kt" 12 | ${BASE_INDEX} -findex cw12_index.aspt ${CW12B_FILES[@]} | tee cw12_indexing.txt 13 | ${BASE_INDEX} -QBM25 -q -findex cw12_quantized.aspt ${CW12B_FILES[@]} | tee cw12_quantized.indexing.txt 14 | 15 | for index in "cw12_index.aspt" "cw12_quantized.aspt" 16 | do 17 | for queries in "201-250" "251-300" 18 | do 19 | query_file=../$TOPICS_QRELS/topics.web.${queries}.txt 20 | qrel_file=../$TOPICS_QRELS/qrels.web.${queries}.txt 21 | stat_file=${index}.${queries}.search_stats.txt 22 | run_file=atire.${index}.${queries}.txt 23 | eval_file=eval.${index}.${queries}.txt 24 | 25 | echo "Searching queries ${queries} on index ${index}" 26 | ./bin/atire -findex ${index} -sa -QN:q -k1000 -q ${query_file} -et -l1000 -o${run_file} -iatire > ${stat_file} 27 | ../${TREC_EVAL} ${qrel_file} ${run_file} > ${eval_file} 28 | ../${GD_EVAL} -c -traditional ${qrel_file} ${run_file} >> ${eval_file} 29 | done 30 | done 31 | -------------------------------------------------------------------------------- /systems/ATIRE/dotgov2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ef 3 | 4 | source ../common.sh 5 | source setup.sh 6 | 7 | GOV2_FILES=$(find $GOV2_LOCATION -mindepth 1 -maxdepth 1 -type d -name 'GX*' -printf '%p/*.gz ') 8 | 9 | BASE_INDEX="stdbuf -oL ./bin/index -N1000000 -sa -rrtrec -iscrub:an -ts -kt" 10 | ${BASE_INDEX} -findex dg2_index.aspt ${GOV2_FILES[@]} | tee dg2_indexing.txt 11 | ${BASE_INDEX} -QBM25 -q -findex dg2_quantized.aspt ${GOV2_FILES[@]} | tee dg2_quantized.indexing.txt 12 | 13 | for index in "dg2_index.aspt" "dg2_quantized.aspt" 14 | do 15 | for queries in "701-750" "751-800" "801-850" 16 | do 17 | query_file=../$TOPICS_QRELS/topics.${queries}.txt 18 | qrel_file=../$TOPICS_QRELS/qrels.${queries}.txt 19 | stat_file=${index}.${queries}.search_stats.txt 20 | run_file=atire.${index}.${queries}.txt 21 | eval_file=eval.${index}.${queries}.txt 22 | 23 | echo "Searching queries ${queries} on index ${index}" 24 | ./bin/atire -findex ${index} -sa -QN:t -k1000 -q ${query_file} -et -l1000 -o${run_file} -iatire > ${stat_file} 25 | ../$TREC_EVAL ${qrel_file} ${run_file} > ${eval_file} 26 | done 27 | done 28 | -------------------------------------------------------------------------------- /systems/ATIRE/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [[ ! -d "atire" ]]; then 4 | hg clone http://www.atire.org/hg/atire -r f3102a7a5848 5 | fi 6 | 7 | cd atire 8 | 9 | make clean all 10 | -------------------------------------------------------------------------------- /systems/JASS/cw09.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source ../common.sh 4 | source setup.sh 5 | 6 | if [[ ! -f ../${ATIRE_DIR}/cw09_quantized.aspt ]]; then 7 | echo "Must have built an ATIRE quantized index" 8 | echo "Looked for: ../${ATIRE_DIR}/cw09_quantized.aspt" 9 | exit 10 | fi 11 | 12 | START=$(date +%s) 13 | ./atire_to_jass_index ../${ATIRE_DIR}/cw09_quantized.aspt -Q 14 | END=$(date +%s) 15 | 16 | echo "'Indexing' took:" $((END - START)) "seconds" 17 | 18 | for queries in "51-100" "101-150" "151-200" 19 | do 20 | query_file=../$TOPICS_QRELS/topics.web.${queries}.txt 21 | qrel_file=../$TOPICS_QRELS/qrels.web.${queries}.txt 22 | stat_file=${index}.${queries}.search_stats.txt 23 | run_file=${queries}.txt 24 | eval_file=eval.${queries}.txt 25 | 26 | ./trec2query/trec2query ${query_file} q -s s > ${queries}.txt 27 | 28 | echo "Searching queries ${queries} to 1B postings" 29 | ./jass ${queries}.txt 1000 1000000000 -d > comp.${stat_file} 30 | mv ranking.txt comp.jass.${run_file} 31 | ../$TREC_EVAL ${qrel_file} comp.jass.${run_file} > comp.${eval_file} 32 | 33 | echo "Searching queries ${queries} to 5M postings" 34 | ./jass ${queries}.txt 1000 5000000 -d > heur.${stat_file} 35 | mv ranking.txt heur.jass.${run_file} 36 | ../$TREC_EVAL ${qrel_file} heur.jass.${run_file} > heur.${eval_file} 37 | done 38 | -------------------------------------------------------------------------------- /systems/JASS/cw12.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source ../common.sh 4 | source setup.sh 5 | 6 | if [[ ! -f ../${ATIRE_DIR}/cw12_quantized.aspt ]]; then 7 | echo "Must have built an ATIRE quantized index" 8 | echo "Looked for: ../${ATIRE_DIR}/cw12_quantized.aspt" 9 | exit 10 | fi 11 | 12 | START=$(date +%s) 13 | ./atire_to_jass_index ../${ATIRE_DIR}/cw12_quantized.aspt -Q 14 | END=$(date +%s) 15 | 16 | echo "'Indexing' took:" $((END - START)) "seconds" 17 | 18 | for queries in "201-250" "251-300" 19 | do 20 | query_file=../$TOPICS_QRELS/topics.web.${queries}.txt 21 | qrel_file=../$TOPICS_QRELS/qrels.web.${queries}.txt 22 | stat_file=${index}.${queries}.search_stats.txt 23 | run_file=${queries}.txt 24 | eval_file=eval.${queries}.txt 25 | 26 | ./trec2query/trec2query ${query_file} q > ${queries}.txt 27 | 28 | echo "Searching queries ${queries} to 1B postings" 29 | ./jass ${queries}.txt 1000 1000000000 -d > comp.${stat_file} 30 | mv ranking.txt comp.jass.${run_file} 31 | ../$TREC_EVAL ${qrel_file} comp.jass.${run_file} > comp.${eval_file} 32 | 33 | echo "Searching queries ${queries} to 5m postings" 34 | ./jass ${queries}.txt 1000 5000000 -d > heur.${stat_file} 35 | mv ranking.txt heur.jass.${run_file} 36 | ../$TREC_EVAL ${qrel_file} heur.jass.${run_file} > heur.${eval_file} 37 | done 38 | -------------------------------------------------------------------------------- /systems/JASS/dotgov2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source ../common.sh 4 | source setup.sh 5 | 6 | if [[ ! -f ../${ATIRE_DIR}/dg2_quantized.aspt ]]; then 7 | echo "Must have built an ATIRE quantized index" 8 | echo "Looked for: ../${ATIRE_DIR}/dg2_quantized.aspt" 9 | exit 10 | fi 11 | 12 | START=$(date +%s) 13 | ./atire_to_jass_index ../${ATIRE_DIR}/dg2_quantized.aspt -Q 14 | END=$(date +%s) 15 | 16 | echo "'Indexing' took:" $((END - START)) "seconds" 17 | 18 | for queries in "701-750" "751-800" "801-850" 19 | do 20 | query_file=../$TOPICS_QRELS/topics.${queries}.txt 21 | qrel_file=../$TOPICS_QRELS/qrels.${queries}.txt 22 | stat_file=${index}.${queries}.search_stats.txt 23 | run_file=${queries}.txt 24 | eval_file=eval.${queries}.txt 25 | 26 | ./trec2query/trec2query ${query_file} t -s s >| ${queries}.txt 27 | 28 | echo "Searching queries ${queries} to 1B postings" 29 | ./jass ${queries}.txt 1000 1000000000 -d >| comp.${stat_file} 30 | mv ranking.txt comp.jass.${run_file} 31 | ../$TREC_EVAL ${qrel_file} comp.jass.${run_file} >| comp.${eval_file} 32 | 33 | echo "Searching queries ${queries} to 2.5M postings" 34 | ./jass ${queries}.txt 1000 2500000 -d >| heur.${stat_file} 35 | mv ranking.txt heur.jass.${run_file} 36 | ../$TREC_EVAL ${qrel_file} heur.jass.${run_file} >| heur.${eval_file} 37 | done 38 | -------------------------------------------------------------------------------- /systems/JASS/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ATIRE_DIR="../ATIRE/atire" 4 | 5 | if [[ ! -d ${ATIRE_DIR} ]]; then 6 | echo "ATIRE is a prerequisite for JASS" 7 | exit 8 | fi 9 | 10 | if [[ ! -d JASS ]]; then 11 | git clone https://github.com/lintool/JASS.git 12 | git checkout -q b27b319 13 | fi 14 | 15 | cd JASS 16 | make ATIRE_DIR=../${ATIRE_DIR} 17 | make -C trec2query ATIRE_DIR=../../${ATIRE_DIR} 18 | -------------------------------------------------------------------------------- /systems/MG4J/README.md: -------------------------------------------------------------------------------- 1 | Scripts 2 | ======= 3 | 4 | For each collection, there is an -index.sh and an -index-pos.sh script 5 | that will build a non-positional index and a positional index, respectively. 6 | The scripts will print on standard output the construction time. All 7 | scripts use parallel instances and log in a number of *.err files what 8 | is happening in each parallel instance. 9 | 10 | For each collection, there is an -eval.sh script that uses a non-positional 11 | index and Model B, an -eval-pos.sh script that uses a positional index and Model B+, 12 | and finally a -bm25.sh that performs a baseline BM25 run. Each script saves 13 | in eval.$queries.txt the results of evaluation and in time.$queries.txt the 14 | overall query time in milliseconds. 15 | 16 | Size 17 | ==== 18 | 19 | A non-positional index is formed by the .properties file, the .titles 20 | file, the .pointers[offsets] files, the .counts[offsets] files, the .sizes 21 | file and the .termmap file. A positional index in addition uses the 22 | .positions[offsets] files. 23 | 24 | Metadata 25 | ======== 26 | 27 | All metadata is contained in the .properties file (which is a standard, 28 | self-descripting Java property file). 29 | -------------------------------------------------------------------------------- /systems/MG4J/cw12-bm25.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # Plain BM25 on disjunctive queries (mainly as a baseline) 5 | 6 | source ../common.sh 7 | 8 | export CLASSPATH=$(find -iname \*.jar | paste -d: -s) 9 | 10 | WORK_DIR=. 11 | 12 | for queries in "201-250" "251-300" 13 | do 14 | topics=$TOPICS_QRELS/topics.web.$queries.txt 15 | qrels=$TOPICS_QRELS/qrels.web.$queries.txt 16 | err=err.$queries.txt 17 | run=run.$queries.txt 18 | 19 | # Extract titles, minimal massaging (no stopwords, U.S. => U S, etc.) 20 | fgrep "<query>" $topics | sed 's/<.\?query>//g;s/--//;/^[[:space:]]*$/d;s/[[:space:]]*$//;s/^[[:space:]]*//' | sed "s/-/ /g;s/U.S./U S/;s/'s//" | sed 's/\<\(a\|the\|in\|to\|of\|on\|for\|and\|at\)\>//g' > titles.$queries.txt 21 | # Generate input files 22 | cat <(echo -e "\$score BM25Scorer(1.2,0.3)\n\$limit 1000\n\$divert $run\n\$mplex off") <(sed -e 's/[ ]\+/|/g' <titles.$queries.txt | awk "BEGIN {i = ${queries%-*} } { print \"\$mode trec \" i \" mg4jAuto\"; print; i = i + 1; }" ) >in.$queries.txt 23 | 24 | java -server it.unimi.di.big.mg4j.query.Query $WORK_DIR/cw12-text -T $WORK_DIR/cw12.titles <in.$queries.txt 2>$err 25 | 26 | $TREC_EVAL $qrels $run >eval.$queries.txt 27 | 28 | grep ms\; $err | cut -d' ' -f6 | paste -d+ -s | bc -l >time.$queries.txt 29 | done 30 | -------------------------------------------------------------------------------- /systems/MG4J/cw12-eval-pos.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # Runs the MG4J k-out-of-n *positional* queries and performs evaluation 5 | 6 | source ../common.sh 7 | 8 | export CLASSPATH=$(find -iname \*.jar | paste -d: -s) 9 | 10 | WORK_DIR=. 11 | 12 | for queries in "201-250" "251-300" 13 | do 14 | topics=$TOPICS_QRELS/topics.web.$queries.txt 15 | qrels=$TOPICS_QRELS/qrels.web.$queries.txt 16 | err=err.$queries.txt 17 | run=run.$queries.txt 18 | 19 | # Extract titles, minimal massaging (no stopwords, U.S. => U S, etc.) 20 | fgrep "<query>" $topics | sed 's/<.\?query>//g;s/--//;/^[[:space:]]*$/d;s/[[:space:]]*$//;s/^[[:space:]]*//' | sed "s/-/ /g;s/U\\.S\\./US/g;s/'s//" | sed 's/\<\(in\|to\|of\|on\|for\|and\|at\)\>//g' > titles.$queries.txt 21 | # Generate input files 22 | cat <(echo -e "\$score BM25Scorer(1.2,0.3)\n\$limit 1000\n\$divert $run\n\$mplex off") <(./genqueriespos.sh $(echo ${queries%-*}) <titles.$queries.txt) >in.$queries.txt 23 | 24 | java -server it.unimi.di.big.mg4j.query.Query $WORK_DIR/cw12-text -T $WORK_DIR/cw12.titles <in.$queries.txt 2>$err 25 | 26 | $TREC_EVAL -q $qrels $run >eval.$queries.txt 27 | 28 | grep ms\; $err | cut -d' ' -f6 | paste -d+ -s | bc -l >time.$queries.txt 29 | done 30 | -------------------------------------------------------------------------------- /systems/MG4J/cw12-eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # Runs the MG4J k-out-of-n queries and performs evaluation 5 | 6 | source ../common.sh 7 | 8 | export CLASSPATH=$(find -iname \*.jar | paste -d: -s) 9 | 10 | WORK_DIR=. 11 | 12 | for queries in "201-250" "251-300" 13 | do 14 | topics=$TOPICS_QRELS/topics.web.$queries.txt 15 | qrels=$TOPICS_QRELS/qrels.web.$queries.txt 16 | err=err.$queries.txt 17 | run=run.$queries.txt 18 | 19 | # Extract titles, minimal massaging (no stopwords, U.S. => U S, etc.) 20 | fgrep "<query>" $topics | sed 's/<.\?query>//g;s/--//;/^[[:space:]]*$/d;s/[[:space:]]*$//;s/^[[:space:]]*//' | sed "s/-/ /g;s/U.S./U S/;s/'s//" | sed 's/\<\(in\|to\|of\|on\|for\|and\|at\)\>//g' > titles.$queries.txt 21 | # Generate input files 22 | cat <(echo -e "\$score BM25Scorer(1.2,0.3)\n\$limit 1000\n\$divert $run\n\$mplex off") <(./genqueries.sh $(echo ${queries%-*}) <titles.$queries.txt) >in.$queries.txt 23 | 24 | java -server it.unimi.di.big.mg4j.query.Query $WORK_DIR/cw12-text -T $WORK_DIR/cw12.titles <in.$queries.txt 2>$err 25 | 26 | $TREC_EVAL -q $qrels $run >eval.$queries.txt 27 | 28 | grep ms\; $err | cut -d' ' -f6 | paste -d+ -s | bc -l >time.$queries.txt 29 | done 30 | -------------------------------------------------------------------------------- /systems/MG4J/cw12-index-pos.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | #sudo apt-add-repository -y ppa:webupd8team/java 5 | #sudo apt-get -y update 6 | #sudo apt-get -y install oracle-java8-installer 7 | #sudo apt-get -y install ruby 8 | 9 | version=5.4.1 10 | 11 | source ../common.sh 12 | 13 | WORK_DIR=. 14 | 15 | if [[ ! -f mg4j-big-$version-bin.tar.gz || ! -f mg4j-big-deps.tar.gz ]]; then 16 | curl http://mg4j.di.unimi.it/mg4j-big-$version-bin.tar.gz >mg4j-big-$version-bin.tar.gz 17 | curl http://mg4j.di.unimi.it/mg4j-big-deps.tar.gz >mg4j-big-deps.tar.gz 18 | fi 19 | 20 | tar -zxvf mg4j-big-$version-bin.tar.gz 21 | tar -zxvf mg4j-big-deps.tar.gz 22 | 23 | export CLASSPATH=.:$(find -iname \*.jar | paste -d: -s) 24 | 25 | starttime=$(date +%s) 26 | 27 | # Parallel 28 | 29 | rm -f $WORK_DIR/cw12.titles $WORK_DIR/cw12-text.* $WORK_DIR/cw12-split-* split-* 30 | 31 | TMP=$(mktemp) 32 | find $CW12B_LOCATION -iname \*.gz -type f | sort >$TMP 33 | split -n l/16 $TMP split- 34 | 35 | (for split in split-*; do 36 | ( 37 | 38 | java -Xmx7512M -server \ 39 | it.unimi.di.big.mg4j.document.WarcDocumentSequence \ 40 | -z -f it.unimi.di.big.mg4j.document.HtmlDocumentFactory -p encoding=iso-8859-1 $WORK_DIR/cw12-$split.sequence $(cat $split) 41 | 42 | # Do not check version. Use BURL to sanitize non-conformant URLs. 43 | 44 | java -Xmx7512M -server -Dit.unimi.di.law.warc.io.version=false -Dit.unimi.di.law.warc.records.useburl=true \ 45 | it.unimi.di.big.mg4j.tool.Scan -s 1000000 -S $WORK_DIR/cw12-$split.sequence -t EnglishStemmer -I text $WORK_DIR/cw12-$split >$split.out 2>$split.err 46 | 47 | )& 48 | 49 | done 50 | 51 | wait) 52 | 53 | # Check that all instances have completed 54 | 55 | if (( $(find -iname cw12-split-\*-text.cluster.properties | wc -l) != 16 )); then 56 | echo "ERROR: Some instance did not complete correctly" 1>&2 57 | exit 1 58 | fi 59 | 60 | java -Xmx7512M -server it.unimi.di.big.mg4j.tool.Concatenate $WORK_DIR/cw12-text \ 61 | $(find $WORK_DIR -iname cw12-split-\*-text@\*.sizes | sort | sed s/.sizes//) 62 | cat $(find $WORK_DIR -iname cw12-split-\*.titles | sort) >$WORK_DIR/cw12.titles 63 | 64 | java -Xmx7512M -server it.unimi.dsi.sux4j.mph.MWHCFunction -s 32 $WORK_DIR/cw12-text.mwhc $WORK_DIR/cw12-text.terms 65 | 66 | java -Xmx7512M -server it.unimi.dsi.sux4j.util.SignedFunctionStringMap $WORK_DIR/cw12-text.mwhc $WORK_DIR/cw12-text.termmap 67 | 68 | 69 | endtime=$(date +%s) 70 | 71 | echo "Indexing time: $((endtime-starttime))s" 72 | -------------------------------------------------------------------------------- /systems/MG4J/cw12-index.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | #sudo apt-add-repository -y ppa:webupd8team/java 5 | #sudo apt-get -y update 6 | #sudo apt-get -y install oracle-java8-installer 7 | #sudo apt-get -y install ruby 8 | 9 | version=5.4.1 10 | 11 | source ../common.sh 12 | 13 | WORK_DIR=. 14 | 15 | if [[ ! -f mg4j-big-$version-bin.tar.gz || ! -f mg4j-big-deps.tar.gz ]]; then 16 | curl http://mg4j.di.unimi.it/mg4j-big-$version-bin.tar.gz >mg4j-big-$version-bin.tar.gz 17 | curl http://mg4j.di.unimi.it/mg4j-big-deps.tar.gz >mg4j-big-deps.tar.gz 18 | fi 19 | 20 | tar -zxvf mg4j-big-$version-bin.tar.gz 21 | tar -zxvf mg4j-big-deps.tar.gz 22 | 23 | export CLASSPATH=.:$(find -iname \*.jar | paste -d: -s) 24 | 25 | starttime=$(date +%s) 26 | 27 | # Parallel 28 | 29 | rm -f $WORK_DIR/cw12.titles $WORK_DIR/cw12-text.* $WORK_DIR/cw12-split-* split-* 30 | 31 | TMP=$(mktemp) 32 | find $CW12B_LOCATION -iname \*.gz -type f | sort >$TMP 33 | split -n l/16 $TMP split- 34 | 35 | (for split in split-*; do 36 | ( 37 | 38 | java -Xmx7512M -server \ 39 | it.unimi.di.big.mg4j.document.WarcDocumentSequence \ 40 | -z -f it.unimi.di.big.mg4j.document.HtmlDocumentFactory -p encoding=iso-8859-1 $WORK_DIR/cw12-$split.sequence $(cat $split) 41 | 42 | # Do not check version. Use BURL to sanitize non-conformant URLs. 43 | 44 | java -Xmx7512M -server -Dit.unimi.di.law.warc.io.version=false -Dit.unimi.di.law.warc.records.useburl=true \ 45 | it.unimi.di.big.mg4j.tool.Scan -s 1000000 -S $WORK_DIR/cw12-$split.sequence -t EnglishStemmer -I text -c COUNTS $WORK_DIR/cw12-$split >$split.out 2>$split.err 46 | 47 | )& 48 | 49 | done 50 | 51 | wait) 52 | 53 | # Check that all instances have completed 54 | 55 | if (( $(find -iname cw12-split-\*-text.cluster.properties | wc -l) != 16 )); then 56 | echo "ERROR: Some instance did not complete correctly" 1>&2 57 | exit 1 58 | fi 59 | 60 | java -Xmx7512M -server it.unimi.di.big.mg4j.tool.Concatenate -c POSITIONS:NONE $WORK_DIR/cw12-text \ 61 | $(find $WORK_DIR -iname cw12-split-\*-text@\*.sizes | sort | sed s/.sizes//) 62 | cat $(find $WORK_DIR -iname cw12-split-\*.titles | sort) >$WORK_DIR/cw12.titles 63 | 64 | java -Xmx7512M -server it.unimi.dsi.sux4j.mph.MWHCFunction -s 32 $WORK_DIR/cw12-text.mwhc $WORK_DIR/cw12-text.terms 65 | 66 | java -Xmx7512M -server it.unimi.dsi.sux4j.util.SignedFunctionStringMap $WORK_DIR/cw12-text.mwhc $WORK_DIR/cw12-text.termmap 67 | 68 | 69 | endtime=$(date +%s) 70 | 71 | echo "Indexing time: $((endtime-starttime))s" 72 | -------------------------------------------------------------------------------- /systems/MG4J/genqueries.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ./gensubsets.rb | awk "BEGIN {i = $1 } { print \"\$mode trec \" i \" mg4jAuto\"; print; i = i + 1; }" 4 | -------------------------------------------------------------------------------- /systems/MG4J/genqueriespos.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ./gensubsetspos.rb | awk "BEGIN {i = $1 } { print \"\$mode trec \" i \" mg4jAuto\"; print; i = i + 1; }" 4 | -------------------------------------------------------------------------------- /systems/MG4J/gensubsets.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | 3 | $stdin.each_line do |l| 4 | a = [] 5 | l.scan(/\w+/) do |w| 6 | if w == "us" then w = "\"u s\"" end 7 | a << w 8 | end 9 | 10 | andthen = [] 11 | (a.length+1).times do |n| 12 | disj = [] 13 | if n == 0 then next end 14 | (2**(a.length)).times do |x| 15 | if x == 0 then next end 16 | item = [] 17 | a.length.times do |index| 18 | if ( x & (2**index) != 0 ) then item << a[index] end 19 | end 20 | 21 | if item.length == n then disj << ( "(" + item.join(" & " ) + ")" ) end 22 | end 23 | andthen << disj.join( " | " ) 24 | end 25 | 26 | puts andthen.reverse.join( ", " ); 27 | end 28 | -------------------------------------------------------------------------------- /systems/MG4J/gensubsetspos.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | 3 | $stdin.each_line do |l| 4 | a = [] 5 | l.scan(/\w+/) do |w| 6 | if w == "US" then w = "\"u s\"" end 7 | a << w 8 | end 9 | 10 | andthen = [] 11 | (a.length+1).times do |n| 12 | window = [] 13 | conj = [] 14 | if n == 0 then next end 15 | (2**(a.length)).times do |x| 16 | if x == 0 then next end 17 | item = [] 18 | a.length.times do |index| 19 | if ( x & (2**index) != 0 ) then item << a[index] end 20 | end 21 | 22 | if item.length == n then 23 | if n > 1; then 24 | window << ( "(" + item.join(" & " ) + ")~" + (2*n).to_s ) 25 | end 26 | conj<< ( "(" + item.join(" & " ) + ")" ) 27 | end 28 | end 29 | andthen << conj.join( " | " ) 30 | if n > 1; then 31 | andthen << window.join( " | " ) 32 | end 33 | end 34 | 35 | puts andthen.reverse.join( ", " ); 36 | end 37 | -------------------------------------------------------------------------------- /systems/MG4J/gov2-bm25.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # Plain BM25 on disjunctive queries (mainly as a baseline) 5 | 6 | source ../common.sh 7 | 8 | export CLASSPATH=$(find -iname \*.jar | paste -d: -s) 9 | 10 | WORK_DIR=. 11 | 12 | for queries in "701-750" "751-800" "801-850" 13 | do 14 | topics=$TOPICS_QRELS/topics.$queries.txt 15 | qrels=$TOPICS_QRELS/qrels.$queries.txt 16 | err=err.$queries.txt 17 | run=run.$queries.txt 18 | 19 | # Extract titles, minimal massaging (no stopwords, U.S. => U S, etc.) 20 | fgrep -A1 "<title>" $topics | sed 's/<title>//;s/--//;/^[[:space:]]*$/d;s/[[:space:]]*$//;s/^[[:space:]]*//' | sed "s/-/ /g;s/U.S./U S/;s/'s//" | sed 's/\<\(in\|to\|of\|on\|for\|and\|at\)\>//g' > titles.$queries.txt 21 | # Generate input files 22 | cat <(echo -e "\$score BM25Scorer(1.2,0.3)\n\$limit 1000\n\$divert $run\n\$mplex off") <(sed -e 's/[ ]\+/|/g' <titles.$queries.txt | awk "BEGIN {i = ${queries%-*} } { print \"\$mode trec \" i \" mg4jAuto\"; print; i = i + 1; }" ) >in.$queries.txt 23 | 24 | java -server it.unimi.di.big.mg4j.query.Query $WORK_DIR/gov2-text -T $WORK_DIR/gov2.titles <in.$queries.txt 2>$err 25 | 26 | $TREC_EVAL $qrels $run >eval.$queries.txt 27 | 28 | grep ms\; $err | cut -d' ' -f6 | paste -d+ -s | bc -l >time.$queries.txt 29 | done 30 | -------------------------------------------------------------------------------- /systems/MG4J/gov2-eval-pos.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # Runs the MG4J k-out-of-n *positional* queries and performs evaluation 5 | 6 | source ../common.sh 7 | 8 | export CLASSPATH=$(find -iname \*.jar | paste -d: -s) 9 | 10 | WORK_DIR=. 11 | 12 | for queries in "701-750" "751-800" "801-850" 13 | do 14 | topics=$TOPICS_QRELS/topics.$queries.txt 15 | qrels=$TOPICS_QRELS/qrels.$queries.txt 16 | err=err.$queries.txt 17 | run=run.$queries.txt 18 | 19 | # Extract titles, minimal massaging (no stopwords, U.S. => US, etc.) 20 | fgrep -A1 "<title>" $topics | sed 's/<title>//;s/--//;/^[[:space:]]*$/d;s/[[:space:]]*$//;s/^[[:space:]]*//' | sed "s/-/ /g;s/U\\.S\\./US/g;s/'s//" | sed 's/\<\(in\|to\|of\|on\|for\|and\|at\)\>//g' > titles.$queries.txt 21 | # Generate input files 22 | cat <(echo -e "\$score BM25Scorer(1.2,0.3)\n\$limit 1000\n\$divert $run\n\$mplex off") <(./genqueriespos.sh $(echo ${queries%-*}) <titles.$queries.txt) >in.$queries.txt 23 | 24 | java -server it.unimi.di.big.mg4j.query.Query $WORK_DIR/gov2-text -T $WORK_DIR/gov2.titles <in.$queries.txt 2>$err 25 | 26 | $TREC_EVAL -q $qrels $run >eval.$queries.txt 27 | 28 | grep ms\; $err | cut -d' ' -f6 | paste -d+ -s | bc -l >time.$queries.txt 29 | done 30 | -------------------------------------------------------------------------------- /systems/MG4J/gov2-eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # Runs the MG4J k-out-of-n queries and performs evaluation 5 | 6 | source ../common.sh 7 | 8 | export CLASSPATH=$(find -iname \*.jar | paste -d: -s) 9 | 10 | WORK_DIR=. 11 | 12 | for queries in "701-750" "751-800" "801-850" 13 | do 14 | topics=$TOPICS_QRELS/topics.$queries.txt 15 | qrels=$TOPICS_QRELS/qrels.$queries.txt 16 | err=err.$queries.txt 17 | run=run.$queries.txt 18 | 19 | # Extract titles, minimal massaging (no stopwords, U.S. => U S, etc.) 20 | fgrep -A1 "<title>" $topics | sed 's/<title>//;s/--//;/^[[:space:]]*$/d;s/[[:space:]]*$//;s/^[[:space:]]*//' | sed "s/-/ /g;s/U.S./U S/;s/'s//" | sed 's/\<\(in\|to\|of\|on\|for\|and\|at\)\>//g' > titles.$queries.txt 21 | # Generate input files 22 | cat <(echo -e "\$score BM25Scorer(1.2,0.3)\n\$limit 1000\n\$divert $run\n\$mplex off") <(./genqueries.sh $(echo ${queries%-*}) <titles.$queries.txt) >in.$queries.txt 23 | 24 | java -server it.unimi.di.big.mg4j.query.Query $WORK_DIR/gov2-text -T $WORK_DIR/gov2.titles <in.$queries.txt 2>$err 25 | 26 | $TREC_EVAL -q $qrels $run >eval.$queries.txt 27 | 28 | grep ms\; $err | cut -d' ' -f6 | paste -d+ -s | bc -l >time.$queries.txt 29 | done 30 | -------------------------------------------------------------------------------- /systems/MG4J/gov2-index-pos.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | sudo apt-add-repository -y ppa:webupd8team/java 5 | sudo apt-get -y update 6 | sudo apt-get -y install oracle-java8-installer 7 | sudo apt-get -y install ruby 8 | 9 | version=5.4.1 10 | 11 | source ../common.sh 12 | 13 | WORK_DIR=. 14 | 15 | if [[ ! -f mg4j-big-$version-bin.tar.gz || ! -f mg4j-big-deps.tar.gz ]]; then 16 | curl http://mg4j.di.unimi.it/mg4j-big-$version-bin.tar.gz >mg4j-big-$version-bin.tar.gz 17 | curl http://mg4j.di.unimi.it/mg4j-big-deps.tar.gz >mg4j-big-deps.tar.gz 18 | fi 19 | 20 | tar -zxvf mg4j-big-$version-bin.tar.gz 21 | tar -zxvf mg4j-big-deps.tar.gz 22 | 23 | export CLASSPATH=.:$(find -iname \*.jar | paste -d: -s) 24 | 25 | starttime=$(date +%s) 26 | 27 | # Parallel 28 | 29 | rm -f $WORK_DIR/gov2.titles $WORK_DIR/gov2-text.* $WORK_DIR/gov2-split-* split-* 30 | 31 | TMP=$(mktemp) 32 | find $GOV2_LOCATION -type f | sort >$TMP 33 | split -n l/16 $TMP split- 34 | 35 | (for split in split-*; do 36 | ( 37 | 38 | java -Xmx7512M -server \ 39 | it.unimi.di.big.mg4j.document.TRECDocumentCollection \ 40 | -f HtmlDocumentFactory -p encoding=iso-8859-1 -z $WORK_DIR/gov2-$split.collection $(cat $split) 41 | 42 | java -Xmx7512M -server \ 43 | it.unimi.di.big.mg4j.tool.Scan -s 1000000 -S $WORK_DIR/gov2-$split.collection -t EnglishStemmer -I text $WORK_DIR/gov2-$split >$split.out 2>$split.err 44 | 45 | )& 46 | 47 | done 48 | 49 | wait) 50 | 51 | # Check that all instances have completed 52 | 53 | if (( $(find -iname gov2-split-\*-text.cluster.properties | wc -l) != 16 )); then 54 | echo "ERROR: Some instance did not complete correctly" 1>&2 55 | exit 1 56 | fi 57 | 58 | java -Xmx7512M -server it.unimi.di.big.mg4j.tool.Concatenate $WORK_DIR/gov2-text \ 59 | $(find $WORK_DIR -iname gov2-split-\*-text@\*.sizes | sort | sed s/.sizes//) 60 | cat $(find $WORK_DIR -iname gov2-split-\*.titles | sort) >$WORK_DIR/gov2.titles 61 | 62 | java -Xmx7512M -server it.unimi.dsi.sux4j.mph.MWHCFunction -s 32 $WORK_DIR/gov2-text.mwhc $WORK_DIR/gov2-text.terms 63 | 64 | java -Xmx7512M -server it.unimi.dsi.sux4j.util.SignedFunctionStringMap $WORK_DIR/gov2-text.mwhc $WORK_DIR/gov2-text.termmap 65 | 66 | endtime=$(date +%s) 67 | 68 | echo "Indexing time: $((endtime-starttime))s" 69 | -------------------------------------------------------------------------------- /systems/MG4J/gov2-index.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | #sudo apt-add-repository -y ppa:webupd8team/java 5 | #sudo apt-get -y update 6 | #sudo apt-get -y install oracle-java8-installer 7 | #sudo apt-get -y install ruby 8 | 9 | version=5.4.1 10 | 11 | source ../common.sh 12 | 13 | WORK_DIR=. 14 | 15 | if [[ ! -f mg4j-big-$version-bin.tar.gz || ! -f mg4j-big-deps.tar.gz ]]; then 16 | curl http://mg4j.di.unimi.it/mg4j-big-$version-bin.tar.gz >mg4j-big-$version-bin.tar.gz 17 | curl http://mg4j.di.unimi.it/mg4j-big-deps.tar.gz >mg4j-big-deps.tar.gz 18 | fi 19 | 20 | tar -zxvf mg4j-big-$version-bin.tar.gz 21 | tar -zxvf mg4j-big-deps.tar.gz 22 | 23 | export CLASSPATH=.:$(find -iname \*.jar | paste -d: -s) 24 | 25 | starttime=$(date +%s) 26 | 27 | # Parallel 28 | 29 | rm -f $WORK_DIR/gov2.titles $WORK_DIR/gov2-text.* $WORK_DIR/gov2-split-* split-* 30 | 31 | TMP=$(mktemp) 32 | find $GOV2_LOCATION -type f | sort >$TMP 33 | split -n l/16 $TMP split- 34 | 35 | (for split in split-*; do 36 | ( 37 | 38 | java -Xmx7512M -server \ 39 | it.unimi.di.big.mg4j.document.TRECDocumentCollection \ 40 | -z -f HtmlDocumentFactory -p encoding=iso-8859-1 $WORK_DIR/gov2-$split.collection $(cat $split) 41 | 42 | java -Xmx7512M -server \ 43 | it.unimi.di.big.mg4j.tool.Scan -s 1000000 -S $WORK_DIR/gov2-$split.collection -t EnglishStemmer -I text -c COUNTS $WORK_DIR/gov2-$split >$split.out 2>$split.err 44 | 45 | )& 46 | 47 | done 48 | 49 | wait) 50 | 51 | # Check that all instances have completed 52 | 53 | if (( $(find -iname gov2-split-\*-text.cluster.properties | wc -l) != 16 )); then 54 | echo "ERROR: Some instance did not complete correctly" 1>&2 55 | exit 1 56 | fi 57 | 58 | java -Xmx7512M -server it.unimi.di.big.mg4j.tool.Concatenate -c POSITIONS:NONE $WORK_DIR/gov2-text \ 59 | $(find $WORK_DIR -iname gov2-split-\*-text@\*.sizes | sort | sed s/.sizes//) 60 | cat $(find $WORK_DIR -iname gov2-split-\*.titles | sort) >$WORK_DIR/gov2.titles 61 | 62 | java -Xmx7512M -server it.unimi.dsi.sux4j.mph.MWHCFunction -s 32 $WORK_DIR/gov2-text.mwhc $WORK_DIR/gov2-text.terms 63 | 64 | java -Xmx7512M -server it.unimi.dsi.sux4j.util.SignedFunctionStringMap $WORK_DIR/gov2-text.mwhc $WORK_DIR/gov2-text.termmap 65 | 66 | endtime=$(date +%s) 67 | 68 | echo "Indexing time: $((endtime-starttime))s" 69 | -------------------------------------------------------------------------------- /systems/MG4J/logback.xml: -------------------------------------------------------------------------------- 1 | <?xml version="1.0" encoding="UTF-8"?> 2 | <configuration> 3 | <appender name="stderr" class="ch.qos.logback.core.ConsoleAppender"> 4 | <target>System.err</target> 5 | <encoder> 6 | <pattern>%d %r %p [%t] %logger{1} - %m%n</pattern> 7 | </encoder> 8 | </appender> 9 | <root level="INFO"> 10 | <appender-ref ref="stderr"/> 11 | </root> 12 | </configuration> 13 | -------------------------------------------------------------------------------- /systems/common.sh: -------------------------------------------------------------------------------- 1 | GOV2_LOCATION=/media/Gov2/data 2 | CW09B_LOCATION=/media/ClueWeb09b/ClueWeb09_English_1 3 | CW12B_LOCATION=/media/ClueWeb12-B13/DiskB 4 | 5 | TOPICS_QRELS=../../topics-and-qrels/ 6 | TREC_EVAL=../../eval/trec_eval.9.0/trec_eval 7 | SAP_EVAL=../../eval/statAP_MQ_eval_v3.pl 8 | GD_EVAL=../../eval/gdeval 9 | 10 | # define JAVA_HOME for those tools that need it 11 | export JAVA_HOME='/usr/lib/jvm/java-8-oracle/' 12 | 13 | # Build trec eval if it has not been 14 | if [[ ! -f ${TREC_EVAL} ]]; then 15 | tar xzf ../../eval/trec_eval.9.0.tar.gz -C ../../eval 16 | make -C ../../eval/trec_eval.9.0/ 17 | fi 18 | TREC_EVAL="${TREC_EVAL} -q" 19 | 20 | # Get statMAP eval tool for the 2009 queries 21 | if [[ ! -f ${SAP_EVAL} ]]; then 22 | curl http://trec.nist.gov/data/web/09/statAP_MQ_eval_v3.pl > ${SAP_EVAL} 23 | chmod +x ${SAP_EVAL} 24 | fi 25 | 26 | # Get the gdeval tool for ERR, nDCG 27 | # This is the latest version I could find 28 | if [[ ! -f ${GD_EVAL} ]]; then 29 | curl https://raw.githubusercontent.com/trec-web/trec-web-2014/master/src/eval/gdeval.pl > ${GD_EVAL} 30 | chmod +x ${GD_EVAL} 31 | fi 32 | -------------------------------------------------------------------------------- /systems/galago/dotgov2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -efu 3 | 4 | source ../common.sh 5 | 6 | URL="https://sourceforge.net/projects/lemur/files/lemur/galago-3.8/galago-3.8-bin.tar.gz/download?use_mirror=autoselect" 7 | GALAGO_ARCHIVE="galago.tar.gz" 8 | GALAGO_DIR='galago-3.8-bin' 9 | TMPDIR="tmp/" 10 | GALAGO="${GALAGO_DIR}/bin/galago" 11 | 12 | if [[ ! -f ${GALAGO_ARCHIVE} ]]; then 13 | wget ${URL} -O ${GALAGO_ARCHIVE} 14 | fi 15 | 16 | if [[ ! -f ${GALAGO} ]]; then 17 | rm -rf ${GALAGO_DIR} 18 | tar -xf ${GALAGO_ARCHIVE} 19 | fi 20 | 21 | 22 | mkdir -p ${TMPDIR} 23 | export JAVA_OPTS="-Djava.io.tmpdir=${TMPDIR} -Xmx7g" 24 | chmod +x ${GALAGO} 25 | 26 | # build index if not already: 27 | INDEX_PATH=gov2.galago 28 | LOG_FILE=build_index.log 29 | 30 | if [[ ! -f ${INDEX_PATH}/buildManifest.json ]]; then 31 | ${GALAGO} build --server=true --mode=fork --distrib=16 --filetype=trecweb --nonStemmedPostings=false --stemmedPostings=true --stemmedCounts=true --corpus=false --inputPath=${GOV2_LOCATION} --indexPath=${INDEX_PATH} 1> >(tee ${LOG_FILE}.stdout) 2> >(tee ${LOG_FILE}.stderr) 32 | fi 33 | 34 | rm -rf ${TMPDIR} # remove any lingering temporary files 35 | 36 | for method in combine sdm; do 37 | for queries in "701-750" "751-800" "801-850"; do 38 | query_file=$TOPICS_QRELS/topics.${queries}.txt 39 | qrel_file=$TOPICS_QRELS/qrels.${queries}.txt 40 | query_json=q${queries}.${method}.json 41 | python2 make_query_json.py $method $query_file > $query_json # generate title queries 42 | run_file=galago${queries}.${method}.trecrun 43 | 44 | if [[ ! -f ${run_file} ]]; then 45 | ${GALAGO} timed-batch-search ${query_json} --repeats=1 --requested=1000 --index=${INDEX_PATH} --outputFile=${run_file} --timesFile=${run_file}.times 46 | fi 47 | $TREC_EVAL ${qrel_file} ${run_file} > galago${queries}.${method}.treceval 48 | done 49 | done 50 | -------------------------------------------------------------------------------- /systems/galago/make_query_json.py: -------------------------------------------------------------------------------- 1 | import sys, json 2 | 3 | operator = sys.argv[1] 4 | queries = [] 5 | inTopic = False 6 | number=None 7 | query = None 8 | 9 | def makeCombineQuery(query): 10 | terms = ['#dirichlet(%s)' % x for x in query.split()] 11 | return '#combine('+' '.join(terms)+')' 12 | 13 | def makeSDMQuery(query): 14 | return '#sdm('+' '.join(query.split())+')' 15 | 16 | with open(sys.argv[2]) as fp: 17 | for line in fp: 18 | 19 | if not inTopic: 20 | if line.startswith('<top>'): 21 | inTopic = True 22 | continue 23 | # inTopic=True 24 | if line.startswith('</top>'): 25 | if operator == 'combine': 26 | queries += [{ 'number': number, 'text': makeCombineQuery(query) }] 27 | elif operator=='sdm': 28 | queries += [{ 'number': number, 'text': makeSDMQuery(query) }] 29 | inTopic = False 30 | continue 31 | if line.startswith('<num>'): 32 | number = line.split('Number: ')[1].strip() 33 | if line.startswith('<title>'): 34 | query = line[len('<title>'):].strip().replace(".", "") 35 | if not query: 36 | query = next(fp).strip().replace(".","") 37 | #print line; 38 | 39 | queries_json = {'queries': queries} 40 | print json.dumps(queries_json) 41 | -------------------------------------------------------------------------------- /systems/indri/clean.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # remove all temporary, intermediate and result folders and files 4 | 5 | # 2015 July 22 - James Valenti/Jamie Callan Carnegie Mellon University 6 | 7 | # indri 8 | # DO NOT REMOVE INDEX (unless you really intend to) 9 | # 10 | #rm -f ./build_index.log.stderr 11 | #rm -f ./build_index.log.stdout 12 | #rm -f ./indri-5.3.tar.gz 13 | #rm -rf ./indri-5.3 14 | 15 | # queries 16 | # these are typically removed between query parameter adjustments and benchmarking runs 17 | rm -fr ./queries 18 | rm -fr ./query_results 19 | rm -f ./results 20 | rm -fr ./scores 21 | -------------------------------------------------------------------------------- /systems/indri/dm.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # 4 | # Perl subroutine that generates Indri dependence model queries. 5 | # 6 | # Written by: Don Metzler (metzler@cs.umass.edu) 7 | # Last updated: 06/27/2005 8 | # 9 | # Feel free to distribute, edit, modify, or mangle this code as you see fit. If you make any interesting 10 | # changes please email me a copy. 11 | # 12 | # For more technical details, see: 13 | # 14 | # * Metzler, D. and Croft, W.B., "A Markov Random Field Model for Term Dependencies," ACM SIGIR 2005. 15 | # 16 | # * Metzler, D., Strohman T., Turtle H., and Croft, W.B., "Indri at TREC 2004: Terabyte Track", TREC 2004. 17 | # 18 | # * http://ciir.cs.umass.edu/~metzler/ 19 | # 20 | # NOTES 21 | # 22 | # * this script assumes that the query string has already been parsed and that all characters 23 | # that are not compatible with Indri's query language have been removed. 24 | # 25 | # * it is not advisable to do a 'full dependence' variant on long strings because of the exponential 26 | # number of terms that will result. it is suggested that the 'sequential dependence' variant be 27 | # used for long strings. either that, or split up long strings into smaller cohesive chunks and 28 | # apply the 'full dependence' variant to each of the chunks. 29 | # 30 | # * the unordered features use a window size of 4 * number of terms within the phrase. this has been 31 | # found to work well across a wide range of collections and topics. however, this may need to be 32 | # modified on an individual basis. 33 | # 34 | 35 | # example usage 36 | #print formulate_query( "white house rose garden", "sd", 0.5, 0.25, 0.25 ) . "\n\n"; 37 | #print formulate_query( "white house rose garden", "fd", 0.8, 0.1, 0.1 ) . "\n\n"; 38 | 39 | my $file = $ARGV[0]; 40 | open my $info, $file or die "Could not open $file: $!"; 41 | 42 | while( my $line = <$info>) { 43 | print formulate_query( $line, "sd", 0.7, 0.2, 0.1 ) . "\n"; 44 | #last if $. == 2; 45 | } 46 | 47 | close $info; 48 | 49 | # 50 | # formulates a query based on query text and feature weights 51 | # 52 | # arguments: 53 | # * query - string containing original query terms separated by spaces 54 | # * type - string. "sd" for sequential dependence or "fd" for full dependence variant. defaults to "fd". 55 | # * wt[0] - weight assigned to term features 56 | # * wt[1] - weight assigned to ordered (#1) features 57 | # * wt[2] - weight assigned to unordered (#uw) features 58 | # 59 | sub formulate_query { 60 | my ( $q, $type, @wt ) = @_; 61 | 62 | # trim whitespace from beginning and end of query string 63 | $q =~ s/^\s+|\s+$//g; 64 | 65 | my $queryT = "#combine( "; 66 | my $queryO = "#combine("; 67 | my $queryU = "#combine("; 68 | 69 | # generate term features (f_T) 70 | my @terms = split(/\s+/ , $q); 71 | my $term; 72 | foreach $term ( @terms ) { 73 | $queryT .= "$term "; 74 | } 75 | 76 | my $num_terms = @terms; 77 | 78 | # skip the rest of the processing if we're just 79 | # interested in term features or if we only have 1 term 80 | if( ( $wt[1] == 0.0 && $wt[2] == 0.0 ) || $num_terms == 1 ) { 81 | return $queryT . ")"; 82 | } 83 | 84 | # generate the rest of the features 85 | my $start = 1; 86 | if( $type eq "sd" ) { $start = 3; } 87 | for( my $i = $start ; $i < 2 ** $num_terms ; $i++ ) { 88 | my $bin = unpack("B*", pack("N", $i)); # create binary representation of i 89 | my $num_extracted = 0; 90 | my $extracted_terms = ""; 91 | 92 | # get query terms corresponding to 'on' bits 93 | for( my $j = 0 ; $j < $num_terms ; $j++ ) { 94 | my $bit = substr($bin, $j - $num_terms, 1); 95 | if( $bit eq "1" ) { 96 | $extracted_terms .= "$terms[$j] "; 97 | $num_extracted++; 98 | } 99 | } 100 | 101 | if( $num_extracted == 1 ) { next; } # skip these, since we already took care of the term features... 102 | if( $bin =~ /^0+11+[^1]*$/ ) { # words in contiguous phrase, ordered features (f_O) 103 | $queryO .= " #1( $extracted_terms) "; 104 | } 105 | $queryU .= " #uw" . 4*$num_extracted . "( $extracted_terms) "; # every subset of terms, unordered features (f_U) 106 | if( $type eq "sd" ) { $i *= 2; $i--; } 107 | } 108 | 109 | my $query = "#weight("; 110 | if( $wt[0] != 0.0 && $queryT ne "#combine( " ) { $query .= " $wt[0] $queryT)"; } 111 | if( $wt[1] != 0.0 && $queryO ne "#combine(" ) { $query .= " $wt[1] $queryO)"; } 112 | if( $wt[2] != 0.0 && $queryU ne "#combine(" ) { $query .= " $wt[2] $queryU)"; } 113 | 114 | if( $query eq "#weight(" ) { return ""; } # return "" if we couldn't formulate anything 115 | 116 | return $query . " )"; 117 | } 118 | -------------------------------------------------------------------------------- /systems/indri/index-clef.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | #usage: ./index.sh language 4 | #language can be bg,de,es,fa,fi,fr,hu,it,nl,pt,ru,sv 5 | 6 | #set lemur bin directory 7 | lemurdir=/u/xiaojiex/ir/lemur/bin 8 | 9 | #set indexing parameter file directory 10 | indexParaDir=/u/xiaojiex/ir/excute/clefmono 11 | 12 | date 13 | echo "building index started" 14 | $lemurdir/IndriBuildIndex $indexParaDir/indexParaSP_$1 15 | echo "building index finished" 16 | date 17 | -------------------------------------------------------------------------------- /systems/indri/index-clef_ReadMe.txt: -------------------------------------------------------------------------------- 1 | Method Description: 2 | xiao jie liu(James Liu), the student of Professor Jian-Yun Nie, submits the CLEF baseline experiments results. James used Lemur/Indri to be 3 | the basic retrieval platform and used language model with Dirichlet Smoothing(mu=2000) for the retrieval model. James dealed with 12 languages 4 | for the topics and corpora. They are Hungarian(hu),Italian(it),Dutch(nl),Portuguese(pt),Russian(ru),Swedish(sv),German(de),Spanish(es),Finnish(fi), 5 | French(fr),Bulgarian(bg) and Persian(fa). 6 | James Liu pre-processed the topics files and Corpora by removing the stopwords and stemming the contents. Stopwords list files are provided 7 | by Maria Maistro (maistro@dei.unipd.it), you can download from https://github.com/mmaistro/IR-Reproducibility/tree/mmaistro. The stemming methods 8 | used are from Professfor Jacques.Savoy (Jacques.Savoy@unine.ch)(http://members.unine.ch/jacques.savoy/clef/index.html) for language Bulgarian; 9 | from Jonsafari (https://www.ling.ohio-state.edu/~jonsafari/persian_nlp.html) for language Persian; from Snowball (http://snowball.tartarus.org/) 10 | for other languages (Hungarian,Italian,Dutch,Portuguese,Russian,Swedish,German,Spanish,Finnish and French). 11 | In the final result, for each topic, there are 1000 documents returned. And the format is Trec standard format. 12 | 13 | -------------------------------------------------------------------------------- /systems/indri/indexParaSP_bg: -------------------------------------------------------------------------------- 1 | <parameters> 2 | <memory>1024m</memory> 3 | <index>/u/xiaojiex/ir/excute/clefmono/indexSP_bg/</index> 4 | <corpus> 5 | <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/SEGA2002_consdoc/</path> 6 | <class>trectext</class> 7 | </corpus> 8 | <corpus> 9 | <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/STANDART2002_consdoc/</path> 10 | <class>trectext</class> 11 | </corpus> 12 | </parameters> 13 | -------------------------------------------------------------------------------- /systems/indri/indexParaSP_de: -------------------------------------------------------------------------------- 1 | <parameters> 2 | <memory>1024m</memory> 3 | <index>/u/xiaojiex/ir/excute/clefmono/indexSP_de/</index> 4 | <corpus> 5 | <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/FRANKFURTER1994_UTF8_consdoc/</path> 6 | <class>trectext</class> 7 | </corpus> 8 | <corpus> 9 | <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/SDA1994_UTF8_consdoc/</path> 10 | <class>trectext</class> 11 | </corpus> 12 | <corpus> 13 | <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/SPIEGEL1994_UTF8_consdoc/</path> 14 | <class>trectext</class> 15 | </corpus> 16 | <corpus> 17 | <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/SPIEGEL1995_UTF8_consdoc/</path> 18 | <class>trectext</class> 19 | </corpus> 20 | </parameters> 21 | -------------------------------------------------------------------------------- /systems/indri/indexParaSP_es: -------------------------------------------------------------------------------- 1 | <parameters> 2 | <memory>1024m</memory> 3 | <index>/u/xiaojiex/ir/excute/clefmono/indexSP_es/</index> 4 | <corpus> 5 | <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/EFE1994_UTF8_consdoc/</path> 6 | <class>trectext</class> 7 | </corpus> 8 | <corpus> 9 | <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/EFE1995_UTF8_consdoc/</path> 10 | <class>trectext</class> 11 | </corpus> 12 | </parameters> 13 | -------------------------------------------------------------------------------- /systems/indri/indexParaSP_fa: -------------------------------------------------------------------------------- 1 | <parameters> 2 | <memory>1024m</memory> 3 | <index>/u/xiaojiex/ir/excute/clefmono/indexSP_fa/</index> 4 | <corpus> 5 | <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/HAMSHAHRI_consdoc/</path> 6 | <class>trectext</class> 7 | </corpus> 8 | </parameters> 9 | -------------------------------------------------------------------------------- /systems/indri/indexParaSP_fi: -------------------------------------------------------------------------------- 1 | <parameters> 2 | <memory>1024m</memory> 3 | <index>/u/xiaojiex/ir/excute/clefmono/indexSP_fi/</index> 4 | <corpus> 5 | <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/AAMULEHTI1994_UTF8_consdoc/</path> 6 | <class>trectext</class> 7 | </corpus> 8 | <corpus> 9 | <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/AAMULEHTI1995_UTF8_consdoc/</path> 10 | <class>trectext</class> 11 | </corpus> 12 | </parameters> 13 | -------------------------------------------------------------------------------- /systems/indri/indexParaSP_fr: -------------------------------------------------------------------------------- 1 | <parameters> 2 | <memory>1024m</memory> 3 | <index>/u/xiaojiex/ir/excute/clefmono/indexSP_fr/</index> 4 | <corpus> 5 | <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/LEMONDE1994_UTF8_consdoc/</path> 6 | <class>trectext</class> 7 | </corpus> 8 | <corpus> 9 | <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/LEMONDE1995_UTF8_consdoc/</path> 10 | <class>trectext</class> 11 | </corpus> 12 | <corpus> 13 | <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/ATS1994_UTF8_consdoc/</path> 14 | <class>trectext</class> 15 | </corpus> 16 | <corpus> 17 | <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/ATS1995_UTF8_consdoc/</path> 18 | <class>trectext</class> 19 | </corpus> 20 | </parameters> 21 | -------------------------------------------------------------------------------- /systems/indri/indexParaSP_hu: -------------------------------------------------------------------------------- 1 | <parameters> 2 | <memory>1024m</memory> 3 | <index>/u/xiaojiex/ir/excute/clefmono/indexSP_hu/</index> 4 | <corpus> 5 | <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/MAGYAR2002_consdoc/</path> 6 | <class>trectext</class> 7 | </corpus> 8 | </parameters> 9 | -------------------------------------------------------------------------------- /systems/indri/indexParaSP_it: -------------------------------------------------------------------------------- 1 | <parameters> 2 | <memory>1024m</memory> 3 | <index>/u/xiaojiex/ir/excute/clefmono/indexSP_it/</index> 4 | <corpus> 5 | <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/AGZ1994_UTF8_consdoc/</path> 6 | <class>trectext</class> 7 | </corpus> 8 | <corpus> 9 | <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/AGZ1995_UTF8_consdoc/</path> 10 | <class>trectext</class> 11 | </corpus> 12 | <corpus> 13 | <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/LASTAMPA1994_UTF8_consdoc/</path> 14 | <class>trectext</class> 15 | </corpus> 16 | </parameters> 17 | -------------------------------------------------------------------------------- /systems/indri/indexParaSP_nl: -------------------------------------------------------------------------------- 1 | <parameters> 2 | <memory>1024m</memory> 3 | <index>/u/xiaojiex/ir/excute/clefmono/indexSP_nl/</index> 4 | <corpus> 5 | <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/ALGEMEEN1994_UTF8_consdoc/</path> 6 | <class>trectext</class> 7 | </corpus> 8 | <corpus> 9 | <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/ALGEMEEN1995_UTF8_consdoc/</path> 10 | <class>trectext</class> 11 | </corpus> 12 | <corpus> 13 | <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/NRC1994_UTF8_consdoc/</path> 14 | <class>trectext</class> 15 | </corpus> 16 | <corpus> 17 | <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/NRC1995_UTF8_consdoc/</path> 18 | <class>trectext</class> 19 | </corpus> 20 | </parameters> 21 | -------------------------------------------------------------------------------- /systems/indri/indexParaSP_pt: -------------------------------------------------------------------------------- 1 | <parameters> 2 | <memory>1024m</memory> 3 | <index>/u/xiaojiex/ir/excute/clefmono/indexSP_pt/</index> 4 | <corpus> 5 | <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/FOLHA1994_UTF8_consdoc/</path> 6 | <class>trectext</class> 7 | </corpus> 8 | <corpus> 9 | <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/FOLHA1995_UTF8_consdoc/</path> 10 | <class>trectext</class> 11 | </corpus> 12 | <corpus> 13 | <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/PUBLICO1994_UTF8_consdoc/</path> 14 | <class>trectext</class> 15 | </corpus> 16 | <corpus> 17 | <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/PUBLICO1995_UTF8_consdoc/</path> 18 | <class>trectext</class> 19 | </corpus> 20 | </parameters> 21 | -------------------------------------------------------------------------------- /systems/indri/indexParaSP_ru: -------------------------------------------------------------------------------- 1 | <parameters> 2 | <memory>1024m</memory> 3 | <index>/u/xiaojiex/ir/excute/clefmono/indexSP_ru/</index> 4 | <corpus> 5 | <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/IZVESTIA1995_consdoc/</path> 6 | <class>trectext</class> 7 | </corpus> 8 | </parameters> 9 | -------------------------------------------------------------------------------- /systems/indri/indexParaSP_sv: -------------------------------------------------------------------------------- 1 | <parameters> 2 | <memory>1024m</memory> 3 | <index>/u/xiaojiex/ir/excute/clefmono/indexSP_sv/</index> 4 | <corpus> 5 | <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/TT1994_consdoc/</path> 6 | <class>trectext</class> 7 | </corpus> 8 | <corpus> 9 | <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/TT1995_consdoc/</path> 10 | <class>trectext</class> 11 | </corpus> 12 | </parameters> 13 | -------------------------------------------------------------------------------- /systems/indri/queryParaLMSP_ru: -------------------------------------------------------------------------------- 1 | <parameters> 2 | <index>/u/xiaojiex/ir/excute/clefmono/indexSP_ru</index> 3 | <rule>method:dirichlet,mu:2000</rule> 4 | <count>1000</count> 5 | <query> 6 | <number>143-AH</number> 7 | <text>Конференц положен Женщин Пекин </text> 8 | </query> 9 | <query> 10 | <number>147-AH</number> 11 | <text>Нефтян авар Птиц </text> 12 | </query> 13 | <query> 14 | <number>148-AH</number> 15 | <text>Разрушен озонов сло </text> 16 | </query> 17 | <query> 18 | <number>149-AH</number> 19 | <text>Виз Пап Римск ШриЛанк </text> 20 | </query> 21 | <query> 22 | <number>151-AH</number> 23 | <text>Чудес Древн Мир </text> 24 | </query> 25 | <query> 26 | <number>153-AH</number> 27 | <text>Олимпийск Игры Мир </text> 28 | </query> 29 | <query> 30 | <number>154-AH</number> 31 | <text>Свобод Слов Интернет </text> 32 | </query> 33 | <query> 34 | <number>155-AH</number> 35 | <text>Опасност Мобильн Телефон </text> 36 | </query> 37 | <query> 38 | <number>157-AH</number> 39 | <text>Победительниц Уимблдон </text> 40 | </query> 41 | <query> 42 | <number>163-AH</number> 43 | <text>Ограничен Куря </text> 44 | </query> 45 | <query> 46 | <number>164-AH</number> 47 | <text>Приговор Наркотик Европ </text> 48 | </query> 49 | <query> 50 | <number>168-AH</number> 51 | <text>Убийств Рабин </text> 52 | </query> 53 | <query> 54 | <number>169-AH</number> 55 | <text>Появлен устройств запис компактдиск </text> 56 | </query> 57 | <query> 58 | <number>172-AH</number> 59 | <text>Миров Рекорд Лёгко Атлетик 1995 </text> 60 | </query> 61 | <query> 62 | <number>176-AH</number> 63 | <text>ШумейкерЛев Юпитер </text> 64 | </query> 65 | <query> 66 | <number>177-AH</number> 67 | <text>Потреблен Молок Европ </text> 68 | </query> 69 | <query> 70 | <number>178-AH</number> 71 | <text>Отказ Несен Воен Служб </text> 72 | </query> 73 | <query> 74 | <number>179-AH</number> 75 | <text>Отставк Генеральн Секретар НАТО </text> 76 | </query> 77 | <query> 78 | <number>180-AH</number> 79 | <text>Банкротств Баринг Бразерс </text> 80 | </query> 81 | <query> 82 | <number>181-AH</number> 83 | <text>Французск Ядерн Испытан </text> 84 | </query> 85 | <query> 86 | <number>183-AH</number> 87 | <text>Ископа Остатк Динозавр Ази </text> 88 | </query> 89 | <query> 90 | <number>187-AH</number> 91 | <text>Ядерн Перевозк Герман </text> 92 | </query> 93 | <query> 94 | <number>192-AH</number> 95 | <text>Убийств Директор Российск Телекомпан </text> 96 | </query> 97 | <query> 98 | <number>193-AH</number> 99 | <text>ЕС Балтийск Государств </text> 100 | </query> 101 | <query> 102 | <number>197-AH</number> 103 | <text>Мирн Соглашен Дейтон </text> 104 | </query> 105 | <query> 106 | <number>198-AH</number> 107 | <text>Почётн Оскар Итальянск Режиссёр </text> 108 | </query> 109 | <query> 110 | <number>199-AH</number> 111 | <text>Эпидем Эбол Заир </text> 112 | </query> 113 | <query> 114 | <number>200-AH</number> 115 | <text>Наводнен Голланд Герман </text> 116 | </query> 117 | <query> 118 | <number>201-AH</number> 119 | <text>Домашн пожар </text> 120 | </query> 121 | <query> 122 | <number>202-AH</number> 123 | <text>Арест Ник Леесон </text> 124 | </query> 125 | <query> 126 | <number>203-AH</number> 127 | <text>Партизанск войн Восточн Тимор </text> 128 | </query> 129 | <query> 130 | <number>207-AH</number> 131 | <text>Травм причин фейерверк </text> 132 | </query> 133 | <query> 134 | <number>209-AH</number> 135 | <text>Победител Тур де Франс </text> 136 | </query> 137 | <query> 138 | <number>210-AH</number> 139 | <text>Кандидат Нобелевск прем </text> 140 | </query> 141 | <query> 142 | <number>211-AH</number> 143 | <text>Пограничн конфликт Пер Эквадор </text> 144 | </query> 145 | <query> 146 | <number>212-AH</number> 147 | <text>Спортсменк допинг </text> 148 | </query> 149 | <query> 150 | <number>213-AH</number> 151 | <text>Путешеств Пап </text> 152 | </query> 153 | <query> 154 | <number>214-AH</number> 155 | <text>Мультимиллиардер </text> 156 | </query> 157 | <query> 158 | <number>215-AH</number> 159 | <text>Повторн избран президент Пер </text> 160 | </query> 161 | <query> 162 | <number>216-AH</number> 163 | <text>Вдыхан кле подростк </text> 164 | </query> 165 | <query> 166 | <number>218-AH</number> 167 | <text>Андреотт маф </text> 168 | </query> 169 | <query> 170 | <number>220-AH</number> 171 | <text>Европейск автомоб Росс </text> 172 | </query> 173 | <query> 174 | <number>221-AH</number> 175 | <text>Олимпийск зимн игр 2002 </text> 176 | </query> 177 | <query> 178 | <number>224-AH</number> 179 | <text>Женщин соверша одиночн восхожден Эверест </text> 180 | </query> 181 | <query> 182 | <number>225-AH</number> 183 | <text>Атомн станц Соснов бор </text> 184 | </query> 185 | <query> 186 | <number>226-AH</number> 187 | <text>Изменен пол </text> 188 | </query> 189 | <query> 190 | <number>227-AH</number> 191 | <text>Алтайск амазонк </text> 192 | </query> 193 | <query> 194 | <number>228-AH</number> 195 | <text>Доисторическ искусств </text> 196 | </query> 197 | <query> 198 | <number>230-AH</number> 199 | <text>Стыковк Атлантис МИР </text> 200 | </query> 201 | <query> 202 | <number>231-AH</number> 203 | <text>Нов португальск премьерминистр </text> 204 | </query> 205 | <query> 206 | <number>232-AH</number> 207 | <text>Программ пенсион обеспечен Европ </text> 208 | </query> 209 | <query> 210 | <number>233-AH</number> 211 | <text>Парников эффект </text> 212 | </query> 213 | <query> 214 | <number>234-AH</number> 215 | <text>Глухот обществ </text> 216 | </query> 217 | <query> 218 | <number>235-AH</number> 219 | <text>Охот тюлен </text> 220 | </query> 221 | <query> 222 | <number>237-AH</number> 223 | <text>Панченлам </text> 224 | </query> 225 | <query> 226 | <number>238-AH</number> 227 | <text>Лед Дая </text> 228 | </query> 229 | <query> 230 | <number>239-AH</number> 231 | <text>Психическ здоров подростк </text> 232 | </query> 233 | <query> 234 | <number>241-AH</number> 235 | <text>Нов политическ парт </text> 236 | </query> 237 | <query> 238 | <number>242-AH</number> 239 | <text>Рекордн срок пребыван космос </text> 240 | </query> 241 | <query> 242 | <number>244-AH</number> 243 | <text>Футболист 1994 </text> 244 | </query> 245 | <query> 246 | <number>245-AH</number> 247 | <text>Кристофер Рив </text> 248 | </query> 249 | <query> 250 | <number>250-AH</number> 251 | <text>Бешенств люд </text> 252 | </query> 253 | <trecFormat>true</trecFormat> 254 | </parameters> 255 | -------------------------------------------------------------------------------- /systems/indri/query_LM.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | #usage: ./query_LM.sh language 4 | #language can be bg,de,es,fa,fi,fr,hu,it,nl,pt,ru,sv 5 | 6 | #set lemur bin directory 7 | lemurdir=/u/xiaojiex/ir/lemur/bin 8 | 9 | #set retrieval parameter file directory 10 | queryParaDir=/u/xiaojiex/ir/excute/clefmono 11 | 12 | #set retrieval result directory 13 | queryResultDir=$queryParaDir/baseline 14 | 15 | date 16 | echo "retrieval started" 17 | $lemurdir/IndriRunQuery $queryParaDir/queryParaLMSP_$1 > $queryResultDir/result_LMSP_file_$1 18 | echo "retrieval finished" 19 | date -------------------------------------------------------------------------------- /systems/lucene/clef.sh: -------------------------------------------------------------------------------- 1 | source ../common.sh 2 | 3 | if [ -z "$CLEF_LOCATION" ]; then 4 | echo "The location of the CLEF Test Collections should be specified"; 5 | else 6 | 7 | if [[ ! -f clef/target/lucene-clef-1.0-jar-with-dependencies.jar ]]; then 8 | echo "Compiling lucene-clef project..." 9 | cd clef 10 | mvn clean compile assembly:single 11 | cd .. 12 | fi 13 | 14 | cd ../../ 15 | 16 | ROOT_PATH=$(pwd); 17 | 18 | mkdir -p $ROOT_PATH/runs 19 | 20 | SYSTEM_PATH=$ROOT_PATH/systems/lucene; 21 | 22 | cd $SYSTEM_PATH 23 | 24 | while read line;do 25 | # load indexing and retrieval options from the clef_runs file 26 | lang=$(echo "$line" | cut -d$'\t' -f1); 27 | use_stemmer=$(echo "$line" | cut -d$'\t' -f2); 28 | use_stoplist=$(echo "$line" | cut -d$'\t' -f3); 29 | model=$(echo "$line" | cut -d$'\t' -f4); 30 | 31 | sh $SYSTEM_PATH/clef_experiments.sh -l $lang -cp $CLEF_LOCATION -stm $use_stemmer -sl $use_stoplist -r $model 32 | 33 | done < $SYSTEM_PATH/clef_runs 34 | 35 | fi 36 | 37 | -------------------------------------------------------------------------------- /systems/lucene/clef/README.md: -------------------------------------------------------------------------------- 1 | # Apache Lucene - CLEF 2 | 3 | *lucene-clef* provides indexing and retrieval functionalities for the CLEF Test Collections through the 4 | [Apache Lucene](https://lucene.apache.org/core/) (version [5.2.1](https://lucene.apache.org/core/5_2_1/index.html)) 5 | library. 6 | 7 | ### Experiments on CLEF Test Collections 8 | 9 | The experiments can be replicated by the script [clef_experiments.sh](../clef_experiments.sh). 10 | An example of usage of the script is the following: 11 | 12 | `./clef_experiments.sh -l it -cp /media/CLEF/corpora -stm y -sl y -r BM25` 13 | 14 | where the meaning of the options is: 15 | - *-l* the language of the test collection (e.g. *it*) 16 | - *-cp* the path to the directory where the document corpora are stored (e.g. */media/CLEF/corpora*) 17 | - *-stm* enable (-stm y) or disable (-stm n) the stemmer 18 | - *-sl* enable (-sl y) or disable (-sl n) the use of the stoplist 19 | - *-r* the ranking model (e.g. BM25) 20 | 21 | The current version of *lucene-clef* supports the following models: 22 | - BM25 23 | 24 | The script [clef.sh](../clef.sh) iterates over the diverse set of options stored in the [clef_runs](../clef_runs) file 25 | (one combination of options per line) and call the [clef_experiments.sh](../clef_experiments.sh) using each option set. 26 | The lines in the [clef_runs](../clef_runs) have the following format: 27 | 28 | `it y y BM25` 29 | 30 | where the options are separated by tabs; the first option refer to the language, the second to the stemmer, the third 31 | to the stoplist usage and the last one to the model. The last line of the [clef_runs](../clef_runs) should be empty. -------------------------------------------------------------------------------- /systems/lucene/clef/pom.xml: -------------------------------------------------------------------------------- 1 | <?xml version="1.0" encoding="UTF-8"?> 2 | <project xmlns="http://maven.apache.org/POM/4.0.0" 3 | xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 4 | xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 5 | <modelVersion>4.0.0</modelVersion> 6 | 7 | <groupId>it.unipd.dei.ims.lucene</groupId> 8 | <artifactId>lucene-clef</artifactId> 9 | <version>1.0</version> 10 | 11 | <properties> 12 | <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> 13 | <maven.compiler.source>1.7</maven.compiler.source> 14 | <maven.compiler.target>1.7</maven.compiler.target> 15 | <org.apache.lucene-version>5.2.1</org.apache.lucene-version> 16 | <slf4j.version>1.6.4</slf4j.version> 17 | <logback.version>1.0.9</logback.version> 18 | </properties> 19 | 20 | <dependencies> 21 | 22 | <!-- Logging --> 23 | <dependency> 24 | <groupId>org.slf4j</groupId> 25 | <artifactId>slf4j-api</artifactId> 26 | <version>${slf4j.version}</version> 27 | </dependency> 28 | 29 | <dependency> 30 | <groupId>ch.qos.logback</groupId> 31 | <artifactId>logback-classic</artifactId> 32 | <version>${logback.version}</version> 33 | </dependency> 34 | 35 | <dependency> 36 | <groupId>ch.qos.logback</groupId> 37 | <artifactId>logback-core</artifactId> 38 | <version>${logback.version}</version> 39 | </dependency> 40 | 41 | <!-- Test --> 42 | <dependency> 43 | <groupId>junit</groupId> 44 | <artifactId>junit</artifactId> 45 | <version>4.12</version> 46 | </dependency> 47 | 48 | 49 | <!-- LUCENE DEPENDENCIES --> 50 | <dependency> 51 | <groupId>org.apache.lucene</groupId> 52 | <artifactId>lucene-core</artifactId> 53 | <version>${org.apache.lucene-version}</version> 54 | </dependency> 55 | 56 | <dependency> 57 | <groupId>org.apache.lucene</groupId> 58 | <artifactId>lucene-analyzers-common</artifactId> 59 | <version>${org.apache.lucene-version}</version> 60 | </dependency> 61 | 62 | <dependency> 63 | <groupId>org.apache.lucene</groupId> 64 | <artifactId>lucene-benchmark</artifactId> 65 | <version>${org.apache.lucene-version}</version> 66 | </dependency> 67 | 68 | </dependencies> 69 | 70 | <build> 71 | <plugins> 72 | <plugin> 73 | <artifactId>maven-compiler-plugin</artifactId> 74 | <configuration> 75 | <source>${maven.compiler.source}</source> 76 | <target>${maven.compiler.target}</target> 77 | <encoding>${project.build.sourceEncoding}</encoding> 78 | </configuration> 79 | </plugin> 80 | <plugin> 81 | <groupId>org.apache.maven.plugins</groupId> 82 | <artifactId>maven-resources-plugin</artifactId> 83 | <configuration> 84 | <encoding>${project.build.sourceEncoding}</encoding> 85 | </configuration> 86 | </plugin> 87 | <plugin> 88 | <artifactId>maven-assembly-plugin</artifactId> 89 | <configuration> 90 | <archive> 91 | <manifest> 92 | <mainClass>it.unipd.dei.ims.lucene.clef.App</mainClass> 93 | </manifest> 94 | </archive> 95 | <descriptorRefs> 96 | <descriptorRef>jar-with-dependencies</descriptorRef> 97 | </descriptorRefs> 98 | </configuration> 99 | <executions> 100 | <execution> 101 | <id>make-assembly</id> <!-- this is used for inheritance merges --> 102 | <phase>package</phase> <!-- bind to the packaging phase --> 103 | <goals> 104 | <goal>single</goal> 105 | </goals> 106 | </execution> 107 | </executions> 108 | </plugin> 109 | </plugins> 110 | </build> 111 | 112 | 113 | </project> -------------------------------------------------------------------------------- /systems/lucene/clef/src/main/java/it/unipd/dei/ims/lucene/clef/AnalyzerFactory.java: -------------------------------------------------------------------------------- 1 | package it.unipd.dei.ims.lucene.clef; 2 | 3 | /* 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | import org.apache.lucene.analysis.Analyzer; 21 | import org.apache.lucene.analysis.bg.BulgarianAnalyzer; 22 | import org.apache.lucene.analysis.de.GermanAnalyzer; 23 | import org.apache.lucene.analysis.es.SpanishAnalyzer; 24 | import org.apache.lucene.analysis.fa.PersianAnalyzer; 25 | import org.apache.lucene.analysis.fi.FinnishAnalyzer; 26 | import org.apache.lucene.analysis.fr.FrenchAnalyzer; 27 | import org.apache.lucene.analysis.hu.HungarianAnalyzer; 28 | import org.apache.lucene.analysis.it.ItalianAnalyzer; 29 | import org.apache.lucene.analysis.nl.DutchAnalyzer; 30 | import org.apache.lucene.analysis.pt.PortugueseAnalyzer; 31 | import org.apache.lucene.analysis.ru.RussianAnalyzer; 32 | import org.apache.lucene.analysis.standard.StandardAnalyzer; 33 | import org.apache.lucene.analysis.sv.SwedishAnalyzer; 34 | import org.apache.lucene.analysis.util.CharArraySet; 35 | import org.slf4j.Logger; 36 | import org.slf4j.LoggerFactory; 37 | 38 | import java.io.File; 39 | import java.io.FileNotFoundException; 40 | import java.util.Scanner; 41 | 42 | /** 43 | * Factory for {@link org.apache.lucene.analysis.Analyzer}s and stopsets. 44 | */ 45 | public class AnalyzerFactory { 46 | 47 | static Logger logger = LoggerFactory.getLogger(AnalyzerFactory.class); 48 | 49 | public static CharArraySet createStopset( 50 | String language, 51 | String stopsetType, 52 | String stopsetPath 53 | ) throws Exception { 54 | 55 | CharArraySet stopset = CharArraySet.EMPTY_SET; 56 | 57 | if (stopsetType.equalsIgnoreCase("CUSTOM")){ 58 | 59 | try { 60 | File f = new File(stopsetPath); 61 | stopset = new CharArraySet(0,true); 62 | Scanner sc = new Scanner(f); 63 | logger.debug("STOPLIST:"); 64 | while (sc.hasNextLine()) { 65 | String stopword = sc.nextLine().trim(); 66 | logger.debug("=> "+stopword); 67 | stopset.add(stopword); 68 | } 69 | logger.debug(""); 70 | sc.close(); 71 | 72 | } catch (FileNotFoundException e) { 73 | e.printStackTrace(); 74 | throw new Exception("FileNotFoundException when loading stopset"); 75 | } 76 | 77 | } else if (stopsetType.equalsIgnoreCase("DEFAULT")){ 78 | 79 | switch (language) { 80 | case "bg": 81 | stopset = BulgarianAnalyzer.getDefaultStopSet(); 82 | break; 83 | case "de": 84 | stopset = GermanAnalyzer.getDefaultStopSet(); 85 | break; 86 | case "es": 87 | stopset = SpanishAnalyzer.getDefaultStopSet(); 88 | break; 89 | case "fa": 90 | stopset = PersianAnalyzer.getDefaultStopSet(); 91 | break; 92 | case "fi": 93 | stopset = FinnishAnalyzer.getDefaultStopSet(); 94 | break; 95 | case "fr": 96 | stopset = FrenchAnalyzer.getDefaultStopSet(); 97 | break; 98 | case "hu": 99 | stopset = HungarianAnalyzer.getDefaultStopSet(); 100 | break; 101 | case "it": 102 | stopset = ItalianAnalyzer.getDefaultStopSet(); 103 | break; 104 | case "nl": 105 | stopset = DutchAnalyzer.getDefaultStopSet(); 106 | break; 107 | case "pt": 108 | stopset = PortugueseAnalyzer.getDefaultStopSet(); 109 | break; 110 | case "ru": 111 | stopset = RussianAnalyzer.getDefaultStopSet(); 112 | break; 113 | case "sv": 114 | stopset = SwedishAnalyzer.getDefaultStopSet(); 115 | break; 116 | default: 117 | throw new UnsupportedOperationException("Language not supported yet"); 118 | } 119 | 120 | } 121 | 122 | return stopset; 123 | } 124 | 125 | 126 | 127 | public static Analyzer createAnalyzer( 128 | String language, 129 | String stemmer, 130 | CharArraySet stopset 131 | ) { 132 | 133 | Analyzer analyzer; 134 | 135 | if (stemmer.equalsIgnoreCase("NONE")){ 136 | 137 | analyzer = new StandardAnalyzer(stopset); 138 | 139 | } else { // otherwise use language-specific analyzer 140 | 141 | switch (language) { 142 | case "bg": 143 | analyzer = new BulgarianAnalyzer(stopset); 144 | break; 145 | case "de": 146 | analyzer = new GermanAnalyzer(stopset); 147 | break; 148 | case "es": 149 | analyzer = new SpanishAnalyzer(stopset); 150 | break; 151 | case "fa": 152 | analyzer = new PersianAnalyzer(stopset); 153 | break; 154 | case "fi": 155 | analyzer = new FinnishAnalyzer(stopset); 156 | break; 157 | case "fr": 158 | analyzer = new FrenchAnalyzer(stopset); 159 | break; 160 | case "hu": 161 | analyzer = new HungarianAnalyzer(stopset); 162 | break; 163 | case "it": 164 | analyzer = new ItalianAnalyzer(stopset); 165 | break; 166 | case "nl": 167 | analyzer = new DutchAnalyzer(stopset); 168 | break; 169 | case "pt": 170 | analyzer = new PortugueseAnalyzer(stopset); 171 | break; 172 | case "ru": 173 | analyzer = new RussianAnalyzer(stopset); 174 | break; 175 | case "sv": 176 | analyzer = new SwedishAnalyzer(stopset); 177 | break; 178 | default: 179 | throw new UnsupportedOperationException("Language not supported yet"); 180 | } 181 | 182 | } 183 | 184 | return analyzer; 185 | 186 | } 187 | 188 | 189 | } 190 | -------------------------------------------------------------------------------- /systems/lucene/clef/src/main/java/it/unipd/dei/ims/lucene/clef/App.java: -------------------------------------------------------------------------------- 1 | package it.unipd.dei.ims.lucene.clef; 2 | 3 | /* 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | import it.unipd.dei.ims.lucene.clef.applications.BatchRetrieval; 21 | import it.unipd.dei.ims.lucene.clef.applications.BuildIndex; 22 | 23 | /** 24 | * Functionalities for CLEF Test collections indexing and batch retrieval. 25 | */ 26 | public class App { 27 | 28 | public static void main(String [] args){ 29 | 30 | if (args.length==1){ 31 | 32 | String option = args[0].toLowerCase(); 33 | switch (option){ 34 | case "-i" : 35 | BuildIndex.main(args); 36 | break; 37 | case "-r" : 38 | BatchRetrieval.main(args); 39 | break; 40 | default: 41 | System.out.println("Supported options:"); 42 | printHelp(); 43 | } 44 | 45 | } else { 46 | 47 | System.out.println("One of the following option should be used:"); 48 | printHelp(); 49 | 50 | } 51 | 52 | } 53 | 54 | private static void printHelp(){ 55 | 56 | System.out.println("-h for this help"); 57 | System.out.println("-i for indexing"); 58 | System.out.println("-r for batch retrieval"); 59 | } 60 | 61 | 62 | 63 | } 64 | -------------------------------------------------------------------------------- /systems/lucene/clef/src/main/java/it/unipd/dei/ims/lucene/clef/parser/ClefDocParser.java: -------------------------------------------------------------------------------- 1 | package it.unipd.dei.ims.lucene.clef.parser; 2 | 3 | /* 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | import org.apache.lucene.benchmark.byTask.feeds.DocData; 21 | import org.apache.lucene.benchmark.byTask.feeds.TrecContentSource; 22 | import org.apache.lucene.benchmark.byTask.feeds.TrecDocParser; 23 | 24 | import java.io.IOException; 25 | 26 | /** 27 | * Parser for the CLEF test collections. 28 | */ 29 | public class ClefDocParser extends TrecDocParser { 30 | 31 | 32 | @Override 33 | public DocData parse(DocData docData, String name, TrecContentSource trecSrc, 34 | StringBuilder docBuf, ParsePathType pathType) throws IOException { 35 | int mark = 0; // that much is skipped 36 | docData.clear(); 37 | docData.setName(name); 38 | docData.setBody(stripTags(docBuf, mark).toString()); 39 | return docData; 40 | } 41 | 42 | } 43 | 44 | -------------------------------------------------------------------------------- /systems/lucene/clef/src/main/java/it/unipd/dei/ims/lucene/clef/parser/ClefQQParser.java: -------------------------------------------------------------------------------- 1 | package it.unipd.dei.ims.lucene.clef.parser; 2 | 3 | /* 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | import it.unipd.dei.ims.lucene.clef.AnalyzerFactory; 21 | import org.apache.lucene.analysis.Analyzer; 22 | import org.apache.lucene.analysis.util.CharArraySet; 23 | import org.apache.lucene.benchmark.quality.QualityQuery; 24 | import org.apache.lucene.benchmark.quality.QualityQueryParser; 25 | import org.apache.lucene.queryparser.classic.ParseException; 26 | import org.apache.lucene.queryparser.classic.QueryParser; 27 | import org.apache.lucene.queryparser.classic.QueryParserBase; 28 | import org.apache.lucene.search.BooleanClause; 29 | import org.apache.lucene.search.BooleanQuery; 30 | import org.apache.lucene.search.Query; 31 | 32 | /** 33 | * Parser for {@link QualityQuery}. 34 | */ 35 | public class ClefQQParser implements QualityQueryParser { 36 | 37 | private String fieldToSearch; 38 | private String qqFields[]; 39 | private String language; 40 | private String stemmer; 41 | private CharArraySet stopset; 42 | 43 | ThreadLocal<QueryParser> queryParser = new ThreadLocal<>(); 44 | 45 | public ClefQQParser( 46 | String qqFields[], 47 | String fieldToSearch, 48 | String language, 49 | String stemmer, 50 | CharArraySet stopset) { 51 | this.qqFields = qqFields; 52 | this.fieldToSearch = fieldToSearch; 53 | this.language=language; 54 | this.stemmer=stemmer; 55 | this.stopset=stopset; 56 | } 57 | 58 | public ClefQQParser( 59 | String qqField, 60 | String fieldToSearch, 61 | String language, 62 | String stemmer, 63 | CharArraySet stopset 64 | ) { 65 | this(new String[] { qqField }, fieldToSearch, language, stemmer, stopset); 66 | } 67 | 68 | @Override 69 | public Query parse(QualityQuery qq) throws ParseException { 70 | QueryParser qp = queryParser.get(); 71 | if (qp==null) { 72 | Analyzer analyzer = AnalyzerFactory.createAnalyzer( 73 | language, 74 | stemmer, 75 | stopset 76 | ); 77 | qp = new QueryParser(fieldToSearch, analyzer); 78 | queryParser.set(qp); 79 | } 80 | BooleanQuery bq = new BooleanQuery(); 81 | for (int i = 0; i < qqFields.length; i++) 82 | bq.add(qp.parse(QueryParserBase.escape(qq.getValue(qqFields[i]))), BooleanClause.Occur.SHOULD); 83 | 84 | return bq; 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /systems/lucene/clef/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | <?xml version="1.0" encoding="UTF-8"?> 2 | <configuration> 3 | 4 | <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender"> 5 | <layout class="ch.qos.logback.classic.PatternLayout"> 6 | <Pattern> 7 | %d{yyyy-MM-dd HH:mm:ss} [%thread] %-5level %logger{36} - %msg%n 8 | </Pattern> 9 | </layout> 10 | </appender> 11 | 12 | <logger name="it.unipd.dei.ims.lucene.clef" level="info" 13 | additivity="false"> 14 | <appender-ref ref="STDOUT" /> 15 | </logger> 16 | 17 | <root level="error"> 18 | <appender-ref ref="STDOUT" /> 19 | </root> 20 | 21 | </configuration> -------------------------------------------------------------------------------- /systems/lucene/clef/src/main/resources/lucene-clef.properties: -------------------------------------------------------------------------------- 1 | #################### 2 | ### CLEF CORPORA ### 3 | #################### 4 | 5 | ## BULGARIAN ## 6 | bg.corpora=SEGA2002;STANDART2002 7 | SEGA2002.encoding=UTF-8 8 | STANDART2002.encoding=UTF-8 9 | bg.analyzerClass=org.apache.lucene.analysis.bg.BulgarianAnalyzer 10 | bg.corpus.size=69195 11 | 12 | 13 | ## GERMAN ## 14 | de.corpora=FRANKFURTER1994;SDA1994;SPIEGEL1994;SPIEGEL1995 15 | FRANKFURTER1994.encoding=ISO-8859-1 16 | SDA1994.encoding=ISO-8859-1 17 | SPIEGEL1994.encoding=ISO-8859-1 18 | SPIEGEL1995.encoding=ISO-8859-1 19 | de.analyzerClass=org.apache.lucene.analysis.de.GermanAnalyzer 20 | de.corpus.size=225371 21 | 22 | 23 | ## SPANISH ## 24 | es.corpora=EFE1994;EFE1995 25 | EFE1994.encoding=ISO-8859-1 26 | EFE1995.encoding=ISO-8859-1 27 | es.analyzerClass=org.apache.lucene.analysis.es.SpanishAnalyzer 28 | es.corpus.size=454045 29 | 30 | 31 | ## PERSIAN ## 32 | fa.corpora=HAMSHAHRI 33 | HAMSHAHRI.encoding=UTF-8 34 | fa.analyzerClass=org.apache.lucene.analysis.fa.PersianAnalyzer 35 | fa.corpus.size=166774 36 | 37 | 38 | ## FINNISH ## 39 | fi.corpora=AAMULEHTI1994;AAMULEHTI1995 40 | AAMULEHTI1994.encoding=ISO-8859-1 41 | AAMULEHTI1995.encoding=ISO-8859-1 42 | fi.analyzerClass=org.apache.lucene.analysis.fi.FinnishAnalyzer 43 | fi.corpus.size=55344 44 | 45 | 46 | ## FRENCH ## 47 | fr.corpora=LEMONDE1994;LEMONDE1995;ATS1994;ATS1995 48 | LEMONDE1994.encoding=ISO-8859-1 49 | LEMONDE1995.encoding=ISO-8859-1 50 | ATS1994.encoding=ISO-8859-1 51 | ATS1995.encoding=ISO-8859-1 52 | fr.analyzerClass=org.apache.lucene.analysis.fr.FrenchAnalyzer 53 | fr.corpus.size=177452 54 | 55 | 56 | ## HUNGARIAN ## 57 | hu.corpora=MAGYAR2002 58 | MAGYAR2002.encoding=UTF-8 59 | hu.analyzerClass=org.apache.lucene.analysis.hu.HungarianAnalyzer 60 | hu.corpus.size=49530 61 | 62 | 63 | ## ITALIAN ## 64 | it.corpora=AGZ1994;AGZ1995;LASTAMPA1994 65 | AGZ1994.encoding=ISO-8859-1 66 | AGZ1995.encoding=ISO-8859-1 67 | LASTAMPA1994.encoding=US-ASCII 68 | it.analyzerClass=org.apache.lucene.analysis.it.ItalianAnalyzer 69 | it.corpus.size=157558 70 | 71 | 72 | ## DUTCH ## 73 | nl.corpora=ALGEMEEN1994;ALGEMEEN1995;NRC1994;NRC1995 74 | ALGEMEEN1994.encoding=ISO-8859-1 75 | ALGEMEEN1995.encoding=ISO-8859-1 76 | NRC1994.encoding=ISO-8859-1 77 | NRC1995.encoding=ISO-8859-1 78 | nl.analyzerClass=org.apache.lucene.analysis.nl.DutchAnalyzer 79 | nl.corpus.size=190604 80 | 81 | 82 | ## PORTUGUESE ## 83 | pt.corpora=FOLHA1994;FOLHA1995;PUBLICO1994;PUBLICO1995 84 | FOLHA1994.encoding=ISO-8859-1 85 | FOLHA1995.encoding=ISO-8859-1 86 | PUBLICO1994.encoding=ISO-8859-1 87 | PUBLICO1995.encoding=ISO-8859-1 88 | pt.analyzerClass=org.apache.lucene.analysis.pt.PortugueseAnalyzer 89 | pt.corpus.size=210734 90 | 91 | 92 | ## RUSSIAN ## 93 | ru.corpora=IZVESTIA1995 94 | IZVESTIA1995.encoding=UTF-8 95 | ru.analyzerClass=org.apache.lucene.analysis.ru.RussianAnalyzer 96 | ru.corpus.size=16716 97 | 98 | 99 | ## SWEDISH ## 100 | sv.corpora=TT1994;TT1995 101 | TT1994.encoding=UTF-8 102 | TT1995.encoding=UTF-8 103 | sv.analyzerClass=org.apache.lucene.analysis.sv.SwedishAnalyzer 104 | sv.corpus.size=142819 -------------------------------------------------------------------------------- /systems/lucene/clef/src/test/java/it/unipd/dei/ims/lucene/clef/parser/ClefQQParserTest.java: -------------------------------------------------------------------------------- 1 | package it.unipd.dei.ims.lucene.clef.parser; 2 | 3 | import it.unipd.dei.ims.lucene.clef.applications.BatchRetrieval; 4 | import org.apache.lucene.benchmark.quality.QualityQuery; 5 | import org.junit.Test; 6 | 7 | import java.io.File; 8 | 9 | /** 10 | * Test of the parser for the CLEF Topics. 11 | * 12 | * @author Emanuele Di Buccio 13 | */ 14 | public class ClefQQParserTest { 15 | 16 | public static String [] langs = { 17 | "bg", 18 | "de", 19 | "es", 20 | "fa", 21 | "fi", 22 | "fr", 23 | "hu", 24 | "it", 25 | "nl", 26 | "pt", 27 | "ru", 28 | "sv" 29 | }; 30 | 31 | public static String [] topicFields = { 32 | "title", 33 | "description" 34 | }; 35 | 36 | @Test 37 | public void testClefTopicParser(){ 38 | 39 | for (String lang : langs){ 40 | 41 | ClassLoader classLoader = getClass().getClassLoader(); 42 | File topicFile = new File(classLoader.getResource("topics/"+lang+"_topics.xml").getFile()); 43 | 44 | try { 45 | QualityQuery[] qqs = BatchRetrieval.getQualityQueries(topicFile.getAbsolutePath(), topicFields); 46 | System.out.println("LANGUAGE: "+lang); 47 | for (QualityQuery qq : qqs){ 48 | System.out.println(qq.getQueryID()); 49 | } 50 | 51 | } catch (Exception e) { 52 | e.printStackTrace(); 53 | } 54 | 55 | } 56 | 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /systems/lucene/clef/src/test/resources/topics/bg_topics.xml: -------------------------------------------------------------------------------- 1 | <?xml version="1.0" encoding="UTF-8"?> 2 | <topics> 3 | <topic> 4 | <identifier>251-AH</identifier> 5 | <title>Алтернативна медицина 6 | Намерете документи, които дискутират някакъв вид алтернативно или природно медицинско лечение, включително специфичните терапии, като акупунктура, хомеопатия, хиропраксис и др. 7 | Подходящите документи ще осигурят обща или специфична информация относно употребата на естествени или алтернативни медицински лечения и практики. 8 | 9 | 10 | 252-AH 11 | Пенсионните схеми в Европа 12 | Намерете документи, които дават информация за съвременните пенсионни системи и привилегии в европейска държава. 13 | Подходящите документи ще съдържат информация за съвременни пенсионни схеми и привилегии в отделните европейски страни. Интересуващите ни документи включват минималната и максималната възраст за пенсиониране и начина, по който пенсионният доход се изчислява. Документи, които дискутират бъдещи реформи в пенсионното дело, не са подходящи. 14 | 15 | 16 | 253-AH 17 | Държави, в които е разрешено смъртното наказание 18 | В кои държави или щати смъртното наказание се практикува или поне е разрешено от конституцията? 19 | Явно е, че документи, които експлицитно декларират, че конституцията на даден щат разрешава смъртно наказание, са подходящи. Документите, които споменават конкретни смъртни присъди също са подходящи, но при условие, че споменават и щата или страната експлицитно. 20 | 21 | -------------------------------------------------------------------------------- /systems/lucene/clef/src/test/resources/topics/de_topics.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 41-AH 5 | Pestizide in Babykost 6 | Berichte über Pestizide in Babynahrung sind gesucht. 7 | Die Dokumente informieren über die Entdeckung von Pestiziden in Babynahrung. Sie berichten über die verschiedenen Marken, Supermärkte und Firmen, die Babykost mit Pestiziden angeboten haben. Sie berichten auch über Maßnahmen gegen die Verunreinigung von Babynahrung mit Pestiziden. 8 | 9 | 10 | 42-AH 11 | UN/US-Invasion Haitis 12 | UNO/USA entsenden Truppen nach Haiti. 13 | Die Dokumente berichten sowohl über die Diskussion über die Entscheidung zur Entsendung von Blauhelmtruppen der USA als auch über die Invasion selbst. Sie informieren auch über die unmittelbaren Folgen des Einmarschs. 14 | 15 | 16 | 43-AH 17 | El Niño und das Wetter 18 | Suche Berichte, die das Phänomen El Niño und seine Auswirkungen auf das Weltwetter (einschließlich seiner Folgen für Temperatur, Luftdruck, Niederschlag usw.) erklären. 19 | Relevante Dokumente informieren über die Auswirkungen von El Niño. Im Hinblick auf dieses Phänomen ist die Wechselwirkung von Weltmeeren und Erdatmosphäre von Interesse. Besonders wichtig ist die Rolle von El Niño im Südpazifik aufgrund dessen Einfluss auf das Weltklima. 20 | 21 | -------------------------------------------------------------------------------- /systems/lucene/clef/src/test/resources/topics/es_topics.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 41-AH 5 | Pesticidas en alimentos para bebes 6 | Encontrar noticias sobre pesticidas en alimentos para bebes. 7 | Los documentos relevantes proporcionan información sobre el descubrimiento de pesticidas en alimentos para bebes. Se informa sobre diferentes marcas, supermercados y compañías que ofrecieron alimentos para bebes que contenian pesticidas. Se discuten también medidas contra la contaminación de alimentos para bebes con pesticidas. 8 | 9 | 10 | 42-AH 11 | Naciones Unidas y Estados Unidos invaden Haití 12 | Encontrar documentos sobre la invasión de Haití por los soldados de la ONU y de los Estados Unidos. 13 | Los documentos comentan tanto la discusión sobre la decisión de la ONU de enviar las tropas americanas a Haití, como la invasión misma. Se habla también de sus consecuencias directas. 14 | 15 | 16 | 43-AH 17 | "El Niño" y el tiempo 18 | Encontrar noticias que expliquen el fenómeno de "El Niño" y su repercusión en el clima del planeta (incluidos los efectos que tiene sobre la temperatura, presión atmosférica, precipitaciones, etc.). 19 | Los documentos relevantes proporcionarán información sobre los efectos de "El Niño". Las interacciones entre los océanos y la atmósfera de la Tierra son interesantes en relación con este fenómeno. "El Niño" es especialmente importante en el Pacífico Sur debido a su influencia sobre el clima mundial. 20 | 21 | -------------------------------------------------------------------------------- /systems/lucene/clef/src/test/resources/topics/fa_topics.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 551-AH 5 | تنيس جام ويمبلدون 6 | نام برندگان جام 7 | فاتحان مرد و زن جام ويمبلدون چه كساني بوده اند 8 | 9 | 10 | 552-AH 11 | بازار بورس تهران 12 | نمونه هايي از شاخص هاي بازار بورس تهران 13 | نوسانات شاخص هاي بازار بورس، سهام صدرنشين در اين بازار، مسايل و چالشهاي احتمالي كه بازار با آن مواجه بوده است 14 | 15 | 16 | 553-AH 17 | جام جهاني 2002 18 | برنده جام جهاني فوتبال در سال 2002 19 | آيا ايران به اين جام راه يافت؟ چه رتبه اي در آسيا و جام به دست آورد؟ تيمهاي راه يافته به مرحله نهايي مربوط به كدام كشورها بودند؟ 20 | 21 | -------------------------------------------------------------------------------- /systems/lucene/clef/src/test/resources/topics/fi_topics.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 92-AH 5 | Irakin vastaiset pakotteet 6 | Mihin toimenpiteisiin Irak on ryhtynyt poistaakseen YK:n talouspakotteet? Pakotteet astuivat voimaan seurauksena Irakin hyökkäyksestä Kuwaitiin vuonna 1990. 7 | Dokumenttien tulee sisältää tietoa toimista, joilla Irak on yrittänyt poistaa pakotteet. Ainoastaan pakotteita koskevat selitykset tai retoriikka sanktioita vastaan eivät ole relevantteja. Kuwaitiin kohdistunutta hyökkäystä pahoittelevat viralliset anteeksipyynnöt Irakin taholta ovat myös relevantteja. 8 | 9 | 10 | 94-AH 11 | Solzhenitsynin paluu. 12 | Etsi dokumentteja jotka kertovat kirjallisuuden Nobel-palkinnon voittajan Solzhenitsynin paluusta Venäjälle. 13 | Relevanteissa dokumenteissa kerrotaan Solzhenitsynin Venäjälle paluun syistä ja ajankohdasta. Dokumenteissa voidaan myös keskustella syistä hänen maahanmuuttoonsa USA:han. 14 | 15 | 16 | 95-AH 17 | Palestiinan konflikti 18 | Etsi artikkeleita, jotka käsittelevät aseellisia konflikteja Palestiinalaisalueilla ja joissa osapuolena on siviiliväestöä. 19 | Relevanteissa dokumenteissa on tietoa äskeisistä tapahtumista israelilais-paletiinalaisessa konfliktissa. Raportit, joissa kerrotaan itsemurhaiskuista otetaan lukuun. 20 | 21 | 22 | -------------------------------------------------------------------------------- /systems/lucene/clef/src/test/resources/topics/fr_topics.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 251-AH 5 | Médecines douces 6 | Trouver des documents parlant de toutes sortes de traitements médicaux alternatifs ou naturels comprenant des thérapies spécifiques telles que l'acupuncture, l'homéopathie, la chiropractie, entre autres. 7 | Les documents pertinents devront fournir des informations générales ou spécifiques sur les traitements ou les pratiques des médecines dites douces, alternatives ou naturelles. 8 | 9 | 10 | 252-AH 11 | Régimes de retraite en Europe 12 | Trouver des documents donnant des informations sur les régimes et indemnités de retraite aujourd'hui dans n'importe quel pays européen. 13 | Les documents pertinents fourniront des informations sur les régimes et indemnités de retraite dans un pays européen donné. Les documents pertinents devront contenir l'âge requis (au minimum et au maximum) pour bénéficier de la retraite ainsi que le mode de calcul du montant de la pension. Les documents évoquant de futurs projets de réformes ne sont pas pertinents. 14 | 15 | 16 | 253-AH 17 | Pays appliquant la peine de mort 18 | Dans quels pays ou Etats la peine de mort est-elle toujours appliquée ou du moins autorisée par la Constitution ? 19 | Les documents pertinents doivent expliquer que la Constitution d'un Etat donné autorise la peine de mort. Les documents rapportant explicitement des condamnations à la peine capitale sont pertinents si l'Etat ou le pays concerné est clairement mentionné. 20 | 21 | 22 | -------------------------------------------------------------------------------- /systems/lucene/clef/src/test/resources/topics/hu_topics.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 251-AH 5 | Alternatív gyógyászat 6 | Keressünk cikkeket alternatív vagy természetes gyógymódokról ill. speciális terápiákról, mint például az akupunktúra, a homeopátia vagy a csontkovácsolás. 7 | A megfelelő cikkek a természetes vagy alternatív gyógyászati kezelésekről vagy praktikákról írnak általában vagy konkrétan. 8 | 9 | 10 | 252-AH 11 | Európai nyugdíjrendszerek 12 | Keressünk cikkeket, melyek valamely európai ország jelenlegi nyugdíjrendszeréről vagy nyugdíjasokra vonatkozó járulékairól szólnak. 13 | A megfelelő cikkek az egyes európai országok nyugdíjrendszeréről, illetve a nyugdíjaskorúakra vonatkozó járulékokról tartalmaznak információkat. Számunkra érdekes adat a nyugdíjba vonulás lehetséges alsó, illetve kötelező felső korhatára, valamint a nyugdíj mértékének kiszámítási módja is. A jövőbeni nyugdíjreformokra vonatkozó tervek nem érdekesek. 14 | 15 | 16 | 253-AH 17 | A halálbüntetés elterjedtsége 18 | Mely országokban vagy államokban alkalmazzák még a halálbüntetést ill. teszi lehetővé annak alkalmazását az alkotmány? 19 | Nem csak azok a cikkek relevánsak, amelyekben tételesen szerepel, hogy az adott ország vagy állam alkotmánya lehetővé teszi a halálbüntetést, hanem azok is, amelyek olyan konkrét esetekről szólnak (az ország vagy állam nevének feltüntetésével), amelyekben halálbüntetést róttak ki. 20 | 21 | -------------------------------------------------------------------------------- /systems/lucene/clef/src/test/resources/topics/it_topics.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 41-AH 5 | Pesticidi negli alimenti per bambini 6 | Trova documenti che parlano dei pesticidi negli alimenti per bambini. 7 | I documenti rilevanti forniscono informazioni sulla scoperta di pesticidi nei cibi per bambini. Riportano i diversi marchi, i supermercati e le ditte che hanno venduto alimenti per bambini con i pesticidi. Sono anche rilevanti i documenti che discutono le misure contro la contaminazione degli alimenti per bambini con i pesticidi. 8 | 9 | 10 | 42-AH 11 | Invasione ONU/USA di Haiti 12 | Reperisci documenti sull'invasione di Haiti da parte delle truppe ONU/USA. 13 | I documenti rilevanti contengono sia le discussioni relative alla decisione di spedire i caschi blu degli Stati Uniti d'America a Haiti, sia dell'invasione stessa. Sono rilevanti anche le conseguenze dirette dell'invasione. 14 | 15 | 16 | 44-AH 17 | Indurain vince il Tour 18 | Reazioni al quarto Tour de France vinto da Miguel Indurain. 19 | I documenti rilevanti commentano le reazioni alla quarta vittoria consecutiva di Miguel Indurain al Tour de France. Sono anche rilevanti i documenti che discutono dell'importanza di questo ciclista nel ciclismo mondiale dopo questa vittoria. 20 | 21 | -------------------------------------------------------------------------------- /systems/lucene/clef/src/test/resources/topics/nl_topics.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 41-AH 5 | Pesticide in babyvoeding 6 | Zoek naar documenten over pesticide in babyvoeding. 7 | Deze documenten geven informatie over ontdekkingen van pesticide in babyvoeding. Het gaat hierbij om producenten, merken en supermarkten die verontreinigde voeding hebben aangeboden. De informatie gaat ook over de maatregelen die tegen de verontreiniging van babyvoeding met pesticide zijn genomen. 8 | 9 | 10 | 42-AH 11 | UN/US invasie op Haïti 12 | UNO/USA sturen troepen naar Haïti. 13 | Deze documenten geven niet alleen informatie over de invasie maar ook over de discussies en beslissingen van de UN die ten grondslag lagen aan het sturen van Amerikaanse blauwhelmen naar Haïti. Ook wordt informatie gegeven over de directe gevolgen van de invasie. 14 | 15 | 16 | 43-AH 17 | El Niño en de voorspelling van het weer 18 | Zoek documenten die een verklaring geven voor het fenomeen El Niño en de weerslag daarvan op het weer op aarde (inclusief het effect op temperatuur, atmosferische druk, neerslag, enz.). 19 | Relevante documenten dienen informatie te bevatten over de gevolgen van het fenomeen El Niño. De wisselwerkingen tussen de oceanen en de aardatmosfeer die verband houden met het fenomeen El Niño komen eveneens in aanmerking. El Niño is vooral van belang in het zuiden van de Stille Oceaan omwille van de invloed die deze uitoefent op het klimaat op aarde. 20 | 21 | -------------------------------------------------------------------------------- /systems/lucene/clef/src/test/resources/topics/pt_topics.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 251-AH 5 | Medicina alternativa 6 | Encontrar documentos sobre tratamentos que empreguem medicina natural ou alternativa. Aqui são incluídas terapias como a acupuntura, a hemopatia, a quiroprática, entre outras. 7 | Documentos relevantes devem fornecer informação, específica ou genérica, sobre o uso de tratamentos ou técnicas de medicina natural ou alternativa. 8 | 9 | 10 | 252-AH 11 | Sistemas de reforma e pensões na Europa 12 | Encontrar documentos sobre os esquemas de pensões e benefícios na reforma em qualquer país europeu. 13 | Documentos relevantes devem conter informação sobre os actuais esquemas de pensões em estados europeus individuais. Informação de interesse engloba as idades de reforma mínima e máxima, assim como a forma de calcular o valor das pensões de reforma. Planos de reformulação dos esquemas de reforma não são relevantes. 14 | 15 | 16 | 253-AH 17 | Países com pena de morte 18 | Em quais países ainda é usada, ou pelo menos constitucionalmente permitida, a pena de morte? 19 | Documentos que afirmem de forma explícita que a constituição de um dado estado permite a pena capital são obviamente relevantes. Documentos de relatos de sentenças de morte específicas também são relevantes caso o estado ou país em questão seja explicitamente mencionado. 20 | 21 | -------------------------------------------------------------------------------- /systems/lucene/clef/src/test/resources/topics/ru_topics.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 143-AH 5 | Конференция по положению Женщин в Пекине 6 | Спорные позиции ряда делегаций поставили Всемирную Конференцию по положению Женщин в Пекине на гране провала. 7 | В релевантных документах должны обсуждаться любые из многочисленных проблем и разногласий, возникших в связи с Конференцией по положению Женщин в Пекине. Особый интерес представляют позиции представителей Ватикана, мусульманских общин и Китайской Коммунистической Партии. 8 | 9 | 10 | 147-AH 11 | Нефтяные аварии и Птицы 12 | Найти документы, в которых описывается вред и урон, наносимый птицам в результате утечек нефти или загрязнения. 13 | Релевантны документы, в которых говорится об уроне, наносимом птицам в результате аварийных утечек нефти. Сообщения о вреде, наносимом выбросами трюмной воды или выбросами нефти не релевантны. 14 | 15 | 16 | 148-AH 17 | Разрушение озонового слоя 18 | Какие озоновые дыры не вызваны загрязнением воздуха? 19 | Не все повреждения озонового слоя вызваны загрязнением воздуха. В релевантных документах должно говориться об иных причинах появления дыр в озоновом слое. 20 | 21 | -------------------------------------------------------------------------------- /systems/lucene/clef/src/test/resources/topics/sv_topics.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 91-AH 5 | Amnesty International i Latinamerika 6 | Rapporter från Amnesty International om mänskliga rättigheter i Latinamerika. 7 | Relevanta dokument bör ge läsarna information om Amnesty Internationals rapporter beträffande mänskliga rättigheter i Latinamerika eller om reaktioner på dessa rapporter. 8 | 9 | 10 | 92-AH 11 | FN-sanktioner mot Irak 12 | Vilka åtgärder har Irak vidtagit för att FN ska upphäva det ekonomiska embargo och de politiska sanktioner som landet belagts med efter invasionen av Kuwait 1990? 13 | Dokumenten måste innehålla olika åtgärder från Iraks sida för att försöka få sanktionerna upphävda. Enbart beskrivningar av sanktionerna eller retorik emot sanktionerna är inte relevanta. Beklaganden från officiellt irakiskt håll av invasionen av Kuwait är relevanta. 14 | 15 | 16 | 93-AH 17 | Eurofighter 18 | Sök efter dokument som rapporterar om projektet EFA eller projektet "Eurofighter". 19 | Relevanta dokument redogör för projektet EFA. Samarbetsländerna Tyskland, Storbritannien, Italien och Spanien har inlett ett "Eurofighter"-konsortium. Information om arbetsfördelning mellan de ingående försvarskoncernerna samt även kostnadsuppskattningar är av intresse. 20 | 21 | -------------------------------------------------------------------------------- /systems/lucene/clef_experiments.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | 3 | source ../common.sh 4 | 5 | # This script allows experiments based on Apache Lucene and CLEF collections to be replicated. 6 | 7 | ## declare the langs array 8 | declare -a langs=("bg" "de" "es" "fa" "fi" "fr" "hu" "it" "nl" "pt" "ru" "sv"); 9 | 10 | ## declare the rank models array 11 | declare -a models=("BM25"); 12 | 13 | topicFields="title;description"; 14 | 15 | if [ "$1" == "-h" ]; then 16 | printf "Usage: `basename $0` \n\n" >&2 17 | printf "Input parameters: \n" >&2 18 | printf "'-l': the language expressed as ISO 639-1 (two-letter codes); e.g. nl for Dutch or it for Italian.\n" >&2 19 | echo "Valid languages for CLEF experiments are: ${langs[@]}"; 20 | printf "'-cp': the path where the document collections are stored. \n" >&2 21 | printf "'-stm': the stemmer specification; y for 'yes' or n for 'no'. \n" >&2 22 | printf "'-sl': the stopword list specification; y for 'yes' or n for 'no'. \n" >&2 23 | printf "'-r': the ranking model specification; e.g. BM25 for bm25. \n" >&2 24 | echo "Allowed models for CLEF experiments and Apache Lucene 5.2.1 are: ${models[@]}"; 25 | printf "'-v': verbose mode. \n" >&2 26 | exit 0 27 | else 28 | # check the input paramenters 29 | if [ $# -lt 10 ]; then 30 | echo "You must specify all the input parameters: -l, -cp, -stm, -sl, -r" 1>&2 31 | exit 1 32 | else 33 | echo "Starting the execution."; 34 | if [ "$1" == "-l" ]; then 35 | lang=$2; 36 | tmp=false; 37 | for i in "${langs[@]}" 38 | do 39 | if [ "$i" == "$lang" ] ; then 40 | tmp=true; 41 | break; 42 | fi 43 | done 44 | 45 | if [ "$tmp" = false ]; then 46 | printf "first parameter must be the language -l expressed as ISO 639-1 (two-letter codes)\n"; 47 | echo "Valid languages for CLEF experiments are: ${langs[@]}"; 48 | exit 1 49 | fi 50 | 51 | else 52 | echo "The first parameter must be the language -l expressed as ISO 639-1 (two-letter codes)" 53 | exit 1 54 | fi 55 | 56 | if [ "$3" == "-cp" ]; then 57 | collection_path=$4; 58 | else 59 | echo "The second parameter must be the collection path -cp" 60 | exit 1 61 | fi 62 | 63 | if [ "$5" == "-stm" ]; then 64 | if [ "$6" == "y" ]; then 65 | stm=true; 66 | elif [ "$6" == "n" ]; then 67 | stm=false; 68 | else 69 | printf "The value %s is not valid for -stm \n" "$6" >&2 70 | printf "For the -stm parameter you must specify y for yes or n for no \n" >&2 71 | exit 1 72 | fi 73 | else 74 | echo "The third parameter must be stemmer specification -stm (y = yes; n = no)" 75 | exit 1 76 | fi 77 | 78 | if [ "$7" == "-sl" ]; then 79 | if [ "$8" == "y" ]; then 80 | sl=true; 81 | elif [ "$8" == "n" ]; then 82 | sl=false; 83 | else 84 | printf "The value %s is not valid for -sl \n" "$6" >&2 85 | printf "For the -sl parameter you must specify y for yes or n for no \n" >&2 86 | exit 1 87 | fi 88 | else 89 | echo "The fourth parameter must be stopword list specification -sl (y = yes; n = no)" 90 | exit 1 91 | fi 92 | 93 | if [ "$9" == "-r" ]; then 94 | rank_model=${10}; 95 | 96 | tmp=false; 97 | for i in "${models[@]}" 98 | do 99 | if [ "$i" == "$rank_model" ] ; then 100 | tmp=true; 101 | break; 102 | fi 103 | done 104 | 105 | if [ "$tmp" = false ]; then 106 | printf "The last parameter must be the ranking model.\n"; 107 | echo "Allowed models for CLEF experiments and Apache Lucene 5.2.1 are: ${models[@]}"; 108 | exit 1 109 | fi 110 | else 111 | echo "The fourth parameter must be stopword list specification -sl (y = yes; n = no)" 112 | exit 1 113 | fi 114 | 115 | if [ "${11}" == "-v" ]; then 116 | verbose=true; 117 | else 118 | verbose=false; 119 | fi 120 | 121 | fi 122 | fi 123 | 124 | LUCENE_PATH=$(pwd); 125 | cd ../../ 126 | ROOT_PATH=$(pwd); 127 | cd $LUCENE_PATH 128 | 129 | ## path to the topics file 130 | topics=$ROOT_PATH/topics-and-qrels/CLEF/topics/"$lang"_topics.xml; 131 | 132 | ## path to the qrels file 133 | qrels=$ROOT_PATH/topics-and-qrels/CLEF/qrels/"$lang"_qrels.txt; 134 | 135 | ## create required folders 136 | 137 | mkdir -p $ROOT_PATH/results/CLEF/lucene/"$lang"; 138 | 139 | mkdir -p $ROOT_PATH/runs/CLEF/lucene/"$lang"; 140 | 141 | indexDir=$lang; 142 | 143 | if [ "$sl" = true ]; then 144 | stopsetType="CUSTOM"; 145 | stoplist=$ROOT_PATH/resources/CLEF/"$lang"_sl.txt; 146 | stoplistOpt="-Dstopset.type=$stopsetType -Dstopset.path=$stoplist"; 147 | indexDir="$indexDir"_Stopword; 148 | else 149 | stoplistOpt="-Dstopset.type=\"EMPTY\""; 150 | fi 151 | 152 | if [ "$stm" = true ]; then 153 | indexDir="$indexDir"_Stemmer; 154 | stemmer="DEFAULT"; 155 | else 156 | stemmer="NONE"; 157 | fi 158 | 159 | runTag="$indexDir"_"$rank_model"; 160 | 161 | runFile=$ROOT_PATH/runs/CLEF/lucene/"$lang"/"$runTag".txt; 162 | 163 | indexDir="${LUCENE_PATH}/indexes/$indexDir"; 164 | 165 | OPTIONS="-Dindex.path=$indexDir -Dcorpora.path=$collection_path -Dstemmer=$stemmer -Dlanguage=$lang $stoplistOpt"; 166 | 167 | 168 | ## do the index, if it does not exist 169 | if [ -d "$indexDir" ]; then 170 | if [ "$verbose" = true ]; then 171 | printf "The index already exists in %s \n" "$indexDir" >&2 172 | fi 173 | else 174 | java -jar $OPTIONS clef/target/lucene-clef-1.0-jar-with-dependencies.jar -i 175 | fi 176 | 177 | 178 | if [ "$verbose" = true ]; then 179 | printf "Performing the retrieval with the ranking model %s \n" "$rank_model" >&2 180 | fi 181 | 182 | 183 | ## do the retrieval 184 | 185 | OPTIONS="$OPTIONS -Drun.model=$rank_model -Drun.path=$runFile -Drun.tag=$runTag -Dtopics.path=$topics -Dtopics.fields=$topicFields"; 186 | 187 | java -jar $OPTIONS clef/target/lucene-clef-1.0-jar-with-dependencies.jar -r 188 | 189 | ${TREC_EVAL} -q -c -M1000 $qrels $runFile>${ROOT_PATH}/results/CLEF/lucene/"$lang"/"$runTag".txt 190 | 191 | -------------------------------------------------------------------------------- /systems/lucene/clef_runs: -------------------------------------------------------------------------------- 1 | de y y BM25 2 | es y y BM25 3 | fa y y BM25 4 | fr y y BM25 5 | hu y y BM25 6 | it y y BM25 7 | nl y y BM25 8 | ru y y BM25 9 | sv y y BM25 10 | -------------------------------------------------------------------------------- /systems/lucene/dotgov2.sh: -------------------------------------------------------------------------------- 1 | source ../common.sh 2 | echo "Compiling ingester project..." 3 | cd ingester 4 | mvn clean compile assembly:single 5 | cd .. 6 | 7 | maxmemory="-Xmx15G" 8 | 9 | echo "Starting indexing..." 10 | #rm -rf gov2.lucene 11 | 12 | # Counts index 13 | java $maxmemory -cp lib/lucene-core-5.2.1.jar:lib/lucene-backward-codecs-5.2.1.jar:lib/lucene-analyzers-common-5.2.1.jar:lib/lucene-benchmark-5.2.1.jar:lib/lucene-queryparser-5.2.1.jar:.:ingester/target/ingester-0.0.1-SNAPSHOT-jar-with-dependencies.jar luceneingester.TrecIngester -dataDir $GOV2_LOCATION -indexPath gov2.lucene.cnt -threadCount 32 -docCountLimit -1 14 | 15 | #Force merge 16 | echo "Force merging..." 17 | java $maxmemory -cp lib/lucene-core-5.2.1.jar:lib/lucene-backward-codecs-5.2.1.jar:lib/lucene-analyzers-common-5.2.1.jar:lib/lucene-benchmark-5.2.1.jar:lib/lucene-queryparser-5.2.1.jar:.:ingester/target/ingester-0.0.1-SNAPSHOT-jar-with-dependencies.jar luceneingester.ForceMerge gov2.lucene.cnt/index 18 | 19 | # Positional index 20 | java $maxmemory -cp lib/lucene-core-5.2.1.jar:lib/lucene-backward-codecs-5.2.1.jar:lib/lucene-analyzers-common-5.2.1.jar:lib/lucene-benchmark-5.2.1.jar:lib/lucene-queryparser-5.2.1.jar:.:ingester/target/ingester-0.0.1-SNAPSHOT-jar-with-dependencies.jar luceneingester.TrecIngester -dataDir $GOV2_LOCATION -indexPath gov2.lucene.pos -positions -threadCount 32 -docCountLimit -1 21 | 22 | echo "Force merging..." 23 | java $maxmemory -cp lib/lucene-core-5.2.1.jar:lib/lucene-backward-codecs-5.2.1.jar:lib/lucene-analyzers-common-5.2.1.jar:lib/lucene-benchmark-5.2.1.jar:lib/lucene-queryparser-5.2.1.jar:.:ingester/target/ingester-0.0.1-SNAPSHOT-jar-with-dependencies.jar luceneingester.ForceMerge gov2.lucene.pos/index 24 | 25 | 26 | for index in "cnt" "pos" 27 | do 28 | echo "Evaluation index ${index}" 29 | for queries in "701-750" "751-800" "801-850" 30 | do 31 | query_file=$TOPICS_QRELS/topics.${queries}.txt 32 | qrel_file=$TOPICS_QRELS/qrels.${queries}.txt 33 | run_file=submission_${queries}_${index}.txt 34 | stat_file=submission_${queries}_${index}.log 35 | eval_file=submission_${queries}_${index}.eval 36 | 37 | java $maxmemory -cp lib/lucene-core-5.2.1.jar:lib/lucene-backward-codecs-5.2.1.jar:lib/lucene-analyzers-common-5.2.1.jar:lib/lucene-benchmark-5.2.1.jar:lib/lucene-queryparser-5.2.1.jar:.:ingester/target/ingester-0.0.1-SNAPSHOT-jar-with-dependencies.jar luceneingester.TrecDriver ${query_file} ${qrel_file} ${run_file} gov2.lucene.${index}/index T > ${stat_file} 38 | 39 | ${TREC_EVAL} ${qrel_file} ${run_file} > ${eval_file} 40 | done 41 | done 42 | -------------------------------------------------------------------------------- /systems/lucene/ingester/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | ingester 4 | ingester 5 | 0.0.1-SNAPSHOT 6 | Lucene/Solr Benchmarks 7 | 8 | 9 | 10 | 11 | org.apache.maven.plugins 12 | maven-compiler-plugin 13 | 14 | 1.8 15 | 1.8 16 | 17 | 18 | 19 | maven-assembly-plugin 20 | 21 | 22 | 23 | fully.qualified.MainClass 24 | 25 | 26 | 27 | jar-with-dependencies 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | org.apache.lucene 37 | lucene-benchmark 38 | 5.2.1 39 | 40 | 41 | org.apache.lucene 42 | lucene-backward-codecs 43 | 5.2.1 44 | 45 | 46 | org.apache.lucene 47 | lucene-queryparser 48 | 5.2.1 49 | 50 | 51 | org.apache.lucene 52 | lucene-analyzers-common 53 | 5.2.1 54 | 55 | 56 | org.apache.lucene 57 | lucene-core 58 | 5.2.1 59 | 60 | 61 | 62 | org.apache.solr 63 | solr-solrj 64 | 5.2.1 65 | 66 | 67 | commons-logging 68 | commons-logging 69 | 1.2 70 | 71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /systems/lucene/ingester/src/main/java/luceneingester/Args.java: -------------------------------------------------------------------------------- 1 | package luceneingester; 2 | 3 | /** 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | import java.util.ArrayList; 21 | import java.util.HashMap; 22 | import java.util.List; 23 | import java.util.Map; 24 | 25 | public class Args { 26 | private final String[] args; 27 | private final Map used = new HashMap(); 28 | 29 | public Args(String[] args) { 30 | this.args = args; 31 | } 32 | 33 | public String getString(String argName) { 34 | for(int upto=0;upto getStrings(String argName) { 48 | List values = new ArrayList(); 49 | for(int upto=0;upto numTotalDocs) { 172 | break; 173 | } 174 | if ((docCount % 100000) == 0) { 175 | System.out.println("Indexer: " + docCount + " docs... (" + (System.currentTimeMillis() - tStart)/1000.0 + " sec)"); 176 | } 177 | w.addDocument(doc); 178 | } 179 | 180 | } catch (Exception e) { 181 | failed.set(true); 182 | throw new RuntimeException(e); 183 | } finally { 184 | stopLatch.countDown(); 185 | } 186 | } 187 | } 188 | 189 | private static class IngestRatePrinter extends Thread { 190 | 191 | private final AtomicInteger count; 192 | private final AtomicBoolean stop; 193 | public IngestRatePrinter(AtomicInteger count, AtomicBoolean stop){ 194 | this.count = count; 195 | this.stop = stop; 196 | } 197 | 198 | @Override 199 | public void run() { 200 | long time = System.currentTimeMillis(); 201 | System.out.println("startIngest: " + time); 202 | final long start = time; 203 | int lastCount = count.get(); 204 | while(!stop.get()) { 205 | try { 206 | Thread.sleep(200); 207 | } catch(Exception ex) { 208 | } 209 | int numDocs = count.get(); 210 | 211 | double current = numDocs - lastCount; 212 | long now = System.currentTimeMillis(); 213 | double seconds = (now-time) / 1000.0d; 214 | System.out.println("ingest: " + (current / seconds) + " " + (now - start)); 215 | time = now; 216 | lastCount = numDocs; 217 | } 218 | } 219 | } 220 | } 221 | -------------------------------------------------------------------------------- /systems/lucene/ingester/src/main/java/luceneingester/NoPositionsTextField.java: -------------------------------------------------------------------------------- 1 | package luceneingester; 2 | 3 | import org.apache.lucene.document.Field; 4 | import org.apache.lucene.document.FieldType; 5 | import org.apache.lucene.index.IndexOptions; 6 | 7 | /** A tokenized field with stored=false and without positions (only frequencies) */ 8 | 9 | public final class NoPositionsTextField extends Field { 10 | 11 | public static final FieldType TYPE_NOT_STORED = new FieldType(); 12 | 13 | static { 14 | TYPE_NOT_STORED.setIndexOptions(IndexOptions.DOCS_AND_FREQS); 15 | TYPE_NOT_STORED.setTokenized(true); 16 | TYPE_NOT_STORED.freeze(); 17 | } 18 | 19 | public NoPositionsTextField(String name, String value) { 20 | super(name, value, TYPE_NOT_STORED); 21 | } 22 | } -------------------------------------------------------------------------------- /systems/lucene/ingester/src/main/java/luceneingester/TrecDriver.java: -------------------------------------------------------------------------------- 1 | package luceneingester; 2 | 3 | import java.io.OutputStreamWriter; 4 | import java.io.PrintWriter; 5 | import java.nio.charset.Charset; 6 | import java.nio.charset.StandardCharsets; 7 | import java.nio.file.Files; 8 | import java.nio.file.Path; 9 | import java.nio.file.Paths; 10 | import java.util.HashSet; 11 | import java.util.Set; 12 | 13 | import org.apache.lucene.analysis.en.EnglishAnalyzer; 14 | import org.apache.lucene.benchmark.quality.Judge; 15 | import org.apache.lucene.benchmark.quality.QualityBenchmark; 16 | import org.apache.lucene.benchmark.quality.QualityQuery; 17 | import org.apache.lucene.benchmark.quality.QualityQueryParser; 18 | import org.apache.lucene.benchmark.quality.QualityStats; 19 | import org.apache.lucene.benchmark.quality.trec.QueryDriver; 20 | import org.apache.lucene.benchmark.quality.trec.TrecJudge; 21 | import org.apache.lucene.benchmark.quality.trec.TrecTopicsReader; 22 | import org.apache.lucene.benchmark.quality.utils.SubmissionReport; 23 | import org.apache.lucene.index.DirectoryReader; 24 | import org.apache.lucene.index.IndexReader; 25 | import org.apache.lucene.queryparser.classic.ParseException; 26 | import org.apache.lucene.queryparser.classic.QueryParser; 27 | import org.apache.lucene.queryparser.classic.QueryParserBase; 28 | import org.apache.lucene.search.BooleanClause; 29 | import org.apache.lucene.search.BooleanQuery; 30 | import org.apache.lucene.search.IndexSearcher; 31 | import org.apache.lucene.search.Query; 32 | import org.apache.lucene.search.similarities.BM25Similarity; 33 | import org.apache.lucene.store.FSDirectory; 34 | 35 | public class TrecDriver extends QueryDriver { 36 | public static void main(String[] args) throws Exception { 37 | 38 | if (args.length < 4 || args.length > 5) { 39 | System.err.println("Usage: QueryDriver [querySpec]"); 40 | System.err.println("topicsFile: input file containing queries"); 41 | System.err.println("qrelsFile: input file containing relevance judgements"); 42 | System.err.println("submissionFile: output submission file for trec_eval"); 43 | System.err.println("indexDir: index directory"); 44 | System.err.println("querySpec: string composed of fields to use in query consisting of T=title,D=description,N=narrative:"); 45 | System.err.println("\texample: TD (query on Title + Description). The default is T (title only)"); 46 | System.exit(1); 47 | } 48 | 49 | Path topicsFile = Paths.get(args[0]); 50 | Path qrelsFile = Paths.get(args[1]); 51 | Path submissionFile = Paths.get(args[2]); 52 | SubmissionReport submitLog = new SubmissionReport(new PrintWriter(Files.newBufferedWriter(submissionFile, StandardCharsets.UTF_8)), "lucene"); 53 | FSDirectory dir = FSDirectory.open(Paths.get(args[3])); 54 | String fieldSpec = args.length == 5 ? args[4] : "T"; // default to Title-only if not specified. 55 | IndexReader reader = DirectoryReader.open(dir); 56 | IndexSearcher searcher = new IndexSearcher(reader); 57 | searcher.setSimilarity(new BM25Similarity(0.9f, 0.4f)); 58 | 59 | int maxResults = 1000; 60 | String docNameField = "docname"; 61 | 62 | PrintWriter logger = new PrintWriter(new OutputStreamWriter(System.out, Charset.defaultCharset()), true); 63 | 64 | // use trec utilities to read trec topics into quality queries 65 | TrecTopicsReader qReader = new TrecTopicsReader(); 66 | QualityQuery qqs[] = qReader.readQueries(Files.newBufferedReader(topicsFile, StandardCharsets.UTF_8)); 67 | 68 | // prepare judge, with trec utilities that read from a QRels file 69 | Judge judge = new TrecJudge(Files.newBufferedReader(qrelsFile, StandardCharsets.UTF_8)); 70 | 71 | // validate topics & judgments match each other 72 | judge.validateData(qqs, logger); 73 | 74 | Set fieldSet = new HashSet<>(); 75 | if (fieldSpec.indexOf('T') >= 0) fieldSet.add("title"); 76 | if (fieldSpec.indexOf('D') >= 0) fieldSet.add("description"); 77 | if (fieldSpec.indexOf('N') >= 0) fieldSet.add("narrative"); 78 | 79 | // set the parsing of quality queries into Lucene queries. 80 | QualityQueryParser qqParser = new EnglishQQParser(fieldSet.toArray(new String[0]), "body"); 81 | 82 | // run the benchmark 83 | QualityBenchmark qrun = new QualityBenchmark(qqs, qqParser, searcher, docNameField); 84 | qrun.setMaxResults(maxResults); 85 | QualityStats stats[] = qrun.execute(judge, submitLog, logger); 86 | 87 | // print an avarage sum of the results 88 | QualityStats avg = QualityStats.average(stats); 89 | avg.log("SUMMARY", 2, logger, " "); 90 | reader.close(); 91 | dir.close(); 92 | } 93 | } 94 | 95 | class EnglishQQParser implements QualityQueryParser { 96 | 97 | private String qqNames[]; 98 | private String indexField; 99 | ThreadLocal queryParser = new ThreadLocal<>(); 100 | 101 | /** 102 | * Constructor of a simple qq parser. 103 | * @param qqNames name-value pairs of quality query to use for creating the query 104 | * @param indexField corresponding index field 105 | */ 106 | public EnglishQQParser(String qqNames[], String indexField) { 107 | this.qqNames = qqNames; 108 | this.indexField = indexField; 109 | } 110 | 111 | /** 112 | * Constructor of a simple qq parser. 113 | * @param qqName name-value pair of quality query to use for creating the query 114 | * @param indexField corresponding index field 115 | */ 116 | public EnglishQQParser(String qqName, String indexField) { 117 | this(new String[] { qqName }, indexField); 118 | } 119 | 120 | /* (non-Javadoc) 121 | * @see org.apache.lucene.benchmark.quality.QualityQueryParser#parse(org.apache.lucene.benchmark.quality.QualityQuery) 122 | */ 123 | @Override 124 | public Query parse(QualityQuery qq) throws ParseException { 125 | QueryParser qp = queryParser.get(); 126 | if (qp==null) { 127 | qp = new QueryParser(indexField, new EnglishAnalyzer()); 128 | queryParser.set(qp); 129 | } 130 | BooleanQuery bq = new BooleanQuery(); 131 | for (int i = 0; i < qqNames.length; i++) 132 | bq.add(qp.parse(QueryParserBase.escape(qq.getValue(qqNames[i]))), BooleanClause.Occur.SHOULD); 133 | 134 | return bq; 135 | } 136 | 137 | } 138 | 139 | -------------------------------------------------------------------------------- /systems/lucene/ingester/src/main/java/luceneingester/TrecIngester.java: -------------------------------------------------------------------------------- 1 | package luceneingester; 2 | 3 | /** 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | import java.io.IOException; 21 | import java.nio.file.Paths; 22 | import java.util.HashMap; 23 | import java.util.Map; 24 | import java.util.Properties; 25 | 26 | import org.apache.lucene.analysis.Analyzer; 27 | import org.apache.lucene.analysis.en.EnglishAnalyzer; 28 | import org.apache.lucene.benchmark.byTask.feeds.TrecContentSource; 29 | import org.apache.lucene.benchmark.byTask.utils.Config; 30 | import org.apache.lucene.document.TextField; 31 | import org.apache.lucene.index.IndexWriter; 32 | import org.apache.lucene.index.IndexWriterConfig; 33 | import org.apache.lucene.index.MergePolicy; 34 | import org.apache.lucene.index.NoMergePolicy; 35 | import org.apache.lucene.search.similarities.BM25Similarity; 36 | import org.apache.lucene.store.*; 37 | import org.apache.lucene.util.*; 38 | 39 | public final class TrecIngester { 40 | private static TrecContentSource createTrecSource(String dataDir) { 41 | TrecContentSource tcs = new TrecContentSource(); 42 | Properties props = new Properties(); 43 | props.setProperty("print.props", "false"); 44 | props.setProperty("content.source.verbose", "false"); 45 | props.setProperty("content.source.excludeIteration", "true"); 46 | props.setProperty("docs.dir", dataDir); 47 | props.setProperty("trec.doc.parser", "org.apache.lucene.benchmark.byTask.feeds.TrecGov2Parser"); 48 | props.setProperty("content.source.forever", "false"); 49 | tcs.setConfig(new Config(props)); 50 | try { 51 | tcs.resetInputs(); 52 | } catch (IOException e) { 53 | e.printStackTrace(); 54 | } 55 | return tcs; 56 | } 57 | 58 | public static void main(String[] clArgs) throws Exception { 59 | Args args = new Args(clArgs); 60 | final String dirPath = args.getString("-indexPath") + "/index"; 61 | final String dataDir = args.getString("-dataDir"); 62 | final int docCountLimit = args.getInt("-docCountLimit"); // -1 means all docs from the source: 63 | final int numThreads = args.getInt("-threadCount"); 64 | final boolean verbose = args.getFlag("-verbose"); 65 | final boolean printDPS = args.getFlag("-printDPS"); 66 | final boolean doUpdate = args.getFlag("-update"); 67 | final boolean positions = args.getFlag("-positions"); 68 | final boolean forceMerge = args.getFlag("-forceMerge"); 69 | 70 | args.check(); 71 | 72 | final Analyzer a = new EnglishAnalyzer(); 73 | final TrecContentSource trecSource = createTrecSource(dataDir); 74 | final Directory dir = FSDirectory.open(Paths.get(dirPath)); 75 | 76 | System.out.println("Index path: " + dirPath); 77 | System.out.println("Doc count limit: " + (docCountLimit == -1 ? "all docs" : ""+docCountLimit)); 78 | System.out.println("Threads: " + numThreads); 79 | System.out.println("Verbose: " + (verbose ? "yes" : "no")); 80 | System.out.println("Positions: " + (positions ? "yes" : "no")); 81 | System.out.println("Force merge: " + (forceMerge ? "yes" : "no")); 82 | 83 | if (verbose) { 84 | InfoStream.setDefault(new PrintStreamInfoStream(System.out)); 85 | } 86 | 87 | final IndexWriterConfig iwc = new IndexWriterConfig(a); 88 | 89 | if (doUpdate) { 90 | iwc.setOpenMode(IndexWriterConfig.OpenMode.APPEND); 91 | } else { 92 | iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); 93 | } 94 | if (forceMerge) { 95 | // TODO: Explore a merge policy that results in just one segment. NoMergePolicy seems 96 | // to result in large number of files, but possibly one 1 segment. 97 | //iwc.setMergePolicy(NoMergePolicy.INSTANCE); 98 | } 99 | System.out.println("IW config=" + iwc); 100 | 101 | final IndexWriter w = new IndexWriter(dir, iwc); 102 | IndexThreads threads = new IndexThreads(w, positions, trecSource, numThreads, docCountLimit, printDPS); 103 | System.out.println("\nIndexer: start"); 104 | 105 | final long t0 = System.currentTimeMillis(); 106 | 107 | threads.start(); 108 | 109 | while (!threads.done()) { 110 | Thread.sleep(100); 111 | } 112 | threads.stop(); 113 | 114 | final long t1 = System.currentTimeMillis(); 115 | System.out.println("\nIndexer: indexing done (" + (t1-t0)/1000.0 + " sec); total " + w.maxDoc() + " docs"); 116 | if (!doUpdate && docCountLimit != -1 && w.maxDoc() != docCountLimit) { 117 | throw new RuntimeException("w.maxDoc()=" + w.maxDoc() + " but expected " + docCountLimit); 118 | } 119 | if (threads.failed.get()) { 120 | throw new RuntimeException("exceptions during indexing"); 121 | } 122 | 123 | 124 | final long t2; 125 | t2 = System.currentTimeMillis(); 126 | 127 | final Map commitData = new HashMap(); 128 | commitData.put("userData", "multi"); 129 | w.setCommitData(commitData); 130 | w.commit(); 131 | final long t3 = System.currentTimeMillis(); 132 | System.out.println("\nIndexer: commit multi (took " + (t3-t2)/1000.0 + " sec)"); 133 | 134 | 135 | if (forceMerge) { 136 | System.out.println("\nStarting the merge..."); 137 | long mergeStart = System.currentTimeMillis(); 138 | w.forceMerge(1); 139 | w.commit(); 140 | System.out.println("\nIndexer: merging took " + (System.currentTimeMillis() - mergeStart)/1000.0 + " sec"); 141 | } 142 | System.out.println("\nIndexer: at close: " + w.segString()); 143 | final long tCloseStart = System.currentTimeMillis(); 144 | w.close(); 145 | System.out.println("\nIndexer: close took " + (System.currentTimeMillis() - tCloseStart)/1000.0 + " sec"); 146 | dir.close(); 147 | final long tFinal = System.currentTimeMillis(); 148 | System.out.println("\nIndexer: finished (" + (tFinal-t0)/1000.0 + " sec)"); 149 | System.out.println("\nIndexer: net bytes indexed " + threads.getBytesIndexed()); 150 | System.out.println("\nIndexer: " + (threads.getBytesIndexed()/1024./1024./1024./((tFinal-t0)/3600000.)) + " GB/hour plain text"); 151 | } 152 | } 153 | -------------------------------------------------------------------------------- /systems/lucene/lib/lucene-analyzers-common-5.2.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/IR-Reproducibility/8223ad29e72b20d3610376e22ad84a0e986022f8/systems/lucene/lib/lucene-analyzers-common-5.2.1.jar -------------------------------------------------------------------------------- /systems/lucene/lib/lucene-backward-codecs-5.2.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/IR-Reproducibility/8223ad29e72b20d3610376e22ad84a0e986022f8/systems/lucene/lib/lucene-backward-codecs-5.2.1.jar -------------------------------------------------------------------------------- /systems/lucene/lib/lucene-benchmark-5.2.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/IR-Reproducibility/8223ad29e72b20d3610376e22ad84a0e986022f8/systems/lucene/lib/lucene-benchmark-5.2.1.jar -------------------------------------------------------------------------------- /systems/lucene/lib/lucene-core-5.2.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/IR-Reproducibility/8223ad29e72b20d3610376e22ad84a0e986022f8/systems/lucene/lib/lucene-core-5.2.1.jar -------------------------------------------------------------------------------- /systems/lucene/lib/lucene-queryparser-5.2.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/IR-Reproducibility/8223ad29e72b20d3610376e22ad84a0e986022f8/systems/lucene/lib/lucene-queryparser-5.2.1.jar -------------------------------------------------------------------------------- /systems/terrier/dotgov2-prox.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ef 3 | 4 | source ../common.sh 5 | 6 | INDEX="blocks" 7 | 8 | if [[ ! -f terrier-4.0.tar.gz ]]; then 9 | curl http://www.dcs.gla.ac.uk/~craigm/terrier-4.0.tar.gz> terrier-4.0.tar.gz 10 | fi 11 | tar -zxf terrier-4.0.tar.gz 12 | cd terrier-4.0 13 | 14 | bin/trec_setup.sh $GOV2_LOCATION 2>&1 | tee trec_setup.log 15 | #mv etc/collection.spec collection.spec && head collection.spec > etc/collection.spec 16 | 17 | OPTS="-i -j" 18 | if [ "$INDEX" == "classical" ]; 19 | then 20 | OPTS="-i" 21 | fi 22 | 23 | echo <> etc/terrier.properties 24 | trec.collection.class=TRECWebCollection 25 | #indexer.meta.forward.keys=docno,url 26 | #indexer.meta.forward.keylens=26,256 27 | indexer.meta.forward.keys=docno 28 | indexer.meta.forward.keylens=26 29 | indexer.meta.reverse.keys= 30 | ignore.low.idf.terms=false 31 | 32 | #faster indexing with more memory 33 | memory.reserved=104857600 34 | EOF 35 | 36 | if [ "$INDEX" == "blocks" ]; 37 | then 38 | OPTS="$OPTS -Dblock.indexing=true" 39 | elif [[ "$INDEX" == "blocks_fields" ]]; then 40 | OPTS="$OPTS -Dblock.indexing=true -DFieldTags.process=TITLE,ELSE" 41 | fi 42 | 43 | JAVA_OPTIONS=-XX:-UseGCOverheadLimit TERRIER_HEAP_MEM=100g bin/trec_terrier.sh $OPTS 2>&1 | tee indexing.${INDEX}.log 44 | 45 | if [[ "$INDEX" == "blocks_fields" ]]; then 46 | perl -pi -e 's/FSADocumentIndex$/FSAFieldDocumentIndex/g' var/index/data.properties 47 | fi 48 | 49 | for RANKER in DPH_Prox; 50 | do 51 | ../dotgov2-ranker.sh $INDEX $RANKER 52 | done 53 | 54 | mv var ${INDEX}-var 55 | -------------------------------------------------------------------------------- /systems/terrier/dotgov2-qe.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ef 3 | 4 | source ../common.sh 5 | 6 | INDEX="classical" 7 | 8 | if [[ ! -f terrier-4.0.tar.gz ]]; then 9 | curl http://www.dcs.gla.ac.uk/~craigm/terrier-4.0.tar.gz> terrier-4.0.tar.gz 10 | fi 11 | tar -zxf terrier-4.0.tar.gz 12 | cd terrier-4.0 13 | 14 | bin/trec_setup.sh $GOV2_LOCATION 2>&1 | tee trec_setup.log 15 | #mv etc/collection.spec collection.spec && head collection.spec > etc/collection.spec 16 | 17 | OPTS="-i -j" 18 | if [ "$INDEX" == "classical" ]; 19 | then 20 | OPTS="-i" 21 | fi 22 | 23 | echo <> etc/terrier.properties 24 | trec.collection.class=TRECWebCollection 25 | #indexer.meta.forward.keys=docno,url 26 | #indexer.meta.forward.keylens=26,256 27 | indexer.meta.forward.keys=docno 28 | indexer.meta.forward.keylens=26 29 | indexer.meta.reverse.keys= 30 | ignore.low.idf.terms=false 31 | 32 | #faster indexing with more memory 33 | memory.reserved=104857600 34 | EOF 35 | 36 | if [ "$INDEX" == "blocks" ]; 37 | then 38 | OPTS="$OPTS -Dblock.indexing=true" 39 | elif [[ "$INDEX" == "blocks_fields" ]]; then 40 | OPTS="$OPTS -Dblock.indexing=true -DFieldTags.process=TITLE,ELSE" 41 | fi 42 | 43 | JAVA_OPTIONS=-XX:-UseGCOverheadLimit TERRIER_HEAP_MEM=100g bin/trec_terrier.sh $OPTS 2>&1 | tee indexing.${INDEX}.log 44 | 45 | if [[ "$INDEX" == "blocks_fields" ]]; then 46 | perl -pi -e 's/FSADocumentIndex$/FSAFieldDocumentIndex/g' var/index/data.properties 47 | fi 48 | 49 | for RANKER in DPH_QE; 50 | do 51 | ../dotgov2-ranker.sh $INDEX $RANKER 52 | done 53 | 54 | mv var ${INDEX}-var 55 | -------------------------------------------------------------------------------- /systems/terrier/dotgov2-ranker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ef 3 | 4 | pushd .. 5 | source ../common.sh 6 | popd 7 | 8 | INDEX=$1 9 | RANKER=$2 10 | 11 | OPTIONS="" 12 | 13 | if [ "$RANKER" == "DPH" ]; 14 | then 15 | OPTIONS="$OPTIONS -Dtrec.model=DPH" 16 | elif [ "$RANKER" == "DPH_QE" ]; 17 | then 18 | OPTIONS="$OPTIONS -Dtrec.model=DPH" 19 | OPTIONS="$OPTIONS -Dquerying.default.controls=qe:on" 20 | if [ ! -e "var/index/data.direct.bf" ]; 21 | then 22 | TERRIER_HEAP_MEM=100g bin/trec_terrier.sh -id 23 | du -csh var/index/ 24 | fi 25 | elif [[ "$RANKER" == "BM25" ]]; then 26 | OPTIONS="$OPTIONS -Dtrec.model=BM25" 27 | elif [[ "$RANKER" == "DPH_Prox" ]]; then 28 | OPTIONS="$OPTIONS -Dtrec.model=DPH" 29 | OPTIONS="$OPTIONS -Dmatching.dsms=DFRDependenceScoreModifier" 30 | OPTIONS="$OPTIONS -Dproximity.dependency.type=SD" 31 | OPTIONS="$OPTIONS -Dproximity.ngram.length=5" 32 | #elif [[ "$RANKER" == "LTR" ]]; then 33 | # pwd 34 | # exec ../dotgov2-ltr-ranker.sh $INDEX $RANKER 35 | fi 36 | 37 | for queries in "701-750" "751-800" "801-850" 38 | do 39 | query_file=../$TOPICS_QRELS/topics.${queries}.txt 40 | qrel_file=../$TOPICS_QRELS/qrels.${queries}.txt 41 | stat_file=${INDEX}.${RANKER}.${queries}.search_stats.txt 42 | run_file=$PWD/${INDEX}.${RANKER}.terrier.${queries}.txt 43 | 44 | TERRIER_HEAP_MEM=26g bin/trec_terrier.sh -r -Dtrec.topics=$query_file -Dtrec.results.file=$run_file $OPTIONS > $stat_file 2>&1 45 | ../$TREC_EVAL ${qrel_file} ${run_file}| tee -a $stat_file 46 | 47 | #grep 'Total Time to Search' ${stat_file} | sed \$d 48 | done 49 | -------------------------------------------------------------------------------- /systems/terrier/dotgov2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ef 3 | 4 | source ../common.sh 5 | 6 | INDEX="singlepass" 7 | 8 | if [[ ! -f terrier-4.0.tar.gz ]]; then 9 | curl http://www.dcs.gla.ac.uk/~craigm/terrier-4.0.tar.gz> terrier-4.0.tar.gz 10 | fi 11 | tar -zxf terrier-4.0.tar.gz 12 | cd terrier-4.0 13 | 14 | bin/trec_setup.sh $GOV2_LOCATION 2>&1 | tee trec_setup.log 15 | #mv etc/collection.spec collection.spec && head collection.spec > etc/collection.spec 16 | 17 | OPTS="-i -j" 18 | if [ "$INDEX" == "classical" ]; 19 | then 20 | OPTS="-i" 21 | fi 22 | 23 | echo <> etc/terrier.properties 24 | trec.collection.class=TRECWebCollection 25 | #indexer.meta.forward.keys=docno,url 26 | #indexer.meta.forward.keylens=26,256 27 | indexer.meta.forward.keys=docno 28 | indexer.meta.forward.keylens=26 29 | indexer.meta.reverse.keys= 30 | ignore.low.idf.terms=false 31 | 32 | #faster indexing with more memory 33 | memory.reserved=104857600 34 | EOF 35 | 36 | if [ "$INDEX" == "blocks" ]; 37 | then 38 | OPTS="$OPTS -Dblock.indexing=true" 39 | elif [[ "$INDEX" == "blocks_fields" ]]; then 40 | OPTS="$OPTS -Dblock.indexing=true -DFieldTags.process=TITLE,ELSE" 41 | fi 42 | 43 | JAVA_OPTIONS=-XX:-UseGCOverheadLimit TERRIER_HEAP_MEM=100g bin/trec_terrier.sh $OPTS 2>&1 | tee indexing.${INDEX}.log 44 | 45 | if [[ "$INDEX" == "blocks_fields" ]]; then 46 | perl -pi -e 's/FSADocumentIndex$/FSAFieldDocumentIndex/g' var/index/data.properties 47 | fi 48 | 49 | for RANKER in DPH BM25; 50 | do 51 | ../dotgov2-ranker.sh $INDEX $RANKER 52 | done 53 | 54 | mv var ${INDEX}-var 55 | -------------------------------------------------------------------------------- /topics-and-qrels/README.md: -------------------------------------------------------------------------------- 1 | Gov2 2 | ==== 3 | 4 | + topics.701-750.txt: [Topics 701-750 (TREC 2004 Terabyte Track)](http://trec.nist.gov/data/terabyte/04/04topics.701-750.txt) 5 | + topics.751-800.txt: [Topics 751-800 (TREC 2005 Terabyte Track)](http://trec.nist.gov/data/terabyte/05/05.topics.751-800.txt) 6 | + topics.801-850.txt: [Topics 801-850 (TREC 2006 Terabyte Track)](http://trec.nist.gov/data/terabyte/06/06.topics.801-850.txt) 7 | + qrels.701-750.txt: [qrels for Topics 701-750 (TREC 2004 Terabyte Track)](http://trec.nist.gov/data/terabyte/04/04.qrels.12-Nov-04) 8 | + qrels.751-800.txt: [qrels for Topics 751-800 (TREC 2005 Terabyte Track)](http://trec.nist.gov/data/terabyte/05/05.adhoc_qrels) 9 | + qrels.801-850.txt: [qrels for Topics 801-850 (TREC 2006 Terabyte Track)](http://trec.nist.gov/data/terabyte/06/qrels.tb06.top50) 10 | 11 | ClueWeb09 12 | ========= 13 | 14 | + topics.web.1-50.txt: [Topics 1-50 (TREC 2009 Web Track)](http://trec.nist.gov/data/web/09/wt09.topics.full.xml) 15 | + topics.web.51-100.txt: [Topics 51-100 (TREC 2010 Web Track)](http://trec.nist.gov/data/web/10/wt2010-topics.xml) 16 | + topics.web.101-150.txt: [Topics 101-150 (TREC 2011 Web Track)](http://trec.nist.gov/data/web/11/full-topics.xml) 17 | + topics.web.151-200.txt: [Topics 151-200 (TREC 2012 Web Track)](http://trec.nist.gov/data/web/12/full-topics.xml) 18 | + qrels.web.1-50.txt: [adhoc prels for category B runs for Topics 1-50 (TREC 2009 Web Track)](http://trec.nist.gov/data/web/09/prels.catB.1-50.gz) 19 | + qrels.web.51-100.txt: [adhoc qrels for Topics 51-100 (TREC 2010 Web Track)](http://trec.nist.gov/data/web/10/10.adhoc-qrels.final) 20 | + qrels.web.101-150.txt: [adhoc qrels for Topics 101-150 (TREC 2011 Web Track)](http://trec.nist.gov/data/web/11/qrels.adhoc) 21 | + qrels.web.151-200.txt: [adhoc qrels for Topics 151-200 (TREC 2012 Web Track)](http://trec.nist.gov/data/web/12/qrels.adhoc) 22 | 23 | ClueWeb12 24 | ========= 25 | + topics.web.201-250.txt: [Topics 201-250 (TREC 2013 Web Track)](http://trec.nist.gov/data/web/2013/trec2013-topics.xml) 26 | + topics.web.251-300.txt: [Topics 251-300 (TREC 2014 Web Track)](http://trec.nist.gov/data/web/2014/trec2014-topics.xml) 27 | + qrels.web.201-250.txt: [one aspect per topic qrels for Topics 251-300 (TREC 2013 Web Track)](http://trec.nist.gov/data/web/2013/qrels.adhoc.txt) 28 | + qrels.web.251-300.txt: [one aspect per topic qrels for Topics 251-300 (TREC 2014 Web Track)](http://trec.nist.gov/data/web/2014/qrels.adhoc.txt) 29 | --------------------------------------------------------------------------------