├── .gitignore
├── CLEF.md
├── CW09.md
├── CW12.md
├── Gov2.md
├── README.md
├── ec2-setup.md
├── eval
    ├── gdeval
    ├── statAP_MQ_eval_v3.pl
    └── trec_eval.9.0.tar.gz
├── resources
    └── CLEF
    │   ├── bg_sl.txt
    │   ├── de_sl.txt
    │   ├── es_sl.txt
    │   ├── fa_sl.txt
    │   ├── fi_sl.txt
    │   ├── fr_sl.txt
    │   ├── hu_sl.txt
    │   ├── it_sl.txt
    │   ├── nl_sl.txt
    │   ├── pt_sl.txt
    │   ├── ru_sl.txt
    │   └── sv_sl.txt
├── results
    └── CLEF
    │   ├── indri
    │       └── info.txt
    │   ├── lucene
    │       └── info.txt
    │   └── terrier
    │       └── info.txt
├── runs
    └── CLEF
    │   ├── indri
    │       └── info.txt
    │   ├── lucene
    │       └── info.txt
    │   └── terrier
    │       └── info.txt
├── systems
    ├── ATIRE
    │   ├── README.md
    │   ├── cw09.sh
    │   ├── cw12.sh
    │   ├── dotgov2.sh
    │   └── setup.sh
    ├── JASS
    │   ├── cw09.sh
    │   ├── cw12.sh
    │   ├── dotgov2.sh
    │   └── setup.sh
    ├── MG4J
    │   ├── README.md
    │   ├── cw12-bm25.sh
    │   ├── cw12-eval-pos.sh
    │   ├── cw12-eval.sh
    │   ├── cw12-index-pos.sh
    │   ├── cw12-index.sh
    │   ├── genqueries.sh
    │   ├── genqueriespos.sh
    │   ├── gensubsets.rb
    │   ├── gensubsetspos.rb
    │   ├── gov2-bm25.sh
    │   ├── gov2-eval-pos.sh
    │   ├── gov2-eval.sh
    │   ├── gov2-index-pos.sh
    │   ├── gov2-index.sh
    │   └── logback.xml
    ├── common.sh
    ├── galago
    │   ├── dotgov2.sh
    │   └── make_query_json.py
    ├── indri
    │   ├── clean.sh
    │   ├── dm.pl
    │   ├── dotgov2.sh
    │   ├── index-clef.sh
    │   ├── index-clef_ReadMe.txt
    │   ├── indexParaSP_bg
    │   ├── indexParaSP_de
    │   ├── indexParaSP_es
    │   ├── indexParaSP_fa
    │   ├── indexParaSP_fi
    │   ├── indexParaSP_fr
    │   ├── indexParaSP_hu
    │   ├── indexParaSP_it
    │   ├── indexParaSP_nl
    │   ├── indexParaSP_pt
    │   ├── indexParaSP_ru
    │   ├── indexParaSP_sv
    │   ├── queryParaLMSP_bg
    │   ├── queryParaLMSP_de
    │   ├── queryParaLMSP_es
    │   ├── queryParaLMSP_fa
    │   ├── queryParaLMSP_fi
    │   ├── queryParaLMSP_fr
    │   ├── queryParaLMSP_hu
    │   ├── queryParaLMSP_it
    │   ├── queryParaLMSP_nl
    │   ├── queryParaLMSP_pt
    │   ├── queryParaLMSP_ru
    │   ├── queryParaLMSP_sv
    │   └── query_LM.sh
    ├── lucene
    │   ├── clef.sh
    │   ├── clef
    │   │   ├── LICENSE.txt
    │   │   ├── README.md
    │   │   ├── pom.xml
    │   │   └── src
    │   │   │   ├── main
    │   │   │       ├── java
    │   │   │       │   └── it
    │   │   │       │   │   └── unipd
    │   │   │       │   │       └── dei
    │   │   │       │   │           └── ims
    │   │   │       │   │               └── lucene
    │   │   │       │   │                   └── clef
    │   │   │       │   │                       ├── AnalyzerFactory.java
    │   │   │       │   │                       ├── App.java
    │   │   │       │   │                       ├── applications
    │   │   │       │   │                           ├── BatchRetrieval.java
    │   │   │       │   │                           └── BuildIndex.java
    │   │   │       │   │                       └── parser
    │   │   │       │   │                           ├── ClefDocParser.java
    │   │   │       │   │                           └── ClefQQParser.java
    │   │   │       └── resources
    │   │   │       │   ├── logback.xml
    │   │   │       │   └── lucene-clef.properties
    │   │   │   └── test
    │   │   │       ├── java
    │   │   │           └── it
    │   │   │           │   └── unipd
    │   │   │           │       └── dei
    │   │   │           │           └── ims
    │   │   │           │               └── lucene
    │   │   │           │                   └── clef
    │   │   │           │                       └── parser
    │   │   │           │                           └── ClefQQParserTest.java
    │   │   │       └── resources
    │   │   │           └── topics
    │   │   │               ├── bg_topics.xml
    │   │   │               ├── de_topics.xml
    │   │   │               ├── es_topics.xml
    │   │   │               ├── fa_topics.xml
    │   │   │               ├── fi_topics.xml
    │   │   │               ├── fr_topics.xml
    │   │   │               ├── hu_topics.xml
    │   │   │               ├── it_topics.xml
    │   │   │               ├── nl_topics.xml
    │   │   │               ├── pt_topics.xml
    │   │   │               ├── ru_topics.xml
    │   │   │               └── sv_topics.xml
    │   ├── clef_experiments.sh
    │   ├── clef_runs
    │   ├── dotgov2.sh
    │   ├── ingester
    │   │   ├── pom.xml
    │   │   └── src
    │   │   │   └── main
    │   │   │       └── java
    │   │   │           └── luceneingester
    │   │   │               ├── Args.java
    │   │   │               ├── ForceMerge.java
    │   │   │               ├── IndexStats.java
    │   │   │               ├── IndexThreads.java
    │   │   │               ├── NoPositionsTextField.java
    │   │   │               ├── TrecDriver.java
    │   │   │               └── TrecIngester.java
    │   └── lib
    │   │   ├── lucene-analyzers-common-5.2.1.jar
    │   │   ├── lucene-backward-codecs-5.2.1.jar
    │   │   ├── lucene-benchmark-5.2.1.jar
    │   │   ├── lucene-core-5.2.1.jar
    │   │   └── lucene-queryparser-5.2.1.jar
    └── terrier
    │   ├── clef_experiments.sh
    │   ├── dotgov2-prox.sh
    │   ├── dotgov2-qe.sh
    │   ├── dotgov2-ranker.sh
    │   └── dotgov2.sh
└── topics-and-qrels
    ├── CLEF
        ├── qrels
        │   ├── bg_qrels.txt
        │   ├── de_qrels.txt
        │   ├── es_qrels.txt
        │   ├── fa_qrels.txt
        │   ├── fi_qrels.txt
        │   ├── fr_qrels.txt
        │   ├── hu_qrels.txt
        │   ├── it_qrels.txt
        │   ├── nl_qrels.txt
        │   ├── pt_qrels.txt
        │   ├── ru_qrels.txt
        │   └── sv_qrels.txt
        └── topics
        │   ├── bg_topics.xml
        │   ├── de_topics.xml
        │   ├── es_topics.xml
        │   ├── fa_topics.xml
        │   ├── fi_topics.xml
        │   ├── fr_topics.xml
        │   ├── hu_topics.xml
        │   ├── it_topics.xml
        │   ├── nl_topics.xml
        │   ├── pt_topics.xml
        │   ├── ru_topics.xml
        │   └── sv_topics.xml
    ├── README.md
    ├── prels.web.1-50.txt
    ├── qrels.701-750.txt
    ├── qrels.751-800.txt
    ├── qrels.801-850.txt
    ├── qrels.web.101-150.txt
    ├── qrels.web.151-200.txt
    ├── qrels.web.201-250.txt
    ├── qrels.web.251-300.txt
    ├── qrels.web.51-100.txt
    ├── topics.701-750.txt
    ├── topics.751-800.txt
    ├── topics.801-850.txt
    ├── topics.web.1-50.txt
    ├── topics.web.101-150.txt
    ├── topics.web.151-200.txt
    ├── topics.web.201-250.txt
    ├── topics.web.251-300.txt
    └── topics.web.51-100.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | eval/trec_eval.9.0/


--------------------------------------------------------------------------------
/CW09.md:
--------------------------------------------------------------------------------
 1 | # ClueWeb09 Category B Comparisons
 2 | What follows is an initial comparison of selected information retrieval systems on the ClueWeb09 Category B collection using scripts provided by authors/leading contributors of those systems. The systems are listed in alphabetical order.
 3 | 
 4 | ## Indexing
 5 | Two metrics for indexing are reported below: the size of the generated index, and the time taken to generate that index.
 6 | 
 7 | System  | Type              |    Size |                  Time | Terms | Postings | Tokens |
 8 | :-------|:------------------|--------:|----------------------:|------:|---------:|--------:
 9 | ATIRE   | Count             | 33.3 GB |                1h 57m |       |          |        |
10 | ATIRE   | Count + Quantized | 43.2 GB |                3h 12m |       |          |        |
11 | JASS    |                   | 66.1 GB | ATIRE Quantized + 12m |       |          |        |
12 | 
13 | ###### ATIRE
14 | + The quantized index pre-calculates the BM25 scores at indexing time and stores these instead of term frequencies, more about the quantization in ATIRE can be found in [Crane et al. (2013)](http://dl.acm.org/citation.cfm?id=2507860).
15 |   + The quantization is performed single threaded although easily parallelized.
16 | + Both indexes were not stemmed.
17 | + Both indexes were pruned of SGML tags, used for typically unused search time features.
18 | + Both indexes postings lists are stored impact ordered, with docids being compressed using variable-byte compression after being delta encoded.
19 | 
20 | ## Retrieval
21 | Both retrieval efficiency (by query latency) and effectiveness (MAP@1000) were measured on four query sets: 1-50, 51-100, 101-150, and 151-200.
22 | 
23 | ### Retrieval Models
24 | 
25 | ###### ATIRE
26 | + ATIRE uses a modified version of BM25, described [here](http://www.cs.otago.ac.nz/homepages/andrew/papers/2012-1.pdf).
27 | + Searching was done using top-k search also described in the above paper.
28 |   + This is not early termination, all documents for all terms in the query still get scored.
29 | + BM25 parameters were set to the default for ATIRE, `k1=0.9 b=0.4`.
30 | + Only stopping of tags was performed, this has no effect on search.
31 | 
32 | ### Retrieval Latency
33 | The table below shows the average search time across queries by query set. The search times were taken from the internal reporting of each systems.
34 | 
35 | System  | Model          | Index             | Topics 1-50 | Topics 51-100 | Topics 101-150 | Topics 151-200
36 | :-------|:---------------|-------------------|------------:|--------------:|---------------:|--------------:
37 | ATIRE   | BM25           | Count             |             |         651ms |          760ms |          452ms
38 | ATIRE   | Quantized BM25 | Count + Quantized |             |         182ms |          272ms |          179ms
39 | JASS    |                |                   |             |         175ms |          234ms |          168ms
40 | JASS    | 5M Postings    |                   |             |          63ms |          105ms |           64ms
41 | 
42 | ### Retrieval Effectiveness
43 | The systems generated run files to be consumed by the `trec_eval` tool. Each system was evaluated on the top 1000 results for each query, and the table below shows the MAP scores for the systems.
44 | 
45 | System  | Model          | Index             | Topics 1-50 | Topics 51-100 | Topics 101-150 | Topics 151-200
46 | :-------|:---------------|-------------------|------------:|--------------:|---------------:|--------------:
47 | ATIRE   | BM25           | Count             |             |        0.1137 |         0.1082 |         0.0982
48 | ATIRE   | Quantized BM25 | Count + Quantized |             |        0.1154 |         0.1070 |         0.0998
49 | JASS    |                |                   |             |        0.1154 |         0.1070 |         0.0998
50 | JASS    | 5M Postings    |                   |             |        0.1151 |         0.1046 |         0.0973
51 | 
52 | ##### Statistical Analysis
53 | 
54 | **TODO:** Need to run statistical analyses.
55 | 


--------------------------------------------------------------------------------
/CW12.md:
--------------------------------------------------------------------------------
 1 | # ClueWeb12 B13 Comparisons
 2 | What follows is an initial comparison of selected information retrieval systems on the ClueWeb12 B13 collection using scripts provided by authors/leading contributors of those systems. The systems are listed in alphabetical order.
 3 | 
 4 | ## Indexing
 5 | Two metrics for indexing are reported below: the size of the generated index, and the time taken to generate that index.
 6 | 
 7 | System  | Type              |    Size |                  Time | Terms | Postings | Tokens |
 8 | :-------|:------------------|--------:|----------------------:|------:|---------:|--------:
 9 | ATIRE   | Count             | 42.4 GB |                3h 03m |       |          |        |
10 | ATIRE   | Count + Quantized | 53.4 GB |                4h 25m |       |          |        |
11 | JASS    |                   | 83.2 GB | ATIRE Quantized + 32m |       |          |        |
12 | MG4J    | Count             |   17 GB |                2h 38m |  133M |    12.7G |        |
13 | MG4J    | Position          |   58 GB |                3h 20m |  133M |    12.7G |  33.8G |
14 | 
15 | ###### ATIRE
16 | + The quantized index pre-calculates the BM25 scores at indexing time and stores these instead of term frequencies, more about the quantization in ATIRE can be found in [Crane et al. (2013)](http://dl.acm.org/citation.cfm?id=2507860).
17 |   + The quantization is performed single threaded although easily parallelized.
18 | + Both indexes were not stemmed.
19 | + Both indexes were pruned of SGML tags, used for typically unused search time features.
20 | + Both indexes postings lists are stored impact ordered, with docids being compressed using variable-byte compression after being delta encoded.
21 | 
22 | ## Retrieval
23 | Both retrieval efficiency (by query latency) and effectiveness (MAP@1000) were measured on two query sets: 201-250, and 251-300.
24 | 
25 | ### Retrieval Models
26 | 
27 | ###### ATIRE
28 | + ATIRE uses a modified version of BM25, described [here](http://www.cs.otago.ac.nz/homepages/andrew/papers/2012-1.pdf).
29 | + Searching was done using top-k search also described in the above paper.
30 |   + This is not early termination, all documents for all terms in the query still get scored.
31 | + BM25 parameters were set to the default for ATIRE, `k1=0.9 b=0.4`.
32 | + Only stopping of tags was performed, this has no effect on search.
33 | 
34 | ###### MG4J
35 | 
36 | See the description for the [Gov2 runs](Gov2.md).
37 | 
38 | ### Retrieval Latency
39 | The table below shows the average search time across queries by query set. The search times were taken from the internal reporting of each systems.
40 | 
41 | System  | Model          | Index             | Topics 201-250 | Topics 251-300
42 | :-------|:---------------|-------------------|---------------:|--------------:
43 | ATIRE   | BM25           | Count             |          809ms |          788ms
44 | ATIRE   | Quantized BM25 | Count + Quantized |          290ms |          296ms
45 | JASS    |                |                   |          222ms |          261ms
46 | JASS    | 5M Postings    |                   |          103ms |           88ms
47 | MG4J    | BM25           | Count             |          706ms |          570ms
48 | MG4J    | Model B        | Count             |           60ms |           73ms
49 | MG4J    | Model B+       | Position          |          122ms |          258ms
50 | 
51 | ### Retrieval Effectiveness
52 | The systems generated run files to be consumed by the `trec_eval` tool. Each system was evaluated on the top 1000 results for each query, and the table below shows the MAP scores for the systems.
53 | 
54 | System  | Model          | Index             | Topics 201-250 | Topics 251-300
55 | :-------|:---------------|-------------------|---------------:|--------------:
56 | ATIRE   | BM25           | Count             |         0.0439 |         0.0196
57 | ATIRE   | Quantized BM25 | Count + Quantized |         0.0429 |         0.0201
58 | JASS    |                |                   |         0.0429 |         0.0201
59 | JASS    | 5M Postings    |                   |         0.0393 |         0.0193
60 | MG4J    | BM25           | Count             |         0.0410 |         0.0207
61 | MG4J    | Model B        | Count             |         0.0418 |         0.0206
62 | MG4J    | Model B+       | Position          |         0.0402 |         0.0166
63 | 
64 | ##### Statistical Analysis
65 | 
66 | **TODO:** Need to run statistical analyses.
67 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | The Open-Source Information Retrieval Reproducibility Challenge
 2 | ===============================================================
 3 | 
 4 | There's a general consensus in the information retrieval community that open-source search engines help enhance dissemination of results and support reproducibility. There's also general agreement that reproducibility is "a good thing". This issue has received some attention recently, including a dedicated track at ECIR 2015. However, we as a community still have a long way to go.
 5 | 
 6 | The goal of this project is to tackle the issue of reproducible *baselines*. This is a more difficult challenge than it seems. Just to provide two examples: Mühleisen et al. (2014) reported large differences in effectiveness across four systems that all purport to implement BM25. Trotman et al. (2014) pointed out that BM25 and query likelihood with Dirichlet smoothing can actually refer to at least half a dozen different variants; in some cases, differences in effectiveness are statistically significant. Given this state of affairs, how can we confidently report comparisons to "baselines" in our papers when even the baselines themselves are ill-defined? Indeed, Armstrong et al. (2009) point to the issue of weak baselines as the reason why ad hoc retrieval techniques haven't really been improving.
 7 | 
 8 | This project started as part of the [SIGIR 2015 Workshop on Reproducibility, Inexplicability, and Generalizability of Results (RIGOR)](https://sites.google.com/site/sigirrigor/), where we will report our initial findings.
 9 | 
10 | Goals
11 | -----
12 | 
13 | The purpose of this exercise is to invite the *developers* of open-source search engines to provide reproducible baselines of their systems in a common environment on Amazon's EC2 so that the community can have a better understanding of the effectiveness and efficiency differences of various baseline implementations. All results will be archived for future reference by the community. This archive is specifically designed to address the following scenarios:
14 | 
15 | 1. I want to evaluate my new technique X. As a baseline, I'll use open-source search engine Y. Or alternatively, I'm building on open-source search engine Y, so I need a baseline anyway.
16 | 
17 | 1. How do I know what's a "reasonable" result for system Y on test collection Z? What settings should I use? (Which stopwords list? What retrieval model? What parameter settings? Etc.) How do I know if I've configured system Y correctly?
18 | 
19 | 1. Correspondingly, as a reviewer of a paper that describes technique X, how do I know if the baseline is any good? Maybe the authors misconfigured system Y (inadvertently), thereby making their technique "look good" (i.e., it's a weak baseline).
20 | 
21 | As a result of this exercise, researchers will be able to go to this resource, and for a number of open-source search engines, they'll learn how to reproduce (through extensive documentation) what the developers of those systems themselves consider to be a reasonable baseline.
22 | 
23 | Similarly, reviewers of papers will be able to consult this resource to determine if the baseline the authors used is reasonable or somehow "faulty".
24 | 
25 | Another anticipated result of this exercise is that we'll gain a better understanding of why all these supposed "baselines" are different. We can imagine a system-by-feature matrix, where the features range from stemming algorithm to HTML cleaning technique. After this exercise, we'll have a partially-filled matrix, from which we'll be able to hopefully learn some generalizations, for example (completely hypothetical): HTML cleaning really makes a big difference, up to 10% in terms of NDCG; which stemming algorithm you use (Krovetz vs. Porter, etc.) doesn't really matter; etc.
26 | 
27 | References
28 | ----------
29 | 
30 | T. Armstrong, A. Moffat, W. Webber, J. Zobel. Improvements That Don't Add Up: Ad-Hoc Retrieval Results Since 1998. CIKM 2009, pages 601-610.
31 | 
32 | H. Mühleisen, T. Samar, J. Lin, and A. de Vries. Old Dogs Are Great at New Tricks: Column Stores for IR Prototyping. SIGIR 2014, pages 863-866.
33 | 
34 | A. Trotman, A. Puurula, and B. Burgess, Improvements to BM25 and Language Models Examined. ADCS 2014.
35 | 


--------------------------------------------------------------------------------
/ec2-setup.md:
--------------------------------------------------------------------------------
 1 | EC2 Setup
 2 | =========
 3 | 
 4 | For the Gov2 experiments, we are currently running the `r3.4xlarge` instance, with 16 vCPUs and 122 GiB memory, Ubuntu Server 14.04 LTS (HVM).
 5 | 
 6 | After logging in, the instance is first prepped by installing common missing packages:
 7 | 
 8 | ```
 9 | sudo apt-add-repository -y ppa:webupd8team/java
10 | sudo apt-get -y update
11 | sudo apt-get -y install oracle-java8-installer
12 | sudo apt-get -y install emacs24
13 | sudo apt-get -y install make gcc g++
14 | sudo apt-get -y install git mercurial
15 | sudo apt-get -y install zlibc zlib1g zlib1g-dev
16 | sudo apt-get -y install maven
17 | ```
18 | 
19 | After that, the collection is mounted:
20 | 
21 | ```
22 | sudo mkdir /media/Gov2
23 | sudo mount /dev/xvdf /media/Gov2
24 | ```
25 | 
26 | The collection is held on a [standard](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EBSVolumeTypes.html) EBS volume (i.e., magnetic). According to Amazon, this volume type will deliver 40-90 MiB/s maximum throughput. This should be sufficient for IR engines, but if any system is bumping up against this limit, we'll certainly revisit.
27 | 
28 | So, you'll see:
29 | 
30 | ```
31 | $ ls /media/Gov2/data/
32 | GX000  GX011  GX022  GX033  GX044  GX055  GX066  GX077  GX088  GX099  GX110  GX121  GX132  GX143  GX154  GX165  GX176  GX187  GX198  GX209  GX220  GX231  GX242  GX253  GX264
33 | GX001  GX012  GX023  GX034  GX045  GX056  GX067  GX078  GX089  GX100  GX111  GX122  GX133  GX144  GX155  GX166  GX177  GX188  GX199  GX210  GX221  GX232  GX243  GX254  GX265
34 | GX002  GX013  GX024  GX035  GX046  GX057  GX068  GX079  GX090  GX101  GX112  GX123  GX134  GX145  GX156  GX167  GX178  GX189  GX200  GX211  GX222  GX233  GX244  GX255  GX266
35 | GX003  GX014  GX025  GX036  GX047  GX058  GX069  GX080  GX091  GX102  GX113  GX124  GX135  GX146  GX157  GX168  GX179  GX190  GX201  GX212  GX223  GX234  GX245  GX256  GX267
36 | GX004  GX015  GX026  GX037  GX048  GX059  GX070  GX081  GX092  GX103  GX114  GX125  GX136  GX147  GX158  GX169  GX180  GX191  GX202  GX213  GX224  GX235  GX246  GX257  GX268
37 | GX005  GX016  GX027  GX038  GX049  GX060  GX071  GX082  GX093  GX104  GX115  GX126  GX137  GX148  GX159  GX170  GX181  GX192  GX203  GX214  GX225  GX236  GX247  GX258  GX269
38 | GX006  GX017  GX028  GX039  GX050  GX061  GX072  GX083  GX094  GX105  GX116  GX127  GX138  GX149  GX160  GX171  GX182  GX193  GX204  GX215  GX226  GX237  GX248  GX259  GX270
39 | GX007  GX018  GX029  GX040  GX051  GX062  GX073  GX084  GX095  GX106  GX117  GX128  GX139  GX150  GX161  GX172  GX183  GX194  GX205  GX216  GX227  GX238  GX249  GX260  GX271
40 | GX008  GX019  GX030  GX041  GX052  GX063  GX074  GX085  GX096  GX107  GX118  GX129  GX140  GX151  GX162  GX173  GX184  GX195  GX206  GX217  GX228  GX239  GX250  GX261  GX272
41 | GX009  GX020  GX031  GX042  GX053  GX064  GX075  GX086  GX097  GX108  GX119  GX130  GX141  GX152  GX163  GX174  GX185  GX196  GX207  GX218  GX229  GX240  GX251  GX262
42 | GX010  GX021  GX032  GX043  GX054  GX065  GX076  GX087  GX098  GX109  GX120  GX131  GX142  GX153  GX164  GX175  GX186  GX197  GX208  GX219  GX230  GX241  GX252  GX263
43 | ```
44 | 
45 | Then, the workspace is mounted:
46 | 
47 | ```
48 | sudo mkdir /media/workspace
49 | sudo mount /dev/xvdg /media/workspace
50 | ```
51 | 
52 | The workspace will serve as the location for holding code, indexes, etc. It is a general purpose SSD EBS volume. This is where the IR-Reproducibility repo should reside.
53 | 
54 | 


--------------------------------------------------------------------------------
/eval/trec_eval.9.0.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lintool/IR-Reproducibility/8223ad29e72b20d3610376e22ad84a0e986022f8/eval/trec_eval.9.0.tar.gz


--------------------------------------------------------------------------------
/resources/CLEF/bg_sl.txt:
--------------------------------------------------------------------------------
  1 | а
  2 | автентичен
  3 | аз
  4 | ако
  5 | ала
  6 | бе
  7 | без
  8 | беше
  9 | би
 10 | бивш
 11 | бивша
 12 | бившо
 13 | бил
 14 | била
 15 | били
 16 | било
 17 | благодаря
 18 | близо
 19 | бъдат
 20 | бъде
 21 | бяха
 22 | в
 23 | вас
 24 | ваш
 25 | ваша
 26 | вероятно
 27 | вече
 28 | взема
 29 | ви
 30 | вие
 31 | винаги
 32 | внимава
 33 | време
 34 | все
 35 | всеки
 36 | всички
 37 | всичко
 38 | всяка
 39 | във
 40 | въпреки
 41 | върху
 42 | г
 43 | ги
 44 | главен
 45 | главна
 46 | главно
 47 | глас
 48 | го
 49 | година
 50 | години
 51 | годишен
 52 | д
 53 | да
 54 | дали
 55 | два
 56 | двама
 57 | двамата
 58 | две
 59 | двете
 60 | ден
 61 | днес
 62 | дни
 63 | до
 64 | добра
 65 | добре
 66 | добро
 67 | добър
 68 | докато
 69 | докога
 70 | дори
 71 | досега
 72 | доста
 73 | друг
 74 | друга
 75 | други
 76 | е
 77 | евтин
 78 | едва
 79 | един
 80 | една
 81 | еднаква
 82 | еднакви
 83 | еднакъв
 84 | едно
 85 | екип
 86 | ето
 87 | живот
 88 | за
 89 | забавям
 90 | зад
 91 | заедно
 92 | заради
 93 | засега
 94 | заспал
 95 | затова
 96 | защо
 97 | защото
 98 | и
 99 | из
100 | или
101 | им
102 | има
103 | имат
104 | иска
105 | й
106 | каза
107 | как
108 | каква
109 | какво
110 | както
111 | какъв
112 | като
113 | кога
114 | когато
115 | което
116 | които
117 | кой
118 | който
119 | колко
120 | която
121 | къде
122 | където
123 | към
124 | лесен
125 | лесно
126 | ли
127 | лош
128 | м
129 | май
130 | малко
131 | ме
132 | между
133 | мек
134 | мен
135 | месец
136 | ми
137 | много
138 | мнозина
139 | мога
140 | могат
141 | може
142 | мокър
143 | моля
144 | момента
145 | му
146 | н
147 | на
148 | над
149 | назад
150 | най
151 | направи
152 | напред
153 | например
154 | нас
155 | не
156 | него
157 | нещо
158 | нея
159 | ни
160 | ние
161 | никой
162 | нито
163 | нищо
164 | но
165 | нов
166 | нова
167 | нови
168 | новина
169 | някои
170 | някой
171 | няколко
172 | няма
173 | обаче
174 | около
175 | освен
176 | особено
177 | от
178 | отгоре
179 | отново
180 | още
181 | пак
182 | по
183 | повече
184 | повечето
185 | под
186 | поне
187 | поради
188 | после
189 | почти
190 | прави
191 | пред
192 | преди
193 | през
194 | при
195 | пък
196 | първата
197 | първи
198 | първо
199 | пъти
200 | равен
201 | равна
202 | с
203 | са
204 | сам
205 | само
206 | се
207 | сега
208 | си
209 | син
210 | скоро
211 | след
212 | следващ
213 | сме
214 | смях
215 | според
216 | сред
217 | срещу
218 | сте
219 | съм
220 | със
221 | също
222 | т
223 | тази
224 | така
225 | такива
226 | такъв
227 | там
228 | твой
229 | те
230 | тези
231 | ти
232 | т.н.
233 | то
234 | това
235 | тогава
236 | този
237 | той
238 | толкова
239 | точно
240 | три
241 | трябва
242 | тук
243 | тъй
244 | тя
245 | тях
246 | у
247 | утре
248 | харесва
249 | хиляди
250 | ч
251 | часа
252 | че
253 | често
254 | чрез
255 | ще
256 | щом
257 | юмрук
258 | я
259 | як


--------------------------------------------------------------------------------
/resources/CLEF/de_sl.txt:
--------------------------------------------------------------------------------
  1 | a
  2 | ab
  3 | aber
  4 | aber
  5 | ach
  6 | acht
  7 | achte
  8 | achten
  9 | achter
 10 | achtes
 11 | ag
 12 | alle
 13 | allein
 14 | allem
 15 | allen
 16 | aller
 17 | allerdings
 18 | alles
 19 | allgemeinen
 20 | als
 21 | als
 22 | also
 23 | am
 24 | an
 25 | andere
 26 | anderen
 27 | andern
 28 | anders
 29 | au
 30 | auch
 31 | auch
 32 | auf
 33 | aus
 34 | ausser
 35 | außer
 36 | ausserdem
 37 | außerdem
 38 | b
 39 | bald
 40 | bei
 41 | beide
 42 | beiden
 43 | beim
 44 | beispiel
 45 | bekannt
 46 | bereits
 47 | besonders
 48 | besser
 49 | besten
 50 | bin
 51 | bis
 52 | bisher
 53 | bist
 54 | c
 55 | d
 56 | da
 57 | dabei
 58 | dadurch
 59 | dafür
 60 | dagegen
 61 | daher
 62 | dahin
 63 | dahinter
 64 | damals
 65 | damit
 66 | danach
 67 | daneben
 68 | dank
 69 | dann
 70 | daran
 71 | darauf
 72 | daraus
 73 | darf
 74 | darfst
 75 | darin
 76 | darüber
 77 | darum
 78 | darunter
 79 | das
 80 | das
 81 | dasein
 82 | daselbst
 83 | dass
 84 | daß
 85 | dasselbe
 86 | davon
 87 | davor
 88 | dazu
 89 | dazwischen
 90 | dein
 91 | deine
 92 | deinem
 93 | deiner
 94 | dem
 95 | dementsprechend
 96 | demgegenüber
 97 | demgemäss
 98 | demgemäß
 99 | demselben
100 | demzufolge
101 | den
102 | denen
103 | denn
104 | denn
105 | denselben
106 | der
107 | deren
108 | derjenige
109 | derjenigen
110 | dermassen
111 | dermaßen
112 | derselbe
113 | derselben
114 | des
115 | deshalb
116 | desselben
117 | dessen
118 | deswegen
119 | d.h
120 | dich
121 | die
122 | diejenige
123 | diejenigen
124 | dies
125 | diese
126 | dieselbe
127 | dieselben
128 | diesem
129 | diesen
130 | dieser
131 | dieses
132 | dir
133 | doch
134 | dort
135 | drei
136 | drin
137 | dritte
138 | dritten
139 | dritter
140 | drittes
141 | du
142 | durch
143 | durchaus
144 | dürfen
145 | dürft
146 | durfte
147 | durften
148 | e
149 | eben
150 | ebenso
151 | ehrlich
152 | ei
153 | ei,
154 | ei,
155 | eigen
156 | eigene
157 | eigenen
158 | eigener
159 | eigenes
160 | ein
161 | einander
162 | eine
163 | einem
164 | einen
165 | einer
166 | eines
167 | einige
168 | einigen
169 | einiger
170 | einiges
171 | einmal
172 | einmal
173 | eins
174 | elf
175 | en
176 | ende
177 | endlich
178 | entweder
179 | entweder
180 | er
181 | Ernst
182 | erst
183 | erste
184 | ersten
185 | erster
186 | erstes
187 | es
188 | etwa
189 | etwas
190 | euch
191 | f
192 | früher
193 | fünf
194 | fünfte
195 | fünften
196 | fünfter
197 | fünftes
198 | für
199 | g
200 | gab
201 | ganz
202 | ganze
203 | ganzen
204 | ganzer
205 | ganzes
206 | gar
207 | gedurft
208 | gegen
209 | gegenüber
210 | gehabt
211 | gehen
212 | geht
213 | gekannt
214 | gekonnt
215 | gemacht
216 | gemocht
217 | gemusst
218 | genug
219 | gerade
220 | gern
221 | gesagt
222 | gesagt
223 | geschweige
224 | gewesen
225 | gewollt
226 | geworden
227 | gibt
228 | ging
229 | gleich
230 | gott
231 | gross
232 | groß
233 | grosse
234 | große
235 | grossen
236 | großen
237 | grosser
238 | großer
239 | grosses
240 | großes
241 | gut
242 | gute
243 | guter
244 | gutes
245 | h
246 | habe
247 | haben
248 | habt
249 | hast
250 | hat
251 | hatte
252 | hätte
253 | hatten
254 | hätten
255 | heisst
256 | her
257 | heute
258 | hier
259 | hin
260 | hinter
261 | hoch
262 | i
263 | ich
264 | ihm
265 | ihn
266 | ihnen
267 | ihr
268 | ihre
269 | ihrem
270 | ihren
271 | ihrer
272 | ihres
273 | im
274 | im
275 | immer
276 | in
277 | in
278 | indem
279 | infolgedessen
280 | ins
281 | irgend
282 | ist
283 | j
284 | ja
285 | ja
286 | jahr
287 | jahre
288 | jahren
289 | je
290 | jede
291 | jedem
292 | jeden
293 | jeder
294 | jedermann
295 | jedermanns
296 | jedoch
297 | jemand
298 | jemandem
299 | jemanden
300 | jene
301 | jenem
302 | jenen
303 | jener
304 | jenes
305 | jetzt
306 | k
307 | kam
308 | kann
309 | kannst
310 | kaum
311 | kein
312 | keine
313 | keinem
314 | keinen
315 | keiner
316 | kleine
317 | kleinen
318 | kleiner
319 | kleines
320 | kommen
321 | kommt
322 | können
323 | könnt
324 | konnte
325 | könnte
326 | konnten
327 | kurz
328 | l
329 | lang
330 | lange
331 | lange
332 | leicht
333 | leide
334 | lieber
335 | los
336 | m
337 | machen
338 | macht
339 | machte
340 | mag
341 | magst
342 | mahn
343 | man
344 | manche
345 | manchem
346 | manchen
347 | mancher
348 | manches
349 | mann
350 | mehr
351 | mein
352 | meine
353 | meinem
354 | meinen
355 | meiner
356 | meines
357 | mensch
358 | menschen
359 | mich
360 | mir
361 | mit
362 | mittel
363 | mochte
364 | möchte
365 | mochten
366 | mögen
367 | möglich
368 | mögt
369 | morgen
370 | muss
371 | muß
372 | müssen
373 | musst
374 | müsst
375 | musste
376 | mussten
377 | n
378 | na
379 | nach
380 | nachdem
381 | nahm
382 | natürlich
383 | neben
384 | nein
385 | neue
386 | neuen
387 | neun
388 | neunte
389 | neunten
390 | neunter
391 | neuntes
392 | nicht
393 | nicht
394 | nichts
395 | nie
396 | niemand
397 | niemandem
398 | niemanden
399 | noch
400 | nun
401 | nun
402 | nur
403 | o
404 | ob
405 | ob
406 | oben
407 | oder
408 | oder
409 | offen
410 | oft
411 | oft
412 | ohne
413 | Ordnung
414 | p
415 | q
416 | r
417 | recht
418 | rechte
419 | rechten
420 | rechter
421 | rechtes
422 | richtig
423 | rund
424 | s
425 | sa
426 | sache
427 | sagt
428 | sagte
429 | sah
430 | satt
431 | schlecht
432 | Schluss
433 | schon
434 | sechs
435 | sechste
436 | sechsten
437 | sechster
438 | sechstes
439 | sehr
440 | sei
441 | sei
442 | seid
443 | seien
444 | sein
445 | seine
446 | seinem
447 | seinen
448 | seiner
449 | seines
450 | seit
451 | seitdem
452 | selbst
453 | selbst
454 | sich
455 | sie
456 | sieben
457 | siebente
458 | siebenten
459 | siebenter
460 | siebentes
461 | sind
462 | so
463 | solang
464 | solche
465 | solchem
466 | solchen
467 | solcher
468 | solches
469 | soll
470 | sollen
471 | sollte
472 | sollten
473 | sondern
474 | sonst
475 | sowie
476 | später
477 | statt
478 | t
479 | tag
480 | tage
481 | tagen
482 | tat
483 | teil
484 | tel
485 | tritt
486 | trotzdem
487 | tun
488 | u
489 | über
490 | überhaupt
491 | übrigens
492 | uhr
493 | um
494 | und
495 | und?
496 | uns
497 | unser
498 | unsere
499 | unserer
500 | unter
501 | v
502 | vergangenen
503 | viel
504 | viele
505 | vielem
506 | vielen
507 | vielleicht
508 | vier
509 | vierte
510 | vierten
511 | vierter
512 | viertes
513 | vom
514 | von
515 | vor
516 | w
517 | wahr?
518 | während
519 | währenddem
520 | währenddessen
521 | wann
522 | war
523 | wäre
524 | waren
525 | wart
526 | warum
527 | was
528 | wegen
529 | weil
530 | weit
531 | weiter
532 | weitere
533 | weiteren
534 | weiteres
535 | welche
536 | welchem
537 | welchen
538 | welcher
539 | welches
540 | wem
541 | wen
542 | wenig
543 | wenig
544 | wenige
545 | weniger
546 | weniges
547 | wenigstens
548 | wenn
549 | wenn
550 | wer
551 | werde
552 | werden
553 | werdet
554 | wessen
555 | wie
556 | wie
557 | wieder
558 | will
559 | willst
560 | wir
561 | wird
562 | wirklich
563 | wirst
564 | wo
565 | wohl
566 | wollen
567 | wollt
568 | wollte
569 | wollten
570 | worden
571 | wurde
572 | würde
573 | wurden
574 | würden
575 | x
576 | y
577 | z
578 | z.b
579 | zehn
580 | zehnte
581 | zehnten
582 | zehnter
583 | zehntes
584 | zeit
585 | zu
586 | zuerst
587 | zugleich
588 | zum
589 | zum
590 | zunächst
591 | zur
592 | zurück
593 | zusammen
594 | zwanzig
595 | zwar
596 | zwar
597 | zwei
598 | zweite
599 | zweiten
600 | zweiter
601 | zweites
602 | zwischen
603 | zwölf
604 | 


--------------------------------------------------------------------------------
/resources/CLEF/es_sl.txt:
--------------------------------------------------------------------------------
  1 | a
  2 | acuerdo
  3 | adelante
  4 | ademas
  5 | además
  6 | adrede
  7 | ahi
  8 | ahí
  9 | ahora
 10 | al
 11 | alli
 12 | allí
 13 | alrededor
 14 | antano
 15 | antaño
 16 | ante
 17 | antes
 18 | apenas
 19 | aproximadamente
 20 | aquel
 21 | aquél
 22 | aquella
 23 | aquélla
 24 | aquellas
 25 | aquéllas
 26 | aquello
 27 | aquellos
 28 | aquéllos
 29 | aqui
 30 | aquí
 31 | arribaabajo
 32 | asi
 33 | así
 34 | aun
 35 | aún
 36 | aunque
 37 | b
 38 | bajo
 39 | bastante
 40 | bien
 41 | breve
 42 | c
 43 | casi
 44 | cerca
 45 | claro
 46 | como
 47 | cómo
 48 | con
 49 | conmigo
 50 | contigo
 51 | contra
 52 | cual
 53 | cuál
 54 | cuales
 55 | cuáles
 56 | cuando
 57 | cuándo
 58 | cuanta
 59 | cuánta
 60 | cuantas
 61 | cuántas
 62 | cuanto
 63 | cuánto
 64 | cuantos
 65 | cuántos
 66 | d
 67 | de
 68 | debajo
 69 | del
 70 | delante
 71 | demasiado
 72 | dentro
 73 | deprisa
 74 | desde
 75 | despacio
 76 | despues
 77 | después
 78 | detras
 79 | detrás
 80 | dia
 81 | día
 82 | dias
 83 | días
 84 | donde
 85 | dónde
 86 | dos
 87 | durante
 88 | e
 89 | el
 90 | él
 91 | ella
 92 | ellas
 93 | ellos
 94 | en
 95 | encima
 96 | enfrente
 97 | enseguida
 98 | entre
 99 | es
100 | esa
101 | ésa
102 | esas
103 | ésas
104 | ese
105 | ése
106 | eso
107 | esos
108 | ésos
109 | esta
110 | está
111 | ésta
112 | estado
113 | estados
114 | estan
115 | están
116 | estar
117 | estas
118 | éstas
119 | este
120 | éste
121 | esto
122 | estos
123 | éstos
124 | ex
125 | excepto
126 | f
127 | final
128 | fue
129 | fuera
130 | fueron
131 | g
132 | general
133 | gran
134 | h
135 | ha
136 | habia
137 | había
138 | habla
139 | hablan
140 | hace
141 | hacia
142 | han
143 | hasta
144 | hay
145 | horas
146 | hoy
147 | i
148 | incluso
149 | informo
150 | informó
151 | j
152 | junto
153 | k
154 | l
155 | la
156 | lado
157 | las
158 | le
159 | lejos
160 | lo
161 | los
162 | luego
163 | m
164 | mal
165 | mas
166 | más
167 | mayor
168 | me
169 | medio
170 | mejor
171 | menos
172 | menudo
173 | mi
174 | mí
175 | mia
176 | mía
177 | mias
178 | mías
179 | mientras
180 | mio
181 | mío
182 | mios
183 | míos
184 | mis
185 | mismo
186 | mucho
187 | muy
188 | n
189 | nada
190 | nadie
191 | ninguna
192 | no
193 | nos
194 | nosotras
195 | nosotros
196 | nuestra
197 | nuestras
198 | nuestro
199 | nuestros
200 | nueva
201 | nuevo
202 | nunca
203 | o
204 | os
205 | otra
206 | otros
207 | p
208 | pais
209 | paìs
210 | para
211 | parte
212 | pasado
213 | peor
214 | pero
215 | poco
216 | por
217 | porque
218 | pronto
219 | proximo
220 | próximo
221 | puede
222 | q
223 | qeu
224 | que
225 | qué
226 | quien
227 | quién
228 | quienes
229 | quiénes
230 | quiza
231 | quizá
232 | quizas
233 | quizás
234 | r
235 | raras
236 | repente
237 | s
238 | salvo
239 | se
240 | sé
241 | segun
242 | según
243 | ser
244 | sera
245 | será
246 | si
247 | sí
248 | sido
249 | siempre
250 | sin
251 | sobre
252 | solamente
253 | solo
254 | sólo
255 | son
256 | soyos
257 | su
258 | supuesto
259 | sus
260 | suya
261 | suyas
262 | suyo
263 | t
264 | tal
265 | tambien
266 | también
267 | tampoco
268 | tarde
269 | te
270 | temprano
271 | ti
272 | tiene
273 | todavia
274 | todavía
275 | todo
276 | todos
277 | tras
278 | tu
279 | tú
280 | tus
281 | tuya
282 | tuyas
283 | tuyo
284 | tuyos
285 | u
286 | un
287 | una
288 | unas
289 | uno
290 | unos
291 | usted
292 | ustedes
293 | v
294 | veces
295 | vez
296 | vosotras
297 | vosotros
298 | vuestra
299 | vuestras
300 | vuestro
301 | vuestros
302 | w
303 | x
304 | y
305 | ya
306 | yo
307 | z
308 | 


--------------------------------------------------------------------------------
/resources/CLEF/fa_sl.txt:
--------------------------------------------------------------------------------
  1 | و
  2 | در
  3 | به
  4 | از
  5 | كه
  6 | مي
  7 | اين
  8 | است
  9 | را
 10 | با
 11 | هاي
 12 | براي
 13 | آن
 14 | يك
 15 | شود
 16 | شده
 17 | خود
 18 | ها
 19 | كرد
 20 | شد
 21 | اي
 22 | تا
 23 | كند
 24 | بر
 25 | بود
 26 | گفت
 27 | نيز
 28 | وي
 29 | هم
 30 | كنند
 31 | دارد
 32 | ما
 33 | كرده
 34 | يا
 35 | اما
 36 | بايد
 37 | دو
 38 | اند
 39 | هر
 40 | خواهد
 41 | او
 42 | مورد
 43 | آنها
 44 | باشد
 45 | ديگر
 46 | مردم
 47 | نمي
 48 | بين
 49 | پيش
 50 | پس
 51 | اگر
 52 | همه
 53 | صورت
 54 | يكي
 55 | هستند
 56 | بي
 57 | من
 58 | دهد
 59 | هزار
 60 | نيست
 61 | استفاده
 62 | داد
 63 | داشته
 64 | راه
 65 | داشت
 66 | چه
 67 | همچنين
 68 | كردند
 69 | داده
 70 | بوده
 71 | دارند
 72 | همين
 73 | ميليون
 74 | سوي
 75 | شوند
 76 | بيشتر
 77 | بسيار
 78 | روي
 79 | گرفته
 80 | هايي
 81 | تواند
 82 | اول
 83 | نام
 84 | هيچ
 85 | چند
 86 | جديد
 87 | بيش
 88 | شدن
 89 | كردن
 90 | كنيم
 91 | نشان
 92 | حتي
 93 | اينكه
 94 | ولی
 95 | توسط
 96 | چنين
 97 | برخي
 98 | نه
 99 | ديروز
100 | دوم
101 | درباره
102 | بعد
103 | مختلف
104 | گيرد
105 | شما
106 | گفته
107 | آنان
108 | بار
109 | طور
110 | گرفت
111 | دهند
112 | گذاري
113 | بسياري
114 | طي
115 | بودند
116 | ميليارد
117 | بدون
118 | تمام
119 | كل
120 | تر
121 | براساس
122 | شدند
123 | ترين
124 | امروز
125 | باشند
126 | ندارد
127 | چون
128 | قابل
129 | گويد
130 | ديگري
131 | همان
132 | خواهند
133 | قبل
134 | آمده
135 | اكنون
136 | تحت
137 | طريق
138 | گيري
139 | جاي
140 | هنوز
141 | چرا
142 | البته
143 | كنيد
144 | سازي
145 | سوم
146 | كنم
147 | بلكه
148 | زير
149 | توانند
150 | ضمن
151 | فقط
152 | بودن
153 | حق
154 | آيد
155 | وقتي
156 | اش
157 | يابد
158 | نخستين
159 | مقابل
160 | خدمات
161 | امسال
162 | تاكنون
163 | مانند
164 | تازه
165 | آورد
166 | فكر
167 | آنچه
168 | نخست
169 | نشده
170 | شايد
171 | چهار
172 | جريان
173 | پنج
174 | ساخته
175 | زيرا
176 | نزديك
177 | برداري
178 | كسي
179 | ريزي
180 | رفت
181 | گردد
182 | مثل
183 | آمد
184 | ام
185 | بهترين
186 | دانست
187 | كمتر
188 | دادن
189 | تمامي
190 | جلوگيري
191 | بيشتري
192 | ايم
193 | ناشي
194 | چيزي
195 | آنكه
196 | بالا
197 | بنابراين
198 | ايشان
199 | بعضي
200 | دادند
201 | داشتند
202 | برخوردار
203 | نخواهد
204 | هنگام
205 | نبايد
206 | غير
207 | نبود
208 | ديده
209 | وگو
210 | داريم
211 | چگونه
212 | بندي
213 | خواست
214 | فوق
215 | ده
216 | نوعي
217 | هستيم
218 | ديگران
219 | همچنان
220 | سراسر
221 | ندارند
222 | گروهي
223 | سعي
224 | روزهاي
225 | آنجا
226 | يكديگر
227 | كردم
228 | بيست
229 | بروز
230 | سپس
231 | رفته
232 | آورده
233 | نمايد
234 | باشيم
235 | گويند
236 | زياد
237 | خويش
238 | همواره
239 | گذاشته
240 | شش
241 | نداشته
242 | شناسي
243 | خواهيم
244 | آباد
245 | داشتن
246 | نظير
247 | همچون
248 | باره
249 | نكرده
250 | شان
251 | سابق
252 | هفت
253 | دانند
254 | جايي
255 | بی
256 | جز
257 | زیرِ
258 | رویِ
259 | سریِ
260 | تویِ
261 | جلویِ
262 | پیشِ
263 | عقبِ
264 | بالایِ
265 | خارجِ
266 | وسطِ
267 | بیرونِ
268 | سویِ
269 | کنارِ
270 | پاعینِ
271 | نزدِ
272 | نزدیکِ
273 | دنبالِ
274 | حدودِ
275 | برابرِ
276 | طبقِ
277 | مانندِ
278 | ضدِّ
279 | هنگامِ
280 | برایِ
281 | مثلِ
282 | بارة
283 | اثرِ
284 | تولِ
285 | علّتِ
286 | سمتِ
287 | عنوانِ
288 | قصدِ
289 | روب
290 | جدا
291 | کی
292 | که
293 | چیست
294 | هست
295 | کجا
296 | کجاست
297 | کَی
298 | چطور
299 | کدام
300 | آیا
301 | مگر
302 | چندین
303 | یک
304 | چیزی
305 | دیگر
306 | کسی
307 | بعری
308 | هیچ
309 | چیز
310 | جا
311 | کس
312 | هرگز
313 | یا
314 | تنها
315 | بلکه
316 | خیاه
317 | بله
318 | بلی
319 | آره
320 | آری
321 | مرسی
322 | البتّه
323 | لطفاً
324 | ّه
325 | انکه
326 | وقتیکه
327 | همین
328 | پیش
329 | مدّتی
330 | هنگامی
331 | مان
332 | تان
333 | 


--------------------------------------------------------------------------------
/resources/CLEF/fr_sl.txt:
--------------------------------------------------------------------------------
  1 | a
  2 | à
  3 | â
  4 | abord
  5 | afin
  6 | ah
  7 | ai
  8 | aie
  9 | ainsi
 10 | allaient
 11 | allo
 12 | allô
 13 | allons
 14 | après
 15 | assez
 16 | attendu
 17 | au
 18 | aucun
 19 | aucune
 20 | aujourd
 21 | aujourd'hui
 22 | auquel
 23 | aura
 24 | auront
 25 | aussi
 26 | autre
 27 | autres
 28 | aux
 29 | auxquelles
 30 | auxquels
 31 | avaient
 32 | avais
 33 | avait
 34 | avant
 35 | avec
 36 | avoir
 37 | ayant
 38 | b
 39 | bah
 40 | beaucoup
 41 | bien
 42 | bigre
 43 | boum
 44 | bravo
 45 | brrr
 46 | c
 47 | ça
 48 | car
 49 | ce
 50 | ceci
 51 | cela
 52 | celle
 53 | celle-ci
 54 | celle-là
 55 | celles
 56 | celles-ci
 57 | celles-là
 58 | celui
 59 | celui-ci
 60 | celui-là
 61 | cent
 62 | cependant
 63 | certain
 64 | certaine
 65 | certaines
 66 | certains
 67 | certes
 68 | ces
 69 | cet
 70 | cette
 71 | ceux
 72 | ceux-ci
 73 | ceux-là
 74 | chacun
 75 | chaque
 76 | cher
 77 | chère
 78 | chères
 79 | chers
 80 | chez
 81 | chiche
 82 | chut
 83 | ci
 84 | cinq
 85 | cinquantaine
 86 | cinquante
 87 | cinquantième
 88 | cinquième
 89 | clac
 90 | clic
 91 | combien
 92 | comme
 93 | comment
 94 | compris
 95 | concernant
 96 | contre
 97 | couic
 98 | crac
 99 | d
100 | da
101 | dans
102 | de
103 | debout
104 | dedans
105 | dehors
106 | delà
107 | depuis
108 | derrière
109 | des
110 | dès
111 | désormais
112 | desquelles
113 | desquels
114 | dessous
115 | dessus
116 | deux
117 | deuxième
118 | deuxièmement
119 | devant
120 | devers
121 | devra
122 | différent
123 | différente
124 | différentes
125 | différents
126 | dire
127 | divers
128 | diverse
129 | diverses
130 | dix
131 | dix-huit
132 | dixième
133 | dix-neuf
134 | dix-sept
135 | doit
136 | doivent
137 | donc
138 | dont
139 | douze
140 | douzième
141 | dring
142 | du
143 | duquel
144 | durant
145 | e
146 | effet
147 | eh
148 | elle
149 | elle-même
150 | elles
151 | elles-mêmes
152 | en
153 | encore
154 | entre
155 | envers
156 | environ
157 | es
158 | ès
159 | est
160 | et
161 | etant
162 | étaient
163 | étais
164 | était
165 | étant
166 | etc
167 | été
168 | etre
169 | être
170 | eu
171 | euh
172 | eux
173 | eux-mêmes
174 | excepté
175 | f
176 | façon
177 | fais
178 | faisaient
179 | faisant
180 | fait
181 | feront
182 | fi
183 | flac
184 | floc
185 | font
186 | g
187 | gens
188 | h
189 | ha
190 | hé
191 | hein
192 | hélas
193 | hem
194 | hep
195 | hi
196 | ho
197 | holà
198 | hop
199 | hormis
200 | hors
201 | hou
202 | houp
203 | hue
204 | hui
205 | huit
206 | huitième
207 | hum
208 | hurrah
209 | i
210 | il
211 | ils
212 | importe
213 | j
214 | je
215 | jusqu
216 | jusque
217 | k
218 | l
219 | la
220 | là
221 | laquelle
222 | las
223 | le
224 | lequel
225 | les
226 | lès
227 | lesquelles
228 | lesquels
229 | leur
230 | leurs
231 | longtemps
232 | lorsque
233 | lui
234 | lui-même
235 | m
236 | ma
237 | maint
238 | mais
239 | malgré
240 | me
241 | même
242 | mêmes
243 | merci
244 | mes
245 | mien
246 | mienne
247 | miennes
248 | miens
249 | mille
250 | mince
251 | moi
252 | moi-même
253 | moins
254 | mon
255 | moyennant
256 | n
257 | na
258 | ne
259 | néanmoins
260 | neuf
261 | neuvième
262 | ni
263 | nombreuses
264 | nombreux
265 | non
266 | nos
267 | notre
268 | nôtre
269 | nôtres
270 | nous
271 | nous-mêmes
272 | nul
273 | o
274 | o|
275 | ô
276 | oh
277 | ohé
278 | olé
279 | ollé
280 | on
281 | ont
282 | onze
283 | onzième
284 | ore
285 | ou
286 | où
287 | ouf
288 | ouias
289 | oust
290 | ouste
291 | outre
292 | p
293 | paf
294 | pan
295 | par
296 | parmi
297 | partant
298 | particulier
299 | particulière
300 | particulièrement
301 | pas
302 | passé
303 | pendant
304 | personne
305 | peu
306 | peut
307 | peuvent
308 | peux
309 | pff
310 | pfft
311 | pfut
312 | pif
313 | plein
314 | plouf
315 | plus
316 | plusieurs
317 | plutôt
318 | pouah
319 | pour
320 | pourquoi
321 | premier
322 | première
323 | premièrement
324 | près
325 | proche
326 | psitt
327 | puisque
328 | q
329 | qu
330 | quand
331 | quant
332 | quanta
333 | quant-à-soi
334 | quarante
335 | quatorze
336 | quatre
337 | quatre-vingt
338 | quatrième
339 | quatrièmement
340 | que
341 | quel
342 | quelconque
343 | quelle
344 | quelles
345 | quelque
346 | quelques
347 | quelqu'un
348 | quels
349 | qui
350 | quiconque
351 | quinze
352 | quoi
353 | quoique
354 | r
355 | revoici
356 | revoilà
357 | rien
358 | s
359 | sa
360 | sacrebleu
361 | sans
362 | sapristi
363 | sauf
364 | se
365 | seize
366 | selon
367 | sept
368 | septième
369 | sera
370 | seront
371 | ses
372 | si
373 | sien
374 | sienne
375 | siennes
376 | siens
377 | sinon
378 | six
379 | sixième
380 | soi
381 | soi-même
382 | soit
383 | soixante
384 | son
385 | sont
386 | sous
387 | stop
388 | suis
389 | suivant
390 | sur
391 | surtout
392 | t
393 | ta
394 | tac
395 | tant
396 | te
397 | té
398 | tel
399 | telle
400 | tellement
401 | telles
402 | tels
403 | tenant
404 | tes
405 | tic
406 | tien
407 | tienne
408 | tiennes
409 | tiens
410 | toc
411 | toi
412 | toi-même
413 | ton
414 | touchant
415 | toujours
416 | tous
417 | tout
418 | toute
419 | toutes
420 | treize
421 | trente
422 | très
423 | trois
424 | troisième
425 | troisièmement
426 | trop
427 | tsoin
428 | tsouin
429 | tu
430 | u
431 | un
432 | une
433 | unes
434 | uns
435 | v
436 | va
437 | vais
438 | vas
439 | vé
440 | vers
441 | via
442 | vif
443 | vifs
444 | vingt
445 | vivat
446 | vive
447 | vives
448 | vlan
449 | voici
450 | voilà
451 | vont
452 | vos
453 | votre
454 | vôtre
455 | vôtres
456 | vous
457 | vous-mêmes
458 | vu
459 | w
460 | x
461 | y
462 | z
463 | zut
464 | 


--------------------------------------------------------------------------------
/resources/CLEF/it_sl.txt:
--------------------------------------------------------------------------------
  1 | a
  2 | abbastanza
  3 | accidenti
  4 | ad
  5 | adesso
  6 | affinche
  7 | agli
  8 | ahime
  9 | ahimè
 10 | ai
 11 | al
 12 | alcuna
 13 | alcuni
 14 | alcuno
 15 | all
 16 | alla
 17 | alle
 18 | allo
 19 | altri
 20 | altrimenti
 21 | altro
 22 | altrui
 23 | anche
 24 | ancora
 25 | anni
 26 | anno
 27 | ansa
 28 | assai
 29 | attesa
 30 | avanti
 31 | avendo
 32 | avente
 33 | aver
 34 | avere
 35 | avete
 36 | aveva
 37 | avuta
 38 | avute
 39 | avuti
 40 | avuto
 41 | basta
 42 | bene
 43 | benissimo
 44 | berlusconi
 45 | brava
 46 | bravo
 47 | c
 48 | casa
 49 | caso
 50 | cento
 51 | certa
 52 | certe
 53 | certi
 54 | certo
 55 | che
 56 | chi
 57 | chicchessia
 58 | chiunque
 59 | ci
 60 | ciascuna
 61 | ciascuno
 62 | cima
 63 | cio
 64 | ciò
 65 | cioe
 66 | cioè
 67 | circa
 68 | citta
 69 | città
 70 | codesta
 71 | codesti
 72 | codesto
 73 | cogli
 74 | coi
 75 | col
 76 | colei
 77 | coll
 78 | coloro
 79 | colui
 80 | come
 81 | con
 82 | concernente
 83 | consiglio
 84 | contro
 85 | cortesia
 86 | cos
 87 | cosa
 88 | cosi
 89 | così
 90 | cui
 91 | d
 92 | da
 93 | dagli
 94 | dai
 95 | dal
 96 | dall
 97 | dalla
 98 | dalle
 99 | dallo
100 | davanti
101 | degli
102 | dei
103 | del
104 | dell
105 | della
106 | delle
107 | dello
108 | dentro
109 | detto
110 | deve
111 | di
112 | dice
113 | dietro
114 | dire
115 | dirimpetto
116 | dopo
117 | dove
118 | dovra
119 | dovrà
120 | due
121 | dunque
122 | durante
123 | e
124 | è
125 | ecco
126 | ed
127 | egli
128 | ella
129 | eppure
130 | era
131 | erano
132 | esse
133 | essendo
134 | esser
135 | essere
136 | essi
137 | ex
138 | fa
139 | fare
140 | fatto
141 | favore
142 | fin
143 | finalmente
144 | finche
145 | fine
146 | fino
147 | forse
148 | fra
149 | fuori
150 | gia
151 | già
152 | giacche
153 | giorni
154 | giorno
155 | gli
156 | gliela
157 | gliele
158 | glieli
159 | glielo
160 | gliene
161 | governo
162 | grande
163 | grazie
164 | gruppo
165 | ha
166 | hai
167 | hanno
168 | ho
169 | i
170 | ieri
171 | il
172 | improvviso
173 | in
174 | infatti
175 | insieme
176 | intanto
177 | intorno
178 | invece
179 | io
180 | l
181 | la
182 | là
183 | lavoro
184 | le
185 | lei
186 | li
187 | lo
188 | lontano
189 | loro
190 | lui
191 | lungo
192 | ma
193 | macche
194 | magari
195 | mai
196 | male
197 | malgrado
198 | malissimo
199 | me
200 | medesimo
201 | mediante
202 | meglio
203 | meno
204 | mentre
205 | mesi
206 | mezzo
207 | mi
208 | mia
209 | mie
210 | miei
211 | mila
212 | miliardi
213 | milioni
214 | ministro
215 | mio
216 | moltissimo
217 | molto
218 | mondo
219 | nazionale
220 | ne
221 | negli
222 | nei
223 | nel
224 | nell
225 | nella
226 | nelle
227 | nello
228 | nemmeno
229 | neppure
230 | nessuna
231 | nessuno
232 | niente
233 | no
234 | noi
235 | non
236 | nondimeno
237 | nostra
238 | nostre
239 | nostri
240 | nostro
241 | nulla
242 | nuovo
243 | o
244 | od
245 | oggi
246 | ogni
247 | ognuna
248 | ognuno
249 | oltre
250 | oppure
251 | ora
252 | ore
253 | osi
254 | ossia
255 | paese
256 | parecchi
257 | parecchie
258 | parecchio
259 | parte
260 | partendo
261 | peccato
262 | peggio
263 | per
264 | perche
265 | perchè
266 | percio
267 | perciò
268 | perfino
269 | pero
270 | però
271 | persone
272 | piedi
273 | pieno
274 | piglia
275 | piu
276 | più
277 | po
278 | pochissimo
279 | poco
280 | poi
281 | poiche
282 | press
283 | prima
284 | primo
285 | proprio
286 | puo
287 | può
288 | pure
289 | purtroppo
290 | qualche
291 | qualcuna
292 | qualcuno
293 | quale
294 | quali
295 | qualunque
296 | quando
297 | quanta
298 | quante
299 | quanti
300 | quanto
301 | quantunque
302 | quasi
303 | quattro
304 | quel
305 | quella
306 | quelli
307 | quello
308 | quest
309 | questa
310 | queste
311 | questi
312 | questo
313 | qui
314 | quindi
315 | riecco
316 | salvo
317 | sara
318 | sarà
319 | sarebbe
320 | scopo
321 | scorso
322 | se
323 | secondo
324 | seguente
325 | sei
326 | sempre
327 | senza
328 | si
329 | sia
330 | siamo
331 | siete
332 | solito
333 | solo
334 | sono
335 | sopra
336 | sotto
337 | sta
338 | staranno
339 | stata
340 | state
341 | stati
342 | stato
343 | stesso
344 | su
345 | sua
346 | successivo
347 | sue
348 | sugli
349 | sui
350 | sul
351 | sull
352 | sulla
353 | sulle
354 | sullo
355 | suo
356 | suoi
357 | tale
358 | talvolta
359 | tanto
360 | te
361 | tempo
362 | ti
363 | torino
364 | tra
365 | tranne
366 | tre
367 | troppo
368 | tu
369 | tua
370 | tue
371 | tuo
372 | tuoi
373 | tutta
374 | tuttavia
375 | tutte
376 | tutti
377 | tutto
378 | uguali
379 | un
380 | una
381 | uno
382 | uomo
383 | va
384 | vale
385 | varia
386 | varie
387 | vario
388 | verso
389 | vi
390 | via
391 | vicino
392 | visto
393 | vita
394 | voi
395 | volta
396 | vostra
397 | vostre
398 | vostri
399 | vostro
400 | 


--------------------------------------------------------------------------------
/resources/CLEF/nl_sl.txt:
--------------------------------------------------------------------------------
  1 | de
  2 | en
  3 | van
  4 | ik
  5 | te
  6 | dat
  7 | die
  8 | in
  9 | een
 10 | hij
 11 | het
 12 | niet
 13 | zijn
 14 | is
 15 | was
 16 | op
 17 | aan
 18 | met
 19 | als
 20 | voor
 21 | had
 22 | er
 23 | maar
 24 | om
 25 | hem
 26 | dan
 27 | zou
 28 | of
 29 | wat
 30 | mijn
 31 | men
 32 | dit
 33 | zo
 34 | door
 35 | over
 36 | ze
 37 | zich
 38 | bij
 39 | ook
 40 | tot
 41 | je
 42 | mij
 43 | uit
 44 | der
 45 | daar
 46 | haar
 47 | naar
 48 | heb
 49 | hoe
 50 | heeft
 51 | hebben
 52 | deze
 53 | u
 54 | want
 55 | nog
 56 | zal
 57 | me
 58 | zij
 59 | nu
 60 | ge
 61 | geen
 62 | omdat
 63 | iets
 64 | worden
 65 | toch
 66 | al
 67 | waren
 68 | veel
 69 | meer
 70 | doen
 71 | toen
 72 | moet
 73 | ben
 74 | zonder
 75 | kan
 76 | hun
 77 | dus
 78 | alles
 79 | onder
 80 | ja
 81 | eens
 82 | hier
 83 | wie
 84 | werd
 85 | altijd
 86 | doch
 87 | wordt
 88 | wezen
 89 | kunnen
 90 | ons
 91 | zelf
 92 | tegen
 93 | na
 94 | reeds
 95 | wil
 96 | kon
 97 | niets
 98 | uw
 99 | iemand
100 | geweest
101 | andere


--------------------------------------------------------------------------------
/resources/CLEF/pt_sl.txt:
--------------------------------------------------------------------------------
  1 | a
  2 | à
  3 | adeus
  4 | agora
  5 | aí
  6 | ainda
  7 | além
  8 | algo
  9 | algumas
 10 | alguns
 11 | ali
 12 | ano
 13 | anos
 14 | antes
 15 | ao
 16 | aos
 17 | apenas
 18 | apoio
 19 | após
 20 | aquela
 21 | aquelas
 22 | aquele
 23 | aqueles
 24 | aqui
 25 | aquilo
 26 | área
 27 | as
 28 | às
 29 | assim
 30 | até
 31 | atrás
 32 | através
 33 | baixo
 34 | bastante
 35 | bem
 36 | bom
 37 | breve
 38 | cá
 39 | cada
 40 | catorze
 41 | cedo
 42 | cento
 43 | certamente
 44 | certeza
 45 | cima
 46 | cinco
 47 | coisa
 48 | com
 49 | como
 50 | conselho
 51 | contra
 52 | custa
 53 | da
 54 | dá
 55 | dão
 56 | daquela
 57 | daquele
 58 | dar
 59 | das
 60 | de
 61 | debaixo
 62 | demais
 63 | dentro
 64 | depois
 65 | desde
 66 | dessa
 67 | desse
 68 | desta
 69 | deste
 70 | deve
 71 | deverá
 72 | dez
 73 | dezanove
 74 | dezasseis
 75 | dezassete
 76 | dezoito
 77 | dia
 78 | diante
 79 | diz
 80 | dizem
 81 | dizer
 82 | do
 83 | dois
 84 | dos
 85 | doze
 86 | duas
 87 | dúvida
 88 | e
 89 | é
 90 | ela
 91 | elas
 92 | ele
 93 | eles
 94 | em
 95 | embora
 96 | entre
 97 | era
 98 | és
 99 | essa
100 | essas
101 | esse
102 | esses
103 | esta
104 | está
105 | estar
106 | estas
107 | estás
108 | estava
109 | este
110 | estes
111 | esteve
112 | estive
113 | estivemos
114 | estiveram
115 | estiveste
116 | estivestes
117 | estou
118 | eu
119 | exemplo
120 | faço
121 | falta
122 | favor
123 | faz
124 | fazeis
125 | fazem
126 | fazemos
127 | fazer
128 | fazes
129 | fez
130 | fim
131 | final
132 | foi
133 | fomos
134 | for
135 | foram
136 | forma
137 | foste
138 | fostes
139 | fui
140 | geral
141 | grande
142 | grandes
143 | grupo
144 | há
145 | hoje
146 | horas
147 | isso
148 | isto
149 | já
150 | lá
151 | lado
152 | local
153 | logo
154 | longe
155 | lugar
156 | maior
157 | maioria
158 | mais
159 | mal
160 | mas
161 | máximo
162 | me
163 | meio
164 | menor
165 | menos
166 | mês
167 | meses
168 | meu
169 | meus
170 | mil
171 | minha
172 | minhas
173 | momento
174 | muito
175 | muitos
176 | na
177 | nada
178 | não
179 | naquela
180 | naquele
181 | nas
182 | nem
183 | nenhuma
184 | nessa
185 | nesse
186 | nesta
187 | neste
188 | nível
189 | no
190 | noite
191 | nome
192 | nos
193 | nós
194 | nossa
195 | nossas
196 | nosso
197 | nossos
198 | nova
199 | nove
200 | novo
201 | novos
202 | num
203 | numa
204 | número
205 | nunca
206 | o
207 | obra
208 | obrigada
209 | obrigado
210 | oitava
211 | oitavo
212 | oito
213 | onde
214 | ontem
215 | onze
216 | os
217 | ou
218 | outra
219 | outras
220 | outro
221 | outros
222 | para
223 | parece
224 | parte
225 | partir
226 | pela
227 | pelas
228 | pelo
229 | pelos
230 | perto
231 | pode
232 | pôde
233 | podem
234 | poder
235 | põe
236 | põem
237 | ponto
238 | pontos
239 | por
240 | porque
241 | porquê
242 | posição
243 | possível
244 | possivelmente
245 | posso
246 | pouca
247 | pouco
248 | primeira
249 | primeiro
250 | próprio
251 | próximo
252 | puderam
253 | qual
254 | quando
255 | quanto
256 | quarta
257 | quarto
258 | quatro
259 | que
260 | quê
261 | quem
262 | quer
263 | quero
264 | questão
265 | quinta
266 | quinto
267 | quinze
268 | relação
269 | sabe
270 | são
271 | se
272 | segunda
273 | segundo
274 | sei
275 | seis
276 | sem
277 | sempre
278 | ser
279 | seria
280 | sete
281 | sétima
282 | sétimo
283 | seu
284 | seus
285 | sexta
286 | sexto
287 | sim
288 | sistema
289 | sob
290 | sobre
291 | sois
292 | somos
293 | sou
294 | sua
295 | suas
296 | tal
297 | talvez
298 | também
299 | tanto
300 | tão
301 | tarde
302 | te
303 | tem
304 | têm
305 | temos
306 | tendes
307 | tenho
308 | tens
309 | ter
310 | terceira
311 | terceiro
312 | teu
313 | teus
314 | teve
315 | tive
316 | tivemos
317 | tiveram
318 | tiveste
319 | tivestes
320 | toda
321 | todas
322 | todo
323 | todos
324 | trabalho
325 | três
326 | treze
327 | tu
328 | tua
329 | tuas
330 | tudo
331 | um
332 | uma
333 | umas
334 | uns
335 | vai
336 | vais
337 | vão
338 | vários
339 | vem
340 | vêm
341 | vens
342 | ver
343 | vez
344 | vezes
345 | viagem
346 | vindo
347 | vinte
348 | você
349 | vocês
350 | vos
351 | vós
352 | vossa
353 | vossas
354 | vosso
355 | vossos
356 | zero
357 | 


--------------------------------------------------------------------------------
/resources/CLEF/ru_sl.txt:
--------------------------------------------------------------------------------
  1 | а
  2 | е
  3 | и
  4 | ж
  5 | м
  6 | о
  7 | на
  8 | не
  9 | ни
 10 | об
 11 | но
 12 | он
 13 | мне
 14 | мои
 15 | мож
 16 | она
 17 | они
 18 | оно
 19 | мной
 20 | много
 21 | многочисленное
 22 | многочисленная
 23 | многочисленные
 24 | многочисленный
 25 | мною
 26 | мой
 27 | мог
 28 | могут
 29 | можно
 30 | может
 31 | можхо
 32 | мор
 33 | моя
 34 | моё
 35 | мочь
 36 | над
 37 | нее
 38 | оба
 39 | нам
 40 | нем
 41 | нами
 42 | ними
 43 | мимо
 44 | немного
 45 | одной
 46 | одного
 47 | менее
 48 | однажды
 49 | однако
 50 | меня
 51 | нему
 52 | меньше
 53 | ней
 54 | наверху
 55 | него
 56 | ниже
 57 | мало
 58 | надо
 59 | один
 60 | одиннадцать
 61 | одиннадцатый
 62 | назад
 63 | наиболее
 64 | недавно
 65 | миллионов
 66 | недалеко
 67 | между
 68 | низко
 69 | меля
 70 | нельзя
 71 | нибудь
 72 | непрерывно
 73 | наконец
 74 | никогда
 75 | никуда
 76 | нас
 77 | наш
 78 | нет
 79 | нею
 80 | неё
 81 | них
 82 | мира
 83 | наша
 84 | наше
 85 | наши
 86 | ничего
 87 | начала
 88 | нередко
 89 | несколько
 90 | обычно
 91 | опять
 92 | около
 93 | мы
 94 | ну
 95 | нх
 96 | от
 97 | отовсюду
 98 | особенно
 99 | нужно
100 | очень
101 | отсюда
102 | в
103 | во
104 | вон
105 | вниз
106 | внизу
107 | вокруг
108 | вот
109 | восемнадцать
110 | восемнадцатый
111 | восемь
112 | восьмой
113 | вверх
114 | вам
115 | вами
116 | важное
117 | важная
118 | важные
119 | важный
120 | вдали
121 | везде
122 | ведь
123 | вас
124 | ваш
125 | ваша
126 | ваше
127 | ваши
128 | впрочем
129 | весь
130 | вдруг
131 | вы
132 | все
133 | второй
134 | всем
135 | всеми
136 | времени
137 | время
138 | всему
139 | всего
140 | всегда
141 | всех
142 | всею
143 | всю
144 | вся
145 | всё
146 | всюду
147 | г
148 | год
149 | говорил
150 | говорит
151 | года
152 | году
153 | где
154 | да
155 | ее
156 | за
157 | из
158 | ли
159 | же
160 | им
161 | до
162 | по
163 | ими
164 | под
165 | иногда
166 | довольно
167 | именно
168 | долго
169 | позже
170 | более
171 | должно
172 | пожалуйста
173 | значит
174 | иметь
175 | больше
176 | пока
177 | ему
178 | имя
179 | пор
180 | пора
181 | потом
182 | потому
183 | после
184 | почему
185 | почти
186 | посреди
187 | ей
188 | два
189 | две
190 | двенадцать
191 | двенадцатый
192 | двадцать
193 | двадцатый
194 | двух
195 | его
196 | дел
197 | или
198 | без
199 | день
200 | занят
201 | занята
202 | занято
203 | заняты
204 | действительно
205 | давно
206 | девятнадцать
207 | девятнадцатый
208 | девять
209 | девятый
210 | даже
211 | алло
212 | жизнь
213 | далеко
214 | близко
215 | здесь
216 | дальше
217 | для
218 | лет
219 | зато
220 | даром
221 | первый
222 | перед
223 | затем
224 | зачем
225 | лишь
226 | десять
227 | десятый
228 | ею
229 | её
230 | их
231 | бы
232 | еще
233 | при
234 | был
235 | про
236 | процентов
237 | против
238 | просто
239 | бывает
240 | бывь
241 | если
242 | люди
243 | была
244 | были
245 | было
246 | будем
247 | будет
248 | будете
249 | будешь
250 | прекрасно
251 | буду
252 | будь
253 | будто
254 | будут
255 | ещё
256 | пятнадцать
257 | пятнадцатый
258 | друго
259 | другое
260 | другой
261 | другие
262 | другая
263 | других
264 | есть
265 | пять
266 | быть
267 | лучше
268 | пятый
269 | к
270 | ком
271 | конечно
272 | кому
273 | кого
274 | когда
275 | которой
276 | которого
277 | которая
278 | которые
279 | который
280 | которых
281 | кем
282 | каждое
283 | каждая
284 | каждые
285 | каждый
286 | кажется
287 | как
288 | какой
289 | какая
290 | кто
291 | кроме
292 | куда
293 | кругом
294 | с
295 | т
296 | у
297 | я
298 | та
299 | те
300 | уж
301 | со
302 | то
303 | том
304 | снова
305 | тому
306 | совсем
307 | того
308 | тогда
309 | тоже
310 | собой
311 | тобой
312 | собою
313 | тобою
314 | сначала
315 | только
316 | уметь
317 | тот
318 | тою
319 | хорошо
320 | хотеть
321 | хочешь
322 | хоть
323 | хотя
324 | свое
325 | свои
326 | твой
327 | своей
328 | своего
329 | своих
330 | свою
331 | твоя
332 | твоё
333 | раз
334 | уже
335 | сам
336 | там
337 | тем
338 | чем
339 | сама
340 | сами
341 | теми
342 | само
343 | рано
344 | самом
345 | самому
346 | самой
347 | самого
348 | семнадцать
349 | семнадцатый
350 | самим
351 | самими
352 | самих
353 | саму
354 | семь
355 | чему
356 | раньше
357 | сейчас
358 | чего
359 | сегодня
360 | себе
361 | тебе
362 | сеаой
363 | человек
364 | разве
365 | теперь
366 | себя
367 | тебя
368 | седьмой
369 | спасибо
370 | слишком
371 | так
372 | такое
373 | такой
374 | такие
375 | также
376 | такая
377 | сих
378 | тех
379 | чаще
380 | четвертый
381 | через
382 | часто
383 | шестой
384 | шестнадцать
385 | шестнадцатый
386 | шесть
387 | четыре
388 | четырнадцать
389 | четырнадцатый
390 | сколько
391 | сказал
392 | сказала
393 | сказать
394 | ту
395 | ты
396 | три
397 | эта
398 | эти
399 | что
400 | это
401 | чтоб
402 | этом
403 | этому
404 | этой
405 | этого
406 | чтобы
407 | этот
408 | стал
409 | туда
410 | этим
411 | этими
412 | рядом
413 | тринадцать
414 | тринадцатый
415 | этих
416 | третий
417 | тут
418 | эту
419 | суть
420 | чуть
421 | тысяч
422 | 
423 | 


--------------------------------------------------------------------------------
/resources/CLEF/sv_sl.txt:
--------------------------------------------------------------------------------
  1 | aderton
  2 | adertonde
  3 | adjö
  4 | aldrig
  5 | alla
  6 | allas
  7 | allt
  8 | alltid
  9 | alltså
 10 | än
 11 | andra
 12 | andras
 13 | annan
 14 | annat
 15 | ännu
 16 | artonde
 17 | artonn
 18 | åtminstone
 19 | att
 20 | åtta
 21 | åttio
 22 | åttionde
 23 | åttonde
 24 | av
 25 | även
 26 | båda
 27 | bådas
 28 | bakom
 29 | bara
 30 | bäst
 31 | bättre
 32 | behöva
 33 | behövas
 34 | behövde
 35 | behövt
 36 | beslut
 37 | beslutat
 38 | beslutit
 39 | bland
 40 | blev
 41 | bli
 42 | blir
 43 | blivit
 44 | bort
 45 | borta
 46 | bra
 47 | då
 48 | dag
 49 | dagar
 50 | dagarna
 51 | dagen
 52 | där
 53 | därför
 54 | de
 55 | del
 56 | delen
 57 | dem
 58 | den
 59 | deras
 60 | dess
 61 | det
 62 | detta
 63 | dig
 64 | din
 65 | dina
 66 | dit
 67 | ditt
 68 | dock
 69 | du
 70 | efter
 71 | eftersom
 72 | elfte
 73 | eller
 74 | elva
 75 | en
 76 | enkel
 77 | enkelt
 78 | enkla
 79 | enligt
 80 | er
 81 | era
 82 | ert
 83 | ett
 84 | ettusen
 85 | få 
 86 | fanns
 87 | får
 88 | fått 
 89 | fem
 90 | femte
 91 | femtio
 92 | femtionde
 93 | femton
 94 | femtonde
 95 | fick
 96 | fin
 97 | finnas
 98 | finns
 99 | fjärde
100 | fjorton
101 | fjortonde
102 | fler
103 | flera
104 | flesta
105 | följande
106 | för
107 | före
108 | förlåt
109 | förra
110 | första
111 | fram
112 | framför
113 | från
114 | fyra
115 | fyrtio
116 | fyrtionde
117 | gå
118 | gälla
119 | gäller
120 | gällt
121 | går
122 | gärna
123 | gått
124 | genast
125 | genom
126 | gick
127 | gjorde
128 | gjort
129 | god
130 | goda
131 | godare
132 | godast
133 | gör
134 | göra
135 | gott
136 | ha
137 | hade
138 | haft
139 | han
140 | hans
141 | har
142 | här
143 | heller
144 | hellre
145 | helst
146 | helt
147 | henne
148 | hennes
149 | hit
150 | hög
151 | höger
152 | högre
153 | högst
154 | hon
155 | honom
156 | hundra
157 | hundraen
158 | hundraett
159 | hur
160 | i
161 | ibland
162 | idag
163 | igår
164 | igen
165 | imorgon
166 | in
167 | inför
168 | inga
169 | ingen
170 | ingenting
171 | inget
172 | innan
173 | inne
174 | inom
175 | inte
176 | inuti
177 | ja
178 | jag
179 | jämfört
180 | kan
181 | kanske
182 | knappast
183 | kom
184 | komma
185 | kommer
186 | kommit
187 | kr
188 | kunde
189 | kunna
190 | kunnat
191 | kvar
192 | länge
193 | längre
194 | långsam
195 | långsammare
196 | långsammast
197 | långsamt
198 | längst
199 | långt
200 | lätt
201 | lättare
202 | lättast
203 | legat
204 | ligga
205 | ligger
206 | lika
207 | likställd
208 | likställda
209 | lilla
210 | lite
211 | liten
212 | litet
213 | man
214 | många
215 | måste
216 | med
217 | mellan
218 | men
219 | mer
220 | mera
221 | mest
222 | mig
223 | min
224 | mina
225 | mindre
226 | minst
227 | mitt
228 | mittemot
229 | möjlig
230 | möjligen
231 | möjligt
232 | möjligtvis
233 | mot
234 | mycket
235 | någon
236 | någonting
237 | något
238 | några
239 | när
240 | nästa
241 | ned
242 | nederst
243 | nedersta
244 | nedre
245 | nej
246 | ner
247 | ni
248 | nio
249 | nionde
250 | nittio
251 | nittionde
252 | nitton
253 | nittonde
254 | nödvändig
255 | nödvändiga
256 | nödvändigt
257 | nödvändigtvis
258 | nog
259 | noll
260 | nr
261 | nu
262 | nummer
263 | och
264 | också
265 | ofta
266 | oftast
267 | olika
268 | olikt
269 | om
270 | oss
271 | över
272 | övermorgon
273 | överst
274 | övre
275 | på
276 | rakt
277 | rätt
278 | redan
279 | så
280 | sade
281 | säga
282 | säger
283 | sagt
284 | samma
285 | sämre
286 | sämst
287 | sedan
288 | senare
289 | senast
290 | sent
291 | sex
292 | sextio
293 | sextionde
294 | sexton
295 | sextonde
296 | sig
297 | sin
298 | sina
299 | sist
300 | sista
301 | siste
302 | sitt
303 | sjätte
304 | sju
305 | sjunde
306 | sjuttio
307 | sjuttionde
308 | sjutton
309 | sjuttonde
310 | ska
311 | skall
312 | skulle
313 | slutligen
314 | små
315 | smått
316 | snart
317 | som
318 | stor
319 | stora
320 | större
321 | störst
322 | stort
323 | tack
324 | tidig
325 | tidigare
326 | tidigast
327 | tidigt
328 | till
329 | tills
330 | tillsammans
331 | tio
332 | tionde
333 | tjugo
334 | tjugoen
335 | tjugoett
336 | tjugonde
337 | tjugotre
338 | tjugotvå
339 | tjungo
340 | tolfte
341 | tolv
342 | tre
343 | tredje
344 | trettio
345 | trettionde
346 | tretton
347 | trettonde
348 | två
349 | tvåhundra
350 | under
351 | upp
352 | ur
353 | ursäkt
354 | ut
355 | utan
356 | utanför
357 | ute
358 | vad
359 | vänster
360 | vänstra
361 | var
362 | vår
363 | vara
364 | våra
365 | varför
366 | varifrån
367 | varit
368 | varken
369 | värre
370 | varsågod
371 | vart
372 | vårt
373 | vem
374 | vems
375 | verkligen
376 | vi
377 | vid
378 | vidare
379 | viktig
380 | viktigare
381 | viktigast
382 | viktigt
383 | vilka
384 | vilken
385 | vilket
386 | vill
387 | 


--------------------------------------------------------------------------------
/results/CLEF/indri/info.txt:
--------------------------------------------------------------------------------
1 | Results are not available on-line, you need to compute them with trec_eval. You can find the resulting runs at: https://github.com/gmdn/IR-reproducibiliy-grium/tree/master/results


--------------------------------------------------------------------------------
/results/CLEF/lucene/info.txt:
--------------------------------------------------------------------------------
1 | You can find trec_eval results at: https://github.com/dibuccio/IR-Reproducibility/tree/master/results/CLEF/lucene


--------------------------------------------------------------------------------
/results/CLEF/terrier/info.txt:
--------------------------------------------------------------------------------
1 | You can find trec_eval results at: https://github.com/mmaistro/IR-Reproducibility/tree/mmaistro/results/CLEF/terrier


--------------------------------------------------------------------------------
/runs/CLEF/indri/info.txt:
--------------------------------------------------------------------------------
1 | You can find the resulting runs at: https://github.com/gmdn/IR-reproducibiliy-grium/tree/master/results


--------------------------------------------------------------------------------
/runs/CLEF/lucene/info.txt:
--------------------------------------------------------------------------------
1 | You can find the resulting runs at: https://github.com/dibuccio/IR-Reproducibility/tree/master/runs/CLEF/lucene


--------------------------------------------------------------------------------
/runs/CLEF/terrier/info.txt:
--------------------------------------------------------------------------------
1 | You can find the resulting runs at: https://github.com/mmaistro/IR-Reproducibility/tree/mmaistro/runs/CLEF/terrier


--------------------------------------------------------------------------------
/systems/ATIRE/README.md:
--------------------------------------------------------------------------------
 1 | ATIRE
 2 | =====
 3 | 
 4 | The ATIRE makefile has a set of options that have been already preselected to provide good performance for indexing.
 5 | 
 6 | The script `dotgov2.sh` provides a script that will clone the required repositories, build the system, index and search all the query sets.
 7 | 
 8 | Indexing
 9 | --------
10 | 
11 | The first two arguments to the indexer simply specify a progress output every `n` documents (`-N<n>`, in this case one million), and to print statistics at the conclusion of the indexing.
12 | 
13 | The next argument specifies the format of the documents. In this case it has been set to recursively find and parse documents according to the TREC format; treating everything between `<DOC>` and `</DOC>`, inclusive, as indexable content. The content between `<DOCNO>` and `</DOCNO>` contains the document id. The exception to this are terms that appear in `SMGL` tags, of which only the tag itself is stored uppercased to allow for focussed retrieval. A term is defined as either a sequence of alpha characters, or a sequence of numeric characters, the definitions of which come from Unicode version 6.0.
14 | 
15 | As the .gov2 collection is supposed to contain only ASCII data, the `-iscrub:an` option is specified. This option will replace the `NUL` character, and other non-ASCII characters with a space. This prevents malformed data getting into the index. No other content filtering other than this is applied.
16 | 
17 | Finally, the indexer uses an s-stripping stemmer.
18 | 
19 | The script generates two indexes using these options, the first is a quantized index, which pre-calculates the retrieval scores and stores them in the index rather than having to be calculated at search time. The `-Q` parameter identifies the ranking function, and the `-q` parameter stores the quantized values. The second index is missing these parameters as it is an unquantized index.
20 | 
21 | Finally are a number of arguments to the indexer. These determine the locations for the recursive searching to take place. In the case of the gov2 script, each sub-folder of the gov2 collection is recursively searched for files matching the pattern `*.gz`. The number of arguments to the indexer determines one of the degrees of parallelism, the other is static. This combination has been shown empirically to perform better than other combinations.
22 | 
23 | Searching
24 | ---------
25 | 
26 | The search is performed to completion for both indexes, for all query sets. A run file is generated for each query set, with the top-1000 results presented, which can be evaluated using traditional TREC tools.
27 | 
28 | The first search command is targeted at high efficiency. To do this it uses the quantized index (specified with `-findex <index filename>`), which sets the ranking function internally.
29 | 
30 | Counter intuitively, the top-k parameter is not used for the speed baseline, as our experiments suggest that at high values of `k` this has an adverse affect on speed. The second argument to the efficiency baseline is the `-M` flag, which tells the search program to load the entire index into memory at startup.
31 | 
32 | The second search command is targeted at high effectiveness. It uses the default index filename (`index.aspt`), which is generated by the non-quantized indexer.
33 | 
34 | The ranking function for the second search is left to the default BM25 implementation with ATIRE, with the default parameters of `k1=0.9, b=0.4`.
35 | 
36 | The second parameter, `-Qr`, to the effectiveness search specifies to use Rocchio blind relevance feedback. There are optional parameters to this argument to specify the number of document to analyse, and the number of terms to extract. These are left at the defaults of `17` documents, and `5` terms.
37 | 
38 | The remainder of the arguments are shared among both searches. These arguments specify to print statistics (`-sa`), and setup the query type (`-QN:t` -- `<title>` fields from a TREC topic file), the file containing the queries (`-q <filepath>`), and options related to generating the run file. Each run file is named `atire.<query set>.<speed|precision>.txt`, and the search statistics are redirected to `<query set>.<speed|precision>.search_stats.txt`.
39 | 


--------------------------------------------------------------------------------
/systems/ATIRE/cw09.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -ef
 3 | 
 4 | source ../common.sh
 5 | source setup.sh
 6 | 
 7 | CW09B_FILES=$(find $CW09B_LOCATION -mindepth 1 -maxdepth 1 -type d -printf '%p/*.warc.gz ')
 8 | 
 9 | BASE_INDEX="stdbuf -oL ./bin/index -N1000000 -sa -rrwarcgz -iscrub:un -kt"
10 | ${BASE_INDEX} -findex cw09_index.aspt ${CW09B_FILES[@]} | tee cw09_indexing.txt
11 | ${BASE_INDEX} -QBM25 -q -findex cw09_quantized.aspt ${CW09B_FILES[@]} | tee cw09_quantized.indexing.txt
12 | 
13 | for index in "cw09_index.aspt" "cw09_quantized.aspt"
14 | do
15 | 	for queries in "1-50"
16 | 	do
17 | 		query_file=../$TOPICS_QRELS/topics.web.${queries}.txt
18 | 		qrel_file=../$TOPICS_QRELS/prels.web.${queries}.txt
19 | 		stat_file=${index}.${queries}.search_stats.txt
20 | 		run_file=atire.${index}.${queries}.txt
21 | 		eval_file=eval.${index}.${queries}.txt
22 | 
23 | 		echo "Searching queries ${queries} on index ${index}"
24 | 		./bin/atire -findex ${index} -sa -QN:q -k1000 -q ${query_file} -et -l1000 -o${run_file} -iatire > ${stat_file}
25 | 		../${SAP_EVAL} ${qrel_file} ${run_file} > ${eval_file}
26 | 	done
27 | done
28 | 
29 | for index in "cw09_index.aspt" "cw09_quantized.aspt"
30 | do
31 | 	for queries in "51-100" "101-150" "151-200"
32 | 	do
33 | 		query_file=../$TOPICS_QRELS/topics.web.${queries}.txt
34 | 		qrel_file=../$TOPICS_QRELS/qrels.web.${queries}.txt
35 | 		stat_file=${index}.${queries}.search_stats.txt
36 | 		run_file=atire.${index}.${queries}.txt
37 | 		eval_file=eval.${index}.${queries}.txt
38 | 
39 | 		echo "Searching queries ${queries} on index ${index}"
40 | 		./bin/atire -findex ${index} -sa -QN:q -k1000 -q ${query_file} -et -l1000 -o${run_file} -iatire > ${stat_file}
41 | 		../${TREC_EVAL} ${qrel_file} ${run_file} > ${eval_file}
42 | 		../${GD_EVAL} -c -traditional ${qrel_file} ${run_file} >> ${eval_file}
43 | 	done
44 | done
45 | 


--------------------------------------------------------------------------------
/systems/ATIRE/cw12.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | source ../common.sh
 5 | source setup.sh
 6 | 
 7 | set +o noglob
 8 | CW12B_FILES=$(find $CW12B_LOCATION/ClueWeb* -mindepth 1 -maxdepth 1 -type d -printf '%p/*.gz ')
 9 | set -o noglob
10 | 
11 | BASE_INDEX="stdbuf -oL ./bin/index -N1000000 -sa -rrwarcgz -iscrub:un -kt"
12 | ${BASE_INDEX} -findex cw12_index.aspt ${CW12B_FILES[@]} | tee cw12_indexing.txt
13 | ${BASE_INDEX} -QBM25 -q -findex cw12_quantized.aspt ${CW12B_FILES[@]} | tee cw12_quantized.indexing.txt
14 | 
15 | for index in "cw12_index.aspt" "cw12_quantized.aspt"
16 | do
17 | 	for queries in "201-250" "251-300"
18 | 	do
19 | 		query_file=../$TOPICS_QRELS/topics.web.${queries}.txt
20 | 		qrel_file=../$TOPICS_QRELS/qrels.web.${queries}.txt
21 | 		stat_file=${index}.${queries}.search_stats.txt
22 | 		run_file=atire.${index}.${queries}.txt
23 | 		eval_file=eval.${index}.${queries}.txt
24 | 
25 | 		echo "Searching queries ${queries} on index ${index}"
26 | 		./bin/atire -findex ${index} -sa -QN:q -k1000 -q ${query_file} -et -l1000 -o${run_file} -iatire > ${stat_file}
27 | 		../${TREC_EVAL} ${qrel_file} ${run_file} > ${eval_file}
28 | 		../${GD_EVAL} -c -traditional ${qrel_file} ${run_file} >> ${eval_file}
29 | 	done
30 | done
31 | 


--------------------------------------------------------------------------------
/systems/ATIRE/dotgov2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -ef
 3 | 
 4 | source ../common.sh
 5 | source setup.sh
 6 | 
 7 | GOV2_FILES=$(find $GOV2_LOCATION -mindepth 1 -maxdepth 1 -type d -name 'GX*' -printf '%p/*.gz ')
 8 | 
 9 | BASE_INDEX="stdbuf -oL ./bin/index -N1000000 -sa -rrtrec -iscrub:an -ts -kt"
10 | ${BASE_INDEX} -findex dg2_index.aspt ${GOV2_FILES[@]} | tee dg2_indexing.txt
11 | ${BASE_INDEX} -QBM25 -q -findex dg2_quantized.aspt ${GOV2_FILES[@]} | tee dg2_quantized.indexing.txt
12 | 
13 | for index in "dg2_index.aspt" "dg2_quantized.aspt"
14 | do
15 | 	for queries in "701-750" "751-800" "801-850"
16 | 	do
17 | 		query_file=../$TOPICS_QRELS/topics.${queries}.txt
18 | 		qrel_file=../$TOPICS_QRELS/qrels.${queries}.txt
19 | 		stat_file=${index}.${queries}.search_stats.txt
20 | 		run_file=atire.${index}.${queries}.txt
21 | 		eval_file=eval.${index}.${queries}.txt
22 | 
23 | 		echo "Searching queries ${queries} on index ${index}"
24 | 		./bin/atire -findex ${index} -sa -QN:t -k1000 -q ${query_file} -et -l1000 -o${run_file} -iatire > ${stat_file}
25 | 		../$TREC_EVAL ${qrel_file} ${run_file} > ${eval_file}
26 | 	done
27 | done
28 | 


--------------------------------------------------------------------------------
/systems/ATIRE/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ ! -d "atire" ]]; then
 4 | 	hg clone http://www.atire.org/hg/atire -r f3102a7a5848
 5 | fi
 6 | 
 7 | cd atire
 8 | 
 9 | make clean all
10 | 


--------------------------------------------------------------------------------
/systems/JASS/cw09.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source ../common.sh
 4 | source setup.sh
 5 | 
 6 | if [[ ! -f ../${ATIRE_DIR}/cw09_quantized.aspt ]]; then
 7 | 	echo "Must have built an ATIRE quantized index"
 8 | 	echo "Looked for: ../${ATIRE_DIR}/cw09_quantized.aspt"
 9 | 	exit
10 | fi
11 | 
12 | START=$(date +%s)
13 | ./atire_to_jass_index ../${ATIRE_DIR}/cw09_quantized.aspt -Q
14 | END=$(date +%s)
15 | 
16 | echo "'Indexing' took:" $((END - START)) "seconds"
17 | 
18 | for queries in "51-100" "101-150" "151-200"
19 | do
20 | 	query_file=../$TOPICS_QRELS/topics.web.${queries}.txt
21 | 	qrel_file=../$TOPICS_QRELS/qrels.web.${queries}.txt
22 | 	stat_file=${index}.${queries}.search_stats.txt
23 | 	run_file=${queries}.txt
24 | 	eval_file=eval.${queries}.txt
25 | 
26 | 	./trec2query/trec2query ${query_file} q -s s > ${queries}.txt
27 | 
28 | 	echo "Searching queries ${queries} to 1B postings"
29 | 	./jass ${queries}.txt 1000 1000000000 -d > comp.${stat_file}
30 | 	mv ranking.txt comp.jass.${run_file}
31 | 	../$TREC_EVAL ${qrel_file} comp.jass.${run_file} > comp.${eval_file}
32 | 
33 | 	echo "Searching queries ${queries} to 5M postings"
34 | 	./jass ${queries}.txt 1000 5000000 -d > heur.${stat_file}
35 | 	mv ranking.txt heur.jass.${run_file}
36 | 	../$TREC_EVAL ${qrel_file} heur.jass.${run_file} > heur.${eval_file}
37 | done
38 | 


--------------------------------------------------------------------------------
/systems/JASS/cw12.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source ../common.sh
 4 | source setup.sh
 5 | 
 6 | if [[ ! -f ../${ATIRE_DIR}/cw12_quantized.aspt ]]; then
 7 | 	echo "Must have built an ATIRE quantized index"
 8 | 	echo "Looked for: ../${ATIRE_DIR}/cw12_quantized.aspt"
 9 | 	exit
10 | fi
11 | 
12 | START=$(date +%s)
13 | ./atire_to_jass_index ../${ATIRE_DIR}/cw12_quantized.aspt -Q
14 | END=$(date +%s)
15 | 
16 | echo "'Indexing' took:" $((END - START)) "seconds"
17 | 
18 | for queries in "201-250" "251-300"
19 | do
20 | 	query_file=../$TOPICS_QRELS/topics.web.${queries}.txt
21 | 	qrel_file=../$TOPICS_QRELS/qrels.web.${queries}.txt
22 | 	stat_file=${index}.${queries}.search_stats.txt
23 | 	run_file=${queries}.txt
24 | 	eval_file=eval.${queries}.txt
25 | 
26 | 	./trec2query/trec2query ${query_file} q > ${queries}.txt
27 | 
28 | 	echo "Searching queries ${queries} to 1B postings"
29 | 	./jass ${queries}.txt 1000 1000000000 -d > comp.${stat_file}
30 | 	mv ranking.txt comp.jass.${run_file}
31 | 	../$TREC_EVAL ${qrel_file} comp.jass.${run_file} > comp.${eval_file}
32 | 
33 | 	echo "Searching queries ${queries} to 5m postings"
34 | 	./jass ${queries}.txt 1000 5000000 -d > heur.${stat_file}
35 | 	mv ranking.txt heur.jass.${run_file}
36 | 	../$TREC_EVAL ${qrel_file} heur.jass.${run_file} > heur.${eval_file}
37 | done
38 | 


--------------------------------------------------------------------------------
/systems/JASS/dotgov2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source ../common.sh
 4 | source setup.sh
 5 | 
 6 | if [[ ! -f ../${ATIRE_DIR}/dg2_quantized.aspt ]]; then
 7 | 	echo "Must have built an ATIRE quantized index"
 8 | 	echo "Looked for: ../${ATIRE_DIR}/dg2_quantized.aspt"
 9 | 	exit
10 | fi
11 | 
12 | START=$(date +%s)
13 | ./atire_to_jass_index ../${ATIRE_DIR}/dg2_quantized.aspt -Q
14 | END=$(date +%s)
15 | 
16 | echo "'Indexing' took:" $((END - START)) "seconds"
17 | 
18 | for queries in "701-750" "751-800" "801-850"
19 | do
20 | 	query_file=../$TOPICS_QRELS/topics.${queries}.txt
21 | 	qrel_file=../$TOPICS_QRELS/qrels.${queries}.txt
22 | 	stat_file=${index}.${queries}.search_stats.txt
23 | 	run_file=${queries}.txt
24 | 	eval_file=eval.${queries}.txt
25 | 
26 | 	./trec2query/trec2query ${query_file} t -s s >| ${queries}.txt
27 | 
28 | 	echo "Searching queries ${queries} to 1B postings"
29 | 	./jass ${queries}.txt 1000 1000000000 -d >| comp.${stat_file}
30 | 	mv ranking.txt comp.jass.${run_file}
31 | 	../$TREC_EVAL ${qrel_file} comp.jass.${run_file} >| comp.${eval_file}
32 | 
33 | 	echo "Searching queries ${queries} to 2.5M postings"
34 | 	./jass ${queries}.txt 1000 2500000 -d >| heur.${stat_file}
35 | 	mv ranking.txt heur.jass.${run_file}
36 | 	../$TREC_EVAL ${qrel_file} heur.jass.${run_file} >| heur.${eval_file}
37 | done
38 | 


--------------------------------------------------------------------------------
/systems/JASS/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ATIRE_DIR="../ATIRE/atire"
 4 | 
 5 | if [[ ! -d ${ATIRE_DIR} ]]; then
 6 | 	echo "ATIRE is a prerequisite for JASS"
 7 | 	exit
 8 | fi
 9 | 
10 | if [[ ! -d JASS ]]; then
11 | 	git clone https://github.com/lintool/JASS.git
12 | 	git checkout -q b27b319
13 | fi
14 | 
15 | cd JASS
16 | make ATIRE_DIR=../${ATIRE_DIR}
17 | make -C trec2query ATIRE_DIR=../../${ATIRE_DIR}
18 | 


--------------------------------------------------------------------------------
/systems/MG4J/README.md:
--------------------------------------------------------------------------------
 1 | Scripts
 2 | =======
 3 | 
 4 | For each collection, there is an -index.sh and an -index-pos.sh script
 5 | that will build a non-positional index and a positional index, respectively.
 6 | The scripts will print on standard output the construction time. All
 7 | scripts use parallel instances and log in a number of *.err files what
 8 | is happening in each parallel instance.
 9 | 
10 | For each collection, there is an -eval.sh script that uses a non-positional
11 | index and Model B, an -eval-pos.sh script that uses a positional index and Model B+,
12 | and finally a -bm25.sh that performs a baseline BM25 run. Each script saves
13 | in eval.$queries.txt the results of evaluation and in time.$queries.txt the
14 | overall query time in milliseconds.
15 | 
16 | Size
17 | ====
18 | 
19 | A non-positional index is formed by the .properties file, the .titles
20 | file, the .pointers[offsets] files, the .counts[offsets] files, the .sizes
21 | file and the .termmap file. A positional index in addition uses the
22 | .positions[offsets] files.
23 | 
24 | Metadata
25 | ========
26 | 
27 | All metadata is contained in the .properties file (which is a standard,
28 | self-descripting Java property file).
29 | 


--------------------------------------------------------------------------------
/systems/MG4J/cw12-bm25.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # Plain BM25 on disjunctive queries (mainly as a baseline)
 5 | 
 6 | source ../common.sh
 7 | 
 8 | export CLASSPATH=$(find -iname \*.jar | paste -d: -s)
 9 | 
10 | WORK_DIR=.
11 | 
12 | for queries in "201-250" "251-300"
13 | do
14 | 	topics=$TOPICS_QRELS/topics.web.$queries.txt
15 | 	qrels=$TOPICS_QRELS/qrels.web.$queries.txt
16 | 	err=err.$queries.txt
17 | 	run=run.$queries.txt
18 | 
19 | 	# Extract titles, minimal massaging (no stopwords, U.S. => U S, etc.)
20 | 	fgrep "<query>" $topics | sed 's/<.\?query>//g;s/--//;/^[[:space:]]*$/d;s/[[:space:]]*$//;s/^[[:space:]]*//' | sed "s/-/ /g;s/U.S./U S/;s/'s//" | sed 's/\<\(a\|the\|in\|to\|of\|on\|for\|and\|at\)\>//g' > titles.$queries.txt
21 | 	# Generate input files
22 | 	cat <(echo -e "\$score BM25Scorer(1.2,0.3)\n\$limit 1000\n\$divert $run\n\$mplex off") <(sed -e 's/[ ]\+/|/g' <titles.$queries.txt | awk "BEGIN {i = ${queries%-*} } { print \"\$mode trec \" i \" mg4jAuto\"; print; i = i + 1; }" ) >in.$queries.txt
23 | 
24 | 	java -server it.unimi.di.big.mg4j.query.Query $WORK_DIR/cw12-text -T $WORK_DIR/cw12.titles <in.$queries.txt 2>$err
25 | 
26 | 	$TREC_EVAL $qrels $run >eval.$queries.txt
27 | 
28 | 	grep ms\; $err | cut -d' ' -f6 | paste -d+ -s | bc -l >time.$queries.txt
29 | done
30 | 


--------------------------------------------------------------------------------
/systems/MG4J/cw12-eval-pos.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # Runs the MG4J k-out-of-n *positional* queries and performs evaluation
 5 | 
 6 | source ../common.sh
 7 | 
 8 | export CLASSPATH=$(find -iname \*.jar | paste -d: -s)
 9 | 
10 | WORK_DIR=.
11 | 
12 | for queries in "201-250" "251-300"
13 | do
14 | 	topics=$TOPICS_QRELS/topics.web.$queries.txt
15 | 	qrels=$TOPICS_QRELS/qrels.web.$queries.txt
16 | 	err=err.$queries.txt
17 | 	run=run.$queries.txt
18 | 
19 | 	# Extract titles, minimal massaging (no stopwords, U.S. => U S, etc.)
20 | 	fgrep "<query>" $topics | sed 's/<.\?query>//g;s/--//;/^[[:space:]]*$/d;s/[[:space:]]*$//;s/^[[:space:]]*//' | sed "s/-/ /g;s/U\\.S\\./US/g;s/'s//" | sed 's/\<\(in\|to\|of\|on\|for\|and\|at\)\>//g' > titles.$queries.txt
21 | 	# Generate input files
22 | 	cat <(echo -e "\$score BM25Scorer(1.2,0.3)\n\$limit 1000\n\$divert $run\n\$mplex off") <(./genqueriespos.sh $(echo ${queries%-*}) <titles.$queries.txt) >in.$queries.txt
23 | 
24 | 	java -server it.unimi.di.big.mg4j.query.Query $WORK_DIR/cw12-text -T $WORK_DIR/cw12.titles <in.$queries.txt 2>$err
25 | 
26 | 	$TREC_EVAL -q $qrels $run >eval.$queries.txt
27 | 
28 | 	grep ms\; $err | cut -d' ' -f6 | paste -d+ -s | bc -l >time.$queries.txt
29 | done
30 | 


--------------------------------------------------------------------------------
/systems/MG4J/cw12-eval.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # Runs the MG4J k-out-of-n queries and performs evaluation
 5 | 
 6 | source ../common.sh
 7 | 
 8 | export CLASSPATH=$(find -iname \*.jar | paste -d: -s)
 9 | 
10 | WORK_DIR=.
11 | 
12 | for queries in "201-250" "251-300"
13 | do
14 | 	topics=$TOPICS_QRELS/topics.web.$queries.txt
15 | 	qrels=$TOPICS_QRELS/qrels.web.$queries.txt
16 | 	err=err.$queries.txt
17 | 	run=run.$queries.txt
18 | 
19 | 	# Extract titles, minimal massaging (no stopwords, U.S. => U S, etc.)
20 | 	fgrep "<query>" $topics | sed 's/<.\?query>//g;s/--//;/^[[:space:]]*$/d;s/[[:space:]]*$//;s/^[[:space:]]*//' | sed "s/-/ /g;s/U.S./U S/;s/'s//" | sed 's/\<\(in\|to\|of\|on\|for\|and\|at\)\>//g' > titles.$queries.txt
21 | 	# Generate input files
22 | 	cat <(echo -e "\$score BM25Scorer(1.2,0.3)\n\$limit 1000\n\$divert $run\n\$mplex off") <(./genqueries.sh $(echo ${queries%-*}) <titles.$queries.txt) >in.$queries.txt
23 | 
24 | 	java -server it.unimi.di.big.mg4j.query.Query $WORK_DIR/cw12-text -T $WORK_DIR/cw12.titles <in.$queries.txt 2>$err
25 | 
26 | 	$TREC_EVAL -q $qrels $run >eval.$queries.txt
27 | 
28 | 	grep ms\; $err | cut -d' ' -f6 | paste -d+ -s | bc -l >time.$queries.txt
29 | done
30 | 


--------------------------------------------------------------------------------
/systems/MG4J/cw12-index-pos.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | #sudo apt-add-repository -y ppa:webupd8team/java
 5 | #sudo apt-get -y update
 6 | #sudo apt-get -y install oracle-java8-installer
 7 | #sudo apt-get -y install ruby
 8 | 
 9 | version=5.4.1
10 | 
11 | source ../common.sh
12 | 
13 | WORK_DIR=.
14 | 
15 | if [[ ! -f mg4j-big-$version-bin.tar.gz ||  ! -f mg4j-big-deps.tar.gz ]]; then
16 | 	curl http://mg4j.di.unimi.it/mg4j-big-$version-bin.tar.gz >mg4j-big-$version-bin.tar.gz
17 | 	curl http://mg4j.di.unimi.it/mg4j-big-deps.tar.gz >mg4j-big-deps.tar.gz
18 | fi
19 | 
20 | tar -zxvf mg4j-big-$version-bin.tar.gz
21 | tar -zxvf mg4j-big-deps.tar.gz
22 | 
23 | export CLASSPATH=.:$(find -iname \*.jar | paste -d: -s)
24 | 
25 | starttime=$(date +%s)
26 | 
27 | # Parallel
28 | 
29 | rm -f $WORK_DIR/cw12.titles $WORK_DIR/cw12-text.* $WORK_DIR/cw12-split-* split-*
30 | 
31 | TMP=$(mktemp)
32 | find $CW12B_LOCATION -iname \*.gz -type f | sort >$TMP
33 | split -n l/16 $TMP split-
34 | 
35 | (for split in split-*; do
36 | (
37 | 
38 | 	java -Xmx7512M -server \
39 | 		it.unimi.di.big.mg4j.document.WarcDocumentSequence \
40 | 			-z -f it.unimi.di.big.mg4j.document.HtmlDocumentFactory -p encoding=iso-8859-1 $WORK_DIR/cw12-$split.sequence $(cat $split)
41 | 
42 | 	# Do not check version. Use BURL to sanitize non-conformant URLs.
43 | 
44 | 	java -Xmx7512M -server -Dit.unimi.di.law.warc.io.version=false -Dit.unimi.di.law.warc.records.useburl=true \
45 | 		it.unimi.di.big.mg4j.tool.Scan -s 1000000 -S $WORK_DIR/cw12-$split.sequence -t EnglishStemmer -I text $WORK_DIR/cw12-$split >$split.out 2>$split.err
46 | 
47 | )& 
48 | 
49 | done
50 | 
51 | wait)
52 | 
53 | # Check that all instances have completed
54 | 
55 | if (( $(find -iname cw12-split-\*-text.cluster.properties | wc -l) != 16 )); then
56 | 	echo "ERROR: Some instance did not complete correctly" 1>&2
57 | 	exit 1
58 | fi
59 | 
60 | java -Xmx7512M -server it.unimi.di.big.mg4j.tool.Concatenate $WORK_DIR/cw12-text \
61 | 	$(find $WORK_DIR -iname cw12-split-\*-text@\*.sizes | sort | sed s/.sizes//)
62 | cat $(find $WORK_DIR -iname cw12-split-\*.titles | sort) >$WORK_DIR/cw12.titles
63 | 
64 | java -Xmx7512M -server it.unimi.dsi.sux4j.mph.MWHCFunction -s 32 $WORK_DIR/cw12-text.mwhc $WORK_DIR/cw12-text.terms
65 | 
66 | java -Xmx7512M -server it.unimi.dsi.sux4j.util.SignedFunctionStringMap $WORK_DIR/cw12-text.mwhc $WORK_DIR/cw12-text.termmap
67 | 
68 | 
69 | endtime=$(date +%s)
70 | 
71 | echo "Indexing time: $((endtime-starttime))s"
72 | 


--------------------------------------------------------------------------------
/systems/MG4J/cw12-index.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | #sudo apt-add-repository -y ppa:webupd8team/java
 5 | #sudo apt-get -y update
 6 | #sudo apt-get -y install oracle-java8-installer
 7 | #sudo apt-get -y install ruby
 8 | 
 9 | version=5.4.1
10 | 
11 | source ../common.sh
12 | 
13 | WORK_DIR=.
14 | 
15 | if [[ ! -f mg4j-big-$version-bin.tar.gz ||  ! -f mg4j-big-deps.tar.gz ]]; then
16 | 	curl http://mg4j.di.unimi.it/mg4j-big-$version-bin.tar.gz >mg4j-big-$version-bin.tar.gz
17 | 	curl http://mg4j.di.unimi.it/mg4j-big-deps.tar.gz >mg4j-big-deps.tar.gz
18 | fi
19 | 
20 | tar -zxvf mg4j-big-$version-bin.tar.gz
21 | tar -zxvf mg4j-big-deps.tar.gz
22 | 
23 | export CLASSPATH=.:$(find -iname \*.jar | paste -d: -s)
24 | 
25 | starttime=$(date +%s)
26 | 
27 | # Parallel
28 | 
29 | rm -f $WORK_DIR/cw12.titles $WORK_DIR/cw12-text.* $WORK_DIR/cw12-split-* split-*
30 | 
31 | TMP=$(mktemp)
32 | find $CW12B_LOCATION -iname \*.gz -type f | sort >$TMP
33 | split -n l/16 $TMP split-
34 | 
35 | (for split in split-*; do
36 | (
37 | 
38 | 	java -Xmx7512M -server \
39 | 		it.unimi.di.big.mg4j.document.WarcDocumentSequence \
40 | 			-z -f it.unimi.di.big.mg4j.document.HtmlDocumentFactory -p encoding=iso-8859-1 $WORK_DIR/cw12-$split.sequence $(cat $split)
41 | 
42 | 	# Do not check version. Use BURL to sanitize non-conformant URLs.
43 | 
44 | 	java -Xmx7512M -server -Dit.unimi.di.law.warc.io.version=false -Dit.unimi.di.law.warc.records.useburl=true \
45 | 		it.unimi.di.big.mg4j.tool.Scan -s 1000000 -S $WORK_DIR/cw12-$split.sequence -t EnglishStemmer -I text -c COUNTS $WORK_DIR/cw12-$split >$split.out 2>$split.err
46 | 
47 | )& 
48 | 
49 | done
50 | 
51 | wait)
52 | 
53 | # Check that all instances have completed
54 | 
55 | if (( $(find -iname cw12-split-\*-text.cluster.properties | wc -l) != 16 )); then
56 | 	echo "ERROR: Some instance did not complete correctly" 1>&2
57 | 	exit 1
58 | fi
59 | 
60 | java -Xmx7512M -server it.unimi.di.big.mg4j.tool.Concatenate -c POSITIONS:NONE $WORK_DIR/cw12-text \
61 | 	$(find $WORK_DIR -iname cw12-split-\*-text@\*.sizes | sort | sed s/.sizes//)
62 | cat $(find $WORK_DIR -iname cw12-split-\*.titles | sort) >$WORK_DIR/cw12.titles
63 | 
64 | java -Xmx7512M -server it.unimi.dsi.sux4j.mph.MWHCFunction -s 32 $WORK_DIR/cw12-text.mwhc $WORK_DIR/cw12-text.terms
65 | 
66 | java -Xmx7512M -server it.unimi.dsi.sux4j.util.SignedFunctionStringMap $WORK_DIR/cw12-text.mwhc $WORK_DIR/cw12-text.termmap
67 | 
68 | 
69 | endtime=$(date +%s)
70 | 
71 | echo "Indexing time: $((endtime-starttime))s"
72 | 


--------------------------------------------------------------------------------
/systems/MG4J/genqueries.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | ./gensubsets.rb  | awk "BEGIN {i = $1 } { print \"\$mode trec \" i \" mg4jAuto\"; print; i = i + 1; }"
4 | 


--------------------------------------------------------------------------------
/systems/MG4J/genqueriespos.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | ./gensubsetspos.rb  | awk "BEGIN {i = $1 } { print \"\$mode trec \" i \" mg4jAuto\"; print; i = i + 1; }"
4 | 


--------------------------------------------------------------------------------
/systems/MG4J/gensubsets.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/ruby
 2 | 
 3 | $stdin.each_line do |l|
 4 | 	a = []
 5 | 	l.scan(/\w+/) do |w| 
 6 | 		if w == "us" then w = "\"u s\"" end
 7 | 		a << w 
 8 | 	end
 9 | 
10 | 	andthen = []
11 | 	(a.length+1).times do |n|
12 | 		disj = []
13 | 		if n == 0 then next end
14 | 		(2**(a.length)).times do |x|
15 | 			if x == 0 then next end
16 | 			item = []
17 | 			a.length.times do |index|
18 | 				if ( x & (2**index) != 0 ) then item << a[index] end
19 | 			end
20 | 
21 | 			if item.length == n then disj << ( "(" + item.join(" & " ) + ")" ) end
22 | 		end
23 | 		andthen << disj.join( " | " )
24 | 	end
25 | 
26 | 	puts andthen.reverse.join( ", " );
27 | end
28 | 


--------------------------------------------------------------------------------
/systems/MG4J/gensubsetspos.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/ruby
 2 | 
 3 | $stdin.each_line do |l|
 4 | 	a = []
 5 | 	l.scan(/\w+/) do |w| 
 6 | 		if w == "US" then w = "\"u s\"" end
 7 | 		a << w 
 8 | 	end
 9 | 
10 | 	andthen = []
11 | 	(a.length+1).times do |n|
12 | 		window = []
13 | 		conj = []
14 | 		if n == 0 then next end
15 | 		(2**(a.length)).times do |x|
16 | 			if x == 0 then next end
17 | 			item = []
18 | 			a.length.times do |index|
19 | 				if ( x & (2**index) != 0 ) then item << a[index] end
20 | 			end
21 | 
22 | 			if item.length == n then 
23 | 				if n > 1; then
24 | 					window << ( "(" + item.join(" & " ) + ")~" + (2*n).to_s )
25 | 				end
26 | 				conj<< ( "(" + item.join(" & " ) + ")" )
27 | 			end
28 | 		end
29 | 		andthen << conj.join( " | " )
30 | 		if n > 1; then 
31 | 			andthen << window.join( " | " )
32 | 		end
33 | 	end
34 | 
35 | 	puts andthen.reverse.join( ", " );
36 | end
37 | 


--------------------------------------------------------------------------------
/systems/MG4J/gov2-bm25.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # Plain BM25 on disjunctive queries (mainly as a baseline)
 5 | 
 6 | source ../common.sh
 7 | 
 8 | export CLASSPATH=$(find -iname \*.jar | paste -d: -s)
 9 | 
10 | WORK_DIR=.
11 | 
12 | for queries in "701-750" "751-800" "801-850"
13 | do
14 | 	topics=$TOPICS_QRELS/topics.$queries.txt
15 | 	qrels=$TOPICS_QRELS/qrels.$queries.txt
16 | 	err=err.$queries.txt
17 | 	run=run.$queries.txt
18 | 
19 | 	# Extract titles, minimal massaging (no stopwords, U.S. => U S, etc.)
20 | 	fgrep -A1 "<title>" $topics | sed 's/<title>//;s/--//;/^[[:space:]]*$/d;s/[[:space:]]*$//;s/^[[:space:]]*//' | sed "s/-/ /g;s/U.S./U S/;s/'s//" | sed 's/\<\(in\|to\|of\|on\|for\|and\|at\)\>//g' > titles.$queries.txt
21 | 	# Generate input files
22 | 	cat <(echo -e "\$score BM25Scorer(1.2,0.3)\n\$limit 1000\n\$divert $run\n\$mplex off") <(sed -e 's/[ ]\+/|/g' <titles.$queries.txt | awk "BEGIN {i = ${queries%-*} } { print \"\$mode trec \" i \" mg4jAuto\"; print; i = i + 1; }" ) >in.$queries.txt
23 | 
24 | 	java -server it.unimi.di.big.mg4j.query.Query $WORK_DIR/gov2-text -T $WORK_DIR/gov2.titles <in.$queries.txt 2>$err
25 | 
26 | 	$TREC_EVAL $qrels $run >eval.$queries.txt
27 | 
28 | 	grep ms\; $err | cut -d' ' -f6 | paste -d+ -s | bc -l >time.$queries.txt
29 | done
30 | 


--------------------------------------------------------------------------------
/systems/MG4J/gov2-eval-pos.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # Runs the MG4J k-out-of-n *positional* queries and performs evaluation
 5 | 
 6 | source ../common.sh
 7 | 
 8 | export CLASSPATH=$(find -iname \*.jar | paste -d: -s)
 9 | 
10 | WORK_DIR=.
11 | 
12 | for queries in "701-750" "751-800" "801-850"
13 | do
14 | 	topics=$TOPICS_QRELS/topics.$queries.txt
15 | 	qrels=$TOPICS_QRELS/qrels.$queries.txt
16 | 	err=err.$queries.txt
17 | 	run=run.$queries.txt
18 | 
19 | 	# Extract titles, minimal massaging (no stopwords, U.S. => US, etc.)
20 | 	fgrep -A1 "<title>" $topics | sed 's/<title>//;s/--//;/^[[:space:]]*$/d;s/[[:space:]]*$//;s/^[[:space:]]*//' | sed "s/-/ /g;s/U\\.S\\./US/g;s/'s//" | sed 's/\<\(in\|to\|of\|on\|for\|and\|at\)\>//g' > titles.$queries.txt
21 | 	# Generate input files
22 | 	cat <(echo -e "\$score BM25Scorer(1.2,0.3)\n\$limit 1000\n\$divert $run\n\$mplex off") <(./genqueriespos.sh $(echo ${queries%-*}) <titles.$queries.txt) >in.$queries.txt
23 | 
24 | 	java -server it.unimi.di.big.mg4j.query.Query $WORK_DIR/gov2-text -T $WORK_DIR/gov2.titles <in.$queries.txt 2>$err
25 | 
26 | 	$TREC_EVAL -q $qrels $run >eval.$queries.txt
27 | 
28 | 	grep ms\; $err | cut -d' ' -f6 | paste -d+ -s | bc -l >time.$queries.txt
29 | done
30 | 


--------------------------------------------------------------------------------
/systems/MG4J/gov2-eval.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # Runs the MG4J k-out-of-n queries and performs evaluation
 5 | 
 6 | source ../common.sh
 7 | 
 8 | export CLASSPATH=$(find -iname \*.jar | paste -d: -s)
 9 | 
10 | WORK_DIR=.
11 | 
12 | for queries in "701-750" "751-800" "801-850"
13 | do
14 | 	topics=$TOPICS_QRELS/topics.$queries.txt
15 | 	qrels=$TOPICS_QRELS/qrels.$queries.txt
16 | 	err=err.$queries.txt
17 | 	run=run.$queries.txt
18 | 
19 | 	# Extract titles, minimal massaging (no stopwords, U.S. => U S, etc.)
20 | 	fgrep -A1 "<title>" $topics | sed 's/<title>//;s/--//;/^[[:space:]]*$/d;s/[[:space:]]*$//;s/^[[:space:]]*//' | sed "s/-/ /g;s/U.S./U S/;s/'s//" | sed 's/\<\(in\|to\|of\|on\|for\|and\|at\)\>//g' > titles.$queries.txt
21 | 	# Generate input files
22 | 	cat <(echo -e "\$score BM25Scorer(1.2,0.3)\n\$limit 1000\n\$divert $run\n\$mplex off") <(./genqueries.sh $(echo ${queries%-*}) <titles.$queries.txt) >in.$queries.txt
23 | 
24 | 	java -server it.unimi.di.big.mg4j.query.Query $WORK_DIR/gov2-text -T $WORK_DIR/gov2.titles <in.$queries.txt 2>$err
25 | 
26 | 	$TREC_EVAL -q $qrels $run >eval.$queries.txt
27 | 
28 | 	grep ms\; $err | cut -d' ' -f6 | paste -d+ -s | bc -l >time.$queries.txt
29 | done
30 | 


--------------------------------------------------------------------------------
/systems/MG4J/gov2-index-pos.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | sudo apt-add-repository -y ppa:webupd8team/java
 5 | sudo apt-get -y update
 6 | sudo apt-get -y install oracle-java8-installer
 7 | sudo apt-get -y install ruby
 8 | 
 9 | version=5.4.1
10 | 
11 | source ../common.sh
12 | 
13 | WORK_DIR=.
14 | 
15 | if [[ ! -f mg4j-big-$version-bin.tar.gz ||  ! -f mg4j-big-deps.tar.gz ]]; then
16 | 	curl http://mg4j.di.unimi.it/mg4j-big-$version-bin.tar.gz >mg4j-big-$version-bin.tar.gz
17 | 	curl http://mg4j.di.unimi.it/mg4j-big-deps.tar.gz >mg4j-big-deps.tar.gz
18 | fi
19 | 
20 | tar -zxvf mg4j-big-$version-bin.tar.gz
21 | tar -zxvf mg4j-big-deps.tar.gz
22 | 
23 | export CLASSPATH=.:$(find -iname \*.jar | paste -d: -s)
24 | 
25 | starttime=$(date +%s)
26 | 
27 | # Parallel
28 | 
29 | rm -f $WORK_DIR/gov2.titles $WORK_DIR/gov2-text.* $WORK_DIR/gov2-split-* split-*
30 | 
31 | TMP=$(mktemp)
32 | find $GOV2_LOCATION -type f | sort >$TMP
33 | split -n l/16 $TMP split-
34 | 
35 | (for split in split-*; do
36 | (
37 | 
38 | 	java -Xmx7512M -server \
39 | 		it.unimi.di.big.mg4j.document.TRECDocumentCollection \
40 | 			-f HtmlDocumentFactory -p encoding=iso-8859-1 -z $WORK_DIR/gov2-$split.collection $(cat $split)
41 | 
42 | 	java -Xmx7512M -server \
43 | 		it.unimi.di.big.mg4j.tool.Scan -s 1000000 -S $WORK_DIR/gov2-$split.collection -t EnglishStemmer -I text $WORK_DIR/gov2-$split >$split.out 2>$split.err
44 | 
45 | )& 
46 | 
47 | done
48 | 
49 | wait)
50 | 
51 | # Check that all instances have completed
52 | 
53 | if (( $(find -iname gov2-split-\*-text.cluster.properties | wc -l) != 16 )); then
54 | 	echo "ERROR: Some instance did not complete correctly" 1>&2
55 | 	exit 1
56 | fi
57 | 
58 | java -Xmx7512M -server it.unimi.di.big.mg4j.tool.Concatenate $WORK_DIR/gov2-text \
59 | 	$(find $WORK_DIR -iname gov2-split-\*-text@\*.sizes | sort | sed s/.sizes//)
60 | cat $(find $WORK_DIR -iname gov2-split-\*.titles | sort) >$WORK_DIR/gov2.titles
61 | 
62 | java -Xmx7512M -server it.unimi.dsi.sux4j.mph.MWHCFunction -s 32 $WORK_DIR/gov2-text.mwhc $WORK_DIR/gov2-text.terms
63 | 
64 | java -Xmx7512M -server it.unimi.dsi.sux4j.util.SignedFunctionStringMap $WORK_DIR/gov2-text.mwhc $WORK_DIR/gov2-text.termmap
65 | 
66 | endtime=$(date +%s)
67 | 
68 | echo "Indexing time: $((endtime-starttime))s"
69 | 


--------------------------------------------------------------------------------
/systems/MG4J/gov2-index.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | #sudo apt-add-repository -y ppa:webupd8team/java
 5 | #sudo apt-get -y update
 6 | #sudo apt-get -y install oracle-java8-installer
 7 | #sudo apt-get -y install ruby
 8 | 
 9 | version=5.4.1
10 | 
11 | source ../common.sh
12 | 
13 | WORK_DIR=.
14 | 
15 | if [[ ! -f mg4j-big-$version-bin.tar.gz ||  ! -f mg4j-big-deps.tar.gz ]]; then
16 | 	curl http://mg4j.di.unimi.it/mg4j-big-$version-bin.tar.gz >mg4j-big-$version-bin.tar.gz
17 | 	curl http://mg4j.di.unimi.it/mg4j-big-deps.tar.gz >mg4j-big-deps.tar.gz
18 | fi
19 | 
20 | tar -zxvf mg4j-big-$version-bin.tar.gz
21 | tar -zxvf mg4j-big-deps.tar.gz
22 | 
23 | export CLASSPATH=.:$(find -iname \*.jar | paste -d: -s)
24 | 
25 | starttime=$(date +%s)
26 | 
27 | # Parallel
28 | 
29 | rm -f $WORK_DIR/gov2.titles $WORK_DIR/gov2-text.* $WORK_DIR/gov2-split-* split-*
30 | 
31 | TMP=$(mktemp)
32 | find $GOV2_LOCATION -type f | sort >$TMP
33 | split -n l/16 $TMP split-
34 | 
35 | (for split in split-*; do
36 | (
37 | 
38 | 	java -Xmx7512M -server \
39 | 		it.unimi.di.big.mg4j.document.TRECDocumentCollection \
40 | 			-z -f HtmlDocumentFactory -p encoding=iso-8859-1 $WORK_DIR/gov2-$split.collection $(cat $split)
41 | 
42 | 	java -Xmx7512M -server \
43 | 		it.unimi.di.big.mg4j.tool.Scan -s 1000000 -S $WORK_DIR/gov2-$split.collection -t EnglishStemmer -I text -c COUNTS $WORK_DIR/gov2-$split >$split.out 2>$split.err
44 | 
45 | )& 
46 | 
47 | done
48 | 
49 | wait)
50 | 
51 | # Check that all instances have completed
52 | 
53 | if (( $(find -iname gov2-split-\*-text.cluster.properties | wc -l) != 16 )); then
54 | 	echo "ERROR: Some instance did not complete correctly" 1>&2
55 | 	exit 1
56 | fi
57 | 
58 | java -Xmx7512M -server it.unimi.di.big.mg4j.tool.Concatenate -c POSITIONS:NONE $WORK_DIR/gov2-text \
59 | 	$(find $WORK_DIR -iname gov2-split-\*-text@\*.sizes | sort | sed s/.sizes//)
60 | cat $(find $WORK_DIR -iname gov2-split-\*.titles | sort) >$WORK_DIR/gov2.titles
61 | 
62 | java -Xmx7512M -server it.unimi.dsi.sux4j.mph.MWHCFunction -s 32 $WORK_DIR/gov2-text.mwhc $WORK_DIR/gov2-text.terms
63 | 
64 | java -Xmx7512M -server it.unimi.dsi.sux4j.util.SignedFunctionStringMap $WORK_DIR/gov2-text.mwhc $WORK_DIR/gov2-text.termmap
65 | 
66 | endtime=$(date +%s)
67 | 
68 | echo "Indexing time: $((endtime-starttime))s"
69 | 


--------------------------------------------------------------------------------
/systems/MG4J/logback.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <configuration>
 3 |   <appender name="stderr" class="ch.qos.logback.core.ConsoleAppender">
 4 |     <target>System.err</target>
 5 |     <encoder>
 6 |       <pattern>%d %r %p [%t] %logger{1} - %m%n</pattern>
 7 |     </encoder>
 8 |   </appender>
 9 |   <root level="INFO">
10 |     <appender-ref ref="stderr"/>
11 |   </root>
12 | </configuration>
13 | 


--------------------------------------------------------------------------------
/systems/common.sh:
--------------------------------------------------------------------------------
 1 | GOV2_LOCATION=/media/Gov2/data
 2 | CW09B_LOCATION=/media/ClueWeb09b/ClueWeb09_English_1
 3 | CW12B_LOCATION=/media/ClueWeb12-B13/DiskB
 4 | 
 5 | TOPICS_QRELS=../../topics-and-qrels/
 6 | TREC_EVAL=../../eval/trec_eval.9.0/trec_eval
 7 | SAP_EVAL=../../eval/statAP_MQ_eval_v3.pl
 8 | GD_EVAL=../../eval/gdeval
 9 | 
10 | # define JAVA_HOME for those tools that need it
11 | export JAVA_HOME='/usr/lib/jvm/java-8-oracle/'
12 | 
13 | # Build trec eval if it has not been
14 | if [[ ! -f ${TREC_EVAL} ]]; then
15 | 	tar xzf ../../eval/trec_eval.9.0.tar.gz -C ../../eval
16 | 	make -C ../../eval/trec_eval.9.0/
17 | fi
18 | TREC_EVAL="${TREC_EVAL} -q"
19 | 
20 | # Get statMAP eval tool for the 2009 queries
21 | if [[ ! -f ${SAP_EVAL} ]]; then
22 | 	curl http://trec.nist.gov/data/web/09/statAP_MQ_eval_v3.pl > ${SAP_EVAL}
23 | 	chmod +x ${SAP_EVAL}
24 | fi
25 | 
26 | # Get the gdeval tool for ERR, nDCG
27 | # This is the latest version I could find
28 | if [[ ! -f ${GD_EVAL} ]]; then
29 | 	curl https://raw.githubusercontent.com/trec-web/trec-web-2014/master/src/eval/gdeval.pl > ${GD_EVAL}
30 | 	chmod +x ${GD_EVAL}
31 | fi
32 | 


--------------------------------------------------------------------------------
/systems/galago/dotgov2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -efu
 3 | 
 4 | source ../common.sh
 5 | 
 6 | URL="https://sourceforge.net/projects/lemur/files/lemur/galago-3.8/galago-3.8-bin.tar.gz/download?use_mirror=autoselect"
 7 | GALAGO_ARCHIVE="galago.tar.gz"
 8 | GALAGO_DIR='galago-3.8-bin'
 9 | TMPDIR="tmp/"
10 | GALAGO="${GALAGO_DIR}/bin/galago"
11 | 
12 | if [[ ! -f ${GALAGO_ARCHIVE} ]]; then
13 |   wget ${URL} -O ${GALAGO_ARCHIVE}
14 | fi
15 | 
16 | if [[ ! -f ${GALAGO} ]]; then
17 |   rm -rf ${GALAGO_DIR}
18 |   tar -xf ${GALAGO_ARCHIVE}
19 | fi
20 | 
21 | 
22 | mkdir -p ${TMPDIR}
23 | export JAVA_OPTS="-Djava.io.tmpdir=${TMPDIR} -Xmx7g"
24 | chmod +x ${GALAGO}
25 | 
26 | # build index if not already:
27 | INDEX_PATH=gov2.galago
28 | LOG_FILE=build_index.log
29 | 
30 | if [[ ! -f ${INDEX_PATH}/buildManifest.json ]]; then
31 |   ${GALAGO} build --server=true --mode=fork --distrib=16 --filetype=trecweb --nonStemmedPostings=false --stemmedPostings=true --stemmedCounts=true --corpus=false --inputPath=${GOV2_LOCATION} --indexPath=${INDEX_PATH} 1> >(tee ${LOG_FILE}.stdout) 2> >(tee ${LOG_FILE}.stderr)
32 | fi
33 | 
34 | rm -rf ${TMPDIR} # remove any lingering temporary files
35 | 
36 | for method in combine sdm; do
37 |   for queries in "701-750" "751-800" "801-850"; do
38 |     query_file=$TOPICS_QRELS/topics.${queries}.txt
39 |     qrel_file=$TOPICS_QRELS/qrels.${queries}.txt
40 |     query_json=q${queries}.${method}.json
41 |     python2 make_query_json.py $method $query_file > $query_json # generate title queries
42 |     run_file=galago${queries}.${method}.trecrun
43 |     
44 |     if [[ ! -f ${run_file} ]]; then
45 |       ${GALAGO} timed-batch-search ${query_json} --repeats=1 --requested=1000 --index=${INDEX_PATH} --outputFile=${run_file} --timesFile=${run_file}.times
46 |     fi
47 |     $TREC_EVAL ${qrel_file} ${run_file} > galago${queries}.${method}.treceval
48 |   done
49 | done
50 | 


--------------------------------------------------------------------------------
/systems/galago/make_query_json.py:
--------------------------------------------------------------------------------
 1 | import sys, json
 2 | 
 3 | operator = sys.argv[1]
 4 | queries = []
 5 | inTopic = False
 6 | number=None
 7 | query = None
 8 | 
 9 | def makeCombineQuery(query):
10 |     terms = ['#dirichlet(%s)' % x for x in query.split()]
11 |     return '#combine('+' '.join(terms)+')'
12 | 
13 | def makeSDMQuery(query):
14 |     return '#sdm('+' '.join(query.split())+')'
15 | 
16 | with open(sys.argv[2]) as fp:
17 |     for line in fp:
18 | 
19 |         if not inTopic:
20 |             if line.startswith('<top>'):
21 |                 inTopic = True
22 |             continue
23 |         # inTopic=True
24 |         if line.startswith('</top>'):
25 |             if operator == 'combine':
26 |                 queries += [{ 'number': number, 'text': makeCombineQuery(query) }]
27 |             elif operator=='sdm':
28 |                 queries += [{ 'number': number, 'text': makeSDMQuery(query) }]
29 |             inTopic = False
30 |             continue
31 |         if line.startswith('<num>'):
32 |             number = line.split('Number: ')[1].strip()
33 |         if line.startswith('<title>'):
34 |             query = line[len('<title>'):].strip().replace(".", "")
35 |             if not query:
36 |                 query = next(fp).strip().replace(".","")
37 |         #print line;
38 | 
39 | queries_json = {'queries': queries}
40 | print json.dumps(queries_json)
41 | 


--------------------------------------------------------------------------------
/systems/indri/clean.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # remove all temporary, intermediate and result folders and files
 4 | 
 5 | # 2015 July 22 - James Valenti/Jamie Callan Carnegie Mellon University
 6 | 
 7 | # indri 
 8 | # DO NOT REMOVE INDEX (unless you really intend to)
 9 | # 
10 | #rm -f ./build_index.log.stderr
11 | #rm -f ./build_index.log.stdout
12 | #rm -f ./indri-5.3.tar.gz
13 | #rm -rf ./indri-5.3
14 | 
15 | # queries
16 | # these are typically removed between query parameter adjustments and benchmarking runs
17 | rm -fr ./queries
18 | rm -fr ./query_results
19 | rm -f  ./results
20 | rm -fr ./scores
21 | 


--------------------------------------------------------------------------------
/systems/indri/dm.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | #
  4 | # Perl subroutine that generates Indri dependence model queries.
  5 | #
  6 | # Written by: Don Metzler (metzler@cs.umass.edu)
  7 | # Last updated: 06/27/2005
  8 | #
  9 | # Feel free to distribute, edit, modify, or mangle this code as you see fit. If you make any interesting
 10 | # changes please email me a copy.
 11 | #
 12 | # For more technical details, see:
 13 | #
 14 | #    * Metzler, D. and Croft, W.B., "A Markov Random Field Model for Term Dependencies," ACM SIGIR 2005.
 15 | #
 16 | #    * Metzler, D., Strohman T., Turtle H., and Croft, W.B., "Indri at TREC 2004: Terabyte Track", TREC 2004.
 17 | #
 18 | #    * http://ciir.cs.umass.edu/~metzler/
 19 | #
 20 | # NOTES
 21 | #
 22 | #    * this script assumes that the query string has already been parsed and that all characters
 23 | #      that are not compatible with Indri's query language have been removed.
 24 | #
 25 | #    * it is not advisable to do a 'full dependence' variant on long strings because of the exponential
 26 | #      number of terms that will result. it is suggested that the 'sequential dependence' variant be
 27 | #      used for long strings. either that, or split up long strings into smaller cohesive chunks and
 28 | #      apply the 'full dependence' variant to each of the chunks.
 29 | #
 30 | #    * the unordered features use a window size of 4 * number of terms within the phrase. this has been
 31 | #      found to work well across a wide range of collections and topics. however, this may need to be
 32 | #      modified on an individual basis.
 33 | #
 34 | 
 35 | # example usage
 36 | #print formulate_query( "white house rose garden", "sd", 0.5, 0.25, 0.25 ) . "\n\n";
 37 | #print formulate_query( "white house rose garden", "fd", 0.8, 0.1, 0.1 ) . "\n\n";
 38 | 
 39 | my $file = $ARGV[0];
 40 | open my $info, $file or die "Could not open $file: $!";
 41 | 
 42 | while( my $line = <$info>)  {   
 43 |     print formulate_query( $line, "sd", 0.7, 0.2, 0.1 ) . "\n";   
 44 |     #last if $. == 2;
 45 | }
 46 | 
 47 | close $info;
 48 | 
 49 | #
 50 | # formulates a query based on query text and feature weights
 51 | #
 52 | # arguments:
 53 | #    * query - string containing original query terms separated by spaces
 54 | #    * type  - string. "sd" for sequential dependence or "fd" for full dependence variant. defaults to "fd".
 55 | #    * wt[0] - weight assigned to term features
 56 | #    * wt[1] - weight assigned to ordered (#1) features
 57 | #    * wt[2] - weight assigned to unordered (#uw) features
 58 | #
 59 | sub formulate_query {
 60 |     my ( $q, $type, @wt ) = @_;
 61 | 
 62 |     # trim whitespace from beginning and end of query string
 63 |     $q =~ s/^\s+|\s+$//g;
 64 |     
 65 |     my $queryT = "#combine( ";
 66 |     my $queryO = "#combine(";
 67 |     my $queryU = "#combine(";
 68 |     
 69 |     # generate term features (f_T)
 70 |     my @terms = split(/\s+/ , $q);
 71 |     my $term;
 72 |     foreach $term ( @terms ) {
 73 | 	$queryT .= "$term ";
 74 |     }
 75 | 
 76 |     my $num_terms = @terms;
 77 |     
 78 |     # skip the rest of the processing if we're just
 79 |     # interested in term features or if we only have 1 term
 80 |     if( ( $wt[1] == 0.0 && $wt[2] == 0.0 ) || $num_terms == 1 ) {
 81 | 	return $queryT . ")";
 82 |     }
 83 |     
 84 |     # generate the rest of the features
 85 |     my $start = 1;
 86 |     if( $type eq "sd" ) { $start = 3; }
 87 |     for( my $i = $start ; $i < 2 ** $num_terms ; $i++ ) {
 88 | 	my $bin = unpack("B*", pack("N", $i)); # create binary representation of i
 89 | 	my $num_extracted = 0;
 90 | 	my $extracted_terms = "";
 91 | 
 92 | 	# get query terms corresponding to 'on' bits
 93 | 	for( my $j = 0 ; $j < $num_terms ; $j++ ) {
 94 | 	    my $bit = substr($bin, $j - $num_terms, 1);
 95 | 	    if( $bit eq "1" ) {
 96 | 		$extracted_terms .= "$terms[$j] ";
 97 | 		$num_extracted++;
 98 | 	    }
 99 | 	}
100 | 	
101 | 	if( $num_extracted == 1 ) { next; } # skip these, since we already took care of the term features...
102 | 	if( $bin =~ /^0+11+[^1]*$/ ) { # words in contiguous phrase, ordered features (f_O)
103 | 	    $queryO .= " #1( $extracted_terms) ";
104 | 	}
105 | 	$queryU .= " #uw" . 4*$num_extracted . "( $extracted_terms) "; # every subset of terms, unordered features (f_U)
106 | 	if( $type eq "sd" ) { $i *= 2; $i--; }
107 |     }
108 | 
109 |     my $query = "#weight(";
110 |     if( $wt[0] != 0.0 && $queryT ne "#combine( " ) { $query .= " $wt[0] $queryT)"; }
111 |     if( $wt[1] != 0.0 && $queryO ne "#combine(" ) { $query .= " $wt[1] $queryO)"; }
112 |     if( $wt[2] != 0.0 && $queryU ne "#combine(" ) { $query .= " $wt[2] $queryU)"; }
113 | 
114 |     if( $query eq "#weight(" ) { return ""; } # return "" if we couldn't formulate anything
115 |     
116 |     return $query . " )";
117 | }
118 | 


--------------------------------------------------------------------------------
/systems/indri/index-clef.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | #usage: ./index.sh language
 4 | #language can be bg,de,es,fa,fi,fr,hu,it,nl,pt,ru,sv
 5 | 
 6 | #set lemur bin directory
 7 | lemurdir=/u/xiaojiex/ir/lemur/bin
 8 | 
 9 | #set indexing parameter file directory
10 | indexParaDir=/u/xiaojiex/ir/excute/clefmono
11 | 
12 | date
13 | echo "building index started" 
14 | $lemurdir/IndriBuildIndex $indexParaDir/indexParaSP_$1
15 | echo "building index finished"
16 | date
17 | 


--------------------------------------------------------------------------------
/systems/indri/index-clef_ReadMe.txt:
--------------------------------------------------------------------------------
 1 | Method Description:
 2 |   xiao jie liu(James Liu), the student of Professor Jian-Yun Nie, submits the CLEF baseline experiments results. James used Lemur/Indri to be 
 3 | the basic retrieval platform and used language model with Dirichlet Smoothing(mu=2000) for the retrieval model. James dealed with 12 languages
 4 | for the topics and corpora. They are Hungarian(hu),Italian(it),Dutch(nl),Portuguese(pt),Russian(ru),Swedish(sv),German(de),Spanish(es),Finnish(fi),
 5 | French(fr),Bulgarian(bg) and Persian(fa).
 6 |   James Liu pre-processed the topics files and Corpora by removing the stopwords and stemming the contents. Stopwords list files are provided
 7 | by Maria Maistro (maistro@dei.unipd.it), you can download from https://github.com/mmaistro/IR-Reproducibility/tree/mmaistro. The stemming methods
 8 | used are from Professfor Jacques.Savoy (Jacques.Savoy@unine.ch)(http://members.unine.ch/jacques.savoy/clef/index.html) for language Bulgarian;
 9 | from Jonsafari (https://www.ling.ohio-state.edu/~jonsafari/persian_nlp.html) for language Persian; from Snowball (http://snowball.tartarus.org/)
10 | for other languages (Hungarian,Italian,Dutch,Portuguese,Russian,Swedish,German,Spanish,Finnish and French).
11 |   In the final result, for each topic, there are 1000 documents returned. And the format is Trec standard format. 
12 |  
13 | 


--------------------------------------------------------------------------------
/systems/indri/indexParaSP_bg:
--------------------------------------------------------------------------------
 1 | <parameters> 
 2 |     <memory>1024m</memory> 
 3 |     <index>/u/xiaojiex/ir/excute/clefmono/indexSP_bg/</index> 
 4 |     <corpus>  
 5 |         <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/SEGA2002_consdoc/</path>
 6 |         <class>trectext</class> 
 7 |     </corpus>
 8 |     <corpus>  
 9 |         <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/STANDART2002_consdoc/</path>
10 |         <class>trectext</class> 
11 |     </corpus>
12 | </parameters> 
13 | 


--------------------------------------------------------------------------------
/systems/indri/indexParaSP_de:
--------------------------------------------------------------------------------
 1 | <parameters> 
 2 |     <memory>1024m</memory> 
 3 |     <index>/u/xiaojiex/ir/excute/clefmono/indexSP_de/</index> 
 4 |     <corpus>  
 5 |         <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/FRANKFURTER1994_UTF8_consdoc/</path>
 6 |         <class>trectext</class> 
 7 |     </corpus>
 8 |     <corpus>  
 9 |         <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/SDA1994_UTF8_consdoc/</path>
10 |         <class>trectext</class> 
11 |     </corpus>
12 |     <corpus>  
13 |         <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/SPIEGEL1994_UTF8_consdoc/</path>
14 |         <class>trectext</class> 
15 |     </corpus>
16 |     <corpus>  
17 |         <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/SPIEGEL1995_UTF8_consdoc/</path>
18 |         <class>trectext</class> 
19 |     </corpus>
20 | </parameters> 
21 | 


--------------------------------------------------------------------------------
/systems/indri/indexParaSP_es:
--------------------------------------------------------------------------------
 1 | <parameters> 
 2 |     <memory>1024m</memory> 
 3 |     <index>/u/xiaojiex/ir/excute/clefmono/indexSP_es/</index> 
 4 |     <corpus>  
 5 |         <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/EFE1994_UTF8_consdoc/</path>
 6 |         <class>trectext</class> 
 7 |     </corpus>
 8 |     <corpus>  
 9 |         <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/EFE1995_UTF8_consdoc/</path>
10 |         <class>trectext</class> 
11 |     </corpus>
12 | </parameters> 
13 | 


--------------------------------------------------------------------------------
/systems/indri/indexParaSP_fa:
--------------------------------------------------------------------------------
1 | <parameters> 
2 |     <memory>1024m</memory> 
3 |     <index>/u/xiaojiex/ir/excute/clefmono/indexSP_fa/</index> 
4 |     <corpus>  
5 |         <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/HAMSHAHRI_consdoc/</path>
6 |         <class>trectext</class> 
7 |     </corpus>
8 | </parameters> 
9 | 


--------------------------------------------------------------------------------
/systems/indri/indexParaSP_fi:
--------------------------------------------------------------------------------
 1 | <parameters> 
 2 |     <memory>1024m</memory> 
 3 |     <index>/u/xiaojiex/ir/excute/clefmono/indexSP_fi/</index> 
 4 |     <corpus>  
 5 |         <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/AAMULEHTI1994_UTF8_consdoc/</path>
 6 |         <class>trectext</class> 
 7 |     </corpus>
 8 |     <corpus>  
 9 |         <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/AAMULEHTI1995_UTF8_consdoc/</path>
10 |         <class>trectext</class> 
11 |     </corpus>
12 | </parameters> 
13 | 


--------------------------------------------------------------------------------
/systems/indri/indexParaSP_fr:
--------------------------------------------------------------------------------
 1 | <parameters> 
 2 |     <memory>1024m</memory> 
 3 |     <index>/u/xiaojiex/ir/excute/clefmono/indexSP_fr/</index> 
 4 |     <corpus>  
 5 |         <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/LEMONDE1994_UTF8_consdoc/</path>
 6 |         <class>trectext</class> 
 7 |     </corpus>
 8 |     <corpus>  
 9 |         <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/LEMONDE1995_UTF8_consdoc/</path>
10 |         <class>trectext</class> 
11 |     </corpus>
12 |     <corpus>  
13 |         <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/ATS1994_UTF8_consdoc/</path>
14 |         <class>trectext</class> 
15 |     </corpus>
16 |     <corpus>  
17 |         <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/ATS1995_UTF8_consdoc/</path>
18 |         <class>trectext</class> 
19 |     </corpus>
20 | </parameters> 
21 | 


--------------------------------------------------------------------------------
/systems/indri/indexParaSP_hu:
--------------------------------------------------------------------------------
1 | <parameters> 
2 |     <memory>1024m</memory> 
3 |     <index>/u/xiaojiex/ir/excute/clefmono/indexSP_hu/</index> 
4 |     <corpus>  
5 |         <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/MAGYAR2002_consdoc/</path>
6 |         <class>trectext</class> 
7 |     </corpus>
8 | </parameters> 
9 | 


--------------------------------------------------------------------------------
/systems/indri/indexParaSP_it:
--------------------------------------------------------------------------------
 1 | <parameters> 
 2 |     <memory>1024m</memory> 
 3 |     <index>/u/xiaojiex/ir/excute/clefmono/indexSP_it/</index> 
 4 |     <corpus>  
 5 |         <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/AGZ1994_UTF8_consdoc/</path>
 6 |         <class>trectext</class> 
 7 |     </corpus>
 8 |     <corpus>  
 9 |         <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/AGZ1995_UTF8_consdoc/</path>
10 |         <class>trectext</class> 
11 |     </corpus>
12 |     <corpus>  
13 |         <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/LASTAMPA1994_UTF8_consdoc/</path>
14 |         <class>trectext</class> 
15 |     </corpus>
16 | </parameters> 
17 | 


--------------------------------------------------------------------------------
/systems/indri/indexParaSP_nl:
--------------------------------------------------------------------------------
 1 | <parameters> 
 2 |     <memory>1024m</memory> 
 3 |     <index>/u/xiaojiex/ir/excute/clefmono/indexSP_nl/</index> 
 4 |     <corpus>  
 5 |         <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/ALGEMEEN1994_UTF8_consdoc/</path>
 6 |         <class>trectext</class> 
 7 |     </corpus>
 8 |     <corpus>  
 9 |         <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/ALGEMEEN1995_UTF8_consdoc/</path>
10 |         <class>trectext</class> 
11 |     </corpus>
12 |     <corpus>  
13 |         <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/NRC1994_UTF8_consdoc/</path>
14 |         <class>trectext</class> 
15 |     </corpus>
16 |     <corpus>  
17 |         <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/NRC1995_UTF8_consdoc/</path>
18 |         <class>trectext</class> 
19 |     </corpus>
20 | </parameters> 
21 | 


--------------------------------------------------------------------------------
/systems/indri/indexParaSP_pt:
--------------------------------------------------------------------------------
 1 | <parameters> 
 2 |     <memory>1024m</memory> 
 3 |     <index>/u/xiaojiex/ir/excute/clefmono/indexSP_pt/</index> 
 4 |     <corpus>  
 5 |         <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/FOLHA1994_UTF8_consdoc/</path>
 6 |         <class>trectext</class> 
 7 |     </corpus>
 8 |     <corpus>  
 9 |         <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/FOLHA1995_UTF8_consdoc/</path>
10 |         <class>trectext</class> 
11 |     </corpus>
12 |     <corpus>  
13 |         <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/PUBLICO1994_UTF8_consdoc/</path>
14 |         <class>trectext</class> 
15 |     </corpus>
16 |     <corpus>  
17 |         <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/PUBLICO1995_UTF8_consdoc/</path>
18 |         <class>trectext</class> 
19 |     </corpus>
20 | </parameters> 
21 | 


--------------------------------------------------------------------------------
/systems/indri/indexParaSP_ru:
--------------------------------------------------------------------------------
1 | <parameters> 
2 |     <memory>1024m</memory> 
3 |     <index>/u/xiaojiex/ir/excute/clefmono/indexSP_ru/</index> 
4 |     <corpus>  
5 |         <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/IZVESTIA1995_consdoc/</path>
6 |         <class>trectext</class> 
7 |     </corpus>
8 | </parameters> 
9 | 


--------------------------------------------------------------------------------
/systems/indri/indexParaSP_sv:
--------------------------------------------------------------------------------
 1 | <parameters> 
 2 |     <memory>1024m</memory> 
 3 |     <index>/u/xiaojiex/ir/excute/clefmono/indexSP_sv/</index> 
 4 |     <corpus>  
 5 |         <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/TT1994_consdoc/</path>
 6 |         <class>trectext</class> 
 7 |     </corpus>
 8 |     <corpus>  
 9 |         <path>/u/xiaojiex/ir/corpus/clef_collections/corpora_stop_stem/TT1995_consdoc/</path>
10 |         <class>trectext</class> 
11 |     </corpus>
12 | </parameters> 
13 | 


--------------------------------------------------------------------------------
/systems/indri/queryParaLMSP_ru:
--------------------------------------------------------------------------------
  1 | <parameters>
  2 |     <index>/u/xiaojiex/ir/excute/clefmono/indexSP_ru</index>
  3 |     <rule>method:dirichlet,mu:2000</rule>
  4 |     <count>1000</count>
  5 | <query>
  6 |  <number>143-AH</number>
  7 |  <text>Конференц положен Женщин Пекин </text>
  8 | </query>
  9 | <query>
 10 |  <number>147-AH</number>
 11 |  <text>Нефтян авар Птиц </text>
 12 | </query>
 13 | <query>
 14 |  <number>148-AH</number>
 15 |  <text>Разрушен озонов сло </text>
 16 | </query>
 17 | <query>
 18 |  <number>149-AH</number>
 19 |  <text>Виз Пап Римск ШриЛанк </text>
 20 | </query>
 21 | <query>
 22 |  <number>151-AH</number>
 23 |  <text>Чудес Древн Мир </text>
 24 | </query>
 25 | <query>
 26 |  <number>153-AH</number>
 27 |  <text>Олимпийск Игры Мир </text>
 28 | </query>
 29 | <query>
 30 |  <number>154-AH</number>
 31 |  <text>Свобод Слов Интернет </text>
 32 | </query>
 33 | <query>
 34 |  <number>155-AH</number>
 35 |  <text>Опасност Мобильн Телефон </text>
 36 | </query>
 37 | <query>
 38 |  <number>157-AH</number>
 39 |  <text>Победительниц Уимблдон </text>
 40 | </query>
 41 | <query>
 42 |  <number>163-AH</number>
 43 |  <text>Ограничен Куря </text>
 44 | </query>
 45 | <query>
 46 |  <number>164-AH</number>
 47 |  <text>Приговор Наркотик Европ </text>
 48 | </query>
 49 | <query>
 50 |  <number>168-AH</number>
 51 |  <text>Убийств Рабин </text>
 52 | </query>
 53 | <query>
 54 |  <number>169-AH</number>
 55 |  <text>Появлен устройств запис компактдиск </text>
 56 | </query>
 57 | <query>
 58 |  <number>172-AH</number>
 59 |  <text>Миров Рекорд Лёгко Атлетик 1995 </text>
 60 | </query>
 61 | <query>
 62 |  <number>176-AH</number>
 63 |  <text>ШумейкерЛев Юпитер </text>
 64 | </query>
 65 | <query>
 66 |  <number>177-AH</number>
 67 |  <text>Потреблен Молок Европ </text>
 68 | </query>
 69 | <query>
 70 |  <number>178-AH</number>
 71 |  <text>Отказ Несен Воен Служб </text>
 72 | </query>
 73 | <query>
 74 |  <number>179-AH</number>
 75 |  <text>Отставк Генеральн Секретар НАТО </text>
 76 | </query>
 77 | <query>
 78 |  <number>180-AH</number>
 79 |  <text>Банкротств Баринг Бразерс </text>
 80 | </query>
 81 | <query>
 82 |  <number>181-AH</number>
 83 |  <text>Французск Ядерн Испытан </text>
 84 | </query>
 85 | <query>
 86 |  <number>183-AH</number>
 87 |  <text>Ископа Остатк Динозавр Ази </text>
 88 | </query>
 89 | <query>
 90 |  <number>187-AH</number>
 91 |  <text>Ядерн Перевозк Герман </text>
 92 | </query>
 93 | <query>
 94 |  <number>192-AH</number>
 95 |  <text>Убийств Директор Российск Телекомпан </text>
 96 | </query>
 97 | <query>
 98 |  <number>193-AH</number>
 99 |  <text>ЕС Балтийск Государств </text>
100 | </query>
101 | <query>
102 |  <number>197-AH</number>
103 |  <text>Мирн Соглашен Дейтон </text>
104 | </query>
105 | <query>
106 |  <number>198-AH</number>
107 |  <text>Почётн Оскар Итальянск Режиссёр </text>
108 | </query>
109 | <query>
110 |  <number>199-AH</number>
111 |  <text>Эпидем Эбол Заир </text>
112 | </query>
113 | <query>
114 |  <number>200-AH</number>
115 |  <text>Наводнен Голланд Герман </text>
116 | </query>
117 | <query>
118 |  <number>201-AH</number>
119 |  <text>Домашн пожар </text>
120 | </query>
121 | <query>
122 |  <number>202-AH</number>
123 |  <text>Арест Ник Леесон </text>
124 | </query>
125 | <query>
126 |  <number>203-AH</number>
127 |  <text>Партизанск войн Восточн Тимор </text>
128 | </query>
129 | <query>
130 |  <number>207-AH</number>
131 |  <text>Травм причин фейерверк </text>
132 | </query>
133 | <query>
134 |  <number>209-AH</number>
135 |  <text>Победител Тур де Франс </text>
136 | </query>
137 | <query>
138 |  <number>210-AH</number>
139 |  <text>Кандидат Нобелевск прем </text>
140 | </query>
141 | <query>
142 |  <number>211-AH</number>
143 |  <text>Пограничн конфликт Пер Эквадор </text>
144 | </query>
145 | <query>
146 |  <number>212-AH</number>
147 |  <text>Спортсменк допинг </text>
148 | </query>
149 | <query>
150 |  <number>213-AH</number>
151 |  <text>Путешеств Пап </text>
152 | </query>
153 | <query>
154 |  <number>214-AH</number>
155 |  <text>Мультимиллиардер </text>
156 | </query>
157 | <query>
158 |  <number>215-AH</number>
159 |  <text>Повторн избран президент Пер </text>
160 | </query>
161 | <query>
162 |  <number>216-AH</number>
163 |  <text>Вдыхан кле подростк </text>
164 | </query>
165 | <query>
166 |  <number>218-AH</number>
167 |  <text>Андреотт маф </text>
168 | </query>
169 | <query>
170 |  <number>220-AH</number>
171 |  <text>Европейск автомоб Росс </text>
172 | </query>
173 | <query>
174 |  <number>221-AH</number>
175 |  <text>Олимпийск зимн игр 2002 </text>
176 | </query>
177 | <query>
178 |  <number>224-AH</number>
179 |  <text>Женщин соверша одиночн восхожден Эверест </text>
180 | </query>
181 | <query>
182 |  <number>225-AH</number>
183 |  <text>Атомн станц Соснов бор </text>
184 | </query>
185 | <query>
186 |  <number>226-AH</number>
187 |  <text>Изменен пол </text>
188 | </query>
189 | <query>
190 |  <number>227-AH</number>
191 |  <text>Алтайск амазонк </text>
192 | </query>
193 | <query>
194 |  <number>228-AH</number>
195 |  <text>Доисторическ искусств </text>
196 | </query>
197 | <query>
198 |  <number>230-AH</number>
199 |  <text>Стыковк Атлантис МИР </text>
200 | </query>
201 | <query>
202 |  <number>231-AH</number>
203 |  <text>Нов португальск премьерминистр </text>
204 | </query>
205 | <query>
206 |  <number>232-AH</number>
207 |  <text>Программ пенсион обеспечен Европ </text>
208 | </query>
209 | <query>
210 |  <number>233-AH</number>
211 |  <text>Парников эффект </text>
212 | </query>
213 | <query>
214 |  <number>234-AH</number>
215 |  <text>Глухот обществ </text>
216 | </query>
217 | <query>
218 |  <number>235-AH</number>
219 |  <text>Охот тюлен </text>
220 | </query>
221 | <query>
222 |  <number>237-AH</number>
223 |  <text>Панченлам </text>
224 | </query>
225 | <query>
226 |  <number>238-AH</number>
227 |  <text>Лед Дая </text>
228 | </query>
229 | <query>
230 |  <number>239-AH</number>
231 |  <text>Психическ здоров подростк </text>
232 | </query>
233 | <query>
234 |  <number>241-AH</number>
235 |  <text>Нов политическ парт </text>
236 | </query>
237 | <query>
238 |  <number>242-AH</number>
239 |  <text>Рекордн срок пребыван космос </text>
240 | </query>
241 | <query>
242 |  <number>244-AH</number>
243 |  <text>Футболист 1994 </text>
244 | </query>
245 | <query>
246 |  <number>245-AH</number>
247 |  <text>Кристофер Рив </text>
248 | </query>
249 | <query>
250 |  <number>250-AH</number>
251 |  <text>Бешенств люд </text>
252 | </query>    
253 |     <trecFormat>true</trecFormat>
254 | </parameters>
255 | 


--------------------------------------------------------------------------------
/systems/indri/query_LM.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | #usage: ./query_LM.sh language
 4 | #language can be bg,de,es,fa,fi,fr,hu,it,nl,pt,ru,sv
 5 | 
 6 | #set lemur bin directory
 7 | lemurdir=/u/xiaojiex/ir/lemur/bin
 8 | 
 9 | #set retrieval parameter file directory
10 | queryParaDir=/u/xiaojiex/ir/excute/clefmono
11 | 
12 | #set retrieval result directory
13 | queryResultDir=$queryParaDir/baseline
14 | 
15 | date
16 | echo "retrieval started" 
17 | $lemurdir/IndriRunQuery $queryParaDir/queryParaLMSP_$1 > $queryResultDir/result_LMSP_file_$1
18 | echo "retrieval finished"
19 | date


--------------------------------------------------------------------------------
/systems/lucene/clef.sh:
--------------------------------------------------------------------------------
 1 | source ../common.sh
 2 | 
 3 | if [ -z "$CLEF_LOCATION" ]; then 
 4 | 	echo "The location of the CLEF Test Collections should be specified";
 5 | else 
 6 | 
 7 | 	if [[ ! -f clef/target/lucene-clef-1.0-jar-with-dependencies.jar ]]; then
 8 | 		echo "Compiling lucene-clef project..."
 9 | 		cd clef
10 | 		mvn clean compile assembly:single
11 | 		cd ..
12 | 	fi
13 | 
14 | 	cd ../../
15 | 
16 | 	ROOT_PATH=$(pwd);
17 | 
18 | 	mkdir -p $ROOT_PATH/runs
19 | 
20 | 	SYSTEM_PATH=$ROOT_PATH/systems/lucene;
21 | 
22 | 	cd $SYSTEM_PATH
23 | 
24 | 	while read line;do
25 | 		# load indexing and retrieval options from the clef_runs file
26 | 		lang=$(echo "$line" | cut -d$'\t' -f1);
27 | 		use_stemmer=$(echo "$line" | cut -d$'\t' -f2);
28 | 		use_stoplist=$(echo "$line" | cut -d$'\t' -f3);
29 | 		model=$(echo "$line" | cut -d$'\t' -f4);
30 | 		
31 | 		sh $SYSTEM_PATH/clef_experiments.sh -l $lang -cp $CLEF_LOCATION -stm $use_stemmer -sl $use_stoplist -r $model
32 | 
33 | 	done < $SYSTEM_PATH/clef_runs
34 | 
35 | fi	
36 | 
37 | 


--------------------------------------------------------------------------------
/systems/lucene/clef/README.md:
--------------------------------------------------------------------------------
 1 | # Apache Lucene - CLEF
 2 | 
 3 | *lucene-clef* provides indexing and retrieval functionalities for the CLEF Test Collections through the
 4 | [Apache Lucene](https://lucene.apache.org/core/) (version [5.2.1](https://lucene.apache.org/core/5_2_1/index.html))
 5 | library.
 6 | 
 7 | ### Experiments on CLEF Test Collections
 8 | 
 9 | The experiments can be replicated by the script [clef_experiments.sh](../clef_experiments.sh).
10 | An example of usage of the script is the following:
11 | 
12 | `./clef_experiments.sh -l it -cp /media/CLEF/corpora -stm y -sl y -r BM25`
13 | 
14 | where the meaning of the options is:
15 | - *-l*  the language of the test collection (e.g. *it*)
16 | - *-cp* the path to the directory where the document corpora are stored (e.g. */media/CLEF/corpora*)
17 | - *-stm* enable (-stm y) or disable (-stm n) the stemmer
18 | - *-sl* enable (-sl y) or disable (-sl n) the use of the stoplist
19 | - *-r* the ranking model (e.g. BM25)
20 | 
21 | The current version of *lucene-clef* supports the following models:
22 | - BM25
23 | 
24 | The script [clef.sh](../clef.sh) iterates over the diverse set of options stored in the [clef_runs](../clef_runs) file
25 | (one combination of options per line) and call the [clef_experiments.sh](../clef_experiments.sh) using each option set.
26 | The lines in the [clef_runs](../clef_runs) have the following format:
27 | 
28 | `it	y	y	BM25`
29 | 
30 | where the options are separated by tabs; the first option refer to the language, the second to the stemmer, the third
31 | to the stoplist usage and the last one to the model. The last line of the [clef_runs](../clef_runs) should be empty.


--------------------------------------------------------------------------------
/systems/lucene/clef/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
  3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |     <modelVersion>4.0.0</modelVersion>
  6 | 
  7 |     <groupId>it.unipd.dei.ims.lucene</groupId>
  8 |     <artifactId>lucene-clef</artifactId>
  9 |     <version>1.0</version>
 10 | 
 11 |     <properties>
 12 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 13 |         <maven.compiler.source>1.7</maven.compiler.source>
 14 |         <maven.compiler.target>1.7</maven.compiler.target>
 15 |         <org.apache.lucene-version>5.2.1</org.apache.lucene-version>
 16 |         <slf4j.version>1.6.4</slf4j.version>
 17 |         <logback.version>1.0.9</logback.version>
 18 |     </properties>
 19 | 
 20 |     <dependencies>
 21 | 
 22 |         <!-- Logging -->
 23 |         <dependency>
 24 |             <groupId>org.slf4j</groupId>
 25 |             <artifactId>slf4j-api</artifactId>
 26 |             <version>${slf4j.version}</version>
 27 |         </dependency>
 28 | 
 29 |         <dependency>
 30 |             <groupId>ch.qos.logback</groupId>
 31 |             <artifactId>logback-classic</artifactId>
 32 |             <version>${logback.version}</version>
 33 |         </dependency>
 34 | 
 35 |         <dependency>
 36 |             <groupId>ch.qos.logback</groupId>
 37 |             <artifactId>logback-core</artifactId>
 38 |             <version>${logback.version}</version>
 39 |         </dependency>
 40 | 
 41 |         <!-- Test -->
 42 |         <dependency>
 43 |             <groupId>junit</groupId>
 44 |             <artifactId>junit</artifactId>
 45 |             <version>4.12</version>
 46 |         </dependency>
 47 | 
 48 | 
 49 |         <!-- LUCENE DEPENDENCIES -->
 50 |         <dependency>
 51 |             <groupId>org.apache.lucene</groupId>
 52 |             <artifactId>lucene-core</artifactId>
 53 |             <version>${org.apache.lucene-version}</version>
 54 |         </dependency>
 55 | 
 56 |         <dependency>
 57 |             <groupId>org.apache.lucene</groupId>
 58 |             <artifactId>lucene-analyzers-common</artifactId>
 59 |             <version>${org.apache.lucene-version}</version>
 60 |         </dependency>
 61 | 
 62 |         <dependency>
 63 |             <groupId>org.apache.lucene</groupId>
 64 |             <artifactId>lucene-benchmark</artifactId>
 65 |             <version>${org.apache.lucene-version}</version>
 66 |         </dependency>
 67 | 
 68 |     </dependencies>
 69 | 
 70 |     <build>
 71 |         <plugins>
 72 |             <plugin>
 73 |                 <artifactId>maven-compiler-plugin</artifactId>
 74 |                 <configuration>
 75 |                     <source>${maven.compiler.source}</source>
 76 |                     <target>${maven.compiler.target}</target>
 77 |                     <encoding>${project.build.sourceEncoding}</encoding>
 78 |                 </configuration>
 79 |             </plugin>
 80 |             <plugin>
 81 |                 <groupId>org.apache.maven.plugins</groupId>
 82 |                 <artifactId>maven-resources-plugin</artifactId>
 83 |                 <configuration>
 84 |                     <encoding>${project.build.sourceEncoding}</encoding>
 85 |                 </configuration>
 86 |             </plugin>
 87 |             <plugin>
 88 |                 <artifactId>maven-assembly-plugin</artifactId>
 89 |                 <configuration>
 90 |                     <archive>
 91 |                         <manifest>
 92 |                             <mainClass>it.unipd.dei.ims.lucene.clef.App</mainClass>
 93 |                         </manifest>
 94 |                     </archive>
 95 |                     <descriptorRefs>
 96 |                         <descriptorRef>jar-with-dependencies</descriptorRef>
 97 |                     </descriptorRefs>
 98 |                 </configuration>
 99 |                 <executions>
100 |                     <execution>
101 |                         <id>make-assembly</id> <!-- this is used for inheritance merges -->
102 |                         <phase>package</phase> <!-- bind to the packaging phase -->
103 |                         <goals>
104 |                             <goal>single</goal>
105 |                         </goals>
106 |                     </execution>
107 |                 </executions>
108 |             </plugin>
109 |         </plugins>
110 |     </build>
111 | 
112 | 
113 | </project>


--------------------------------------------------------------------------------
/systems/lucene/clef/src/main/java/it/unipd/dei/ims/lucene/clef/AnalyzerFactory.java:
--------------------------------------------------------------------------------
  1 | package it.unipd.dei.ims.lucene.clef;
  2 | 
  3 | /*
  4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  5 |  * contributor license agreements.  See the NOTICE file distributed with
  6 |  * this work for additional information regarding copyright ownership.
  7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  8 |  * (the "License"); you may not use this file except in compliance with
  9 |  * the License.  You may obtain a copy of the License at
 10 |  *
 11 |  *     http://www.apache.org/licenses/LICENSE-2.0
 12 |  *
 13 |  * Unless required by applicable law or agreed to in writing, software
 14 |  * distributed under the License is distributed on an "AS IS" BASIS,
 15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 |  * See the License for the specific language governing permissions and
 17 |  * limitations under the License.
 18 |  */
 19 | 
 20 | import org.apache.lucene.analysis.Analyzer;
 21 | import org.apache.lucene.analysis.bg.BulgarianAnalyzer;
 22 | import org.apache.lucene.analysis.de.GermanAnalyzer;
 23 | import org.apache.lucene.analysis.es.SpanishAnalyzer;
 24 | import org.apache.lucene.analysis.fa.PersianAnalyzer;
 25 | import org.apache.lucene.analysis.fi.FinnishAnalyzer;
 26 | import org.apache.lucene.analysis.fr.FrenchAnalyzer;
 27 | import org.apache.lucene.analysis.hu.HungarianAnalyzer;
 28 | import org.apache.lucene.analysis.it.ItalianAnalyzer;
 29 | import org.apache.lucene.analysis.nl.DutchAnalyzer;
 30 | import org.apache.lucene.analysis.pt.PortugueseAnalyzer;
 31 | import org.apache.lucene.analysis.ru.RussianAnalyzer;
 32 | import org.apache.lucene.analysis.standard.StandardAnalyzer;
 33 | import org.apache.lucene.analysis.sv.SwedishAnalyzer;
 34 | import org.apache.lucene.analysis.util.CharArraySet;
 35 | import org.slf4j.Logger;
 36 | import org.slf4j.LoggerFactory;
 37 | 
 38 | import java.io.File;
 39 | import java.io.FileNotFoundException;
 40 | import java.util.Scanner;
 41 | 
 42 | /**
 43 |  * Factory for {@link org.apache.lucene.analysis.Analyzer}s and stopsets.
 44 |  */
 45 | public class AnalyzerFactory {
 46 | 
 47 |     static Logger logger = LoggerFactory.getLogger(AnalyzerFactory.class);
 48 | 
 49 |     public static CharArraySet createStopset(
 50 |             String language,
 51 |             String stopsetType,
 52 |             String stopsetPath
 53 |     ) throws Exception {
 54 | 
 55 |         CharArraySet stopset = CharArraySet.EMPTY_SET;
 56 | 
 57 |         if (stopsetType.equalsIgnoreCase("CUSTOM")){
 58 | 
 59 |             try {
 60 |                 File f = new File(stopsetPath);
 61 |                 stopset = new CharArraySet(0,true);
 62 |                 Scanner sc = new Scanner(f);
 63 |                 logger.debug("STOPLIST:");
 64 |                 while (sc.hasNextLine()) {
 65 |                     String stopword = sc.nextLine().trim();
 66 |                     logger.debug("=> "+stopword);
 67 |                     stopset.add(stopword);
 68 |                 }
 69 |                 logger.debug("");
 70 |                 sc.close();
 71 | 
 72 |             } catch (FileNotFoundException e) {
 73 |                 e.printStackTrace();
 74 |                 throw new Exception("FileNotFoundException when loading stopset");
 75 |             }
 76 | 
 77 |         } else if (stopsetType.equalsIgnoreCase("DEFAULT")){
 78 | 
 79 |             switch (language) {
 80 |                 case "bg":
 81 |                     stopset = BulgarianAnalyzer.getDefaultStopSet();
 82 |                     break;
 83 |                 case "de":
 84 |                     stopset = GermanAnalyzer.getDefaultStopSet();
 85 |                     break;
 86 |                 case "es":
 87 |                     stopset = SpanishAnalyzer.getDefaultStopSet();
 88 |                     break;
 89 |                 case "fa":
 90 |                     stopset = PersianAnalyzer.getDefaultStopSet();
 91 |                     break;
 92 |                 case "fi":
 93 |                     stopset = FinnishAnalyzer.getDefaultStopSet();
 94 |                     break;
 95 |                 case "fr":
 96 |                     stopset = FrenchAnalyzer.getDefaultStopSet();
 97 |                     break;
 98 |                 case "hu":
 99 |                     stopset = HungarianAnalyzer.getDefaultStopSet();
100 |                     break;
101 |                 case "it":
102 |                     stopset = ItalianAnalyzer.getDefaultStopSet();
103 |                     break;
104 |                 case "nl":
105 |                     stopset = DutchAnalyzer.getDefaultStopSet();
106 |                     break;
107 |                 case "pt":
108 |                     stopset = PortugueseAnalyzer.getDefaultStopSet();
109 |                     break;
110 |                 case "ru":
111 |                     stopset = RussianAnalyzer.getDefaultStopSet();
112 |                     break;
113 |                 case "sv":
114 |                     stopset = SwedishAnalyzer.getDefaultStopSet();
115 |                     break;
116 |                 default:
117 |                     throw new UnsupportedOperationException("Language not supported yet");
118 |             }
119 | 
120 |         }
121 | 
122 |         return stopset;
123 |     }
124 | 
125 | 
126 | 
127 |     public static Analyzer createAnalyzer(
128 |             String language,
129 |             String stemmer,
130 |             CharArraySet stopset
131 |     ) {
132 | 
133 |         Analyzer analyzer;
134 | 
135 |         if (stemmer.equalsIgnoreCase("NONE")){
136 | 
137 |             analyzer = new StandardAnalyzer(stopset);
138 | 
139 |         } else { // otherwise use language-specific analyzer
140 | 
141 |             switch (language) {
142 |                 case "bg":
143 |                     analyzer = new BulgarianAnalyzer(stopset);
144 |                     break;
145 |                 case "de":
146 |                     analyzer = new GermanAnalyzer(stopset);
147 |                     break;
148 |                 case "es":
149 |                     analyzer = new SpanishAnalyzer(stopset);
150 |                     break;
151 |                 case "fa":
152 |                     analyzer = new PersianAnalyzer(stopset);
153 |                     break;
154 |                 case "fi":
155 |                     analyzer = new FinnishAnalyzer(stopset);
156 |                     break;
157 |                 case "fr":
158 |                     analyzer = new FrenchAnalyzer(stopset);
159 |                     break;
160 |                 case "hu":
161 |                     analyzer = new HungarianAnalyzer(stopset);
162 |                     break;
163 |                 case "it":
164 |                     analyzer = new ItalianAnalyzer(stopset);
165 |                     break;
166 |                 case "nl":
167 |                     analyzer = new DutchAnalyzer(stopset);
168 |                     break;
169 |                 case "pt":
170 |                     analyzer = new PortugueseAnalyzer(stopset);
171 |                     break;
172 |                 case "ru":
173 |                     analyzer = new RussianAnalyzer(stopset);
174 |                     break;
175 |                 case "sv":
176 |                     analyzer = new SwedishAnalyzer(stopset);
177 |                     break;
178 |                 default:
179 |                     throw new UnsupportedOperationException("Language not supported yet");
180 |             }
181 | 
182 |         }
183 | 
184 |         return analyzer;
185 | 
186 |     }
187 | 
188 | 
189 | }
190 | 


--------------------------------------------------------------------------------
/systems/lucene/clef/src/main/java/it/unipd/dei/ims/lucene/clef/App.java:
--------------------------------------------------------------------------------
 1 | package it.unipd.dei.ims.lucene.clef;
 2 | 
 3 | /*
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *     http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | import it.unipd.dei.ims.lucene.clef.applications.BatchRetrieval;
21 | import it.unipd.dei.ims.lucene.clef.applications.BuildIndex;
22 | 
23 | /**
24 |  * Functionalities for CLEF Test collections indexing and batch retrieval.
25 |  */
26 | public class App {
27 | 
28 |     public static void main(String [] args){
29 | 
30 |         if (args.length==1){
31 | 
32 |             String option = args[0].toLowerCase();
33 |             switch (option){
34 |                 case "-i" :
35 |                     BuildIndex.main(args);
36 |                     break;
37 |                 case "-r" :
38 |                     BatchRetrieval.main(args);
39 |                     break;
40 |                 default:
41 |                     System.out.println("Supported options:");
42 |                     printHelp();
43 |             }
44 | 
45 |         } else {
46 | 
47 |             System.out.println("One of the following option should be used:");
48 |             printHelp();
49 | 
50 |         }
51 | 
52 |     }
53 | 
54 |     private static void printHelp(){
55 | 
56 |         System.out.println("-h for this help");
57 |         System.out.println("-i for indexing");
58 |         System.out.println("-r for batch retrieval");
59 |     }
60 | 
61 | 
62 | 
63 | }
64 | 


--------------------------------------------------------------------------------
/systems/lucene/clef/src/main/java/it/unipd/dei/ims/lucene/clef/parser/ClefDocParser.java:
--------------------------------------------------------------------------------
 1 | package it.unipd.dei.ims.lucene.clef.parser;
 2 | 
 3 | /*
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *     http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | import org.apache.lucene.benchmark.byTask.feeds.DocData;
21 | import org.apache.lucene.benchmark.byTask.feeds.TrecContentSource;
22 | import org.apache.lucene.benchmark.byTask.feeds.TrecDocParser;
23 | 
24 | import java.io.IOException;
25 | 
26 | /**
27 |  * Parser for the CLEF test collections.
28 |  */
29 | public class ClefDocParser extends TrecDocParser {
30 | 
31 | 
32 |     @Override
33 |     public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
34 |                          StringBuilder docBuf, ParsePathType pathType) throws IOException {
35 |         int mark = 0; // that much is skipped
36 |         docData.clear();
37 |         docData.setName(name);
38 |         docData.setBody(stripTags(docBuf, mark).toString());
39 |         return docData;
40 |     }
41 | 
42 | }
43 | 
44 | 


--------------------------------------------------------------------------------
/systems/lucene/clef/src/main/java/it/unipd/dei/ims/lucene/clef/parser/ClefQQParser.java:
--------------------------------------------------------------------------------
 1 | package it.unipd.dei.ims.lucene.clef.parser;
 2 | 
 3 | /*
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *     http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | import it.unipd.dei.ims.lucene.clef.AnalyzerFactory;
21 | import org.apache.lucene.analysis.Analyzer;
22 | import org.apache.lucene.analysis.util.CharArraySet;
23 | import org.apache.lucene.benchmark.quality.QualityQuery;
24 | import org.apache.lucene.benchmark.quality.QualityQueryParser;
25 | import org.apache.lucene.queryparser.classic.ParseException;
26 | import org.apache.lucene.queryparser.classic.QueryParser;
27 | import org.apache.lucene.queryparser.classic.QueryParserBase;
28 | import org.apache.lucene.search.BooleanClause;
29 | import org.apache.lucene.search.BooleanQuery;
30 | import org.apache.lucene.search.Query;
31 | 
32 | /**
33 |  * Parser for {@link QualityQuery}.
34 |  */
35 | public class ClefQQParser implements QualityQueryParser {
36 | 
37 |     private String fieldToSearch;
38 |     private String qqFields[];
39 |     private String language;
40 |     private String stemmer;
41 |     private CharArraySet stopset;
42 | 
43 |     ThreadLocal<QueryParser> queryParser = new ThreadLocal<>();
44 | 
45 |     public ClefQQParser(
46 |             String qqFields[],
47 |             String fieldToSearch,
48 |             String language,
49 |             String stemmer,
50 |             CharArraySet stopset) {
51 |         this.qqFields = qqFields;
52 |         this.fieldToSearch = fieldToSearch;
53 |         this.language=language;
54 |         this.stemmer=stemmer;
55 |         this.stopset=stopset;
56 |     }
57 | 
58 |     public ClefQQParser(
59 |             String qqField,
60 |             String fieldToSearch,
61 |             String language,
62 |             String stemmer,
63 |             CharArraySet stopset
64 |     ) {
65 |         this(new String[] { qqField }, fieldToSearch, language, stemmer, stopset);
66 |     }
67 | 
68 |     @Override
69 |     public Query parse(QualityQuery qq) throws ParseException {
70 |         QueryParser qp = queryParser.get();
71 |         if (qp==null) {
72 |             Analyzer analyzer = AnalyzerFactory.createAnalyzer(
73 |                     language,
74 |                     stemmer,
75 |                     stopset
76 |             );
77 |             qp = new QueryParser(fieldToSearch, analyzer);
78 |             queryParser.set(qp);
79 |         }
80 |         BooleanQuery bq = new BooleanQuery();
81 |         for (int i = 0; i < qqFields.length; i++)
82 |             bq.add(qp.parse(QueryParserBase.escape(qq.getValue(qqFields[i]))), BooleanClause.Occur.SHOULD);
83 | 
84 |         return bq;
85 |     }
86 | }
87 | 


--------------------------------------------------------------------------------
/systems/lucene/clef/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <configuration>
 3 | 
 4 |     <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
 5 |         <layout class="ch.qos.logback.classic.PatternLayout">
 6 |             <Pattern>
 7 |                 %d{yyyy-MM-dd HH:mm:ss} [%thread] %-5level %logger{36} - %msg%n
 8 |             </Pattern>
 9 |         </layout>
10 |     </appender>
11 | 
12 |     <logger name="it.unipd.dei.ims.lucene.clef" level="info"
13 |             additivity="false">
14 |         <appender-ref ref="STDOUT" />
15 |     </logger>
16 | 
17 |     <root level="error">
18 |         <appender-ref ref="STDOUT" />
19 |     </root>
20 | 
21 | </configuration>


--------------------------------------------------------------------------------
/systems/lucene/clef/src/main/resources/lucene-clef.properties:
--------------------------------------------------------------------------------
  1 | ####################
  2 | ### CLEF CORPORA ###
  3 | ####################
  4 | 
  5 | ## BULGARIAN ##
  6 | bg.corpora=SEGA2002;STANDART2002
  7 | SEGA2002.encoding=UTF-8
  8 | STANDART2002.encoding=UTF-8
  9 | bg.analyzerClass=org.apache.lucene.analysis.bg.BulgarianAnalyzer
 10 | bg.corpus.size=69195
 11 | 
 12 | 
 13 | ## GERMAN ##
 14 | de.corpora=FRANKFURTER1994;SDA1994;SPIEGEL1994;SPIEGEL1995
 15 | FRANKFURTER1994.encoding=ISO-8859-1
 16 | SDA1994.encoding=ISO-8859-1
 17 | SPIEGEL1994.encoding=ISO-8859-1
 18 | SPIEGEL1995.encoding=ISO-8859-1
 19 | de.analyzerClass=org.apache.lucene.analysis.de.GermanAnalyzer
 20 | de.corpus.size=225371
 21 | 
 22 | 
 23 | ## SPANISH ##
 24 | es.corpora=EFE1994;EFE1995
 25 | EFE1994.encoding=ISO-8859-1
 26 | EFE1995.encoding=ISO-8859-1
 27 | es.analyzerClass=org.apache.lucene.analysis.es.SpanishAnalyzer
 28 | es.corpus.size=454045
 29 | 
 30 | 
 31 | ## PERSIAN ##
 32 | fa.corpora=HAMSHAHRI
 33 | HAMSHAHRI.encoding=UTF-8
 34 | fa.analyzerClass=org.apache.lucene.analysis.fa.PersianAnalyzer
 35 | fa.corpus.size=166774
 36 | 
 37 | 
 38 | ## FINNISH ##
 39 | fi.corpora=AAMULEHTI1994;AAMULEHTI1995
 40 | AAMULEHTI1994.encoding=ISO-8859-1
 41 | AAMULEHTI1995.encoding=ISO-8859-1
 42 | fi.analyzerClass=org.apache.lucene.analysis.fi.FinnishAnalyzer
 43 | fi.corpus.size=55344
 44 | 
 45 | 
 46 | ## FRENCH ##
 47 | fr.corpora=LEMONDE1994;LEMONDE1995;ATS1994;ATS1995
 48 | LEMONDE1994.encoding=ISO-8859-1
 49 | LEMONDE1995.encoding=ISO-8859-1
 50 | ATS1994.encoding=ISO-8859-1
 51 | ATS1995.encoding=ISO-8859-1
 52 | fr.analyzerClass=org.apache.lucene.analysis.fr.FrenchAnalyzer
 53 | fr.corpus.size=177452
 54 | 
 55 | 
 56 | ## HUNGARIAN ##
 57 | hu.corpora=MAGYAR2002
 58 | MAGYAR2002.encoding=UTF-8
 59 | hu.analyzerClass=org.apache.lucene.analysis.hu.HungarianAnalyzer
 60 | hu.corpus.size=49530
 61 | 
 62 | 
 63 | ## ITALIAN ##
 64 | it.corpora=AGZ1994;AGZ1995;LASTAMPA1994
 65 | AGZ1994.encoding=ISO-8859-1
 66 | AGZ1995.encoding=ISO-8859-1
 67 | LASTAMPA1994.encoding=US-ASCII
 68 | it.analyzerClass=org.apache.lucene.analysis.it.ItalianAnalyzer
 69 | it.corpus.size=157558
 70 | 
 71 | 
 72 | ## DUTCH ##
 73 | nl.corpora=ALGEMEEN1994;ALGEMEEN1995;NRC1994;NRC1995
 74 | ALGEMEEN1994.encoding=ISO-8859-1
 75 | ALGEMEEN1995.encoding=ISO-8859-1
 76 | NRC1994.encoding=ISO-8859-1
 77 | NRC1995.encoding=ISO-8859-1
 78 | nl.analyzerClass=org.apache.lucene.analysis.nl.DutchAnalyzer
 79 | nl.corpus.size=190604
 80 | 
 81 | 
 82 | ## PORTUGUESE ##
 83 | pt.corpora=FOLHA1994;FOLHA1995;PUBLICO1994;PUBLICO1995
 84 | FOLHA1994.encoding=ISO-8859-1
 85 | FOLHA1995.encoding=ISO-8859-1
 86 | PUBLICO1994.encoding=ISO-8859-1
 87 | PUBLICO1995.encoding=ISO-8859-1
 88 | pt.analyzerClass=org.apache.lucene.analysis.pt.PortugueseAnalyzer
 89 | pt.corpus.size=210734
 90 | 
 91 | 
 92 | ## RUSSIAN ##
 93 | ru.corpora=IZVESTIA1995
 94 | IZVESTIA1995.encoding=UTF-8
 95 | ru.analyzerClass=org.apache.lucene.analysis.ru.RussianAnalyzer
 96 | ru.corpus.size=16716
 97 | 
 98 | 
 99 | ## SWEDISH ##
100 | sv.corpora=TT1994;TT1995
101 | TT1994.encoding=UTF-8
102 | TT1995.encoding=UTF-8
103 | sv.analyzerClass=org.apache.lucene.analysis.sv.SwedishAnalyzer
104 | sv.corpus.size=142819


--------------------------------------------------------------------------------
/systems/lucene/clef/src/test/java/it/unipd/dei/ims/lucene/clef/parser/ClefQQParserTest.java:
--------------------------------------------------------------------------------
 1 | package it.unipd.dei.ims.lucene.clef.parser;
 2 | 
 3 | import it.unipd.dei.ims.lucene.clef.applications.BatchRetrieval;
 4 | import org.apache.lucene.benchmark.quality.QualityQuery;
 5 | import org.junit.Test;
 6 | 
 7 | import java.io.File;
 8 | 
 9 | /**
10 |  * Test of the parser for the CLEF Topics.
11 |  *
12 |  * @author Emanuele Di Buccio
13 |  */
14 | public class ClefQQParserTest {
15 | 
16 |     public static String [] langs = {
17 |             "bg",
18 |             "de",
19 |             "es",
20 |             "fa",
21 |             "fi",
22 |             "fr",
23 |             "hu",
24 |             "it",
25 |             "nl",
26 |             "pt",
27 |             "ru",
28 |             "sv"
29 |     };
30 | 
31 |     public static String [] topicFields = {
32 |             "title",
33 |             "description"
34 |     };
35 | 
36 |     @Test
37 |     public void testClefTopicParser(){
38 | 
39 |         for (String lang : langs){
40 | 
41 |             ClassLoader classLoader = getClass().getClassLoader();
42 |             File topicFile = new File(classLoader.getResource("topics/"+lang+"_topics.xml").getFile());
43 | 
44 |             try {
45 |                 QualityQuery[] qqs = BatchRetrieval.getQualityQueries(topicFile.getAbsolutePath(), topicFields);
46 |                 System.out.println("LANGUAGE: "+lang);
47 |                 for (QualityQuery qq : qqs){
48 |                     System.out.println(qq.getQueryID());
49 |                 }
50 | 
51 |             } catch (Exception e) {
52 |                 e.printStackTrace();
53 |             }
54 | 
55 |         }
56 | 
57 |     }
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/systems/lucene/clef/src/test/resources/topics/bg_topics.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <topics>
 3 | <topic>
 4 | <identifier>251-AH</identifier>
 5 | <title>Алтернативна медицина</title>
 6 | <description>Намерете документи, които дискутират някакъв вид алтернативно или природно медицинско лечение, включително специфичните терапии, като акупунктура, хомеопатия, хиропраксис и др.</description>
 7 | <narrative>Подходящите документи ще осигурят обща или специфична информация относно употребата на естествени или алтернативни медицински лечения и практики.</narrative>
 8 | </topic>
 9 | <topic>
10 | <identifier>252-AH</identifier>
11 | <title>Пенсионните схеми в Европа</title>
12 | <description>Намерете документи, които дават информация за съвременните пенсионни системи и привилегии в европейска държава.</description>
13 | <narrative>Подходящите документи ще съдържат информация за съвременни пенсионни схеми и привилегии в отделните европейски страни. Интересуващите ни документи включват минималната и максималната възраст за пенсиониране и начина, по който пенсионният доход се изчислява. Документи, които дискутират бъдещи реформи в пенсионното дело, не са подходящи.</narrative>
14 | </topic>
15 | <topic>
16 | <identifier>253-AH</identifier>
17 | <title>Държави, в които е разрешено смъртното наказание</title>
18 | <description>В кои държави или щати смъртното наказание се практикува или поне е разрешено от конституцията?</description>
19 | <narrative>Явно е, че документи, които експлицитно декларират, че конституцията на даден щат разрешава смъртно наказание, са подходящи. Документите, които споменават конкретни смъртни присъди също са подходящи, но при условие, че споменават и щата или страната експлицитно.</narrative>
20 | </topic>
21 | </topics>


--------------------------------------------------------------------------------
/systems/lucene/clef/src/test/resources/topics/de_topics.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <topics>
 3 | <topic>
 4 | <identifier>41-AH</identifier>
 5 | <title>Pestizide in Babykost</title>
 6 | <description>Berichte über Pestizide in Babynahrung sind gesucht.</description>
 7 | <narrative>Die Dokumente informieren über die Entdeckung von Pestiziden in Babynahrung. Sie berichten über die verschiedenen Marken, Supermärkte und Firmen, die Babykost mit Pestiziden angeboten haben. Sie berichten auch über Maßnahmen gegen die Verunreinigung von Babynahrung mit Pestiziden.</narrative>
 8 | </topic>
 9 | <topic>
10 | <identifier>42-AH</identifier>
11 | <title>UN/US-Invasion Haitis</title>
12 | <description>UNO/USA entsenden Truppen nach Haiti.</description>
13 | <narrative>Die Dokumente berichten sowohl über die Diskussion über die Entscheidung zur Entsendung von Blauhelmtruppen der USA als auch über die Invasion selbst. Sie informieren auch über die unmittelbaren Folgen des Einmarschs.</narrative>
14 | </topic>
15 | <topic>
16 | <identifier>43-AH</identifier>
17 | <title>El Niño und das Wetter</title>
18 | <description>Suche Berichte, die das Phänomen El Niño und seine Auswirkungen auf das Weltwetter (einschließlich seiner Folgen für Temperatur, Luftdruck, Niederschlag usw.) erklären.</description>
19 | <narrative>Relevante Dokumente informieren über die Auswirkungen von El Niño. Im Hinblick auf dieses Phänomen ist die Wechselwirkung von Weltmeeren und Erdatmosphäre von Interesse. Besonders wichtig ist die Rolle von El Niño im Südpazifik aufgrund dessen Einfluss auf das Weltklima.</narrative>
20 | </topic>
21 | </topics>


--------------------------------------------------------------------------------
/systems/lucene/clef/src/test/resources/topics/es_topics.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <topics>
 3 | <topic>
 4 | <identifier>41-AH</identifier>
 5 | <title>Pesticidas en alimentos para bebes</title>
 6 | <description>Encontrar noticias sobre pesticidas en alimentos para bebes.</description>
 7 | <narrative>Los documentos relevantes proporcionan información sobre el descubrimiento de pesticidas en alimentos para bebes. Se informa sobre diferentes marcas, supermercados y compañías que ofrecieron alimentos para bebes que contenian pesticidas. Se discuten también medidas contra la contaminación de alimentos para bebes con pesticidas.</narrative>
 8 | </topic>
 9 | <topic>
10 | <identifier>42-AH</identifier>
11 | <title>Naciones Unidas y Estados Unidos invaden Haití</title>
12 | <description>Encontrar documentos sobre la invasión de Haití por los soldados de la ONU y de los Estados Unidos.</description>
13 | <narrative>Los documentos comentan tanto la discusión sobre la decisión de la ONU de enviar las tropas americanas a Haití, como la invasión misma. Se habla también de sus consecuencias directas.</narrative>
14 | </topic>
15 | <topic>
16 | <identifier>43-AH</identifier>
17 | <title>"El Niño" y el tiempo</title>
18 | <description>Encontrar noticias que expliquen el fenómeno de "El Niño" y su repercusión en el clima del planeta (incluidos los efectos que tiene sobre la temperatura, presión atmosférica, precipitaciones, etc.).</description>
19 | <narrative>Los documentos relevantes proporcionarán información sobre los efectos de "El Niño". Las interacciones entre los océanos y la atmósfera de la Tierra son interesantes en relación con este fenómeno. "El Niño" es especialmente importante en el Pacífico Sur debido a su influencia sobre el clima mundial.</narrative>
20 | </topic>
21 | </topics>


--------------------------------------------------------------------------------
/systems/lucene/clef/src/test/resources/topics/fa_topics.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <topics>
 3 | <topic>
 4 | <identifier>551-AH</identifier>
 5 | <title>تنيس جام ويمبلدون</title>
 6 | <description>نام برندگان جام</description>
 7 | <narrative>فاتحان مرد و زن جام ويمبلدون چه كساني بوده اند</narrative>
 8 | </topic>
 9 | <topic>
10 | <identifier>552-AH</identifier>
11 | <title>بازار بورس تهران</title>
12 | <description>نمونه هايي از شاخص هاي بازار بورس تهران</description>
13 | <narrative>نوسانات شاخص هاي بازار بورس، سهام صدرنشين در اين بازار، مسايل و چالشهاي احتمالي كه بازار با آن مواجه بوده است</narrative>
14 | </topic>
15 | <topic>
16 | <identifier>553-AH</identifier>
17 | <title>جام جهاني 2002</title>
18 | <description>برنده جام جهاني فوتبال در سال 2002</description>
19 | <narrative>آيا ايران به اين جام راه يافت؟ چه رتبه اي در آسيا و جام به دست آورد؟ تيمهاي راه يافته به مرحله نهايي مربوط به كدام كشورها بودند؟</narrative>
20 | </topic>
21 | </topics>


--------------------------------------------------------------------------------
/systems/lucene/clef/src/test/resources/topics/fi_topics.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <topics>
 3 | <topic>
 4 | <identifier>92-AH</identifier>
 5 | <title>Irakin vastaiset pakotteet</title>
 6 | <description>Mihin toimenpiteisiin Irak on ryhtynyt poistaakseen YK:n talouspakotteet? Pakotteet astuivat voimaan seurauksena Irakin hyökkäyksestä Kuwaitiin vuonna 1990.</description>
 7 | <narrative>Dokumenttien tulee sisältää tietoa toimista, joilla Irak on yrittänyt poistaa pakotteet. Ainoastaan pakotteita koskevat selitykset tai retoriikka sanktioita vastaan eivät ole relevantteja. Kuwaitiin kohdistunutta hyökkäystä pahoittelevat viralliset anteeksipyynnöt Irakin taholta ovat myös relevantteja.</narrative>
 8 | </topic>
 9 | <topic>
10 | <identifier>94-AH</identifier>
11 | <title>Solzhenitsynin paluu.</title>
12 | <description>Etsi dokumentteja jotka kertovat kirjallisuuden Nobel-palkinnon voittajan Solzhenitsynin paluusta Venäjälle.</description>
13 | <narrative>Relevanteissa dokumenteissa kerrotaan Solzhenitsynin Venäjälle paluun syistä ja ajankohdasta. Dokumenteissa voidaan myös keskustella syistä hänen maahanmuuttoonsa USA:han.</narrative>
14 | </topic>
15 | <topic>
16 | <identifier>95-AH</identifier>
17 | <title>Palestiinan konflikti</title>
18 | <description>Etsi artikkeleita, jotka käsittelevät aseellisia konflikteja Palestiinalaisalueilla ja joissa osapuolena on siviiliväestöä.</description>
19 | <narrative>Relevanteissa dokumenteissa on tietoa äskeisistä tapahtumista israelilais-paletiinalaisessa konfliktissa. Raportit, joissa kerrotaan itsemurhaiskuista otetaan lukuun.</narrative>
20 | </topic>
21 | </topics>
22 | 


--------------------------------------------------------------------------------
/systems/lucene/clef/src/test/resources/topics/fr_topics.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <topics>
 3 | <topic>
 4 | <identifier>251-AH</identifier>
 5 | <title>Médecines douces</title>
 6 | <description>Trouver des documents parlant de toutes sortes de traitements médicaux alternatifs ou naturels comprenant des thérapies spécifiques telles que l'acupuncture, l'homéopathie, la chiropractie, entre autres.</description>
 7 | <narrative>Les documents pertinents devront fournir des informations générales ou spécifiques sur les traitements ou les pratiques des médecines dites douces, alternatives ou naturelles.</narrative>
 8 | </topic>
 9 | <topic>
10 | <identifier>252-AH</identifier>
11 | <title>Régimes de retraite en Europe</title>
12 | <description>Trouver des documents donnant des informations sur les régimes et indemnités de retraite aujourd'hui dans n'importe quel pays européen.</description>
13 | <narrative>Les documents pertinents fourniront des informations sur les régimes et indemnités de retraite dans un pays européen donné. Les documents pertinents devront contenir l'âge requis (au minimum et au maximum) pour bénéficier de la retraite ainsi que le mode de calcul du montant de la pension. Les documents évoquant de futurs projets de réformes ne sont pas pertinents.</narrative>
14 | </topic>
15 | <topic>
16 | <identifier>253-AH</identifier>
17 | <title>Pays appliquant la peine de mort</title>
18 | <description>Dans quels pays ou Etats la peine de mort est-elle toujours appliquée ou du moins autorisée par la Constitution ?</description>
19 | <narrative>Les documents pertinents doivent expliquer que la Constitution d'un Etat donné autorise la peine de mort. Les documents rapportant explicitement des condamnations à la peine capitale sont pertinents si l'Etat ou le pays concerné est clairement mentionné.</narrative>
20 | </topic>
21 | </topics>
22 | 


--------------------------------------------------------------------------------
/systems/lucene/clef/src/test/resources/topics/hu_topics.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <topics>
 3 |     <topic>
 4 |         <identifier>251-AH</identifier>
 5 |         <title>Alternatív gyógyászat</title>
 6 |         <description>Keressünk cikkeket alternatív vagy természetes gyógymódokról ill. speciális       terápiákról, mint például az akupunktúra, a homeopátia vagy a csontkovácsolás.</description>
 7 |         <narrative>A megfelelő cikkek a természetes vagy alternatív gyógyászati kezelésekről vagy       praktikákról írnak általában vagy konkrétan.</narrative>
 8 |     </topic>
 9 |     <topic>
10 |         <identifier>252-AH</identifier>
11 |         <title>Európai nyugdíjrendszerek</title>
12 |         <description>Keressünk cikkeket, melyek valamely európai ország jelenlegi nyugdíjrendszeréről       vagy nyugdíjasokra vonatkozó járulékairól szólnak.</description>
13 |         <narrative>A megfelelő cikkek az egyes európai országok nyugdíjrendszeréről, illetve a       nyugdíjaskorúakra vonatkozó járulékokról tartalmaznak információkat. Számunkra érdekes       adat a nyugdíjba vonulás lehetséges alsó, illetve kötelező felső korhatára, valamint a       nyugdíj mértékének kiszámítási módja is. A jövőbeni nyugdíjreformokra vonatkozó tervek       nem érdekesek.</narrative>
14 |     </topic>
15 |     <topic>
16 |         <identifier>253-AH</identifier>
17 |         <title>A halálbüntetés elterjedtsége</title>
18 |         <description>Mely országokban vagy államokban alkalmazzák még a halálbüntetést ill. teszi       lehetővé annak alkalmazását az alkotmány?</description>
19 |         <narrative>Nem csak azok a cikkek relevánsak, amelyekben tételesen szerepel, hogy az adott       ország vagy állam alkotmánya lehetővé teszi a halálbüntetést, hanem azok is, amelyek       olyan konkrét esetekről szólnak (az ország vagy állam nevének feltüntetésével),       amelyekben halálbüntetést róttak ki.</narrative>
20 |     </topic>
21 | </topics>


--------------------------------------------------------------------------------
/systems/lucene/clef/src/test/resources/topics/it_topics.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <topics>
 3 |     <topic>
 4 |         <identifier>41-AH</identifier>
 5 |         <title>Pesticidi negli alimenti per bambini</title>
 6 |         <description>Trova documenti che parlano dei pesticidi negli alimenti per bambini.</description>
 7 |         <narrative>I documenti rilevanti forniscono informazioni sulla scoperta di pesticidi nei cibi per bambini. Riportano i diversi marchi, i supermercati e le ditte che hanno venduto alimenti per bambini con i pesticidi. Sono anche rilevanti i documenti che discutono le misure contro la contaminazione degli alimenti per bambini con i pesticidi.</narrative>
 8 |     </topic>
 9 |     <topic>
10 |         <identifier>42-AH</identifier>
11 |         <title>Invasione ONU/USA di Haiti</title>
12 |         <description>Reperisci documenti sull'invasione di Haiti da parte delle truppe ONU/USA.</description>
13 |         <narrative>I documenti rilevanti contengono sia le discussioni relative alla decisione di spedire i caschi blu degli Stati Uniti d'America a Haiti, sia dell'invasione stessa. Sono rilevanti anche le conseguenze dirette dell'invasione.</narrative>
14 |     </topic>
15 |     <topic>
16 |         <identifier>44-AH</identifier>
17 |         <title>Indurain vince il Tour</title>
18 |         <description>Reazioni al quarto Tour de France vinto da Miguel Indurain.</description>
19 |         <narrative>I documenti rilevanti commentano le reazioni alla quarta vittoria consecutiva di Miguel Indurain al Tour de France. Sono anche rilevanti i documenti che discutono dell'importanza di questo ciclista nel ciclismo mondiale dopo questa vittoria.</narrative>
20 |     </topic>
21 |  </topics>


--------------------------------------------------------------------------------
/systems/lucene/clef/src/test/resources/topics/nl_topics.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <topics>
 3 |     <topic>
 4 |         <identifier>41-AH</identifier>
 5 |         <title>Pesticide in babyvoeding</title>
 6 |         <description>Zoek naar documenten over pesticide in babyvoeding.</description>
 7 |         <narrative>Deze documenten geven informatie over ontdekkingen van pesticide in babyvoeding. Het gaat hierbij om producenten, merken en supermarkten die verontreinigde voeding hebben aangeboden. De informatie gaat ook over de maatregelen die tegen de verontreiniging van babyvoeding met pesticide zijn genomen.</narrative>
 8 |     </topic>
 9 |     <topic>
10 |         <identifier>42-AH</identifier>
11 |         <title>UN/US invasie op Haïti</title>
12 |         <description>UNO/USA sturen troepen naar Haïti.</description>
13 |         <narrative>Deze documenten geven niet alleen informatie over de invasie maar ook over de discussies en beslissingen van de UN die ten grondslag lagen aan het sturen van Amerikaanse blauwhelmen naar Haïti. Ook wordt informatie gegeven over de directe gevolgen van de invasie.</narrative>
14 |     </topic>
15 |     <topic>
16 |         <identifier>43-AH</identifier>
17 |         <title>El Niño en de voorspelling van het weer</title>
18 |         <description>Zoek documenten die een verklaring geven voor het fenomeen El Niño en de weerslag daarvan op het weer op aarde (inclusief het effect op temperatuur, atmosferische druk, neerslag, enz.).</description>
19 |         <narrative>Relevante documenten dienen informatie te bevatten over de gevolgen van het fenomeen El Niño. De wisselwerkingen tussen de oceanen en de aardatmosfeer die verband houden met het fenomeen El Niño komen eveneens in aanmerking. El Niño is vooral van belang in het zuiden van de Stille Oceaan omwille van de invloed die deze uitoefent op het klimaat op aarde.</narrative>
20 |     </topic>
21 | </topics>


--------------------------------------------------------------------------------
/systems/lucene/clef/src/test/resources/topics/pt_topics.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <topics>
 3 |     <topic>
 4 |         <identifier>251-AH</identifier>
 5 |         <title>Medicina alternativa</title>
 6 |         <description>Encontrar documentos sobre tratamentos que empreguem medicina natural ou alternativa. Aqui são incluídas terapias como a acupuntura, a hemopatia, a quiroprática, entre outras.</description>
 7 |         <narrative>Documentos relevantes devem fornecer informação, específica ou genérica, sobre o uso de tratamentos ou técnicas de medicina natural ou alternativa.</narrative>
 8 |     </topic>
 9 |     <topic>
10 |         <identifier>252-AH</identifier>
11 |         <title>Sistemas de reforma e pensões na Europa</title>
12 |         <description>Encontrar documentos sobre os esquemas de pensões e benefícios na reforma em qualquer país europeu.</description>
13 |         <narrative>Documentos relevantes devem conter informação sobre os actuais esquemas de pensões em estados europeus individuais. Informação de interesse engloba as idades de reforma mínima e máxima, assim como a forma de calcular o valor das pensões de reforma. Planos de reformulação dos esquemas de reforma não são relevantes.</narrative>
14 |     </topic>
15 |     <topic>
16 |         <identifier>253-AH</identifier>
17 |         <title>Países com pena de morte</title>
18 |         <description>Em quais países ainda é usada, ou pelo menos constitucionalmente permitida, a pena de morte?</description>
19 |         <narrative>Documentos que afirmem de forma explícita que a constituição de um dado estado permite a pena capital são obviamente relevantes. Documentos de relatos de sentenças de morte específicas também são relevantes caso o estado ou país em questão seja explicitamente mencionado.</narrative>
20 |     </topic>
21 | </topics>


--------------------------------------------------------------------------------
/systems/lucene/clef/src/test/resources/topics/ru_topics.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <topics>
 3 |     <topic>
 4 |         <identifier>143-AH</identifier>
 5 |         <title>Конференция по положению Женщин в Пекине</title>
 6 |         <description>Спорные позиции ряда делегаций поставили Всемирную Конференцию по положению Женщин в Пекине на гране провала.</description>
 7 |         <narrative>В релевантных документах должны обсуждаться любые из многочисленных проблем и разногласий, возникших в связи с Конференцией по положению Женщин в Пекине. Особый интерес представляют позиции представителей Ватикана, мусульманских общин и Китайской Коммунистической Партии.</narrative>
 8 |     </topic>
 9 |     <topic>
10 |         <identifier>147-AH</identifier>
11 |         <title>Нефтяные аварии и Птицы</title>
12 |         <description>Найти документы, в которых описывается вред и урон, наносимый птицам в результате утечек нефти или загрязнения.</description>
13 |         <narrative>Релевантны документы, в которых говорится об уроне, наносимом птицам в результате аварийных утечек нефти. Сообщения о вреде, наносимом выбросами трюмной воды или выбросами нефти не релевантны.</narrative>
14 |     </topic>
15 |     <topic>
16 |         <identifier>148-AH</identifier>
17 |         <title>Разрушение озонового слоя</title>
18 |         <description>Какие озоновые дыры не вызваны загрязнением воздуха?</description>
19 |         <narrative>Не все повреждения озонового слоя вызваны загрязнением воздуха. В релевантных документах должно говориться об иных причинах появления дыр в озоновом слое.</narrative>
20 |     </topic>
21 | </topics>


--------------------------------------------------------------------------------
/systems/lucene/clef/src/test/resources/topics/sv_topics.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <topics>
 3 |     <topic>
 4 |         <identifier>91-AH</identifier>
 5 |         <title>Amnesty International i Latinamerika</title>
 6 |         <description>Rapporter från Amnesty International om mänskliga rättigheter i Latinamerika.</description>
 7 |         <narrative>Relevanta dokument bör ge läsarna information om Amnesty Internationals rapporter beträffande mänskliga rättigheter i Latinamerika eller om reaktioner på dessa rapporter.</narrative>
 8 |     </topic>
 9 |     <topic>
10 |         <identifier>92-AH</identifier>
11 |         <title>FN-sanktioner mot Irak</title>
12 |         <description>Vilka åtgärder har Irak vidtagit för att FN ska upphäva det ekonomiska embargo och de politiska sanktioner som landet belagts med efter invasionen av Kuwait 1990?</description>
13 |         <narrative>Dokumenten måste innehålla olika åtgärder från Iraks sida för att försöka få sanktionerna upphävda. Enbart beskrivningar av sanktionerna eller retorik emot sanktionerna är inte relevanta. Beklaganden från officiellt irakiskt håll av invasionen av Kuwait är relevanta.</narrative>
14 |     </topic>
15 |     <topic>
16 |         <identifier>93-AH</identifier>
17 |         <title>Eurofighter</title>
18 |         <description>Sök efter dokument som rapporterar om projektet EFA eller projektet "Eurofighter".</description>
19 |         <narrative>Relevanta dokument redogör för projektet EFA. Samarbetsländerna Tyskland, Storbritannien, Italien och Spanien har inlett ett "Eurofighter"-konsortium. Information om arbetsfördelning mellan de ingående försvarskoncernerna samt även kostnadsuppskattningar är av intresse.</narrative>
20 |     </topic>
21 | </topics>


--------------------------------------------------------------------------------
/systems/lucene/clef_experiments.sh:
--------------------------------------------------------------------------------
  1 | #! /bin/sh
  2 | 
  3 | source ../common.sh
  4 | 
  5 | # This script allows experiments based on Apache Lucene and CLEF collections to be replicated.
  6 | 
  7 | ## declare the langs array
  8 | declare -a langs=("bg" "de" "es" "fa" "fi" "fr" "hu" "it" "nl" "pt" "ru" "sv");
  9 | 
 10 | ## declare the rank models array
 11 | declare -a models=("BM25");
 12 | 
 13 | topicFields="title;description";
 14 | 
 15 | if [ "$1" == "-h" ]; then
 16 |   printf "Usage: `basename $0` \n\n" >&2
 17 |   printf "Input parameters: \n" >&2
 18 |   printf "'-l': the language expressed as ISO 639-1 (two-letter codes); e.g. nl for Dutch or it for Italian.\n" >&2
 19 |   echo "Valid languages for CLEF experiments are: ${langs[@]}";
 20 |   printf "'-cp': the path where the document collections are stored. \n" >&2
 21 |   printf "'-stm': the stemmer specification; y for 'yes' or n for 'no'. \n" >&2
 22 |   printf "'-sl': the stopword list specification; y for 'yes' or n for 'no'. \n" >&2
 23 |   printf "'-r': the ranking model specification; e.g. BM25 for bm25. \n" >&2
 24 |   echo "Allowed models for CLEF experiments and Apache Lucene 5.2.1 are: ${models[@]}";
 25 |   printf "'-v': verbose mode. \n" >&2
 26 |   exit 0
 27 | else
 28 | 	# check the input paramenters
 29 | 	if [ $# -lt 10 ]; then
 30 | 		echo "You must specify all the input parameters: -l, -cp, -stm, -sl, -r" 1>&2
 31 | 		exit 1
 32 | 	else
 33 | 		echo "Starting the execution.";
 34 | 		if [ "$1" == "-l" ]; then
 35 | 			lang=$2;
 36 | 			tmp=false;
 37 | 			for i in "${langs[@]}"
 38 | 			do
 39 | 			    if [ "$i" == "$lang" ] ; then
 40 | 			        tmp=true;
 41 | 			        break;
 42 | 			    fi
 43 | 			done
 44 | 
 45 | 			if  [ "$tmp" = false ]; then
 46 | 				printf "first parameter must be the language -l expressed as ISO 639-1 (two-letter codes)\n";
 47 |   				echo "Valid languages for CLEF experiments are: ${langs[@]}";
 48 |   				exit 1
 49 |   			fi
 50 |   				 
 51 | 		else
 52 | 			echo "The first parameter must be the language -l expressed as ISO 639-1 (two-letter codes)"
 53 | 			exit 1
 54 | 		fi
 55 | 
 56 | 		if [ "$3" == "-cp" ]; then
 57 | 			collection_path=$4;
 58 | 		else
 59 | 			echo "The second parameter must be the collection path -cp"
 60 | 			exit 1
 61 | 		fi
 62 | 
 63 | 		if [ "$5" == "-stm" ]; then
 64 | 			if [ "$6" == "y" ]; then
 65 | 				stm=true;
 66 | 			elif [ "$6" == "n" ]; then
 67 | 				stm=false;
 68 | 			else
 69 | 				printf "The value %s is not valid for -stm \n" "$6" >&2
 70 | 				printf "For the -stm parameter you must specify y for yes or n for no \n" >&2
 71 | 				exit 1
 72 | 			fi
 73 | 		else
 74 | 			echo "The third parameter must be stemmer specification -stm (y = yes; n = no)"
 75 | 			exit 1
 76 | 		fi	
 77 | 
 78 | 		if [ "$7" == "-sl" ]; then
 79 | 			if [ "$8" == "y" ]; then
 80 | 				sl=true;
 81 | 			elif [ "$8" == "n" ]; then
 82 | 				sl=false;
 83 | 			else
 84 | 				printf "The value %s is not valid for -sl \n" "$6" >&2
 85 | 				printf "For the -sl parameter you must specify y for yes or n for no \n" >&2
 86 | 				exit 1
 87 | 			fi
 88 | 		else
 89 | 			echo "The fourth parameter must be stopword list specification -sl (y = yes; n = no)"
 90 | 			exit 1
 91 | 		fi	
 92 | 
 93 | 		if [ "$9" == "-r" ]; then
 94 | 			rank_model=${10};
 95 | 
 96 | 			tmp=false;
 97 | 			for i in "${models[@]}"
 98 | 			do
 99 | 			    if [ "$i" == "$rank_model" ] ; then
100 | 			        tmp=true;
101 | 			        break;
102 | 			    fi
103 | 			done
104 | 
105 | 			if  [ "$tmp" = false ]; then
106 | 				printf "The last parameter must be the ranking model.\n";
107 |   				echo "Allowed models for CLEF experiments and Apache Lucene 5.2.1 are: ${models[@]}";
108 |   				exit 1
109 |   			fi
110 | 		else
111 | 			echo "The fourth parameter must be stopword list specification -sl (y = yes; n = no)"
112 | 			exit 1
113 | 		fi	
114 | 
115 | 		if [ "${11}" == "-v" ]; then
116 | 			verbose=true;
117 | 		else
118 | 			verbose=false;
119 | 		fi
120 | 		
121 | 	fi
122 | fi
123 | 
124 | LUCENE_PATH=$(pwd);
125 | cd ../../
126 | ROOT_PATH=$(pwd);
127 | cd $LUCENE_PATH
128 | 
129 | ## path to the topics file
130 | topics=$ROOT_PATH/topics-and-qrels/CLEF/topics/"$lang"_topics.xml;
131 | 
132 | ## path to the qrels file
133 | qrels=$ROOT_PATH/topics-and-qrels/CLEF/qrels/"$lang"_qrels.txt;
134 | 
135 | ## create required folders
136 | 
137 | mkdir -p $ROOT_PATH/results/CLEF/lucene/"$lang";
138 | 
139 | mkdir -p $ROOT_PATH/runs/CLEF/lucene/"$lang";
140 | 
141 | indexDir=$lang;
142 | 
143 | if [ "$sl" = true ]; then
144 | 	stopsetType="CUSTOM";
145 | 	stoplist=$ROOT_PATH/resources/CLEF/"$lang"_sl.txt;
146 | 	stoplistOpt="-Dstopset.type=$stopsetType -Dstopset.path=$stoplist";
147 | 	indexDir="$indexDir"_Stopword;
148 | else
149 | 	stoplistOpt="-Dstopset.type=\"EMPTY\"";
150 | fi
151 | 
152 | if [ "$stm" = true ]; then
153 | 	indexDir="$indexDir"_Stemmer;
154 | 	stemmer="DEFAULT";
155 | else
156 | 	stemmer="NONE";
157 | fi
158 | 
159 | runTag="$indexDir"_"$rank_model";
160 | 
161 | runFile=$ROOT_PATH/runs/CLEF/lucene/"$lang"/"$runTag".txt;
162 | 
163 | indexDir="${LUCENE_PATH}/indexes/$indexDir"; 
164 | 
165 | OPTIONS="-Dindex.path=$indexDir -Dcorpora.path=$collection_path -Dstemmer=$stemmer -Dlanguage=$lang $stoplistOpt";
166 | 
167 | 
168 | ## do the index, if it does not exist
169 | if [ -d "$indexDir" ]; then
170 | 	if  [ "$verbose" = true ]; then
171 | 		printf "The index already exists in %s \n" "$indexDir" >&2
172 | 	fi	
173 | else
174 | 	java -jar $OPTIONS clef/target/lucene-clef-1.0-jar-with-dependencies.jar -i
175 | fi
176 | 
177 | 
178 | if  [ "$verbose" = true ]; then
179 | 	printf "Performing the retrieval with the ranking model %s \n" "$rank_model" >&2
180 | fi
181 | 
182 | 
183 | ## do the retrieval
184 | 
185 | OPTIONS="$OPTIONS -Drun.model=$rank_model -Drun.path=$runFile -Drun.tag=$runTag -Dtopics.path=$topics -Dtopics.fields=$topicFields";
186 | 
187 | java -jar $OPTIONS clef/target/lucene-clef-1.0-jar-with-dependencies.jar -r
188 | 
189 | ${TREC_EVAL} -q -c -M1000 $qrels $runFile>${ROOT_PATH}/results/CLEF/lucene/"$lang"/"$runTag".txt
190 | 
191 | 


--------------------------------------------------------------------------------
/systems/lucene/clef_runs:
--------------------------------------------------------------------------------
 1 | de	y	y	BM25
 2 | es	y	y	BM25
 3 | fa	y	y	BM25
 4 | fr	y	y	BM25
 5 | hu	y	y	BM25
 6 | it	y	y	BM25
 7 | nl	y	y	BM25
 8 | ru	y	y	BM25
 9 | sv	y	y	BM25
10 | 


--------------------------------------------------------------------------------
/systems/lucene/dotgov2.sh:
--------------------------------------------------------------------------------
 1 | source ../common.sh
 2 | echo "Compiling ingester project..."
 3 | cd ingester
 4 | mvn clean compile assembly:single
 5 | cd ..
 6 | 
 7 | maxmemory="-Xmx15G"
 8 | 
 9 | echo "Starting indexing..."
10 | #rm -rf gov2.lucene
11 | 
12 | # Counts index
13 | java $maxmemory -cp lib/lucene-core-5.2.1.jar:lib/lucene-backward-codecs-5.2.1.jar:lib/lucene-analyzers-common-5.2.1.jar:lib/lucene-benchmark-5.2.1.jar:lib/lucene-queryparser-5.2.1.jar:.:ingester/target/ingester-0.0.1-SNAPSHOT-jar-with-dependencies.jar luceneingester.TrecIngester -dataDir $GOV2_LOCATION -indexPath gov2.lucene.cnt -threadCount 32 -docCountLimit -1
14 | 
15 | #Force merge
16 | echo "Force merging..."
17 | java $maxmemory -cp lib/lucene-core-5.2.1.jar:lib/lucene-backward-codecs-5.2.1.jar:lib/lucene-analyzers-common-5.2.1.jar:lib/lucene-benchmark-5.2.1.jar:lib/lucene-queryparser-5.2.1.jar:.:ingester/target/ingester-0.0.1-SNAPSHOT-jar-with-dependencies.jar luceneingester.ForceMerge gov2.lucene.cnt/index
18 | 
19 | # Positional index
20 | java $maxmemory -cp lib/lucene-core-5.2.1.jar:lib/lucene-backward-codecs-5.2.1.jar:lib/lucene-analyzers-common-5.2.1.jar:lib/lucene-benchmark-5.2.1.jar:lib/lucene-queryparser-5.2.1.jar:.:ingester/target/ingester-0.0.1-SNAPSHOT-jar-with-dependencies.jar luceneingester.TrecIngester -dataDir $GOV2_LOCATION -indexPath gov2.lucene.pos -positions -threadCount 32 -docCountLimit -1
21 | 
22 | echo "Force merging..."
23 | java $maxmemory -cp lib/lucene-core-5.2.1.jar:lib/lucene-backward-codecs-5.2.1.jar:lib/lucene-analyzers-common-5.2.1.jar:lib/lucene-benchmark-5.2.1.jar:lib/lucene-queryparser-5.2.1.jar:.:ingester/target/ingester-0.0.1-SNAPSHOT-jar-with-dependencies.jar luceneingester.ForceMerge gov2.lucene.pos/index
24 | 
25 | 
26 | for index in "cnt" "pos"
27 | do
28 | 	echo "Evaluation index ${index}"
29 | 	for queries in "701-750" "751-800" "801-850"
30 | 	do
31 | 		query_file=$TOPICS_QRELS/topics.${queries}.txt
32 | 		qrel_file=$TOPICS_QRELS/qrels.${queries}.txt
33 | 		run_file=submission_${queries}_${index}.txt
34 | 		stat_file=submission_${queries}_${index}.log
35 | 		eval_file=submission_${queries}_${index}.eval
36 | 
37 | 		java $maxmemory -cp lib/lucene-core-5.2.1.jar:lib/lucene-backward-codecs-5.2.1.jar:lib/lucene-analyzers-common-5.2.1.jar:lib/lucene-benchmark-5.2.1.jar:lib/lucene-queryparser-5.2.1.jar:.:ingester/target/ingester-0.0.1-SNAPSHOT-jar-with-dependencies.jar luceneingester.TrecDriver ${query_file} ${qrel_file} ${run_file} gov2.lucene.${index}/index T > ${stat_file}
38 | 
39 | 		${TREC_EVAL} ${qrel_file} ${run_file} > ${eval_file}
40 | 	done
41 | done
42 | 


--------------------------------------------------------------------------------
/systems/lucene/ingester/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 2 |   <modelVersion>4.0.0</modelVersion>
 3 |   <groupId>ingester</groupId>
 4 |   <artifactId>ingester</artifactId>
 5 |   <version>0.0.1-SNAPSHOT</version>
 6 |   <name>Lucene/Solr Benchmarks</name>
 7 | 
 8 |   <build>
 9 |     <plugins>
10 |       <plugin>
11 |         <groupId>org.apache.maven.plugins</groupId>
12 |         <artifactId>maven-compiler-plugin</artifactId>
13 |         <configuration>
14 |           <source>1.8</source>
15 |           <target>1.8</target>
16 |         </configuration>
17 |       </plugin>
18 |       <plugin>
19 |         <artifactId>maven-assembly-plugin</artifactId>
20 |         <configuration>
21 |           <archive>
22 |             <manifest>
23 |               <mainClass>fully.qualified.MainClass</mainClass>
24 |             </manifest>
25 |           </archive>
26 |           <descriptorRefs>
27 |             <descriptorRef>jar-with-dependencies</descriptorRef>
28 |           </descriptorRefs>
29 |         </configuration>
30 |       </plugin>
31 |     </plugins>
32 |   </build>
33 |   
34 |   <dependencies>
35 |     <dependency>
36 |       <groupId>org.apache.lucene</groupId>
37 |       <artifactId>lucene-benchmark</artifactId>
38 | 	<version>5.2.1</version>
39 | 	</dependency>
40 | 	<dependency>
41 | 	  <groupId>org.apache.lucene</groupId>
42 | 	  <artifactId>lucene-backward-codecs</artifactId>
43 | 	  <version>5.2.1</version>
44 |     </dependency>
45 | 	<dependency>
46 | 	  <groupId>org.apache.lucene</groupId>
47 | 	  <artifactId>lucene-queryparser</artifactId>
48 | 	  <version>5.2.1</version>
49 |     </dependency>
50 |     <dependency>
51 | 	  <groupId>org.apache.lucene</groupId>
52 | 	  <artifactId>lucene-analyzers-common</artifactId>
53 | 	  <version>5.2.1</version>
54 |     </dependency>
55 |     <dependency>
56 | 	  <groupId>org.apache.lucene</groupId>
57 | 	  <artifactId>lucene-core</artifactId>
58 | 	  <version>5.2.1</version>
59 |     </dependency>
60 |     
61 |     <dependency>
62 |       <groupId>org.apache.solr</groupId>
63 |       <artifactId>solr-solrj</artifactId>
64 |       <version>5.2.1</version>
65 |     </dependency>
66 |     <dependency>
67 |       <groupId>commons-logging</groupId>
68 |       <artifactId>commons-logging</artifactId>
69 |       <version>1.2</version>
70 |     </dependency>
71 |     
72 |   </dependencies>
73 | </project>
74 | 


--------------------------------------------------------------------------------
/systems/lucene/ingester/src/main/java/luceneingester/Args.java:
--------------------------------------------------------------------------------
  1 | package luceneingester;
  2 | 
  3 | /**
  4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  5 |  * contributor license agreements.  See the NOTICE file distributed with
  6 |  * this work for additional information regarding copyright ownership.
  7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  8 |  * (the "License"); you may not use this file except in compliance with
  9 |  * the License.  You may obtain a copy of the License at
 10 |  *
 11 |  *     http://www.apache.org/licenses/LICENSE-2.0
 12 |  *
 13 |  * Unless required by applicable law or agreed to in writing, software
 14 |  * distributed under the License is distributed on an "AS IS" BASIS,
 15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 |  * See the License for the specific language governing permissions and
 17 |  * limitations under the License.
 18 |  */
 19 | 
 20 | import java.util.ArrayList;
 21 | import java.util.HashMap;
 22 | import java.util.List;
 23 | import java.util.Map;
 24 | 
 25 | public class Args {
 26 |   private final String[] args;
 27 |   private final Map<String,Boolean> used = new HashMap<String,Boolean>();
 28 | 
 29 |   public Args(String[] args) {
 30 |     this.args = args;
 31 |   }
 32 | 
 33 |   public String getString(String argName) {
 34 |     for(int upto=0;upto<args.length;upto++) {
 35 |       if (args[upto].equals(argName)) {
 36 |         if (upto == args.length-1) {
 37 |           throw new RuntimeException("missing value for argument " + argName);
 38 |         }
 39 |         used.put(argName, true);
 40 |         return args[1+upto];
 41 |       }
 42 |     }
 43 | 
 44 |     throw new RuntimeException("missing required argument: " + argName);
 45 |   }
 46 | 
 47 |   public List<String> getStrings(String argName) {
 48 |     List<String> values = new ArrayList<String>();
 49 |     for(int upto=0;upto<args.length;upto++) {
 50 |       if (args[upto].equals(argName)) {
 51 |         if (upto == args.length-1) {
 52 |           throw new RuntimeException("missing value for argument " + argName);
 53 |         }
 54 |         used.put(argName, true);
 55 |         values.add(args[1+upto]);
 56 |       }
 57 |     }
 58 |     if (values.size() == 0) {
 59 |       throw new RuntimeException("missing required argument: " + argName);
 60 |     }
 61 |     return values;
 62 |   }
 63 | 
 64 |   public String getString(String argName, String defaultValue) {
 65 |     for(int upto=0;upto<args.length;upto++) {
 66 |       if (args[upto].equals(argName)) {
 67 |         if (upto == args.length-1) {
 68 |           throw new RuntimeException("missing value for argument " + argName);
 69 |         }
 70 |         used.put(argName, true);
 71 |         return args[1+upto];
 72 |       }
 73 |     }
 74 | 
 75 |     return defaultValue;
 76 |   }
 77 | 
 78 |   public int getInt(String argName) {
 79 |     return Integer.parseInt(getString(argName));
 80 |   }
 81 | 
 82 |   public double getDouble(String argName) {
 83 |     return Double.parseDouble(getString(argName));
 84 |   }
 85 | 
 86 |   public float getFloat(String argName) {
 87 |     return Float.parseFloat(getString(argName));
 88 |   }
 89 | 
 90 |   public long getLong(String argName) {
 91 |     return Long.parseLong(getString(argName));
 92 |   }
 93 | 
 94 |   public boolean getFlag(String argName) {
 95 |     for(int upto=0;upto<args.length;upto++) {
 96 |       if (args[upto].equals(argName)) {
 97 |         used.put(argName, false);
 98 |         return true;
 99 |       }
100 |     }
101 | 
102 |     return false;
103 |   }
104 | 
105 |   public void check() {
106 |     for(int upto=0;upto<args.length;upto++) {
107 |       Boolean v = used.get(args[upto]);
108 |       if (v == null) {
109 |         throw new RuntimeException("argument " + args[upto] + " isn't recognized");
110 |       } else if (v) {
111 |         upto++;
112 |       }
113 |     }
114 |   }
115 | }
116 | 


--------------------------------------------------------------------------------
/systems/lucene/ingester/src/main/java/luceneingester/ForceMerge.java:
--------------------------------------------------------------------------------
 1 | package luceneingester;
 2 | 
 3 | import java.io.IOException;
 4 | import java.nio.file.Paths;
 5 | 
 6 | import org.apache.lucene.analysis.Analyzer;
 7 | import org.apache.lucene.analysis.en.EnglishAnalyzer;
 8 | import org.apache.lucene.index.IndexWriter;
 9 | import org.apache.lucene.index.IndexWriterConfig;
10 | import org.apache.lucene.store.Directory;
11 | import org.apache.lucene.store.FSDirectory;
12 | 
13 | public class ForceMerge {
14 | 	public static void main(String[] args) throws IOException {
15 | 		String indexPath = args[0];
16 | 	    final Analyzer a = new EnglishAnalyzer();
17 | 	    final Directory dir = FSDirectory.open(Paths.get(indexPath));
18 | 
19 | 	    final IndexWriterConfig iwc = new IndexWriterConfig(a);
20 | 	    iwc.setOpenMode(IndexWriterConfig.OpenMode.APPEND);
21 | 	    final IndexWriter w = new IndexWriter(dir, iwc);
22 | 	    long start = System.currentTimeMillis();
23 | 	    w.forceMerge(1);
24 | 	    System.out.println("Merged.. Took: "+((System.currentTimeMillis())-start)/1000+" secs");
25 | 	    start = System.currentTimeMillis();
26 | 	    w.commit();
27 | 	    System.out.println("Committed.. Took: "+((System.currentTimeMillis())-start)/1000+" secs");
28 | 	    w.close();
29 | 	}
30 | }
31 | 
32 | 


--------------------------------------------------------------------------------
/systems/lucene/ingester/src/main/java/luceneingester/IndexStats.java:
--------------------------------------------------------------------------------
 1 | package luceneingester;
 2 | 
 3 | import java.io.File;
 4 | import java.io.IOException;
 5 | 
 6 | import org.apache.lucene.index.DirectoryReader;
 7 | import org.apache.lucene.index.Fields;
 8 | import org.apache.lucene.index.MultiFields;
 9 | import org.apache.lucene.index.Terms;
10 | import org.apache.lucene.store.FSDirectory;
11 | 
12 | public class IndexStats {
13 | 	public static void main(String[] args) throws IOException {
14 | 		String indexPath = args[0];
15 | 		String field = args[1];
16 | 		
17 | 		DirectoryReader reader = (DirectoryReader.open(FSDirectory.open(new File(indexPath).toPath())));
18 | 
19 | 		Fields fields = MultiFields.getFields(reader);
20 | 		Terms terms = fields.terms(field);
21 | 		System.out.println("Unique terms: "+terms.size());
22 | 		System.out.println("Sum doc freq: "+reader.getSumDocFreq(field));
23 | 		System.out.println("Sum total term freq: "+reader.getSumTotalTermFreq(field));
24 | 
25 | 	}
26 | }
27 | 


--------------------------------------------------------------------------------
/systems/lucene/ingester/src/main/java/luceneingester/IndexThreads.java:
--------------------------------------------------------------------------------
  1 | package luceneingester;
  2 | 
  3 | /**
  4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  5 |  * contributor license agreements.  See the NOTICE file distributed with
  6 |  * this work for additional information regarding copyright ownership.
  7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  8 |  * (the "License"); you may not use this file except in compliance with
  9 |  * the License.  You may obtain a copy of the License at
 10 |  *
 11 |  *     http://www.apache.org/licenses/LICENSE-2.0
 12 |  *
 13 |  * Unless required by applicable law or agreed to in writing, software
 14 |  * distributed under the License is distributed on an "AS IS" BASIS,
 15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 |  * See the License for the specific language governing permissions and
 17 |  * limitations under the License.
 18 |  */
 19 | 
 20 | import java.io.IOException;
 21 | import java.util.concurrent.CountDownLatch;
 22 | import java.util.concurrent.atomic.AtomicBoolean;
 23 | import java.util.concurrent.atomic.AtomicInteger;
 24 | 
 25 | import org.apache.lucene.benchmark.byTask.feeds.DocData;
 26 | import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
 27 | import org.apache.lucene.benchmark.byTask.feeds.TrecContentSource;
 28 | import org.apache.lucene.document.Document;
 29 | import org.apache.lucene.document.StringField;
 30 | import org.apache.lucene.document.TextField;
 31 | import org.apache.lucene.document.Field.Store;
 32 | import org.apache.lucene.index.IndexWriter;
 33 | 
 34 | class IndexThreads {
 35 | 
 36 |   final IngestRatePrinter printer;
 37 |   final CountDownLatch startLatch = new CountDownLatch(1);
 38 |   final AtomicBoolean stop;
 39 |   final AtomicBoolean failed;
 40 |   final TrecContentSource tcs;
 41 |   final Thread[] threads;
 42 | 
 43 |   public IndexThreads(IndexWriter w, boolean positions,
 44 |       TrecContentSource tcs, int numThreads, int docCountLimit, boolean printDPS) throws IOException, InterruptedException {
 45 | 
 46 |     this.tcs = tcs;
 47 |     threads = new Thread[numThreads];
 48 | 
 49 |     final CountDownLatch stopLatch = new CountDownLatch(numThreads);
 50 |     final AtomicInteger count = new AtomicInteger();
 51 |     stop = new AtomicBoolean(false);
 52 |     failed = new AtomicBoolean(false);
 53 | 
 54 |     for(int thread=0;thread<numThreads;thread++) {
 55 |       threads[thread] = new IndexThread(startLatch, stopLatch, w, positions, tcs, docCountLimit, count, stop, failed);
 56 |       threads[thread].start();
 57 |     }
 58 | 
 59 |     Thread.sleep(10);
 60 | 
 61 |     if (printDPS) {
 62 |       printer = new IngestRatePrinter(count, stop);
 63 |       printer.start();
 64 |     } else {
 65 |       printer = null;
 66 |     }
 67 |   }
 68 | 
 69 |   public void start() {
 70 |     startLatch.countDown();
 71 |   }
 72 | 
 73 |   public long getBytesIndexed() {
 74 |     return tcs.getBytesCount();
 75 |   }
 76 | 
 77 |   public void stop() throws InterruptedException, IOException {
 78 |     stop.getAndSet(true);
 79 |     for(Thread t : threads) {
 80 |       t.join();
 81 |     }
 82 |     if (printer != null) {
 83 |       printer.join();
 84 |     }
 85 |     tcs.close();
 86 |   }
 87 | 
 88 |   public boolean done() {
 89 |     for(Thread t: threads) {
 90 |       if (t.isAlive()) {
 91 |         return false;
 92 |       }
 93 |     }
 94 | 
 95 |     return true;
 96 |   }
 97 | 
 98 |   private static class IndexThread extends Thread {
 99 |     private final TrecContentSource tcs;
100 |     private final int numTotalDocs;
101 |     private final IndexWriter w;
102 |     private final AtomicInteger count;
103 |     private final CountDownLatch startLatch;
104 |     private final CountDownLatch stopLatch;
105 |     private final AtomicBoolean failed;
106 |     private final boolean positions;
107 | 
108 |     public IndexThread(CountDownLatch startLatch, CountDownLatch stopLatch, IndexWriter w, boolean positions,
109 |         TrecContentSource tcs, int numTotalDocs, AtomicInteger count,
110 |         AtomicBoolean stop, AtomicBoolean failed) {
111 |       this.startLatch = startLatch;
112 |       this.stopLatch = stopLatch;
113 |       this.w = w;
114 |       this.positions = positions;
115 |       this.tcs = tcs;
116 |       this.numTotalDocs = numTotalDocs;
117 |       this.count = count;
118 |       this.failed = failed;
119 |     }
120 |     
121 |     private Document getDocumentFromDocData(DocData dd) {
122 |       Document doc = new Document();
123 |       doc.add(new StringField("docname", dd.getName(), Store.YES));
124 |       if(positions) {
125 |         doc.add(new TextField("body", dd.getTitle(), Store.NO));
126 |         doc.add(new TextField("body", dd.getBody(), Store.NO));
127 |       } else {
128 |         doc.add(new NoPositionsTextField("body", dd.getTitle()));
129 |         doc.add(new NoPositionsTextField("body", dd.getBody()));
130 |       }
131 |       return doc;
132 |     }
133 | 
134 |     @Override
135 |     public void run() {
136 |       try {
137 |         final long tStart = System.currentTimeMillis();
138 | 
139 |         try {
140 |           startLatch.await();
141 |         } catch (InterruptedException ie) {
142 |           Thread.currentThread().interrupt();
143 |           return;
144 |         }
145 | 
146 |         while (true) {
147 |           DocData dd = new DocData();
148 |           
149 |           try {
150 |             dd = tcs.getNextDocData(dd);
151 |           } catch (IOException ex) {
152 |             // The HTML parser used with this trec parser doesn't support HTML pages with framesets.
153 |             if(!(ex.getCause()!=null && ex.getCause().getMessage().contains("HTML framesets") )) {
154 |               System.err.println("Failed: "+ex.getMessage());
155 |             }
156 |             continue;
157 |           } catch (Exception e) {
158 |             if(e instanceof NoMoreDataException) {
159 |               break;
160 |             } else {
161 |               System.err.println("Failed: "+e.getMessage());
162 |               continue;
163 |             }
164 |           }
165 | 
166 |           Document doc = getDocumentFromDocData(dd);
167 |           if (doc == null) {
168 |             break;
169 |           }
170 |           int docCount = count.incrementAndGet();
171 |           if (numTotalDocs != -1 && docCount > numTotalDocs) {
172 |             break;
173 |           }
174 |           if ((docCount % 100000) == 0) {
175 |             System.out.println("Indexer: " + docCount + " docs... (" + (System.currentTimeMillis() - tStart)/1000.0 + " sec)");
176 |           }
177 |           w.addDocument(doc);
178 |         }
179 | 
180 |       } catch (Exception e) {
181 |         failed.set(true);
182 |         throw new RuntimeException(e);
183 |       } finally {
184 |         stopLatch.countDown();
185 |       }
186 |     }
187 |   }
188 | 
189 |   private static class IngestRatePrinter extends Thread {
190 | 
191 |     private final AtomicInteger count;
192 |     private final AtomicBoolean stop;
193 |     public IngestRatePrinter(AtomicInteger count, AtomicBoolean stop){
194 |       this.count = count;
195 |       this.stop = stop;
196 |     }
197 | 
198 |     @Override
199 |     public void run() {
200 |       long time = System.currentTimeMillis();
201 |       System.out.println("startIngest: " + time);
202 |       final long start = time;
203 |       int lastCount = count.get();
204 |       while(!stop.get()) {
205 |         try {
206 |           Thread.sleep(200);
207 |         } catch(Exception ex) {
208 |         }
209 |         int numDocs = count.get();
210 | 
211 |         double current = numDocs - lastCount;
212 |         long now = System.currentTimeMillis();
213 |         double seconds = (now-time) / 1000.0d;
214 |         System.out.println("ingest: " + (current / seconds) + " " + (now - start));
215 |         time = now;
216 |         lastCount = numDocs;
217 |       }
218 |     }
219 |   }
220 | }
221 | 


--------------------------------------------------------------------------------
/systems/lucene/ingester/src/main/java/luceneingester/NoPositionsTextField.java:
--------------------------------------------------------------------------------
 1 | package luceneingester;
 2 | 
 3 | import org.apache.lucene.document.Field;
 4 | import org.apache.lucene.document.FieldType;
 5 | import org.apache.lucene.index.IndexOptions;
 6 | 
 7 | /** A tokenized field with stored=false and without positions (only frequencies) */
 8 | 
 9 | public final class NoPositionsTextField extends Field {
10 | 
11 |   public static final FieldType TYPE_NOT_STORED = new FieldType();
12 | 
13 |   static {
14 |     TYPE_NOT_STORED.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
15 |     TYPE_NOT_STORED.setTokenized(true);
16 |     TYPE_NOT_STORED.freeze();
17 |   }
18 | 
19 |   public NoPositionsTextField(String name, String value) {
20 |     super(name, value, TYPE_NOT_STORED);
21 |   }
22 | }


--------------------------------------------------------------------------------
/systems/lucene/ingester/src/main/java/luceneingester/TrecDriver.java:
--------------------------------------------------------------------------------
  1 | package luceneingester;
  2 | 
  3 | import java.io.OutputStreamWriter;
  4 | import java.io.PrintWriter;
  5 | import java.nio.charset.Charset;
  6 | import java.nio.charset.StandardCharsets;
  7 | import java.nio.file.Files;
  8 | import java.nio.file.Path;
  9 | import java.nio.file.Paths;
 10 | import java.util.HashSet;
 11 | import java.util.Set;
 12 | 
 13 | import org.apache.lucene.analysis.en.EnglishAnalyzer;
 14 | import org.apache.lucene.benchmark.quality.Judge;
 15 | import org.apache.lucene.benchmark.quality.QualityBenchmark;
 16 | import org.apache.lucene.benchmark.quality.QualityQuery;
 17 | import org.apache.lucene.benchmark.quality.QualityQueryParser;
 18 | import org.apache.lucene.benchmark.quality.QualityStats;
 19 | import org.apache.lucene.benchmark.quality.trec.QueryDriver;
 20 | import org.apache.lucene.benchmark.quality.trec.TrecJudge;
 21 | import org.apache.lucene.benchmark.quality.trec.TrecTopicsReader;
 22 | import org.apache.lucene.benchmark.quality.utils.SubmissionReport;
 23 | import org.apache.lucene.index.DirectoryReader;
 24 | import org.apache.lucene.index.IndexReader;
 25 | import org.apache.lucene.queryparser.classic.ParseException;
 26 | import org.apache.lucene.queryparser.classic.QueryParser;
 27 | import org.apache.lucene.queryparser.classic.QueryParserBase;
 28 | import org.apache.lucene.search.BooleanClause;
 29 | import org.apache.lucene.search.BooleanQuery;
 30 | import org.apache.lucene.search.IndexSearcher;
 31 | import org.apache.lucene.search.Query;
 32 | import org.apache.lucene.search.similarities.BM25Similarity;
 33 | import org.apache.lucene.store.FSDirectory;
 34 | 
 35 | public class TrecDriver extends QueryDriver {
 36 |   public static void main(String[] args) throws Exception {
 37 | 
 38 |     if (args.length < 4 || args.length > 5) {
 39 |       System.err.println("Usage: QueryDriver <topicsFile> <qrelsFile> <submissionFile> <indexDir> [querySpec]");
 40 |       System.err.println("topicsFile: input file containing queries");
 41 |       System.err.println("qrelsFile: input file containing relevance judgements");
 42 |       System.err.println("submissionFile: output submission file for trec_eval");
 43 |       System.err.println("indexDir: index directory");
 44 |       System.err.println("querySpec: string composed of fields to use in query consisting of T=title,D=description,N=narrative:");
 45 |       System.err.println("\texample: TD (query on Title + Description). The default is T (title only)");
 46 |       System.exit(1);
 47 |     }
 48 |     
 49 |     Path topicsFile = Paths.get(args[0]);
 50 |     Path qrelsFile = Paths.get(args[1]);
 51 |     Path submissionFile = Paths.get(args[2]);
 52 |     SubmissionReport submitLog = new SubmissionReport(new PrintWriter(Files.newBufferedWriter(submissionFile, StandardCharsets.UTF_8)), "lucene");
 53 |     FSDirectory dir = FSDirectory.open(Paths.get(args[3]));
 54 |     String fieldSpec = args.length == 5 ? args[4] : "T"; // default to Title-only if not specified.
 55 |     IndexReader reader = DirectoryReader.open(dir);
 56 |     IndexSearcher searcher = new IndexSearcher(reader);
 57 |     searcher.setSimilarity(new BM25Similarity(0.9f, 0.4f));
 58 | 
 59 |     int maxResults = 1000;
 60 |     String docNameField = "docname";
 61 | 
 62 |     PrintWriter logger = new PrintWriter(new OutputStreamWriter(System.out, Charset.defaultCharset()), true);
 63 | 
 64 |     // use trec utilities to read trec topics into quality queries
 65 |     TrecTopicsReader qReader = new TrecTopicsReader();
 66 |     QualityQuery qqs[] = qReader.readQueries(Files.newBufferedReader(topicsFile, StandardCharsets.UTF_8));
 67 | 
 68 |     // prepare judge, with trec utilities that read from a QRels file
 69 |     Judge judge = new TrecJudge(Files.newBufferedReader(qrelsFile, StandardCharsets.UTF_8));
 70 | 
 71 |     // validate topics & judgments match each other
 72 |     judge.validateData(qqs, logger);
 73 | 
 74 |     Set<String> fieldSet = new HashSet<>();
 75 |     if (fieldSpec.indexOf('T') >= 0) fieldSet.add("title");
 76 |     if (fieldSpec.indexOf('D') >= 0) fieldSet.add("description");
 77 |     if (fieldSpec.indexOf('N') >= 0) fieldSet.add("narrative");
 78 |     
 79 |     // set the parsing of quality queries into Lucene queries.
 80 |     QualityQueryParser qqParser = new EnglishQQParser(fieldSet.toArray(new String[0]), "body");
 81 | 
 82 |     // run the benchmark
 83 |     QualityBenchmark qrun = new QualityBenchmark(qqs, qqParser, searcher, docNameField);
 84 |     qrun.setMaxResults(maxResults);
 85 |     QualityStats stats[] = qrun.execute(judge, submitLog, logger);
 86 | 
 87 |     // print an avarage sum of the results
 88 |     QualityStats avg = QualityStats.average(stats);
 89 |     avg.log("SUMMARY", 2, logger, "  ");
 90 |     reader.close();
 91 |     dir.close();
 92 |   }
 93 | }
 94 | 
 95 | class EnglishQQParser implements QualityQueryParser {
 96 | 
 97 |   private String qqNames[];
 98 |   private String indexField;
 99 |   ThreadLocal<QueryParser> queryParser = new ThreadLocal<>();
100 | 
101 |   /**
102 |    * Constructor of a simple qq parser.
103 |    * @param qqNames name-value pairs of quality query to use for creating the query
104 |    * @param indexField corresponding index field  
105 |    */
106 |   public EnglishQQParser(String qqNames[], String indexField) {
107 |     this.qqNames = qqNames;
108 |     this.indexField = indexField;
109 |   }
110 | 
111 |   /**
112 |    * Constructor of a simple qq parser.
113 |    * @param qqName name-value pair of quality query to use for creating the query
114 |    * @param indexField corresponding index field  
115 |    */
116 |   public EnglishQQParser(String qqName, String indexField) {
117 |     this(new String[] { qqName }, indexField);
118 |   }
119 | 
120 |   /* (non-Javadoc)
121 |    * @see org.apache.lucene.benchmark.quality.QualityQueryParser#parse(org.apache.lucene.benchmark.quality.QualityQuery)
122 |    */
123 |   @Override
124 |   public Query parse(QualityQuery qq) throws ParseException {
125 |     QueryParser qp = queryParser.get();
126 |     if (qp==null) {
127 |       qp = new QueryParser(indexField, new EnglishAnalyzer());
128 |       queryParser.set(qp);
129 |     }
130 |     BooleanQuery bq = new BooleanQuery();
131 |     for (int i = 0; i < qqNames.length; i++)
132 |       bq.add(qp.parse(QueryParserBase.escape(qq.getValue(qqNames[i]))), BooleanClause.Occur.SHOULD);
133 |     
134 |     return bq;
135 |   }
136 | 
137 | }
138 | 
139 | 


--------------------------------------------------------------------------------
/systems/lucene/ingester/src/main/java/luceneingester/TrecIngester.java:
--------------------------------------------------------------------------------
  1 | package luceneingester;
  2 | 
  3 | /**
  4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  5 |  * contributor license agreements.  See the NOTICE file distributed with
  6 |  * this work for additional information regarding copyright ownership.
  7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  8 |  * (the "License"); you may not use this file except in compliance with
  9 |  * the License.  You may obtain a copy of the License at
 10 |  *
 11 |  *     http://www.apache.org/licenses/LICENSE-2.0
 12 |  *
 13 |  * Unless required by applicable law or agreed to in writing, software
 14 |  * distributed under the License is distributed on an "AS IS" BASIS,
 15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 |  * See the License for the specific language governing permissions and
 17 |  * limitations under the License.
 18 |  */
 19 | 
 20 | import java.io.IOException;
 21 | import java.nio.file.Paths;
 22 | import java.util.HashMap;
 23 | import java.util.Map;
 24 | import java.util.Properties;
 25 | 
 26 | import org.apache.lucene.analysis.Analyzer;
 27 | import org.apache.lucene.analysis.en.EnglishAnalyzer;
 28 | import org.apache.lucene.benchmark.byTask.feeds.TrecContentSource;
 29 | import org.apache.lucene.benchmark.byTask.utils.Config;
 30 | import org.apache.lucene.document.TextField;
 31 | import org.apache.lucene.index.IndexWriter;
 32 | import org.apache.lucene.index.IndexWriterConfig;
 33 | import org.apache.lucene.index.MergePolicy;
 34 | import org.apache.lucene.index.NoMergePolicy;
 35 | import org.apache.lucene.search.similarities.BM25Similarity;
 36 | import org.apache.lucene.store.*;
 37 | import org.apache.lucene.util.*;
 38 | 
 39 | public final class TrecIngester {
 40 |   private static TrecContentSource createTrecSource(String dataDir) {
 41 |     TrecContentSource tcs = new TrecContentSource();
 42 |     Properties props = new Properties();
 43 |     props.setProperty("print.props", "false");
 44 |     props.setProperty("content.source.verbose", "false");
 45 |     props.setProperty("content.source.excludeIteration", "true");
 46 |     props.setProperty("docs.dir", dataDir);
 47 |     props.setProperty("trec.doc.parser", "org.apache.lucene.benchmark.byTask.feeds.TrecGov2Parser");
 48 |     props.setProperty("content.source.forever", "false");
 49 |     tcs.setConfig(new Config(props));
 50 |     try {
 51 |       tcs.resetInputs();
 52 |     } catch (IOException e) {
 53 |       e.printStackTrace();
 54 |     }
 55 |     return tcs;
 56 |   }
 57 | 
 58 |   public static void main(String[] clArgs) throws Exception {
 59 |     Args args = new Args(clArgs);
 60 |     final String dirPath = args.getString("-indexPath") + "/index";
 61 |     final String dataDir = args.getString("-dataDir");
 62 |     final int docCountLimit = args.getInt("-docCountLimit"); // -1 means all docs from the source:
 63 |     final int numThreads = args.getInt("-threadCount");
 64 |     final boolean verbose = args.getFlag("-verbose");
 65 |     final boolean printDPS = args.getFlag("-printDPS");
 66 |     final boolean doUpdate = args.getFlag("-update");
 67 |     final boolean positions = args.getFlag("-positions");
 68 |     final boolean forceMerge = args.getFlag("-forceMerge");
 69 | 
 70 |     args.check();
 71 | 
 72 |     final Analyzer a = new EnglishAnalyzer();
 73 |     final TrecContentSource trecSource = createTrecSource(dataDir);
 74 |     final Directory dir = FSDirectory.open(Paths.get(dirPath));
 75 | 
 76 |     System.out.println("Index path: " + dirPath);
 77 |     System.out.println("Doc count limit: " + (docCountLimit == -1 ? "all docs" : ""+docCountLimit));
 78 |     System.out.println("Threads: " + numThreads);
 79 |     System.out.println("Verbose: " + (verbose ? "yes" : "no"));
 80 |     System.out.println("Positions: " + (positions ? "yes" : "no"));
 81 |     System.out.println("Force merge: " + (forceMerge ? "yes" : "no"));
 82 | 
 83 |     if (verbose) {
 84 |       InfoStream.setDefault(new PrintStreamInfoStream(System.out));
 85 |     }
 86 | 
 87 |     final IndexWriterConfig iwc = new IndexWriterConfig(a);
 88 | 
 89 |     if (doUpdate) {
 90 |       iwc.setOpenMode(IndexWriterConfig.OpenMode.APPEND);
 91 |     } else {
 92 |       iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
 93 |     }
 94 |     if (forceMerge) {
 95 |     	// TODO: Explore a merge policy that results in just one segment. NoMergePolicy seems
 96 | 	// to result in large number of files, but possibly one 1 segment.
 97 |   	//iwc.setMergePolicy(NoMergePolicy.INSTANCE); 
 98 |     }
 99 |     System.out.println("IW config=" + iwc);
100 | 
101 |     final IndexWriter w = new IndexWriter(dir, iwc);
102 |     IndexThreads threads = new IndexThreads(w, positions, trecSource, numThreads, docCountLimit, printDPS);
103 |     System.out.println("\nIndexer: start");
104 | 
105 |     final long t0 = System.currentTimeMillis();
106 | 
107 |     threads.start();
108 | 
109 |     while (!threads.done()) {
110 |       Thread.sleep(100);
111 |     }
112 |     threads.stop();
113 | 
114 |     final long t1 = System.currentTimeMillis();
115 |     System.out.println("\nIndexer: indexing done (" + (t1-t0)/1000.0 + " sec); total " + w.maxDoc() + " docs");
116 |     if (!doUpdate && docCountLimit != -1 && w.maxDoc() != docCountLimit) {
117 |       throw new RuntimeException("w.maxDoc()=" + w.maxDoc() + " but expected " + docCountLimit);
118 |     }
119 |     if (threads.failed.get()) {
120 |       throw new RuntimeException("exceptions during indexing");
121 |     }
122 | 
123 | 
124 |     final long t2;
125 |     t2 = System.currentTimeMillis();
126 | 
127 |     final Map<String,String> commitData = new HashMap<String,String>();
128 |     commitData.put("userData", "multi");
129 |     w.setCommitData(commitData);
130 |     w.commit();
131 |     final long t3 = System.currentTimeMillis();
132 |     System.out.println("\nIndexer: commit multi (took " + (t3-t2)/1000.0 + " sec)");
133 | 
134 | 
135 |     if (forceMerge) {
136 |     	System.out.println("\nStarting the merge...");
137 |     	long mergeStart = System.currentTimeMillis();
138 |     	w.forceMerge(1);
139 |     	w.commit();
140 |         System.out.println("\nIndexer: merging took " + (System.currentTimeMillis() - mergeStart)/1000.0 + " sec");
141 |     }
142 |     System.out.println("\nIndexer: at close: " + w.segString());
143 |     final long tCloseStart = System.currentTimeMillis();
144 |     w.close();
145 |     System.out.println("\nIndexer: close took " + (System.currentTimeMillis() - tCloseStart)/1000.0 + " sec");
146 |     dir.close();
147 |     final long tFinal = System.currentTimeMillis();
148 |     System.out.println("\nIndexer: finished (" + (tFinal-t0)/1000.0 + " sec)");
149 |     System.out.println("\nIndexer: net bytes indexed " + threads.getBytesIndexed());
150 |     System.out.println("\nIndexer: " + (threads.getBytesIndexed()/1024./1024./1024./((tFinal-t0)/3600000.)) + " GB/hour plain text");
151 |   }
152 | }
153 | 


--------------------------------------------------------------------------------
/systems/lucene/lib/lucene-analyzers-common-5.2.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lintool/IR-Reproducibility/8223ad29e72b20d3610376e22ad84a0e986022f8/systems/lucene/lib/lucene-analyzers-common-5.2.1.jar


--------------------------------------------------------------------------------
/systems/lucene/lib/lucene-backward-codecs-5.2.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lintool/IR-Reproducibility/8223ad29e72b20d3610376e22ad84a0e986022f8/systems/lucene/lib/lucene-backward-codecs-5.2.1.jar


--------------------------------------------------------------------------------
/systems/lucene/lib/lucene-benchmark-5.2.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lintool/IR-Reproducibility/8223ad29e72b20d3610376e22ad84a0e986022f8/systems/lucene/lib/lucene-benchmark-5.2.1.jar


--------------------------------------------------------------------------------
/systems/lucene/lib/lucene-core-5.2.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lintool/IR-Reproducibility/8223ad29e72b20d3610376e22ad84a0e986022f8/systems/lucene/lib/lucene-core-5.2.1.jar


--------------------------------------------------------------------------------
/systems/lucene/lib/lucene-queryparser-5.2.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lintool/IR-Reproducibility/8223ad29e72b20d3610376e22ad84a0e986022f8/systems/lucene/lib/lucene-queryparser-5.2.1.jar


--------------------------------------------------------------------------------
/systems/terrier/dotgov2-prox.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -ef
 3 | 
 4 | source ../common.sh
 5 | 
 6 | INDEX="blocks"
 7 | 
 8 | if [[ ! -f terrier-4.0.tar.gz ]]; then
 9 | 	curl http://www.dcs.gla.ac.uk/~craigm/terrier-4.0.tar.gz> terrier-4.0.tar.gz
10 | fi
11 | tar -zxf terrier-4.0.tar.gz
12 | cd  terrier-4.0
13 | 
14 | bin/trec_setup.sh $GOV2_LOCATION 2>&1  | tee trec_setup.log
15 | #mv etc/collection.spec collection.spec && head collection.spec > etc/collection.spec
16 | 
17 | OPTS="-i -j"
18 | if [ "$INDEX" == "classical" ];
19 | then
20 |  OPTS="-i"
21 | fi
22 | 
23 | echo <<EOF >> etc/terrier.properties
24 | trec.collection.class=TRECWebCollection
25 | #indexer.meta.forward.keys=docno,url
26 | #indexer.meta.forward.keylens=26,256
27 | indexer.meta.forward.keys=docno
28 | indexer.meta.forward.keylens=26
29 | indexer.meta.reverse.keys=
30 | ignore.low.idf.terms=false
31 | 
32 | #faster indexing with more memory
33 | memory.reserved=104857600
34 | EOF
35 | 
36 | if [ "$INDEX" == "blocks" ];
37 | then
38 |  OPTS="$OPTS -Dblock.indexing=true"
39 | elif [[ "$INDEX" == "blocks_fields" ]]; then
40 |  OPTS="$OPTS -Dblock.indexing=true -DFieldTags.process=TITLE,ELSE"
41 | fi
42 | 
43 | JAVA_OPTIONS=-XX:-UseGCOverheadLimit TERRIER_HEAP_MEM=100g bin/trec_terrier.sh $OPTS 2>&1 | tee indexing.${INDEX}.log
44 | 
45 | if [[ "$INDEX" == "blocks_fields" ]]; then
46 | 	 perl -pi -e 's/FSADocumentIndex$/FSAFieldDocumentIndex/g' var/index/data.properties
47 | fi
48 | 
49 | for RANKER in DPH_Prox;
50 | do
51 |   ../dotgov2-ranker.sh $INDEX $RANKER
52 | done
53 | 
54 | mv var ${INDEX}-var
55 | 


--------------------------------------------------------------------------------
/systems/terrier/dotgov2-qe.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -ef
 3 | 
 4 | source ../common.sh
 5 | 
 6 | INDEX="classical"
 7 | 
 8 | if [[ ! -f terrier-4.0.tar.gz ]]; then
 9 | 	curl http://www.dcs.gla.ac.uk/~craigm/terrier-4.0.tar.gz> terrier-4.0.tar.gz
10 | fi
11 | tar -zxf terrier-4.0.tar.gz
12 | cd  terrier-4.0
13 | 
14 | bin/trec_setup.sh $GOV2_LOCATION 2>&1  | tee trec_setup.log
15 | #mv etc/collection.spec collection.spec && head collection.spec > etc/collection.spec
16 | 
17 | OPTS="-i -j"
18 | if [ "$INDEX" == "classical" ];
19 | then
20 |  OPTS="-i"
21 | fi
22 | 
23 | echo <<EOF >> etc/terrier.properties
24 | trec.collection.class=TRECWebCollection
25 | #indexer.meta.forward.keys=docno,url
26 | #indexer.meta.forward.keylens=26,256
27 | indexer.meta.forward.keys=docno
28 | indexer.meta.forward.keylens=26
29 | indexer.meta.reverse.keys=
30 | ignore.low.idf.terms=false
31 | 
32 | #faster indexing with more memory
33 | memory.reserved=104857600
34 | EOF
35 | 
36 | if [ "$INDEX" == "blocks" ];
37 | then
38 |  OPTS="$OPTS -Dblock.indexing=true"
39 | elif [[ "$INDEX" == "blocks_fields" ]]; then
40 |  OPTS="$OPTS -Dblock.indexing=true -DFieldTags.process=TITLE,ELSE"
41 | fi
42 | 
43 | JAVA_OPTIONS=-XX:-UseGCOverheadLimit TERRIER_HEAP_MEM=100g bin/trec_terrier.sh $OPTS 2>&1 | tee indexing.${INDEX}.log
44 | 
45 | if [[ "$INDEX" == "blocks_fields" ]]; then
46 | 	 perl -pi -e 's/FSADocumentIndex$/FSAFieldDocumentIndex/g' var/index/data.properties
47 | fi
48 | 
49 | for RANKER in DPH_QE;
50 | do
51 |   ../dotgov2-ranker.sh $INDEX $RANKER
52 | done
53 | 
54 | mv var ${INDEX}-var
55 | 


--------------------------------------------------------------------------------
/systems/terrier/dotgov2-ranker.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -ef
 3 | 
 4 | pushd ..
 5 | source ../common.sh
 6 | popd
 7 | 
 8 | INDEX=$1
 9 | RANKER=$2
10 | 
11 | OPTIONS=""
12 | 
13 | if [ "$RANKER" == "DPH" ];
14 | then
15 |  	OPTIONS="$OPTIONS -Dtrec.model=DPH"
16 | elif [ "$RANKER" == "DPH_QE" ];
17 | then
18 |   OPTIONS="$OPTIONS -Dtrec.model=DPH"
19 |   OPTIONS="$OPTIONS -Dquerying.default.controls=qe:on"
20 |   if [ ! -e "var/index/data.direct.bf" ];
21 |   then
22 |     TERRIER_HEAP_MEM=100g bin/trec_terrier.sh -id
23 |     du -csh var/index/
24 |   fi
25 | elif [[ "$RANKER" == "BM25" ]]; then
26 | 	OPTIONS="$OPTIONS -Dtrec.model=BM25"
27 | elif [[ "$RANKER" == "DPH_Prox" ]]; then
28 |   OPTIONS="$OPTIONS -Dtrec.model=DPH"
29 | 	OPTIONS="$OPTIONS -Dmatching.dsms=DFRDependenceScoreModifier"
30 | 	OPTIONS="$OPTIONS -Dproximity.dependency.type=SD"
31 | 	OPTIONS="$OPTIONS -Dproximity.ngram.length=5"
32 | #elif [[ "$RANKER" == "LTR" ]]; then
33 | #  pwd
34 | #  exec ../dotgov2-ltr-ranker.sh $INDEX $RANKER
35 | fi
36 | 
37 | for queries in "701-750" "751-800" "801-850"
38 | do
39 | 	query_file=../$TOPICS_QRELS/topics.${queries}.txt
40 | 	qrel_file=../$TOPICS_QRELS/qrels.${queries}.txt
41 | 	stat_file=${INDEX}.${RANKER}.${queries}.search_stats.txt
42 | 	run_file=$PWD/${INDEX}.${RANKER}.terrier.${queries}.txt
43 | 
44 | 	TERRIER_HEAP_MEM=26g bin/trec_terrier.sh -r -Dtrec.topics=$query_file -Dtrec.results.file=$run_file $OPTIONS > $stat_file 2>&1
45 | 	../$TREC_EVAL ${qrel_file} ${run_file}| tee -a $stat_file
46 | 
47 | 	#grep 'Total Time to Search' ${stat_file} | sed \$d
48 | done
49 | 


--------------------------------------------------------------------------------
/systems/terrier/dotgov2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -ef
 3 | 
 4 | source ../common.sh
 5 | 
 6 | INDEX="singlepass"
 7 | 
 8 | if [[ ! -f terrier-4.0.tar.gz ]]; then
 9 | 	curl http://www.dcs.gla.ac.uk/~craigm/terrier-4.0.tar.gz> terrier-4.0.tar.gz
10 | fi
11 | tar -zxf terrier-4.0.tar.gz
12 | cd  terrier-4.0
13 | 
14 | bin/trec_setup.sh $GOV2_LOCATION 2>&1  | tee trec_setup.log
15 | #mv etc/collection.spec collection.spec && head collection.spec > etc/collection.spec
16 | 
17 | OPTS="-i -j"
18 | if [ "$INDEX" == "classical" ];
19 | then
20 |  OPTS="-i"
21 | fi
22 | 
23 | echo <<EOF >> etc/terrier.properties
24 | trec.collection.class=TRECWebCollection
25 | #indexer.meta.forward.keys=docno,url
26 | #indexer.meta.forward.keylens=26,256
27 | indexer.meta.forward.keys=docno
28 | indexer.meta.forward.keylens=26
29 | indexer.meta.reverse.keys=
30 | ignore.low.idf.terms=false
31 | 
32 | #faster indexing with more memory
33 | memory.reserved=104857600
34 | EOF
35 | 
36 | if [ "$INDEX" == "blocks" ];
37 | then
38 |  OPTS="$OPTS -Dblock.indexing=true"
39 | elif [[ "$INDEX" == "blocks_fields" ]]; then
40 |  OPTS="$OPTS -Dblock.indexing=true -DFieldTags.process=TITLE,ELSE"
41 | fi
42 | 
43 | JAVA_OPTIONS=-XX:-UseGCOverheadLimit TERRIER_HEAP_MEM=100g bin/trec_terrier.sh $OPTS 2>&1 | tee indexing.${INDEX}.log
44 | 
45 | if [[ "$INDEX" == "blocks_fields" ]]; then
46 | 	 perl -pi -e 's/FSADocumentIndex$/FSAFieldDocumentIndex/g' var/index/data.properties
47 | fi
48 | 
49 | for RANKER in DPH BM25;
50 | do
51 |   ../dotgov2-ranker.sh $INDEX $RANKER
52 | done
53 | 
54 | mv var ${INDEX}-var
55 | 


--------------------------------------------------------------------------------
/topics-and-qrels/README.md:
--------------------------------------------------------------------------------
 1 | Gov2
 2 | ====
 3 | 
 4 | + topics.701-750.txt: [Topics 701-750 (TREC 2004 Terabyte Track)](http://trec.nist.gov/data/terabyte/04/04topics.701-750.txt)
 5 | + topics.751-800.txt: [Topics 751-800 (TREC 2005 Terabyte Track)](http://trec.nist.gov/data/terabyte/05/05.topics.751-800.txt)
 6 | + topics.801-850.txt: [Topics 801-850 (TREC 2006 Terabyte Track)](http://trec.nist.gov/data/terabyte/06/06.topics.801-850.txt)
 7 | + qrels.701-750.txt: [qrels for Topics 701-750 (TREC 2004 Terabyte Track)](http://trec.nist.gov/data/terabyte/04/04.qrels.12-Nov-04)
 8 | + qrels.751-800.txt: [qrels for Topics 751-800 (TREC 2005 Terabyte Track)](http://trec.nist.gov/data/terabyte/05/05.adhoc_qrels)
 9 | + qrels.801-850.txt: [qrels for Topics 801-850 (TREC 2006 Terabyte Track)](http://trec.nist.gov/data/terabyte/06/qrels.tb06.top50)
10 | 
11 | ClueWeb09
12 | =========
13 | 
14 | + topics.web.1-50.txt:    [Topics 1-50 (TREC 2009 Web Track)](http://trec.nist.gov/data/web/09/wt09.topics.full.xml)
15 | + topics.web.51-100.txt:  [Topics 51-100 (TREC 2010 Web Track)](http://trec.nist.gov/data/web/10/wt2010-topics.xml)
16 | + topics.web.101-150.txt: [Topics 101-150 (TREC 2011 Web Track)](http://trec.nist.gov/data/web/11/full-topics.xml)
17 | + topics.web.151-200.txt: [Topics 151-200 (TREC 2012 Web Track)](http://trec.nist.gov/data/web/12/full-topics.xml)
18 | + qrels.web.1-50.txt:    [adhoc prels for category B runs for Topics 1-50 (TREC 2009 Web Track)](http://trec.nist.gov/data/web/09/prels.catB.1-50.gz)
19 | + qrels.web.51-100.txt:  [adhoc qrels for Topics 51-100 (TREC 2010 Web Track)](http://trec.nist.gov/data/web/10/10.adhoc-qrels.final)
20 | + qrels.web.101-150.txt: [adhoc qrels for Topics 101-150 (TREC 2011 Web Track)](http://trec.nist.gov/data/web/11/qrels.adhoc)
21 | + qrels.web.151-200.txt: [adhoc qrels for Topics 151-200 (TREC 2012 Web Track)](http://trec.nist.gov/data/web/12/qrels.adhoc)
22 | 
23 | ClueWeb12
24 | =========
25 | + topics.web.201-250.txt: [Topics 201-250 (TREC 2013 Web Track)](http://trec.nist.gov/data/web/2013/trec2013-topics.xml)
26 | + topics.web.251-300.txt: [Topics 251-300 (TREC 2014 Web Track)](http://trec.nist.gov/data/web/2014/trec2014-topics.xml)
27 | + qrels.web.201-250.txt: [one aspect per topic qrels for Topics 251-300 (TREC 2013 Web Track)](http://trec.nist.gov/data/web/2013/qrels.adhoc.txt)
28 | + qrels.web.251-300.txt: [one aspect per topic qrels for Topics 251-300 (TREC 2014 Web Track)](http://trec.nist.gov/data/web/2014/qrels.adhoc.txt)
29 | 


--------------------------------------------------------------------------------