├── extract_doc.py
├── extract.sh
└── README.md


/extract_doc.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import gzip
 3 | import sys
 4 | import glob
 5 | from tqdm import tqdm
 6 | 
 7 | f = sys.argv[1] 
 8 | 
 9 | limit = 5
10 | for i, jsonl in tqdm(enumerate(gzip.open(f, "rt"))):
11 |     jsonl = json.loads(jsonl)
12 | 
13 |     if i%2==0:
14 |         if "index" in jsonl:
15 |             doc_id = jsonl["index"]["_id"] if "index" in jsonl else None
16 |     else:
17 |         doc_text = jsonl["text"]
18 |         if doc_id is not None:
19 |             print("%s\t%s"%(doc_id, doc_text.replace("\t","")))
20 | 
21 | 


--------------------------------------------------------------------------------
/extract.sh:
--------------------------------------------------------------------------------
 1 | lcode=$1
 2 | dump_date=20210719
 3 | baseurl=https://dumps.wikimedia.org/other/cirrussearch
 4 | dumpurl=$baseurl/$dump_date/${lcode}wiki-${dump_date}-cirrussearch-content.json.gz
 5 | dumpfile=$(basename $dumpurl)
 6 | 
 7 | if [ $# -eq 0 ]
 8 | then
 9 |     echo "No language code supplied"
10 | else
11 |     echo "Downloading dump file..."
12 |     wget $dumpurl
13 |     echo "extracting text from dump file..."
14 |     python extract_doc.py $dumpfile > $lcode.tsv
15 |     echo "extracted wiki documents to : ${lcode}.tsv"
16 |     rm $dumpfile
17 | fi
18 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CLIRMatrix
 2 | http://www.cs.jhu.edu/~shuosun/clirmatrix/
 3 | 
 4 | Alternatively, CLIRMatrix is also available in the following google drive:
 5 | 
 6 | https://drive.google.com/drive/folders/1V-DcBwvAnlVAYJw_gsx0zXV5VXJcRGGc?usp=sharing
 7 | 
 8 | Script to extract untruncated documents from Wikipedia dumps:
 9 | 
10 | ```
11 | Usage:
12 |     ./extract.sh [wikipedia language code]
13 | E.g.
14 |     ./extract.sh en
15 | ```
16 | 
17 | ## Reference
18 | 
19 | [1] Shuo Sun, Kevin Duh
20 | [*CLIRMatrix: A massively large collection of bilingual and multilingual datasets for Cross-Lingual Information Retrieval*](https://www.aclweb.org/anthology/2020.emnlp-main.340/),
21 |     Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)
22 |     
23 | 
24 | 


--------------------------------------------------------------------------------