├── __init__.py
├── baseline
    ├── __init__.py
    ├── util
    │   └── __init__.py
    ├── filter_emty_text_from_lett.py
    ├── add_warc_locations.sh
    ├── ngrams.py
    ├── dedupe.sh
    ├── text2langstats.py
    ├── lett_viewer.py
    ├── bitextor_util
    │   ├── bitextorutil.py
    │   ├── show_bitextor_docs.py
    │   ├── wordcounts.py
    │   └── lett2ridx_combine.py
    ├── dumptar.py
    ├── filter_tmx.sh
    ├── strip_headers.py
    ├── download_domain.py
    ├── corpus_by_domain.py
    ├── download_candidates.py
    ├── download_and_align.sh
    ├── lett2corpus_lowmem.sh
    ├── find_pairs.py
    ├── corpus2corpus.py
    ├── lett2ridx.py
    ├── filter_sent.py
    ├── candidates2bitextor.py
    ├── eval_sent.py
    ├── check_lett_lang.py
    ├── add_warc_locations.py
    ├── collect_domains.py
    ├── url_matching.py
    ├── locate_candidates.py
    ├── tar2ett.py
    ├── dictionary.md
    ├── html2text.py
    ├── candidates2corpus.py
    ├── strip_language_from_uri.py
    └── score_ngrams.py
├── metadata
    ├── __init__.py
    ├── lang_stats
    │   ├── __init__.py
    │   ├── old2new_stats.py
    │   ├── cld2helper.py
    │   ├── accumulate_langstats.py
    │   ├── percent_to_bytes.py
    │   ├── accumulate_stats.py
    │   └── join_stats.py
    ├── leveldb
    │   ├── Readme
    │   ├── Makefile
    │   ├── insertkv.cc
    │   └── updatekv.cc
    ├── meta_data_kv.sh
    ├── drop_links_from_json.py
    ├── extract_monolingual.sh
    ├── insert_kv.py
    ├── extract_links.sh
    ├── rocksdb
    │   ├── Makefile
    │   ├── rdb_options.h
    │   ├── insertkv.cc
    │   └── updatekv.cc
    ├── url_classifier
    │   ├── filter_languages.py
    │   └── filter_features.py
    ├── dump_keys.py
    ├── extract_pdflinks.sh
    ├── extract_location.sh
    ├── query_md.py
    ├── langstats2kv.py
    ├── count_uniq_urls.py
    ├── read_wet.py
    ├── add_lang_stats.py
    └── links_from_wat.py
├── merge
    └── metadata
    │   ├── __init__.py
    │   ├── lang_stats
    │       ├── __init__.py
    │       └── percent_to_bytes.py
    │   ├── drop_links_from_json.py
    │   ├── read_wet.py
    │   └── add_lang_stats.py
├── .gitignore
├── Results
    ├── .DS_Store
    ├── results_ev6.txt
    ├── results-ev12.txt
    ├── results-ev7.txt
    ├── results_ev4.xml
    └── results_ev9.txt
├── common_crawl_process.png
├── dicts
    ├── fix_encoding.py
    ├── dict_convert.py
    └── filter_giza.py
├── docaligner
    ├── numpy_text2npz.sh
    ├── numpy_text2npz.py
    ├── minmaxstd.py
    ├── htmlprocessor.py
    ├── page.py
    ├── counts2idf.py
    ├── nn.py
    ├── hash_lines.py
    ├── split_long_short.py
    ├── matching.py
    ├── table4paper.py
    ├── map_translations.py
    ├── eval_bitextor.py
    ├── tokenizer.py
    ├── ratio.py
    ├── extract.sh
    └── extract_dev_feats.sh
├── crawlertest
    ├── bitextor
    │   ├── extract_urls.py
    │   └── map_urls.py
    ├── httrack.sh
    ├── filename2url.py
    ├── httrack_pdf.sh
    └── bitextor_notes.txt
├── monolingual
    ├── README.md
    └── collect_lang.py
├── html_convert
    ├── example
    │   ├── example.html
    │   └── example.html~
    ├── header.h
    ├── Makefile
    ├── anything_to_utf8.py
    ├── string_util.h
    └── html2text.cpp
├── requirements.txt
├── docalign_task
    └── eval_langid.py
├── README.md
├── INSTALL.md
└── parseXML.py


/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/baseline/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/metadata/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/baseline/util/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/merge/metadata/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/metadata/lang_stats/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/merge/metadata/lang_stats/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .history.buck
2 | *.pyc
3 | *.xz
4 | *.gz
5 | dc.sublime-workspace


--------------------------------------------------------------------------------
/Results/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/Results/.DS_Store


--------------------------------------------------------------------------------
/common_crawl_process.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/common_crawl_process.png


--------------------------------------------------------------------------------
/dicts/fix_encoding.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | 
 6 | for line in sys.stdin:
 7 |     line = line.decode("utf-8").encode("iso-8859-1")  # .decode("utf-8")
 8 |     line = line.strip()
 9 | 
10 |     print line.strip()
11 | 


--------------------------------------------------------------------------------
/metadata/leveldb/Readme:
--------------------------------------------------------------------------------
 1 | Building instructions:
 2 | 
 3 | 1. Install libsnappy-dev and libgoogle-perftools-dev
 4 | 2. Get leveldb
 5 | 
 6 |     cd build
 7 |     git clone git@github.com:google/leveldb.git
 8 |     cd leveldb
 9 |     make
10 | 
11 | 3. Modify Makefile to point to leveldb includes
12 | 
13 | 


--------------------------------------------------------------------------------
/metadata/meta_data_kv.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #$1 /fs/gna0/buck/cc/links
 4 | #$2 2013_20
 5 | #$3 1368701562534
 6 | 
 7 | # find $1/$2/$3/ | grep internal.links.gz | xargs zcat |\
 8 | zcat $1/$2/$3/*internal.links.gz | \
 9 | /home/buck/net/build/DataCollection/metadata/metadatabase.py $2 $3 | \
10 | gzip -9 > $1/$2/$3/db_kv.gz


--------------------------------------------------------------------------------
/docaligner/numpy_text2npz.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Exit as soon as any command fails
 4 | set -e
 5 | set -o pipefail
 6 | 
 7 | OUTFILE=`basename $1`.npz
 8 | DONEFILE=npy/${OUTFILE}.done
 9 | 
10 | if [ ! -f ${DONEFILE} ]; then
11 |     nice python /home/buck/net/build/DataCollection/docaligner/numpy_text2npz.py $1 -out npy/${OUTFILE}
12 |     touch ${DONEFILE}
13 | fi
14 | 


--------------------------------------------------------------------------------
/baseline/filter_emty_text_from_lett.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """ Removes lines from .lett file where the last column containing base64
 5 | encoded text is empty. This otherwise leads to problems downstream. """
 6 | 
 7 | import sys
 8 | 
 9 | for line in sys.stdin:
10 |     if line.split("\t")[-1].strip():
11 |         sys.stdout.write(line)
12 | 


--------------------------------------------------------------------------------
/metadata/drop_links_from_json.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Reads downloaded website from tar file and writes lett format to be
 6 | processed by bitextor pipeline
 7 | """
 8 | 
 9 | import sys
10 | import json
11 | 
12 | for line in sys.stdin:
13 |     domain, data = line.split(" ", 1)
14 |     data = json.loads(data)
15 |     data.pop("links")
16 |     print domain, json.dumps(data)
17 | 


--------------------------------------------------------------------------------
/merge/metadata/drop_links_from_json.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Reads downloaded website from tar file and writes lett format to be
 6 | processed by bitextor pipeline
 7 | """
 8 | 
 9 | import sys
10 | import json
11 | 
12 | for line in sys.stdin:
13 |     domain, data = line.split(" ", 1)
14 |     data = json.loads(data)
15 |     data.pop("links")
16 |     print domain, json.dumps(data)
17 | 


--------------------------------------------------------------------------------
/Results/results_ev6.txt:
--------------------------------------------------------------------------------
 1 | Files Downloaded:
 2 | -----------------------------------------------------
 3 | 1.xml	AB Coring	http://ab-carottage.fr/en/
 4 | 6.xml	Contact "AB Coring	http://ab-carottage.fr/en/contact/
 5 | 24.xml	Contact « AB Carottage	http://ab-carottage.fr/contact/
 6 | 2.xml	AB Carottage	http://ab-carottage.fr/
 7 | 
 8 | Files Mapped:
 9 | -----------------------------------------------------
10 | 6.xml	24.xml
11 | 2.xml	1.xml
12 | 


--------------------------------------------------------------------------------
/crawlertest/bitextor/extract_urls.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import sys
 4 | import re
 5 | 
 6 | if len(sys.argv) != 2:
 7 |     print "Usage: python "+sys.argv[0]+" language_id"
 8 |     exit()
 9 | 
10 | langid = sys.argv[1]
11 | 
12 | r = re.compile("(\S*"+langid+"\S*?)\t(.*?)\t")
13 | for line in sys.stdin:
14 |     m = r.match(line)
15 |     if m:
16 | 	(urlsource, urltarget) = m.group(1,2)
17 | 	print urlsource+"\t"+urltarget
18 | 
19 | 


--------------------------------------------------------------------------------
/metadata/extract_monolingual.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | set -o pipefail
 5 | 
 6 | FILENAME=$(echo $1 | awk ' BEGIN { FS = "/" } { print $(NF-2) "/" $(NF)}')
 7 | 
 8 | if [ ! -f ${FILENAME}.done ]; then
 9 |   curl -s $1 | gzip -cd | \
10 |   /fs/nas/heithrun0/commoncrawl/langsplit/bin/read_wet.py | \
11 |   /fs/nas/heithrun0/commoncrawl/langsplit/bin/langsplit --printchunks 2> /dev/null | \
12 |   xz -9 -e > ${FILENAME}.langsplit.xz
13 |   touch ${FILENAME}.done
14 | fi
15 | 


--------------------------------------------------------------------------------
/monolingual/README.md:
--------------------------------------------------------------------------------
1 | For monolingual [Common Crawl](http://commoncrawl.org) data and code to process it please refer to these resources:
2 | * [University of Edinburgh N-gram site](http://statmt.org/ngrams)
3 | * Code to process corpora: https://github.com/kpu/preprocess
4 | * Code to produce raw monolingual files from CommonCrawl: https://github.com/treigerm/CommonCrawlProcessing
5 | * Alternative monolingual data extraction under development in ParaCrawl project: https://github.com/paracrawl/extractor
6 | 


--------------------------------------------------------------------------------
/metadata/insert_kv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | import leveldb
 5 | 
 6 | if __name__ == "__main__":
 7 |     errors = 0
 8 |     import argparse
 9 | 
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument('db', help='leveldb root directory')
12 |     args = parser.parse_args(sys.argv[1:])
13 | 
14 |     db = leveldb.LevelDB(args.db)
15 | 
16 |     for line in sys.stdin:
17 |         k, v = line.rstrip().split("\t", 1)
18 |         db.Put(k, v)
19 | 
20 |     sys.stderr.write("%s" % db.GetStats())
21 | 


--------------------------------------------------------------------------------
/html_convert/example/example.html:
--------------------------------------------------------------------------------
 1 | df6fa1abb58549287111ba8d776733e9 http://example.com/site.html
 2 | <html><body>
 3 | This is some English text and it should hopefully be classified at such.
 4 | <ul>
 5 |   <li>Single.</li>
 6 |   <li>Word.</li>
 7 |   <li>Lines.</li>
 8 | </ul>
 9 | 
10 | Have some links:
11 | <a href="http://google.com">Google</a>
12 | <a href="http://statmt.org/ngrams/BuckEtAl_LREC2014_CommonCrawlLM.pdf">LREC paper</a>.
13 | 
14 | Am Ende finden wir dann noch etwas deutschen Text, wobei nicht klar ist ob die Menge ausreicht um ihn vom englishen zu unterscheiden.
15 | </body></html>
16 | 


--------------------------------------------------------------------------------
/html_convert/example/example.html~:
--------------------------------------------------------------------------------
 1 | df6fa1abb58549287111ba8d776733e9 http://example.com/site.html
 2 | <html><body>
 3 | This is some English text and it should hopefully be classified at such.
 4 | <ul>
 5 |   <li>Single.</li>
 6 |   <li>Word.</li>
 7 |   <li>Lines.</li>
 8 | </ul>
 9 | 
10 | Have some links:
11 | <a href="http://google.com">Google</a>
12 | <a href="http://statmt.org/ngrams/BuckEtAl_LREC2014_CommonCrawlLM.pdf">LREC paper</a>.
13 | 
14 | Am Ende finden wir dann noch etwas deutschen Text, wobei nicht klar ist ob die Menge ausreicht um ihn vom englishen zu unterscheiden.
15 | </body></html>
16 | 


--------------------------------------------------------------------------------
/baseline/add_warc_locations.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Downloads $1 which should be a CommonCrawl wat file,
 4 | # extracts links, sorts by domainname and xzips the result
 5 | 
 6 | # Exit as soon as any command fails
 7 | set -e
 8 | set -o pipefail
 9 | 
10 | # Directory in which this script is stored
11 | DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
12 | 
13 | PREFIX=$(dirname ${1}/x | awk -F '/' '{print $NF}')
14 | 
15 | zcat ${1}/*.links.gz | \
16 | python ${DIR}/add_warc_locations.py --prefix=${PREFIX}/ /home/achim/stats/found_urls.txt \
17 | > ${1}/found_locations.txt
18 | touch ${1}/found_locations.done
19 | 


--------------------------------------------------------------------------------
/baseline/ngrams.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | # ngrams = defaultdict(lambda: defaultdict:)
 6 | 
 7 | if __name__ == "__main__":
 8 |     import argparse
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument('-n', type=int, default=4)
11 |     args = parser.parse_args(sys.argv[1:])
12 | 
13 |     for line in sys.stdin:
14 |         filename, text = line.split("\t", 1)
15 |         text = text.strip().split()
16 |         for start in range(len(text) - args.n + 1):
17 |             sys.stdout.write(
18 |                 "%s\t%s\n" % (" ".join(text[start:start + args.n]), filename))
19 | 


--------------------------------------------------------------------------------
/metadata/leveldb/Makefile:
--------------------------------------------------------------------------------
 1 | CXXFLAGS = -O2 -Wall -fmessage-length=0
 2 | 
 3 | default: all
 4 | 
 5 | insertkv: insertkv.o
 6 | 	$(CXX) -o $@ $^ -pg -L/home/buck/net/build/leveldb -ltcmalloc -lleveldb -lpthread -lsnappy -Wl,-rpath=/home/buck/net/build/leveldb 
 7 | 
 8 | updatekv: updatekv.o
 9 | 	$(CXX) -o $@ $^ -pg -L/home/buck/net/build/leveldb -ltcmalloc -lleveldb -lpthread -lsnappy -ljsoncpp -Wl,-rpath=/home/buck/net/build/leveldb 
10 | 
11 | 
12 | %.o : %.cpp *.h Makefile
13 | 	@echo "***" $< "***"
14 | 	$(CXX) $(CXXFLAGS) -I/home/buck/net/build/leveldb/include -c $< -o $@  
15 | 
16 | .PHONY : all clean
17 | all:    insertkv updatekv
18 | 
19 | clean:
20 | 	rm -f insertkv updatekv
21 | 


--------------------------------------------------------------------------------
/baseline/dedupe.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Usage: dedupe.sh <input folder> <output folder> <sourcelangid> <targetlangid>
 4 | 
 5 | # Location of dedupe binary built from https://github.com/kpu/preprocess
 6 | dedupebin=~/preprocess/bin/dedupe
 7 | 
 8 | if [ ! -d $1 ]
 9 | then
10 |     echo "$1 is not a folder."
11 |     exit
12 | fi
13 | 
14 | if [ -e $2]
15 | then
16 |     if [ ! -d $2 ]
17 |     then
18 | 	echo "$2 is not a directory."
19 | 	exit
20 |     fi
21 | else
22 |     mkdir -p $2
23 | fi
24 | 
25 | for insrc in $1/*.$3
26 | do
27 |     intgt=${insrc%.$3}.$4
28 |     outsrc=$2/${insrc##*/}
29 |     outtgt=$2/${intgt##*/}
30 |     `$dedupebin $insrc $intgt $outsrc $outtgt`
31 | done
32 | 
33 | 


--------------------------------------------------------------------------------
/dicts/dict_convert.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Converts a bilingual dictionary from Eduard Barbu's format
 5 | # to a word-based dictionary usable as a dictionary for Bitextor's
 6 | # sentence aligner
 7 | #
 8 | # Usage: python dict_convert.py < input_dict > output_dict
 9 | 
10 | import sys
11 | 
12 | # TBD: output language identifiers here; read from command line?
13 | for line in sys.stdin:
14 |     line = line.rstrip('\r\n')
15 |     entry = line.split('@#@')
16 |     source = entry[0].split()
17 |     if len(source) != 1:
18 | 	continue
19 |     target = entry[1].split()
20 |     if len(target) != 1:
21 | 	continue
22 |     print source[0]+'\t'+target[0]
23 | 
24 | 


--------------------------------------------------------------------------------
/crawlertest/httrack.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | if [ $# != 3 ]; then
 4 |     echo "Usage: $0 start-url output-directory"
 5 |     echo "* start-url: initial seed url"
 6 |     echo "* output-directory: write logs and downloads to this dir"
 7 |     exit
 8 | fi
 9 | 
10 | httrack \
11 | --connection-per-second=20 \
12 | --sockets=10 \
13 | --keep-alive \
14 | --disable-security-limits \
15 | --max-rate=500000 \
16 | --display \
17 | --verbose \
18 | --advanced-progressinfo \
19 | --continue \
20 | --robots=0 \
21 | --urlhack \
22 | --index=0 \
23 | -m \
24 | -F 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36' \
25 | -#L500000000 \
26 | --skeleton \
27 | --path=$2 \
28 | $1
29 | 


--------------------------------------------------------------------------------
/docaligner/numpy_text2npz.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import numpy as np
 4 | import sys
 5 | 
 6 | if __name__ == "__main__":
 7 |     import argparse
 8 |     parser = argparse.ArgumentParser()
 9 |     parser.add_argument('infile', help='input text format matrix')
10 |     parser.add_argument('-outfile', help='output npz file')
11 |     args = parser.parse_args(sys.argv[1:])
12 | 
13 |     if args.infile.endswith('npz'):
14 |         m = np.load(args.infile)['m']
15 |     else:
16 |         m = np.loadtxt(args.infile)
17 |     print "Loaded ", args.infile, " of shape ", m.shape
18 |     if args.outfile:
19 |         np.save(args.outfile, m)
20 |     print "Wrote ", args.outfile, " of shape ", m.shape
21 | 


--------------------------------------------------------------------------------
/baseline/text2langstats.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | from collections import defaultdict
 6 | 
 7 | magic_number = "df6fa1abb58549287111ba8d776733e9"
 8 | 
 9 | 
10 | langstats = defaultdict(int)
11 | 
12 | lang = None
13 | for line in sys.stdin:
14 |     if line.startswith(magic_number):
15 |         # df6fa1abb58549287111ba8d776733e9
16 |         # http://www.achpr.org/about/documentation-centre/ language:en
17 |         # offset:200 bytes: 3424
18 |         lang = line.split()[2].split(":")[-1]
19 |         continue
20 |     langstats[lang] += len(line.decode("utf-8").strip())
21 | 
22 | for lang, num_bytes in langstats.items():
23 |     sys.stdout.write("%s\t%d\n" % (lang, num_bytes))
24 | 


--------------------------------------------------------------------------------
/metadata/extract_links.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Downloads $1 which should be a CommonCrawl wat file,
 4 | # extracts links, sorts by domainname and xzips the result
 5 | 
 6 | # Exit as soon as any command fails
 7 | set -e
 8 | 
 9 | FILENAME=`echo $1 | awk  ' BEGIN { FS = "/" } { print $(NF-2) "/" $(NF) }'`
10 | OUTFILE=${FILENAME/warc.wat.gz/links.xz}
11 | TMPDIR=./tmp/`hostname`
12 | mkdir -p ${TMPDIR}
13 | 
14 | # Directory in which this script is stored
15 | DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
16 | 
17 | curl -s --retry 5 $1 | gzip -cd | ${DIR}/links_from_wat.py | sort -t" " -S500M -k1,1 --compress-program=pigz --temporary-directory=${TMPDIR} --parallel=2 | uniq | /home/buck/net/build/pxz/pxz -T 2 -9 -e > ${OUTFILE} && touch ${OUTFILE/links.xz/done}
18 | 


--------------------------------------------------------------------------------
/metadata/rocksdb/Makefile:
--------------------------------------------------------------------------------
 1 | CXXFLAGS = -O3 -Wall -Wextra -Wsign-compare -Wshadow -Wno-unused-parameter -std=c++11 -fmessage-length=0 -Wfatal-errors -I/home/buck/net/build/rocksdb/include/
 2 | 
 3 | default: all
 4 | 
 5 | insertkv: insertkv.o
 6 | 	$(CXX) -o $@ $^ -static -L/home/buck/net/build/rocksdb -lrocksdb -lrt -lz -lbz2 -lpthread -lsnappy -Wl,-rpath=/home/buck/net/build/rocksdb 
 7 | 
 8 | updatekv: updatekv.o
 9 | 	$(CXX) -o $@ $^ -static -L/home/buck/net/build/rocksdb -lrocksdb -lrt -lz -lbz2 -lpthread -lsnappy -ljsoncpp -Wl,-rpath=/home/buck/net/build/rocksdb 
10 | 
11 | 
12 | %.o : %.cc *.h Makefile
13 | 	@echo "***" $< "***"
14 | 	$(CXX) $(CXXFLAGS) -c $< -o $@  
15 | 
16 | .PHONY : all clean
17 | all:    insertkv updatekv
18 | 
19 | clean:
20 | 	rm -f insertkv updatekv
21 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | beautifulsoup4==4.4.1
 2 | cffi==0.9.2
 3 | chardet==2.3.0
 4 | CherryPy==3.7.0
 5 | cld2-cffi==0.1.1
 6 | cryptography==0.9
 7 | Cython==0.23.1
 8 | enum34==1.0.4
 9 | html5lib==0.99999
10 | idna==2.0
11 | ipaddress==1.0.7
12 | ipython==3.1.0
13 | jsonrpc==1.2
14 | jsonrpclib==0.1.3
15 | langid==1.1.4.dev0
16 | leveldb==0.193
17 | line-profiler==1.0
18 | lxml==3.4.4
19 | munkres==1.0.7
20 | ndg-httpsclient==0.4.0
21 | nltk==3.0.2
22 | numpy==1.9.2
23 | pexpect==3.3
24 | pyasn1==0.1.7
25 | pycparser==2.14
26 | pyOpenSSL==0.15.1
27 | pyrocksdb==0.4
28 | python-Levenshtein==0.12.0
29 | regex==2015.9.15
30 | requests==2.7.0
31 | scipy==0.15.1
32 | simhash==1.6.2
33 | simplejson==3.8.0
34 | six==1.9.0
35 | tldextract==1.6
36 | Unidecode==0.4.18
37 | urltools==0.3.2
38 | xmltodict==0.9.2
39 | 


--------------------------------------------------------------------------------
/Results/results-ev12.txt:
--------------------------------------------------------------------------------
 1 | Files Downloaded:
 2 | -----------------------------------------------------
 3 | 65.xml	Philosophy: Simplicity pays off	http://tekstwerk.com/philosophy
 4 | 60.xml	Wir über uns: tekstwerk•com kennenlernen	http://tekstwerk.com/de/wir-ueber-uns
 5 | 52.xml	Nieuws	http://tekstwerk.com/de/news
 6 | 63.xml	Nieuws	http://tekstwerk.com/news
 7 | 19.xml	Introduction: Learn more about tekstwerk•com	http://tekstwerk.com/en/introduction
 8 | 10.xml	Portfolio	http://www.tekstwerk.com/en/portfolio
 9 | 33.xml	Philosophie: Einfachheit lohnt	http://www.tekstwerk.com/de/philosophie
10 | 55.xml	Portfolio	http://tekstwerk.com/de/portfolio
11 | 
12 | Files Mapped:
13 | -----------------------------------------------------
14 | 10.xml	55.xml
15 | 52.xml	63.xml
16 | 65.xml	33.xml
17 | 19.xml	60.xml
18 | 


--------------------------------------------------------------------------------
/docaligner/minmaxstd.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import numpy as np
 4 | import sys
 5 | import gzip
 6 | 
 7 | if __name__ == "__main__":
 8 |     import argparse
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument('infile', help='input text format matrix')
11 |     parser.add_argument('-outfile', help='output npz file')
12 |     args = parser.parse_args(sys.argv[1:])
13 | 
14 |     fh = open(args.infile, 'r')
15 |     if args.infile.endswith('.gz'):
16 |         fh = gzip.open(args.infile)
17 |     m = np.load(fh)
18 | 
19 |     print "Loaded ", args.infile, " of shape ", m.shape
20 |     print "Std\t", np.std(m)
21 |     print "Min\t", np.min(m)
22 |     print "Max\t", np.max(m)
23 |     print "Mean\t", np.average(m)
24 |     print "Median\t", np.median(m)
25 | 


--------------------------------------------------------------------------------
/monolingual/collect_lang.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | import argparse
 5 | 
 6 | magic_number = 'df6fa1abb58549287111ba8d776733e9'
 7 | 
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument('-lang', help='language code')
10 | args = parser.parse_args()
11 | 
12 | buf = []
13 | keep = False
14 | for line in sys.stdin:
15 |     if line.startswith(magic_number):
16 |         if buf:
17 |             assert keep is True
18 |             sys.stdout.write("".join(buf))
19 | 
20 |         keep = False
21 |         buf = []
22 | 
23 |         if "language:%s" % args.lang in line.strip().split():
24 |             keep = True
25 | 
26 |     if keep:
27 |         buf.append(line)
28 | 
29 | if buf:
30 |     assert keep is True
31 |     sys.stdout.write("".join(buf))
32 | 


--------------------------------------------------------------------------------
/metadata/url_classifier/filter_languages.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | 
 5 | 
 6 | if __name__ == "__main__":
 7 |     import argparse
 8 |     parser = argparse.ArgumentParser()
 9 |     parser.add_argument('--valid', type=argparse.FileType(),
10 |                         help='file containing valid label, one per line')
11 |     parser.add_argument('--default',
12 |                         help='replacement for invalid labels')
13 |     args = parser.parse_args(sys.argv[1:])
14 | 
15 |     valid = set([l.strip() for l in args.valid])
16 | 
17 |     for line in sys.stdin:
18 |         label, feats = line.split("\t", 1)
19 |         if label not in valid:
20 |             label = args.default
21 |         if label:
22 |             sys.stdout.write("%s\t%s" % (label, feats))
23 | 


--------------------------------------------------------------------------------
/metadata/dump_keys.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | import rocksdb
 5 | 
 6 | if __name__ == "__main__":
 7 |     import argparse
 8 |     parser = argparse.ArgumentParser()
 9 |     parser.add_argument('db',
10 |                         help='path to rocksdb')
11 |     parser.add_argument(
12 |         '-outfile', help='output file', type=argparse.FileType('w'),
13 |         default=sys.stdout)
14 |     args = parser.parse_args()
15 | 
16 |     opts = rocksdb.Options()
17 |     opts.create_if_missing = False
18 |     opts.max_open_files = 100
19 |     opts.num_levels = 6
20 |     db = rocksdb.DB(args.db, opts, read_only=True)
21 |     it = db.iterkeys()
22 |     it.seek_to_first()
23 |     for key in it:
24 |         tld, url, crawl = key.split(" ", 2)
25 |         args.outfile.write(url + "\n")
26 | 


--------------------------------------------------------------------------------
/metadata/extract_pdflinks.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Downloads $1 which should be a CommonCrawl wat file,
 4 | # extracts links, sorts by domainname and xzips the result
 5 | 
 6 | # Exit as soon as any command fails
 7 | set -e
 8 | set -o pipefail
 9 | 
10 | FILENAME=`echo $1 | awk  ' BEGIN { FS = "/" } { print $(NF-2) "/" $(NF) }'`
11 | OUTFILE=${FILENAME/warc.wat.gz/pdflinks.xz}
12 | 
13 | # don't let temporary sort files fill up local /tmp
14 | TMPDIR=./tmp/`hostname`
15 | mkdir -p ${TMPDIR}
16 | 
17 | # Directory in which this script is stored
18 | DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
19 | 
20 | if [ ! -f ${OUTFILE/.xz/.done} ]; then
21 |   curl -s --retry 5 $1 | \
22 |   gzip -cd | \
23 |   ${DIR}/links_from_wat.py -pdf | \
24 |   sort -t" " -S500M -k1,1 --compress-program=pigz --temporary-directory=${TMPDIR} -u --parallel=2 | \
25 |   xz -9 -e \
26 |   > ${OUTFILE}
27 |   touch ${OUTFILE/.xz/.done}
28 | fi
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/html_convert/header.h:
--------------------------------------------------------------------------------
 1 | #ifndef HEADER_H_
 2 | #define HEADER_H_
 3 | 
 4 | #include <string>
 5 | 
 6 | #include "string_util.h"
 7 | 
 8 | namespace {
 9 | 
10 | using std::string;
11 | 
12 | class Header {
13 |  public:
14 |   explicit Header(const string& header) {
15 |     for (const auto& value : StringUtil::Split(header, ' ')) {
16 |       if (value.find("tld:") == 0) {
17 |         tld_ = value.substr(4);
18 |       } else if (value.find("uri:") == 0) {
19 |         uri_ = value.substr(4);
20 |       } else if (value.find("encoding:") == 0) {
21 |         encoding_ = value.substr(9);
22 |       }
23 |     }
24 |   }
25 | 
26 |   const string get_tld() const { return tld_; }
27 |   const string get_uri() const { return uri_; }
28 |   const string get_encoding() const { return encoding_; }
29 | 
30 |  private:
31 |   string uri_;
32 |   string tld_;
33 |   string encoding_;
34 | };
35 | 
36 | }  // namespace
37 | 
38 | #endif /* HEADER_H_ */
39 | 


--------------------------------------------------------------------------------
/docaligner/htmlprocessor.py:
--------------------------------------------------------------------------------
 1 | from HTMLParser import HTMLParser
 2 | 
 3 | 
 4 | class HTMLSequencer(HTMLParser):
 5 | 
 6 |     def __init__(self, length_function, growth_function):
 7 |         HTMLParser.__init__(self)
 8 |         self.sequence = []
 9 |         self.length_function = length_function
10 |         self.growth_function = growth_function
11 | 
12 |     def handle_starttag(self, tag, attrs):
13 |         self.sequence.append("<%s>" % tag)
14 | 
15 |     def handle_endtag(self, tag):
16 |         self.sequence.append("</%s>" % tag)
17 | 
18 |     def handle_data(self, data):
19 |         if not data.strip():
20 |             return
21 |         n = self.length_function(data)
22 | 
23 |         for n in range(int(self.growth_function(n))):
24 |             self.sequence.append("%d" % n)
25 | 
26 |     def get_result(self):
27 |         return self.sequence
28 | 
29 |     def reset(self):
30 |         HTMLParser.reset(self)
31 |         self.sequence = []
32 | 


--------------------------------------------------------------------------------
/metadata/extract_location.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Downloads $1 which should be a CommonCrawl wat file,
 4 | # extracts links, sorts by domainname and xzips the result
 5 | 
 6 | # Exit as soon as any command fails
 7 | set -e
 8 | set -o pipefail
 9 | 
10 | FILENAME=`echo $1 | awk  ' BEGIN { FS = "/" } { print $(NF-2) "/" $(NF) }'`
11 | OUTFILE=${FILENAME/warc.wat.gz/meta.xz}
12 | 
13 | # don't let temporary sort files fill up local /tmp
14 | TMPDIR=./tmp/`hostname`
15 | mkdir -p ${TMPDIR}
16 | 
17 | # Directory in which this script is stored
18 | DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
19 | 
20 | if [ ! -f ${OUTFILE/.xz/.done} ]; then
21 |   curl -s --retry 5 $1 | \
22 |   gzip -cd | \
23 |   ${DIR}/links_from_wat.py -nolinks | \
24 |   sort -t" " -S500M -k1,1 --compress-program=pigz --temporary-directory=${TMPDIR} --parallel=2 | \
25 |   uniq | \
26 |   /home/buck/net/build/pxz/pxz -T 2 -9 -e \
27 |   > ${OUTFILE}
28 |   touch ${OUTFILE/.xz/.done}
29 | fi
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/metadata/lang_stats/old2new_stats.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | import tldextract
 6 | 
 7 | from cld2helper import read_cld2_languages
 8 | 
 9 | 
10 | def get_domain(netloc):
11 |     extract = tldextract.extract(netloc)
12 |     return ".".join((extract.domain, extract.suffix)).encode('idna')
13 | 
14 | 
15 | if __name__ == "__main__":
16 |     import argparse
17 | 
18 |     parser = argparse.ArgumentParser()
19 |     parser.add_argument('infile', nargs='?', type=argparse.FileType('r'))
20 |     args = parser.parse_args()
21 | 
22 |     name2code, code2name = read_cld2_languages(args.infile)
23 | 
24 |     for line in sys.stdin:
25 |         domain, language, num_bytes = line.split()
26 |         assert language in code2name
27 |         language = code2name[language]
28 |         # domain = get_domain(domain)
29 | 
30 |         sys.stdout.write("%s %s %d\n" % (domain, language, int(num_bytes)))
31 | 
32 | 
33 | # en.wikipedia.org xx-Kali 274
34 | 


--------------------------------------------------------------------------------
/docaligner/page.py:
--------------------------------------------------------------------------------
 1 | class Page(object):
 2 | 
 3 |     def __init__(self, url, html, text, mime_type,
 4 |                  encoding, french, english, english_mt):
 5 |         self.url = url
 6 |         self.html = html
 7 |         self.text = text
 8 |         self.mime_type = mime_type
 9 |         self.encoding = encoding
10 |         self.french = french
11 |         self.english = english
12 |         self.english_mt = english_mt
13 | 
14 |     def __str__(self):
15 |         res = []
16 |         res.append("--Page--")
17 |         res.append("url : %s" % self.url)
18 |         res.append("html : %s" % self.html)
19 |         res.append("text : %s" % self.text.encode('utf-8'))
20 |         res.append("mime_type : %s" % self.mime_type)
21 |         res.append("encoding : %s" % self.encoding)
22 |         res.append("french : %s" % self.french.encode('utf-8'))
23 |         res.append("english : %s" % self.english.encode('utf-8'))
24 |         res.append("english_mt : %s" % self.english_mt.encode('utf-8'))
25 |         return "\n".join(res)
26 | 


--------------------------------------------------------------------------------
/crawlertest/bitextor/map_urls.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import sys
 4 | 
 5 | if __name__ == "__main__":
 6 |     import argparse
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument('mapping', help='mapping from filename to url',
 9 |                         type=argparse.FileType('r'))
10 |     args = parser.parse_args(sys.argv[1:])
11 | 
12 |     mapping = {}
13 |     for line in args.mapping:
14 |         filename, url = line.strip().split()
15 |         assert filename not in mapping, "Repeated value: %s\n" % line
16 |         mapping[filename] = url
17 | 
18 |     for line in sys.stdin:
19 |         filesource, filetarget = line.strip().split()
20 |         if filesource in mapping:
21 |             if filetarget in mapping:
22 |                 print mapping[filesource] + "\t" + mapping[filetarget]
23 |             else:
24 |                 sys.stderr.write(
25 |                     "Target file mapping not found:" + filetarget + "\n")
26 |         else:
27 |             sys.stderr.write(
28 |                 "Source file mapping not found:" + filesource + "\n")
29 | 


--------------------------------------------------------------------------------
/crawlertest/filename2url.py:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Extract original URL from httrack webdir
 6 | """
 7 | 
 8 | import sys
 9 | import re
10 | 
11 | # Example line we're looking for:
12 | # <!-- Mirrored from www.tekstwerk.com/en by HTTrack ...
13 | 
14 | 
15 | magic_number = "df6fa1abb58549287111ba8d776733e9"
16 | 
17 | if __name__ == "__main__":
18 |     import argparse
19 |     parser = argparse.ArgumentParser()
20 |     parser.add_argument('infile', help='name of file in httrack webdir',
21 |                         nargs='+')
22 |     # parser.add_argument('outfile', type=argparse.FileType('w'),
23 |     #                     help='output file')
24 |     args = parser.parse_args(sys.argv[1:])
25 | 
26 |     for filename in args.infile:
27 |         html = open(filename, 'r').read()
28 |         m = re.search("<!-- Mirrored from (\S+) by HTTrack", html)
29 |         if m:
30 |             url = m.groups()[0]
31 |             sys.stdout.write("%s\t%s\n" % (filename, url))
32 |         else:
33 |             sys.stderr.write("No URL found for %s\n" % filename)
34 | 


--------------------------------------------------------------------------------
/metadata/lang_stats/cld2helper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | 
 5 | def read_cld2_languages(infile):
 6 |     # Example input:
 7 |     # "  TG_UNKNOWN_LANGUAGE          = 25,  // xxx"
 8 |     name2code = {}
 9 |     code2name = {}
10 |     for line in infile:
11 |         if not line.strip():
12 |             continue
13 |         line = line.strip().split()
14 |         name = line[0]
15 |         code = line[-1]
16 |         if code == "//":
17 |             continue
18 | 
19 |         assert code not in code2name
20 |         code2name[code] = name
21 |         assert name not in name2code
22 |         name2code[name] = code
23 |     return name2code, code2name
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     import argparse
28 |     import sys
29 | 
30 |     parser = argparse.ArgumentParser()
31 |     parser.add_argument('infile', nargs='?', type=argparse.FileType('r'),
32 |                         default=sys.stdin)
33 |     args = parser.parse_args()
34 | 
35 |     name2code, code2name = read_cld2_languages(args.infile)
36 |     print name2code
37 |     print code2name
38 | 


--------------------------------------------------------------------------------
/html_convert/Makefile:
--------------------------------------------------------------------------------
 1 | # -*- GNUMakefile -*-
 2 | 
 3 | GUMBO_SRC:= /home/buck/net/build/gumbo
 4 | GUMBO_INC:= ${GUMBO_SRC}/include
 5 | GUMBO_LIB:= ${GUMBO_SRC}/lib
 6 | CLD2_SRC := /home/buck/net/build/cld2
 7 | CLD2_INC := ${CLD2_SRC}/public
 8 | CLD2_LIB := ${CLD2_SRC}/internal
 9 | CLD2_VAR := cld2_full
10 | 
11 | CXXFLAGS := -O2 -flto -g -Wall -fmessage-length=0 -std=c++11
12 | LDFLAGS  := -flto -O2 -static 
13 | LIBS     := -L${GUMBO_LIB} -L${CLD2_LIB}
14 | 
15 | OBJS =		html2text.o langsplit.o
16 | HEADERS =	*.h
17 | 
18 | default: all
19 | 
20 | TARGET =	html2text langsplit
21 | 
22 | html2text: html2text.o
23 | 	$(CXX) -o $@ $^ $(LDFLAGS) -L${GUMBO_LIB} -lgumbo
24 | 
25 | langsplit: langsplit.o
26 | 	$(CXX) -o $@ $^ $(LDFLAGS) -L${CLD2_LIB} -l${CLD2_VAR}
27 | 
28 | html2text.o : html2text.cpp *.h Makefile
29 | 	@echo "***" $< "***"
30 | 	$(CXX) $(CXXFLAGS) -c -I${GUMBO_INC} $< -o $@  
31 | 
32 | langsplit.o : langsplit.cpp *.h Makefile
33 | 	@echo "***" $< "***"
34 | 	$(CXX) $(CXXFLAGS) -c -I${CLD2_INC} $< -o $@  
35 | 
36 | 
37 | .PHONY : all clean
38 | all:	html2text langsplit
39 | 
40 | clean:
41 | 	rm -f $(OBJS) $(TARGET)
42 | 


--------------------------------------------------------------------------------
/metadata/url_classifier/filter_features.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | 
 5 | 
 6 | def has_prefix(prefixes, s):
 7 |     "Returns true if s starts with one of the prefixes"
 8 |     for p in prefixes:
 9 |         if s.startswith(p):
10 |             return True
11 |     return False
12 | 
13 | if __name__ == "__main__":
14 |     import argparse
15 |     parser = argparse.ArgumentParser()
16 |     parser.add_argument('--valid', type=argparse.FileType(),
17 |                         help='file containing valid features, one per line')
18 |     parser.add_argument('--prefix', nargs='+',
19 |                         help='list of valid prefixes')
20 |     args = parser.parse_args(sys.argv[1:])
21 | 
22 |     valid = set([l.strip() for l in args.valid]) if args.valid else set()
23 | 
24 |     for line in sys.stdin:
25 |         label, feats = line.split("\t", 1)
26 |         feats = [f for f in feats.strip().split() if
27 |                  (valid and f in valid) or
28 |                  (args.prefix and has_prefix(args.prefix, f))]
29 |         sys.stdout.write("%s\t%s\n" % (label, " ".join(feats)))
30 | 


--------------------------------------------------------------------------------
/dicts/filter_giza.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | svocabulary = {}
 6 | tvocabulary = {}
 7 | svcb = open(sys.argv[1], "r")
 8 | tvcb = open(sys.argv[2], "r")
 9 | for line in svcb:
10 |     item = line.strip().split(" ")
11 |     svocabulary[item[0]] = item[1]
12 | 
13 | for line in tvcb:
14 |     item = line.strip().split(" ")
15 |     tvocabulary[item[0]] = item[1]
16 | 
17 | t3dic = {}
18 | t3s = open(sys.argv[3], "r")
19 | t3t = open(sys.argv[4], "r")
20 | for line in t3t:
21 |     item = line.strip().split(" ")
22 |     if item[1] in t3dic:
23 |         t3dic[item[1]][item[0]] = item[2]
24 |     else:
25 |         t3dic[item[1]] = {}
26 |         t3dic[item[1]][item[0]] = item[2]
27 | 
28 | for line in t3s:
29 |     item = line.strip().split(" ")
30 |     if item[0] in t3dic:
31 |         if item[1] in t3dic[item[0]]:
32 |             value1 = float(t3dic[item[0]][item[1]])
33 |             value2 = float(item[2])
34 |             hmean = 2 / ((1 / value1) + (1 / value2))
35 |             if hmean > 0.2:
36 |                 if item[1] in svocabulary and item[0] in tvocabulary:
37 |                     word1 = svocabulary[item[1]]
38 |                     word2 = tvocabulary[item[0]]
39 |                     print "{0}\t{1}".format(word1, word2)
40 | 


--------------------------------------------------------------------------------
/baseline/lett_viewer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import base64
 4 | import sys
 5 | 
 6 | from html2text import html2text
 7 | from textsanitzer import TextSanitizer
 8 | 
 9 | if __name__ == "__main__":
10 |     import argparse
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument('-n',
13 |                         help='line number to extract', type=int, default=0)
14 |     parser.add_argument(
15 |         '-fromhtml', help='re-extract text from HTML', action='store_true')
16 | 
17 |     args = parser.parse_args()
18 | 
19 |     for linenr, line in enumerate(sys.stdin):
20 |         if linenr > args.n:
21 |             break
22 |         elif linenr < args.n:
23 |             continue
24 | 
25 |         lang, mime_type, enc, uri, html, text = line.split("\t")
26 |         uri = TextSanitizer.to_unicode(uri)
27 | 
28 |         if args.fromhtml:
29 |             text = html2text(base64.b64decode(html), sanitize=False,
30 |                              ignore_br=False)
31 |         else:
32 |             text = base64.b64decode(text).decode("utf-8")
33 | 
34 |         html = base64.b64decode(html).decode('utf-8')
35 | 
36 |         print uri.encode('utf-8')
37 |         print "\n---HTML---\n\n"
38 |         print html.encode('utf-8')
39 |         print "\n---TEXT---\n\n"
40 |         print text.encode("utf-8")
41 | 


--------------------------------------------------------------------------------
/docalign_task/eval_langid.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | 
 5 | 
 6 | def read_pairs(infile):
 7 |     pairs = []
 8 |     for line in infile:
 9 |         source_url, target_url = line.strip().split("\t")
10 |         pairs.append((source_url, target_url))
11 |     pairs = map(tuple, pairs)
12 |     return pairs
13 | 
14 | 
15 | if __name__ == "__main__":
16 |     import argparse
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument('devset',
19 |                         help='correct pairs in dev set',
20 |                         type=argparse.FileType('r'))
21 | 
22 |     args = parser.parse_args(sys.argv[1:])
23 | 
24 |     dev = set(read_pairs(args.devset))
25 | 
26 |     en, fr = set(), set()
27 |     for line in sys.stdin:
28 |         line = line.strip().split("\t")
29 |         if not len(line) == 2:
30 |             print "ERR:", line
31 |             continue
32 |         lang, url = line
33 |         if lang == "en":
34 |             en.add(url)
35 |         else:
36 |             assert lang == "fr"
37 |             fr.add(url)
38 | 
39 |     for source_url, target_url in dev:
40 |         if source_url not in fr:
41 |             print "%s not identified as French" % (source_url)
42 |         if target_url not in en:
43 |             print "%s not identified as French" % (target_url)
44 | 


--------------------------------------------------------------------------------
/baseline/bitextor_util/bitextorutil.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import base64
 4 | from nltk import wordpunct_tokenize
 5 | 
 6 | import sys
 7 | import os
 8 | sys.path.append(os.path.join(os.environ['BITEXTORBIN'], '../share/bitextor/utils'))
 9 | from unicodepunct import get_unicode_punct
10 | punctuation_chars = get_unicode_punct()
11 | 
12 | # same tokenization as used in bitextor-lett2idx
13 | 
14 | 
15 | def get_words(untokenized_text):
16 |     text = " ".join(wordpunct_tokenize(untokenized_text))
17 |     words = text.lower().split()
18 |     words = [w.strip(punctuation_chars) for w in words]
19 |     words = [w for w in words if w]
20 |     return words
21 | 
22 | 
23 | def read_lett(filename, lang=None, as_set=False):
24 |     # with codecs.open(filename, 'r', 'utf-8') as lettfile:
25 |     with open(filename, 'r') as lettfile:
26 |         for linenr, line in enumerate(lettfile):
27 |             fields = line.strip().split("\t")
28 |             if lang is not None and lang != fields[0]:
29 |                 continue
30 |             text = base64.b64decode(fields[6])
31 |             # print repr(text)
32 |             text = text.decode('utf-8')
33 |             if text:
34 |                 words = get_words(text)
35 |                 if as_set:
36 |                     words = set(words)
37 |                 yield linenr, fields[0], words
38 | 


--------------------------------------------------------------------------------
/baseline/bitextor_util/show_bitextor_docs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | import base64
 5 | 
 6 | """ Makes bitextors .docs files humand readable """
 7 | 
 8 | if __name__ == "__main__":
 9 |     import argparse
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument('-html', help='show HTML', action='store_true')
12 |     args = parser.parse_args()
13 | 
14 |     for line in sys.stdin:
15 |         line = line.strip().split("\t")
16 |         s_url, t_url, source, target = line[:4]
17 | 
18 |         src_html, tgt_html = '', ''
19 | 
20 |         # for reading .down files
21 |         if len(line) == 6:
22 |             src_html = base64.b64decode(line[4])
23 |             # print line[5]
24 |             tgt_html = base64.b64decode(line[5])
25 | 
26 |         source = base64.b64decode(source)
27 |         target = base64.b64decode(target)
28 | 
29 |         sys.stdout.write("----------------")
30 |         sys.stdout.write("Source URL: %s\n" % s_url)
31 |         sys.stdout.write("Target URL: %s\n" % s_url)
32 |         sys.stdout.write("---SOURCE TEXT ---\n\n%s\n" % source)
33 |         if src_html and args.html:
34 |             sys.stdout.write("---SOURCE HTML ---\n\n%s\n" % src_html)
35 |         sys.stdout.write("---TARGET TEXT ---\n\n%s\n" % target)
36 |         if tgt_html and args.html:
37 |             sys.stdout.write("---TARGET HTML ---\n\n%s\n" % tgt_html)
38 | 


--------------------------------------------------------------------------------
/crawlertest/httrack_pdf.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | if [ $# != 3 ]; then
 4 |     echo "Usage: $0 start-url domain output-directory"
 5 |     echo "* start-url: initial seed url"
 6 |     echo "* domain: stay within this (sub-)domain"
 7 |     echo "* output-directory: write logs and downloads to this dir"
 8 |     exit
 9 | fi
10 | 
11 | nice httrack \
12 | --connection-per-second=20 \
13 | --sockets=20 \
14 | --keep-alive \
15 | --display \
16 | --verbose \
17 | --advanced-progressinfo \
18 | --disable-security-limits \
19 | --max-rate=10000000 \
20 | --continue \
21 | --robots=0 \
22 | --urlhack \
23 | --index=0 \
24 | --timeout=2 \
25 | --retries=3 \
26 | --extended-parsing yes \
27 | -m \
28 | -F 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36' \
29 | -#L500000000 \
30 | -'*' \
31 | +"*$2/*.html" +"*$2/*.htm" \
32 | +"*$2/*.pdf" \
33 | +"*$2/*/" \
34 | +"*$2/*.php" +"*$2/*.cgi" +"*$2/*.asp" \
35 | -'mime:*/*' +'mime:text/html' +'mime:application/pdf' +'mime:application/x-pdf' \
36 | --path=$3 \
37 | $1
38 | 
39 | # -'*.jpg' -'*.jpeg' -'*.gif' -'*.ps' -'*.js' -'*.png' -'*.zip' -'*.swf' \
40 | # -'*.flv' -'*.avi' -'*.tgz' -'*.css' -'*.doc' -'*.exe' -'*.mid' -'*.midi' \
41 | # -'*.mp3' -'*.mp4' -'*.mpg'  -'*.mpeg' -'*.mov' -'*.qt' -'*.ram' -'*.rar' \
42 | # -'*.tif' -'*.tiff' -'*.eps' -'*.svg' -'*.txt' -'*.wav' -'*.apk' -'*.torrent' \
43 | # -'*.dll' -'*.msi' -'*.xls' -'*.djvu' -'*.json' -'*.ogv' -'*.ogg' \
44 | 


--------------------------------------------------------------------------------
/baseline/dumptar.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Reads downloaded website from tar file and writes lett format to be
 6 | processed by bitextor pipeline
 7 | """
 8 | 
 9 | import sys
10 | import tarfile
11 | from textsanitzer import TextSanitizer
12 | 
13 | 
14 | magic_number = "df6fa1abb58549287111ba8d776733e9"
15 | 
16 | 
17 | if __name__ == "__main__":
18 |     import argparse
19 |     parser = argparse.ArgumentParser()
20 |     parser.add_argument('tarfile', help='tarfile containing a webdir')
21 |     parser.add_argument('outfile', type=argparse.FileType('w'),
22 |                         help='output file')
23 |     parser.add_argument('-unicode', action='store_true',
24 |                         help='ensure unicode output')
25 |     parser.add_argument('-language',
26 |                         help='possible language for encoding guessing')
27 |     args = parser.parse_args(sys.argv[1:])
28 | 
29 |     tar = tarfile.open(args.tarfile, "r:gz")
30 | 
31 |     for tarinfo in tar:
32 |         if not tarinfo.isreg():
33 |             continue
34 |         data = tar.extractfile(tarinfo).read()
35 |         args.outfile.write("%s uri:%s\n" % (magic_number, tarinfo.name))
36 |         if args.unicode:
37 |             data = TextSanitizer.to_unicode(
38 |                 data, is_html=True, lang=args.language)
39 |             args.outfile.write(data.encode("utf-8"))
40 |         else:
41 |             args.outfile.write(data)
42 |         args.outfile.write("\n")
43 |     tar.close()
44 | 


--------------------------------------------------------------------------------
/baseline/filter_tmx.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SLOC=$1
 4 | shift
 5 | TLOC=$1
 6 | shift
 7 | 
 8 | # Extract language identifiers and TMX base name
 9 | SLANG=${SLOC%%-*}
10 | TLANG=${TLOC%%-*}
11 | echo $SLOC
12 | echo $TLOC
13 | echo $SLANG
14 | echo $TLANG
15 | for TMX in $@
16 | do
17 |     FILENAME=$(basename $TMX) 
18 |     BASE=${FILENAME%.*}
19 | 
20 |     echo $TMX
21 |     echo $BASE
22 | 
23 |     # Conversion with tmx2txt.pl (specific language identifier needed)
24 | 
25 |     echo "$TMX to bitext conversion ..."
26 |     perl ~/m4loc/tmx/tmx2txt.pl $SLOC $TLOC $BASE $TMX
27 |      
28 |     # Pasteing together into one tab-delimited file
29 | 
30 |     echo "Pasteing into one tab-delimited file ..."
31 |     paste $BASE.$SLOC $BASE.$TLOC > $BASE
32 | 
33 |     # Cleaning with filter_hunalign_bitext.py
34 | 
35 |     echo "Cleaning the bitext ..." 
36 |     python ~/DataCollection/baseline/filter_hunalign_bitext.py -deleted ${BASE}.deleted -slang $SLANG -tlang $TLANG -cld2 $BASE ${BASE}.filtered
37 | 
38 |     # Separation with cut into two language files (name _langid.bitext)
39 | 
40 |     echo "Separation into two language files ..." 
41 |     cut -f1 ${BASE}.filtered > ${BASE}_${SLANG}
42 |     cut -f2 ${BASE}.filtered > ${BASE}_${TLANG}
43 | 
44 |     # tar two files into one .tar
45 | 
46 |     echo "Packaging with tar ..."
47 |     tar -cvf ${BASE}_filtered.tar ${BASE}_${SLANG} ${BASE}_${TLANG}
48 | 
49 |     # Gzip
50 | 
51 |     echo "Compression with gzip ..."
52 |     gzip ${BASE}_filtered.tar
53 | 
54 | done
55 | 
56 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DataCollection
 2 | 
 3 | Collecting data for machine translation training from CommonCrawl is a two-phase process illustrated in the following diagram:
 4 | 
 5 | ![CommonCrawl process diagram](/common_crawl_process.png?raw=true "CommonCrawl data collection process")
 6 | 
 7 | ## Installation
 8 | 
 9 | Hardware requirements and installation instructions can be found [here](/INSTALL.md).
10 | 
11 | ## Phase 1: Language annotation, building a meta-data file and monolingual data extraction
12 | 
13 | The first phase detects the languages of the web pages contained in the crawl and other meta-data. A meta-data file is built from this analysis.
14 | 
15 | The [metadata documentation](/metadata/metadata.md) describes phase 1 step-by-step.
16 | 
17 | With data from this phase monolingual data for language model training can be extracted. The data for most of the CommonCrawl crawls and many languages can be found on:
18 | 
19 | * http://statmt.org/ngrams/
20 | * http://www.statmt.org/wmt16/translation-task.html
21 | 
22 | 
23 | ## Phase 2: Extracting parallel data and optional cleaning
24 |  
25 | In the second phase the meta-data collected in phase 1 is used to extract parallel data from CommonCrawl data based on URL pattern matching. Phase 2 is documented step-by-step in the [baseline documentation](/baseline/baseline.md)
26 | 
27 | For the language pairs en↔de, en↔fr, en↔es, en↔it, en↔pt, en↔nl and en↔ru matched URL data for CommonCrawl 2015_32 is available for data extraction in [release 0.1.0](https://github.com/ModernMT/DataCollection/releases/tag/0.1.0)
28 | 


--------------------------------------------------------------------------------
/metadata/rocksdb/rdb_options.h:
--------------------------------------------------------------------------------
 1 | #ifndef RDB_OPTIONS_H
 2 | #define RDB_OPTIONS_H
 3 | 
 4 | 
 5 | #include "rocksdb/cache.h"
 6 | #include "rocksdb/env.h"
 7 | #include "rocksdb/filter_policy.h"
 8 | #include "rocksdb/options.h"
 9 | #include "rocksdb/table.h"
10 | #include "rocksdb/write_batch.h"
11 | 
12 | rocksdb::Options GetOptions() {
13 | 
14 |     rocksdb::Options options;
15 |     options.IncreaseParallelism(16);
16 |     options.OptimizeLevelStyleCompaction((uint64_t) 1024 * 1024 * 1024);
17 | 
18 |     options.num_levels = 6;
19 |     // options.write_buffer_size = 256 * 1024 * 1024; // 256MB
20 |     // options.max_write_buffer_number = 5; // Total of 1GB write cache
21 |     // options.min_write_buffer_number_to_merge = 2;
22 |     options.disableDataSync = true;
23 |     // options.target_file_size_base = (long) 1024 * 1024 * 1024; // 1GB files
24 | 
25 |     // // Compression
26 |     // options.compression = rocksdb::kSnappyCompression;
27 |     // options.compaction_style = rocksdb::kCompactionStyleLevel;
28 | 
29 |     // // Bloom Filter
30 |     // rocksdb::BlockBasedTableOptions topt;
31 |     // topt.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, true));
32 |     // topt.block_cache = rocksdb::NewLRUCache(1024 * 1024 * 1024, 7);
33 |     // options.table_factory.reset(NewBlockBasedTableFactory(topt));
34 | 
35 |     options.level_compaction_dynamic_level_bytes = true;
36 | 
37 |     options.max_open_files = 1000;
38 | 
39 |     options.create_if_missing = true;
40 |     options.allow_mmap_reads = true;
41 |     options.allow_mmap_writes = true;
42 |     return options;
43 | }
44 | 
45 | #endif /* RDB_OPTIONS_H */
46 | 


--------------------------------------------------------------------------------
/docaligner/counts2idf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | from collections import defaultdict
 4 | import math
 5 | import sys
 6 | 
 7 | if __name__ == "__main__":
 8 |     import argparse
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument('counts', help='input counts file',
11 |                         type=argparse.FileType('r'), nargs='+')
12 |     parser.add_argument(
13 |         '-outfile', help='output file', type=argparse.FileType('w'),
14 |         default=sys.stdout)
15 |     parser.add_argument('-lower', help='lowercase ngrams',
16 |                         action='store_true', default=False)
17 |     parser.add_argument('-mincount', help='minimum count', type=int, default=2)
18 |     args = parser.parse_args(sys.argv[1:])
19 | 
20 |     counts = defaultdict(int)
21 |     n_docs = 0
22 | 
23 |     for f in args.counts:
24 |         n_docs += int(f.readline())
25 |         for line in f:
26 |             ngram, count = line.split('\t')
27 |             if args.lower:
28 |                 ngram = ngram.lower()
29 |             counts[ngram] += int(count)
30 | 
31 |     args.outfile.write("%d\n" % (n_docs))
32 |     n_written, n_skipped = 0, 0
33 |     for ngram, count in counts.iteritems():
34 |         if count < args.mincount:
35 |             n_skipped += 1
36 |             continue
37 |         idf = math.log(float(n_docs)) - math.log(float(count))
38 |         args.outfile.write("%s\t%d\t%f\n" % (ngram, count, idf))
39 |         n_written += 1
40 | 
41 |     sys.stderr.write("Wrote %d and skipped %d due to count below %d\n" %
42 |                      (n_written, n_skipped, args.mincount))
43 | 


--------------------------------------------------------------------------------
/baseline/bitextor_util/wordcounts.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import codecs
 5 | from collections import defaultdict
 6 | from bitextorutil import read_lett
 7 | 
 8 | if __name__ == "__main__":
 9 |     import argparse
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument('lett', help='bitextor lett file')
12 |     parser.add_argument('-lang1',
13 |                         help='Two letter source language code', required=True)
14 |     parser.add_argument('-lang2',
15 |                         help='Two letter target language code', required=True)
16 |     parser.add_argument('-m', help='Max number of occurences to keep word',
17 |                         type=int, default=-1)
18 |     parser.add_argument('-once', help='count only once per document',
19 |                         action='store_true')
20 | 
21 |     args = parser.parse_args(sys.argv[1:])
22 |     sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
23 | 
24 |     counts_l1, counts_l2 = defaultdict(int), defaultdict(int)
25 |     for linenr, lang, words in read_lett(args.lett, as_set=args.once):
26 |         for w in words:
27 |             # w = w.encode('utf-8')`
28 |             if lang == args.lang1:
29 |                 counts_l1[w] += 1
30 |             elif lang == args.lang2:
31 |                 counts_l2[w] += 1
32 | 
33 |     for w, count in counts_l1.iteritems():
34 |         if args.m <= 0 or count <= args.m:
35 |             sys.stdout.write("%s\t%d\t%s\n" % (args.lang1, count, w))
36 |     for w, count in counts_l2.iteritems():
37 |         if args.m <= 0 or count <= args.m:
38 |             sys.stdout.write("%s\t%d\t%s\n" % (args.lang2, count, w))
39 | 


--------------------------------------------------------------------------------
/metadata/query_md.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | import json
 5 | import urlparse
 6 | import tldextract
 7 | import time
 8 | from collections import defaultdict
 9 | 
10 | 
11 | def get_tld(uri):
12 |     netloc = urlparse.urlparse(uri).netloc
13 |     tld = tldextract.extract(netloc).domain.encode('idna')
14 |     return tld
15 | 
16 | if __name__ == "__main__":
17 |     errors = 0
18 |     import argparse
19 | 
20 |     parser = argparse.ArgumentParser()
21 |     parser.add_argument('db', help='leveldb root directory')
22 |     parser.add_argument('url', help='url to search for')
23 |     # parser.add_argument('crawl', help='crawl id, e.g. 2013_11')
24 |     args = parser.parse_args(sys.argv[1:])
25 | 
26 |     import leveldb
27 |     db = leveldb.LevelDB(args.db)
28 | 
29 |     start_time = time.time()
30 |     query_tld = get_tld(args.url)
31 |     # sys.stderr.write("Looking for TLD: %s\n" % query_tld)
32 |     uri2crawl = defaultdict(list)
33 |     crawl2uri = defaultdict(set)
34 |     for key, value in db.RangeIter("%s " % query_tld):
35 |         tld, uri, crawl = key.split(" ", 2)
36 |         if query_tld != tld:
37 |             break
38 |         data = json.loads(value)
39 |         uri2crawl[uri].append((crawl, data))
40 |         crawl2uri[crawl].add(uri)
41 | 
42 |     sys.stderr.write("Found %d unique URLs in %d crawls\n" % (len(uri2crawl),
43 |                                                               len(crawl2uri)))
44 |     sys.stderr.write("Crawl\t#Urls\n")
45 |     for crawl in crawl2uri:
46 |         sys.stderr.write("%s\t%d\n" % (crawl, len(crawl2uri[crawl])))
47 | 
48 |     for uri in uri2crawl:
49 |         sys.stdout.write("%s\n" % uri)
50 | 
51 |     sys.stderr.write("Query took %f seconds.\n" % (time.time() - start_time))
52 | 


--------------------------------------------------------------------------------
/docaligner/nn.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import sys
 3 | from hashlib import sha1
 4 | 
 5 | 
 6 | def hashfunc(salt):
 7 |     h = sha1()
 8 |     h.update(salt)
 9 |     return h
10 | 
11 | 
12 | def ngrams(sentence, order=1, sep='#'):
13 |     for n in range(1, order + 1):
14 |         prefix = ''
15 |         if n > 1:
16 |             sentence.insert(0, '<s>')
17 |             sentence.append('</s>')
18 |         for i in range(len(sentence) - n + 1):
19 |             yield prefix + sep.join(sentence[i:i + n])
20 | 
21 | 
22 | def hash_line(line, salts, order=1):
23 |     line = line.strip().split()
24 |     for w in ngrams(line, order):
25 |         for s in salts:
26 |             h = hashfunc(s)
27 |             h.update(w)
28 |             yield int(h.hexdigest(), 16)
29 | 
30 | if __name__ == "__main__":
31 |     import argparse
32 |     parser = argparse.ArgumentParser()
33 |     parser.add_argument('-size', action='store',
34 |                         help='size of bitarray',
35 |                         type=int, default=1000)
36 |     parser.add_argument('-n', action='store',
37 |                         help='number of hash functions (default 5)',
38 |                         type=int, default=5)
39 |     parser.add_argument('-o', action='store', dest='order',
40 |                         help='order of n-grams (default 1)',
41 |                         type=int, default=1)
42 |     args = parser.parse_args(sys.argv[1:])
43 | 
44 |     salts = ["a", "b", "c", "d", "e", "f", "g", "h"][
45 |         :args.n]  # more secure than debian
46 | 
47 |     for line in sys.stdin:
48 |         hashed_line = set()
49 |         for h in hash_line(line, salts, args.order):
50 |             hashed_line.add(h % args.size)
51 |         sys.stdout.write("\t".join(str(h) for h in hashed_line))
52 |         sys.stdout.write("\n")
53 | 


--------------------------------------------------------------------------------
/docaligner/hash_lines.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import sys
 3 | from hashlib import sha1
 4 | 
 5 | 
 6 | def hashfunc(salt):
 7 |     h = sha1()
 8 |     h.update(salt)
 9 |     return h
10 | 
11 | 
12 | def ngrams(sentence, order=1, sep='#'):
13 |     for n in range(1, order + 1):
14 |         prefix = ''
15 |         if n > 1:
16 |             sentence.insert(0, '<s>')
17 |             sentence.append('</s>')
18 |         for i in range(len(sentence) - n + 1):
19 |             yield prefix + sep.join(sentence[i:i + n])
20 | 
21 | 
22 | def hash_line(line, salts, order=1):
23 |     line = line.strip().split()
24 |     for w in ngrams(line, order):
25 |         for s in salts:
26 |             h = hashfunc(s)
27 |             h.update(w)
28 |             yield int(h.hexdigest(), 16)
29 | 
30 | if __name__ == "__main__":
31 |     import argparse
32 |     parser = argparse.ArgumentParser()
33 |     parser.add_argument('-size', action='store',
34 |                         help='size of bitarray',
35 |                         type=int, default=1000)
36 |     parser.add_argument('-n', action='store',
37 |                         help='number of hash functions (default 5)',
38 |                         type=int, default=5)
39 |     parser.add_argument('-o', action='store', dest='order',
40 |                         help='order of n-grams (default 1)',
41 |                         type=int, default=1)
42 |     args = parser.parse_args(sys.argv[1:])
43 | 
44 |     salts = ["a", "b", "c", "d", "e", "f", "g", "h"][
45 |         :args.n]  # more secure than debian
46 | 
47 |     for line in sys.stdin:
48 |         hashed_line = set()
49 |         for h in hash_line(line, salts, args.order):
50 |             hashed_line.add(h % args.size)
51 |         sys.stdout.write("\t".join(str(h) for h in hashed_line))
52 |         sys.stdout.write("\n")
53 | 


--------------------------------------------------------------------------------
/metadata/langstats2kv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | from collections import defaultdict
 6 | from metadatabase import make_key
 7 | import json
 8 | 
 9 | magic_number = 'df6fa1abb58549287111ba8d776733e9'
10 | 
11 | 
12 | def parse_line(line):
13 |     """ Example input:
14 |     df6fa1abb58549287111ba8d776733e9 uri:http://0d1.info/ language:en offset:451 bytes:2743
15 |     """
16 |     d = {}
17 |     for elem in line[:-1].split()[1:]:
18 |         k, v = elem.split(':', 1)
19 |         d[k] = v
20 |     return d
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     import argparse
25 |     parser = argparse.ArgumentParser()
26 |     parser.add_argument('crawl', help='crawl format YYYY_WW, e.g. 2015_22')
27 |     args = parser.parse_args(sys.argv[1:])
28 | 
29 |     stats = defaultdict(int)
30 |     url = None
31 | 
32 |     for linenr, line in enumerate(sys.stdin):
33 |         if not line.startswith(magic_number):
34 |             continue
35 |         line = parse_line(line)
36 |         if line['language'] == 'un':
37 |             continue
38 |         if url is not None and line['uri'] != url:
39 |             key = None
40 |             try:
41 |                 key = make_key(url, args.crawl)
42 |             except:
43 |                 continue
44 |             sys.stdout.write("%s\t%s\n" % (
45 |                 key,
46 |                 json.dumps({"languages": stats.items()})))
47 |             stats = defaultdict(int)
48 |         url = line['uri']
49 |         stats[line['language']] += int(line['bytes'])
50 | 
51 |     if url is not None:
52 |         try:
53 |             key = make_key(url, args.crawl)
54 |             sys.stdout.write("%s\t%s\n" % (
55 |                 key,
56 |                 json.dumps({"languages": stats.items()})))
57 |         except:
58 |             pass
59 | 


--------------------------------------------------------------------------------
/docaligner/split_long_short.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | from itertools import izip
 5 | 
 6 | if __name__ == "__main__":
 7 |     import argparse
 8 |     parser = argparse.ArgumentParser()
 9 |     parser.add_argument('source', type=argparse.FileType('r'),
10 |                         help='source corpus used to compute length')
11 |     parser.add_argument('target', type=argparse.FileType('r'),
12 |                         help='target corpus used to compute length')
13 |     parser.add_argument('source_original', type=argparse.FileType('r'),
14 |                         help='source corpus for output')
15 |     parser.add_argument('target_original', type=argparse.FileType('r'),
16 |                         help='target corpus for output')
17 |     parser.add_argument('source_short', type=argparse.FileType('w'),
18 |                         help='source corpus output file')
19 |     parser.add_argument('target_short', type=argparse.FileType('w'),
20 |                         help='target corpus output file')
21 |     parser.add_argument('source_long', type=argparse.FileType('w'),
22 |                         help='source corpus output file')
23 |     parser.add_argument('target_long', type=argparse.FileType('w'),
24 |                         help='target corpus output file')
25 |     parser.add_argument('-n', type=int, default=50,
26 |                         help='max tokens per line')
27 |     args = parser.parse_args(sys.argv[1:])
28 | 
29 |     for sl, tl, s, t in izip(args.source, args.target,
30 |                              args.source_original, args.target_original):
31 |         if len(sl.split()) > args.n or len(tl.split()) > args.n:
32 |             args.target_long.write(t)
33 |             args.source_long.write(s)
34 |         else:
35 |             args.target_short.write(t)
36 |             args.source_short.write(s)
37 | 


--------------------------------------------------------------------------------
/baseline/strip_headers.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | import chardet
 5 | 
 6 | magic_numer = "df6fa1abb58549287111ba8d776733e9"
 7 | 
 8 | helptext = """ Remove warc and http headers from CC downloads """
 9 | current_encoding = "utf-8"
10 | 
11 | 
12 | def process_buffer(buf, fout):
13 |     if not buf:
14 |         return
15 |     header = buf[0]
16 |     skip = 0
17 |     empty_lines = 0
18 |     while empty_lines < 2:
19 |         skip += 1
20 |         if not buf[skip].strip():
21 |             empty_lines += 1
22 | 
23 |     html = "".join(buf[skip + 1:])
24 | 
25 |     global current_encoding
26 |     try:
27 |         html = html.decode(current_encoding)
28 |     except:
29 |         try:
30 |             encoding = chardet.detect(html)
31 |             html = html.decode(encoding["encoding"])
32 |             current_encoding = encoding["encoding"]
33 |         except:
34 |             sys.stderr.write("error decoding %s\n" % header.split()[-1])
35 |             return
36 | 
37 |     fout.write(header)
38 |     fout.write(html.encode("utf-8"))
39 |     fout.write("\n")
40 | 
41 | 
42 | def read_file(fin, fout):
43 |     buf = []
44 |     for line in fin:
45 |         if line.startswith(magic_numer):
46 |             process_buffer(buf, fout)
47 |             buf = [line]
48 |             continue
49 |         buf.append(line)
50 |     process_buffer(buf, fout)
51 | 
52 | if __name__ == "__main__":
53 |     import argparse
54 |     parser = argparse.ArgumentParser(description=helptext)
55 |     parser.add_argument('infile', type=argparse.FileType('r'),
56 |                         help='source corpus', default=sys.stdin)
57 |     parser.add_argument('outfile', type=argparse.FileType('w'),
58 |                         help='output file', default=sys.stdout)
59 |     args = parser.parse_args(sys.argv[1:])
60 | 
61 |     read_file(args.infile, args.outfile)
62 | 


--------------------------------------------------------------------------------
/docaligner/matching.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import numpy as np
 5 | import sys
 6 | 
 7 | # for Hungarian Algorithm
 8 | import munkres
 9 | 
10 | sys.path.append("/home/buck/net/build/DataCollection/baseline")
11 | from strip_language_from_uri import LanguageStripper
12 | 
13 | 
14 | def get_best_match(source_corpus, target_corpus, scores):
15 |     stripper = LanguageStripper()
16 |     err = 0
17 |     for s_idx, (s_url, s_page) in enumerate(source_corpus.iteritems()):
18 |         max_idx = np.argmax(scores[s_idx])
19 |         t_url = target_corpus.keys()[max_idx]
20 |         success = stripper.strip(t_url) == stripper.strip(s_page.url)
21 |         if not success:
22 |             err += 1
23 |         # sys.stdout.write("%f\t%s\t%s\t%s\n" %
24 |         #                  (scores[s_idx, max_idx], success, s_url, t_url))
25 |     n = min(len(source_corpus), len(target_corpus))
26 |     sys.stderr.write("Correct (greedy): %d out of %d = %f%%\n" %
27 |                      (n - err, n, (1. * n - err) / n))
28 | 
29 | 
30 | def get_best_matching(source_corpus, target_corpus, scores):
31 |     stripper = LanguageStripper()
32 |     err = 0
33 | 
34 |     m = munkres.Munkres()
35 |     cost_matrix = munkres.make_cost_matrix(scores, lambda cost: 1 - cost)
36 |     indexes = m.compute(cost_matrix)
37 | 
38 |     for row, column in indexes:
39 |         s_url = source_corpus.keys()[row]
40 |         t_url = target_corpus.keys()[column]
41 |         success = stripper.strip(t_url) == stripper.strip(s_url)
42 |         if not success:
43 |             err += 1
44 |         # sys.stdout.write("%f\t%s\t%s\t%s\n" %
45 |         #                  (scores[row, column], success, s_url, t_url))
46 | 
47 |     n = min(len(source_corpus), len(target_corpus))
48 |     sys.stderr.write("Correct: %d out of %d = %f%%\n" %
49 |                      (n - err, n, (1. * n - err) / n))
50 | 


--------------------------------------------------------------------------------
/metadata/rocksdb/insertkv.cc:
--------------------------------------------------------------------------------
 1 | // Write tab separated key-value pairs to a level-db
 2 | #include "rdb_options.h"
 3 | 
 4 | #include <iostream>
 5 | #include <string>
 6 | #include "rocksdb/db.h"
 7 | 
 8 | 
 9 | using std::string;
10 | 
11 | 
12 | int main(int argc, char** argv) {
13 |     if (argc < 2) {
14 |         std::cout << "Usage: " << argv[0] << " db_directory" << std::endl;
15 |         return -1;
16 |     }
17 | 
18 |     rocksdb::DB* db;
19 | 
20 |     rocksdb::Options options = GetOptions();
21 |     options.create_if_missing = true;
22 | 
23 | 
24 |     rocksdb::Status status = rocksdb::DB::Open(options, argv[1], &db);
25 |     if (!status.ok()) {
26 |         std::cerr << "Error opening DB: " << status.ToString() << std::endl;  
27 |         return -1;
28 |     } 
29 | 
30 |     rocksdb::WriteOptions writeOptions;
31 | 
32 |     string line;
33 |     int nLines = 0;
34 |     rocksdb::WriteBatch batch;
35 |     string key, value;
36 | 
37 |     while(std::getline(std::cin, line)) {
38 |         ++nLines;
39 |         const size_t key_end = line.find("\t");
40 |         assert(key_end != std::string::npos);
41 |         assert(line.find("\t", key_end + 1) == std::string::npos);
42 |         key  = line.substr(0, key_end);
43 |         value = line.substr(key_end+1);
44 |         // db->Put(writeOptions, key, value);
45 |         batch.Put(key, value);
46 | 
47 |         if (nLines % 1000 == 0) {
48 |             status = db->Write(writeOptions, &batch);
49 |             if (!status.ok()) {
50 |                 std::cerr << "Write error: " << status.ToString() << std::endl;
51 |             }
52 |             batch.Clear();
53 |         }
54 |     }
55 |     // Write remaining entries
56 |     status = db->Write(writeOptions, &batch);
57 |     if (!status.ok()) {
58 |         std::cerr << "Write error: " << status.ToString() << std::endl;
59 |     }
60 | 
61 |     delete db;
62 |     return 0;
63 | }
64 | 


--------------------------------------------------------------------------------
/crawlertest/bitextor_notes.txt:
--------------------------------------------------------------------------------
 1 | Bitextor Fileformats
 2 | 
 3 | 
 4 | == ett ==
 5 | One document per line
 6 | Values (tab-separated)
 7 |     mime type given by 'file'
 8 |     charset given by 'file' (is this ever used?)
 9 |     filename relative to webdir 
10 |     base64 encoded HTML after running though tika and boilerpipe
11 | 
12 | == lett ==
13 | One document per line
14 | Retain only source- or target- language files
15 | Values (tab-separated)
16 |     mime type given by 'file'
17 |     charset given by 'file' (is this ever used?)
18 |     filename relative to webdir 
19 |     base64 encoded HTML after running though tika and boilerpipe
20 | 
21 | == lettr ==
22 | Same as lett but additional column containing serialized pages
23 | Serialization based on tika-cleaned, boilerpipe-cleaned page in 5th column
24 | Scripts ignored
25 | Tags wrapped in _, e.g. _body_
26 | Text replaced by series of '_' times log_2(num_words)
27 | Example:
28 | _html__style______style__body_______a__a__body__html_
29 | 
30 | == lettr2idx ==
31 | Builds a (language, word) to document index
32 | Format: 
33 | language lowercased_word docs[:off1[:off2[:..]]]
34 | 
35 | Example:
36 | de      abenteuer       684:14:3:31:2
37 | 
38 | Means word 'abenteuer' from a German text appears in documents (i.e. lines in the lettr file) 684, 684+14, 684+14+3, ...
39 | 
40 | == idx2ridx ==
41 | Generates candidate pairs
42 | Reads the word-to-document index and a bilingual dictionary.
43 | Documents represented by bag of words.
44 | Augments dictionary with words apprearing on both sides.
45 | Generates document-words index, generates list of possible traget words
46 | by generating all translations for all words
47 | 
48 | Formula for similarity:
49 | 
50 | size_of_bigger_document / size_of_smaller_document * number_of_common_words_after_translating_source / number_of_translated
51 | 
52 | 
53 | 
54 | Example www.hettahuskies.com
55 | 
56 | Webdocuments from crawl: 1048
57 | After language-id: 908
58 |      49 de
59 |     859 en
60 | 


--------------------------------------------------------------------------------
/metadata/leveldb/insertkv.cc:
--------------------------------------------------------------------------------
 1 | // Write tab separated key-value pairs to a level-db
 2 | 
 3 | #include <iostream>
 4 | #include <string>
 5 | #include "leveldb/cache.h"
 6 | #include "leveldb/db.h"
 7 | #include "leveldb/write_batch.h"
 8 | 
 9 | using std::string;
10 | 
11 | 
12 | int main(int argc, char** argv) {
13 |     if (argc < 2) {
14 |         std::cout << "Usage: " << argv[0] << " db_directory" << std::endl;
15 |         return -1;
16 |     }
17 | 
18 |     leveldb::DB* db;
19 |     leveldb::Options options;
20 |     options.block_cache = leveldb::NewLRUCache(1014 * 1024 * 1024);  // 1GB
21 |     options.write_buffer_size = 1024 * 1024 * 1024; // 1 GB
22 |     options.create_if_missing = true;
23 |     leveldb::Status status = leveldb::DB::Open(options, argv[1], &db);
24 |     if (!status.ok()) {
25 |         std::cerr << "Error opening DB: " << status.ToString() << std::endl;  
26 |         return -1;
27 |     } 
28 | 
29 |     leveldb::WriteOptions writeOptions;
30 |     // writeOptions.sync = true;
31 | 
32 |     string line;
33 |     int nLines = 0;
34 |     leveldb::WriteBatch batch;
35 | 
36 |     while(std::getline(std::cin, line)) {
37 |         ++nLines;
38 |         const size_t key_end = line.find("\t");
39 |         assert(key_end != std::string::npos);
40 |         const string key  = line.substr(0, key_end);
41 |         const string value = line.substr(key_end+1);
42 |         batch.Put(key, value);
43 | 
44 |         if (nLines % 10000 == 0) {
45 |             status = db->Write(writeOptions, &batch);
46 |             if (!status.ok()) {
47 |                 std::cerr << "Write error: " << status.ToString() << std::endl;
48 |             }
49 |             batch.Clear();
50 |         }
51 |     }
52 |     // Write remaining entries
53 |     status = db->Write(writeOptions, &batch);
54 |     if (!status.ok()) {
55 |         std::cerr << "Write error: " << status.ToString() << std::endl;
56 |     }
57 | 
58 |     delete db;
59 |     delete options.block_cache;
60 |     return 0;
61 | }
62 | 


--------------------------------------------------------------------------------
/baseline/download_domain.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | from urlparse import urlparse
 4 | import sys
 5 | import tldextract
 6 | import requests
 7 | from ccdownloader import CCDownloader
 8 | 
 9 | 
10 | def get_domain(uri):
11 |     extract = tldextract.extract(urlparse(uri).netloc)
12 |     return ".".join((extract.domain.encode('idna'), extract.suffix))
13 | 
14 | 
15 | if __name__ == "__main__":
16 |     import argparse
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument('domain', help="domain to download")
19 |     parser.add_argument('-api', default="http://statmt.org:8031/query_domain")
20 |     parser.add_argument('-outfile', type=argparse.FileType('w'),
21 |                         help='output file', default=sys.stdout)
22 |     args = parser.parse_args(sys.argv[1:])
23 | 
24 |     downloader = CCDownloader()
25 | 
26 |     payload = {"url": args.domain, "max_results": 1000000, "full": 1}
27 |     resp = requests.get(args.api, params=payload)
28 | 
29 |     data = resp.json()['data']
30 |     downloaded_pages, downloaded_bytes = 0, 0.
31 |     for url, crawldata in data.iteritems():
32 |         for crawl, metadata in crawldata:
33 |             if metadata.get('mime', "text/html") != "text/html":
34 |                 continue
35 |             # work around stupid typo
36 |             offset = metadata['offset'] \
37 |                 if 'offset' in metadata else metadata['offset:']
38 |             raw_page = downloader.download(
39 |                 metadata['filename'], int(offset), int(metadata['length']))
40 |             if not raw_page:
41 |                 continue
42 | 
43 |             header = "%s %s\n" % (CCDownloader.magic_number, url)
44 |             args.outfile.write(header)
45 |             args.outfile.write(raw_page)
46 |             args.outfile.write("\n")
47 |             downloaded_bytes += len(raw_page)
48 |             downloaded_pages += 1
49 | 
50 |     sys.stderr.write("%s: downloaded %d spages, total of %0.3f Mbytes\n" % (
51 |         args.domain, downloaded_pages, downloaded_bytes / 1024 / 1024))
52 | 


--------------------------------------------------------------------------------
/html_convert/anything_to_utf8.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | import chardet
 6 | from chardet.universaldetector import UniversalDetector
 7 | 
 8 | 
 9 | def guess_encoding_incremental(data):
10 |     sys.stderr.write("running incremental chardet\n")
11 |     detector = UniversalDetector()
12 |     for line in data.split("\n"):
13 |         detector.feed(line)
14 |         if detector.done:
15 |             break
16 |     detector.close()
17 |     encoding = detector.result
18 |     return encoding["encoding"]
19 | 
20 | 
21 | def guess_encoding(data):
22 |     sys.stderr.write("running full chardet\n")
23 |     encoding = chardet.detect(data)
24 |     return encoding["encoding"]
25 | 
26 | 
27 | def convert_to_utf8(data, force_chardet=False):
28 |     encoding = "utf-8"
29 |     try:
30 |         if force_chardet:
31 |             raise
32 |         data = data.decode(encoding)
33 |     except:
34 |         encoding = guess_encoding_incremental(data)
35 |         try:
36 |             data = data.decode(encoding)
37 |         except:
38 |             encoding = guess_encoding(data)
39 |             try:
40 |                 data = data.decode(encoding)
41 |             except:
42 |                 sys.stderr.write("Fallback: ignoring errors.\n")
43 |                 return data.decode("utf-8", errors='ignore')
44 |     sys.stderr.write("Detected encoding: %s\n"
45 |                      % encoding)
46 |     return data
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     import argparse
51 |     parser = argparse.ArgumentParser()
52 |     parser.add_argument('infile', type=argparse.FileType('r'),
53 |                         help='input file')
54 |     parser.add_argument('outfile', type=argparse.FileType('w'),
55 |                         help='output file')
56 |     parser.add_argument('-chardet', action='store_true',
57 |                         help='force chardet detection (slow)')
58 |     args = parser.parse_args()
59 | 
60 |     data = args.infile.read()
61 |     data = convert_to_utf8(data, args.chardet)
62 |     args.outfile.write(data.encode("utf-8"))
63 | 


--------------------------------------------------------------------------------
/metadata/count_uniq_urls.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | import hyperloglog
 5 | import gzip
 6 | from subprocess import Popen, PIPE
 7 | 
 8 | """ Estimates the approximate number of unique urls in .kv files """
 9 | 
10 | 
11 | def read_urls(f):
12 |     """ Reads from filehandle and produces the 2nd column """
13 |     for linenr, line in enumerate(f):
14 |         _, url, _ = line.split(' ', 2)
15 |         yield url
16 | 
17 | 
18 | def combinations(l):
19 |     """ Produces all combinations of values from l """
20 |     for i, a in enumerate(l):
21 |         for b in l[i + 1:]:
22 |             yield a, b
23 | 
24 | 
25 | def open_file(fname):
26 |     # print "opening %s" % str(fname)
27 |     if fname.endswith('gz'):
28 |         f = Popen(['zcat', fname], stdout=PIPE, stderr=PIPE)
29 |         return f.stdout
30 |         # return gzip.open(fname)
31 |     else:
32 |         return open(fname)
33 | 
34 | 
35 | def incremental_stats(fnames, err):
36 |     hll = hyperloglog.HyperLogLog(err)  # 0.1 = 10% error
37 |     for fname in fnames:
38 |         n_lines = 0
39 |         hll_local = hyperloglog.HyperLogLog(err)
40 |         for url in read_urls(open_file(fname)):
41 |             n_lines += 1
42 |             hll_local.add(url)
43 |             hll.add(url)
44 |         print "%s\t%d\t%d\t%d" % (fname, n_lines, len(hll_local), len(hll))
45 | 
46 | 
47 | def combination_stats(fnames, err):
48 |     for fn1, fn2 in combinations(fnames):
49 |         incremental_stats((fn1, fn2), err)
50 |         print "--"
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     import argparse
55 |     parser = argparse.ArgumentParser()
56 |     parser.add_argument('files', help='kv files', nargs='+')
57 |     parser.add_argument('-incremental', help='produce incremental statistics',
58 |                         action='store_true')
59 |     parser.add_argument('-error', help='counting error, default: .1%',
60 |                         type=float, default=0.001)
61 |     args = parser.parse_args(sys.argv[1:])
62 | 
63 |     if args.incremental:
64 |         incremental_stats(args.files, args.error)
65 |     else:
66 |         combination_stats(args.files, args.error)
67 | 


--------------------------------------------------------------------------------
/Results/results-ev7.txt:
--------------------------------------------------------------------------------
 1 | Files Downloaded:
 2 | -----------------------------------------------------
 3 | 135.xml	Adaptation et réduction des risques	http://www.iisd.org/adaptation/default_fr.aspx
 4 | 226.xml	Traitement équitable	http://www.iisd.org/gender/default_fr.aspx
 5 | 315.xml	Éducation, apprentissage et leadership	http://www.iisd.org/leaders/default_fr.aspx
 6 | 379.xml	Ressources naturelles	http://www.iisd.org/natres/default_fr.aspx
 7 | 336.xml	Mesure et évaluation	http://www.iisd.org/measure/default_fr.aspx
 8 | 148.xml	Changements climatiques et énergie	http://www.iisd.org/climate/default_fr.aspx
 9 | 391.xml	Réseaux et partenariats	http://www.iisd.org/networks/default_fr.aspx
10 | 1.xml	What is Sustainable Development?	http://www.iisd.org/sd/
11 | 19.xml	Gender Equity	http://www.iisd.org/gender/
12 | 188.xml	Économie et DD	http://www.iisd.org/economics/default_fr.aspx
13 | 204.xml	Environment, Conflict and Peacebuilding	http://www.iisd.org/ecp/default_fr.aspx
14 | 10.xml	Climate Change and Energy	http://www.iisd.org/climate/
15 | 42.xml	Qu’est-ce que le développement durable?	http://www.iisd.org/sd/default_fr.aspx
16 | 6.xml	Adaptation and Risk Reduction	http://www.iisd.org/adaptation/
17 | 25.xml	Sustainable Markets	http://www.iisd.org/markets/
18 | 31.xml	Networks & Partnerships	http://www.iisd.org/networks/
19 | 20.xml	Governance for Sustainable Development	http://www.iisd.org/governance/
20 | 15.xml	Environment, Conflict and Peacebuilding	http://www.iisd.org/ecp/
21 | 14.xml	Economics and SD	http://www.iisd.org/economics/
22 | 27.xml	Measurement and Assessment	http://www.iisd.org/measure/
23 | 231.xml	Gouvernance pour le développement durable	http://www.iisd.org/governance/default_fr.aspx
24 | 24.xml	Education, Learning and Leadership	http://www.iisd.org/leaders/
25 | 30.xml	Natural Resources	http://www.iisd.org/natres/
26 | 321.xml	Marchés durables	http://www.iisd.org/markets/default_fr.aspx
27 | 
28 | Files Mapped:
29 | -----------------------------------------------------
30 | 14.xml	188.xml
31 | 30.xml	379.xml
32 | 24.xml	315.xml
33 | 336.xml	27.xml
34 | 19.xml	226.xml
35 | 6.xml	135.xml
36 | 25.xml	321.xml
37 | 20.xml	231.xml
38 | 31.xml	391.xml
39 | 10.xml	148.xml
40 | 42.xml	1.xml
41 | 204.xml	15.xml
42 | 


--------------------------------------------------------------------------------
/docaligner/table4paper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | from lett import read_lett
 5 | import locale
 6 | locale.setlocale(locale.LC_ALL, '')
 7 | 
 8 | 
 9 | def read_devset(fh):
10 |     # format fr-url <TAB> en-url
11 |     surls, turls = set(), set()
12 |     # print "Reading devset from ", fh.name
13 |     for line in fh:
14 |         surl, turl = line.strip().split('\t')
15 |         surls.add(surl)
16 |         turls.add(turl)
17 | 
18 |     assert len(surls) == len(turls)
19 |     return surls, turls
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     import argparse
24 |     parser = argparse.ArgumentParser()
25 |     parser.add_argument(
26 |         'devset', help='correct pairs', type=argparse.FileType('r'))
27 |     parser.add_argument(
28 |         'lettfile', help='input lett file', type=argparse.FileType('r'))
29 |     parser.add_argument('-slang', help='source language', default='en')
30 |     parser.add_argument('-tlang', help='target language', default='fr')
31 |     args = parser.parse_args()
32 | 
33 |     surls, turls = read_devset(args.devset)
34 |     sys.stderr.write("Read %d devpairs\n" % (len(surls)))
35 | 
36 |     # read source and target corpus
37 |     s, t = read_lett(args.lettfile, args.slang, args.tlang)
38 | 
39 |     source_found = 0
40 |     for url, page in s.iteritems():
41 |         if page.url in surls:
42 |             source_found += 1
43 | 
44 |     target_found = 0
45 |     for url, page in t.iteritems():
46 |         if page.url in turls:
47 |             target_found += 1
48 | 
49 |     assert source_found == target_found
50 | 
51 |     name = args.lettfile.name.replace('.lett.gz', '')
52 |     n_source = locale.format("%d", len(s), grouping=True)
53 |     n_target = locale.format("%d", len(t), grouping=True)
54 |     n_pairs = locale.format("%d", len(s) * len(t), grouping=True)
55 |     source_found = locale.format("%d", source_found, grouping=True)
56 |     target_found = locale.format("%d", target_found, grouping=True)
57 |     # forcesavenir.qc &   3,592   &   3,982   &   14,303,344  &   8 \\
58 |     sys.stdout.write("%s &\t %s &\t %s &\t %s &\t %s \\\\\n"
59 |                      % (name, n_source, n_target, n_pairs, source_found))
60 | 


--------------------------------------------------------------------------------
/baseline/corpus_by_domain.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from urlparse import urlparse
 5 | import tldextract
 6 | import argparse
 7 | import sys
 8 | 
 9 | # Takes a sentence-aligned file (typical extension .sent) that includes
10 | # source and target url columns and writes out the corpus separated by domain
11 | 
12 | parser = argparse.ArgumentParser(description="Output corpus by domain")
13 | parser.add_argument('infile', nargs='?', type=argparse.FileType('r'),
14 | 		    default=sys.stdin)
15 | parser.add_argument('-slang', '--lang1', help='source language',
16 | 		    dest='source_lang', default='en')
17 | parser.add_argument('-tlang', '--lang2', help='target language',
18 | 		    dest='target_lang', default='fr')
19 | parser.add_argument('--regdomain', help='use registered domain without subdomain', action='store_true', default=False)
20 | args = parser.parse_args()
21 | 
22 | srcdomain = ""
23 | tgtdomain = ""
24 | nwrites = 0
25 | for line in args.infile:
26 |     split_line = line.rstrip("\n\r").split("\t")
27 |     if len(split_line) != 4 and len(split_line) !=5:
28 | 	sys.exit("Line with inconsistent number of elements")
29 |     source_url, target_url, source, target = split_line[:4]
30 |     srcurl = urlparse(source_url)
31 |     tgturl = urlparse(target_url)
32 |     if args.regdomain:
33 | 	srcext = tldextract.extract("http://"+srcurl.netloc)
34 | 	tgtext = tldextract.extract("http://"+tgturl.netloc)
35 | 	new_srcdomain = srcext.registered_domain
36 | 	new_tgtdomain = tgtext.registered_domain
37 |     else:
38 | 	new_srcdomain = srcurl.netloc
39 | 	new_tgtdomain = tgturl.netloc
40 |     if new_srcdomain != new_tgtdomain:
41 | 	sys.stderr.write("Domain mismatch {} {}\n".format(source_url,target_url))
42 | 	continue
43 |     if srcdomain != new_srcdomain:
44 | 	if srcdomain != "":
45 | 	    srcout.close()
46 | 	    tgtout.close()
47 | 	srcdomain = new_srcdomain
48 | 	tgtdomain = new_tgtdomain
49 | 	srcout = open(srcdomain+"."+args.source_lang, "a")
50 | 	tgtout = open(tgtdomain+"."+args.target_lang, "a")
51 |     srcout.write(source+"\n")
52 |     tgtout.write(target+"\n")
53 |     nwrites += 1
54 | 
55 | print "Number of lines written: {}".format(nwrites)
56 | srcout.close()
57 | tgtout.close()
58 | 


--------------------------------------------------------------------------------
/html_convert/string_util.h:
--------------------------------------------------------------------------------
 1 | #ifndef STRING_UTIL_H_
 2 | #define STRING_UTIL_H_
 3 | 
 4 | #include <algorithm>
 5 | #include <string>
 6 | #include <vector>
 7 | 
 8 | namespace {
 9 | 
10 | using std::string;
11 | 
12 | class StringUtil {
13 |  public:
14 |   static bool EndsWith(string const& s, const string& end) {
15 |     if (s.length() >= end.length() &&
16 |         s.compare(s.length() - end.length(), end.length(), end) == 0) {
17 |       return true;
18 |     }
19 | //    std::cout << s << " does not end with " << end << std::endl;
20 |     return false;
21 |   }
22 | 
23 |   static string ToLower(const string& s) {
24 |     string lower(s);
25 |     std::transform(s.begin(), s.end(), lower.begin(), ::tolower);
26 | //    std::cout << lower << std::endl;
27 |     return lower;
28 |   }
29 | 
30 |   static std::vector<string> Split(const string& s, const char delim = ' ') {
31 |     std::vector<string> tokens;
32 |     std::stringstream ss(s);
33 |     string token;
34 |     while (std::getline(ss, token, delim)) {
35 |       if (!token.empty()) {
36 |         tokens.push_back(token);
37 |       }
38 |     }
39 |     return tokens;
40 |   }
41 | 
42 |   static string Trim(const string& s) {
43 |     auto wsfront = std::find_if_not(s.begin(), s.end(),
44 |                                     [](int c) { return std::isspace(c); });
45 |     auto wsback =
46 |         std::find_if_not(s.rbegin(), s.rend(),
47 |                          [](int c) { return std::isspace(c); }).base();
48 |     return (wsback <= wsfront ? std::string() : std::string(wsfront, wsback));
49 |   }
50 | 
51 |   static void TrimRepeatedSpace(string* s) {
52 |     s->erase(std::unique(s->begin(), s->end(),
53 |                          [](char a, char b) { return std::isspace(a) && std::isspace(b); }),
54 |              s->end());
55 |   }
56 | 
57 |   static string TrimRepeatedWhitespace(const string& s) {
58 |     std::stringstream ss(s);
59 |     std::ostringstream oss;
60 |     string line;
61 |     while (std::getline(ss, line)) {
62 |       line = Trim(line);
63 |       if (!line.empty()) {
64 |         TrimRepeatedSpace(&line);
65 |         oss << line << std::endl;
66 |       }
67 |     }
68 |     return oss.str();
69 |   }
70 | };
71 | 
72 | }  // namespace
73 | 
74 | #endif /* STRING_UTIL_H_ */
75 | 


--------------------------------------------------------------------------------
/docaligner/map_translations.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | from itertools import izip
 5 | import locale
 6 | locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
 7 | 
 8 | 
 9 | if __name__ == "__main__":
10 |     import argparse
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument('-source', type=argparse.FileType('r'),
13 |                         help='sources of translations', required=True)
14 |     parser.add_argument('-target', type=argparse.FileType('r'),
15 |                         help='translation of source', required=True)
16 |     parser.add_argument('-outfile', type=argparse.FileType('w'),
17 |                         help='output file for translated segments')
18 |     parser.add_argument('-outfile_source', type=argparse.FileType('w'),
19 |                         help='output file for untranslated segments')
20 |     args = parser.parse_args(sys.argv[1:])
21 | 
22 |     s2t = {}
23 |     for s, t in izip(args.source, args.target):
24 |         s = s.rstrip('\n').decode('utf-8')
25 |         t = t.rstrip('\n').decode('utf-8')
26 |         s2t[s] = t
27 |     print "Read %d pairs" % len(s2t)
28 |     print "Last:", s, t
29 | 
30 |     fixes = {}
31 | 
32 |     for linenr, line in enumerate(sys.stdin):
33 |         filename, text = line.rstrip('\n').split('\t', 1)
34 |         text = text.strip().decode('utf-8')
35 | 
36 |         # text = text.strip()
37 |         if text not in s2t:
38 |             if text in fixes:
39 |                 text = fixes[text]
40 |             else:
41 |                 for s in s2t:
42 |                     if locale.strcoll(text, s) == 0:
43 |                         fixes[text] = s
44 |                         print "fixed: %s -> %s" % (repr(text), repr(s))
45 |                         text = s
46 |                         break
47 | 
48 |         if text not in s2t:
49 |             print "Text for file %s not found" % filename
50 |             print "Text: %s" % repr(text)
51 |             continue
52 | 
53 |         if args.outfile:
54 |             args.outfile.write(
55 |                 "%s\t%s\n" % (filename, s2t[text].encode('utf-8')))
56 |         if args.outfile_source:
57 |             args.outfile_source.write(
58 |                 "%s\t%s\n" % (filename2url[filename], text.encode('utf-8')))
59 | 


--------------------------------------------------------------------------------
/baseline/download_candidates.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | from collections import defaultdict
 4 | from urlparse import urlparse
 5 | import gzip
 6 | import os
 7 | import sys
 8 | import tldextract
 9 | from ccdownloader import CCDownloader
10 | 
11 | 
12 | def get_domain(uri):
13 |     extract = tldextract.extract(urlparse(uri).netloc)
14 |     return ".".join((extract.domain.encode('idna'), extract.suffix))
15 | 
16 | 
17 | if __name__ == "__main__":
18 |     import argparse
19 |     parser = argparse.ArgumentParser()
20 |     parser.add_argument('srclang', help="source langauge")
21 |     parser.add_argument('tgtlang', help="target langauge")
22 |     parser.add_argument('crawl', help='crawl id, e.g. 2013_11')
23 |     parser.add_argument('--outdir', help='prefix for filenames',
24 |                         default='data/')
25 |     args = parser.parse_args(sys.argv[1:])
26 | 
27 |     source_lines, target_lines = defaultdict(list), defaultdict(list)
28 |     for line in sys.stdin:
29 |         line = line.strip().split()
30 |         lang = line[2]
31 |         language_independent_url = line[0]
32 |         if lang == args.srclang:
33 |             source_lines[language_independent_url].append(line)
34 |         elif lang == args.tgtlang:
35 |             target_lines[language_independent_url].append(line)
36 |         else:
37 |             continue
38 | 
39 |     downloader = CCDownloader()
40 | 
41 |     for language_independent_url in source_lines:
42 |         if language_independent_url in target_lines:
43 |             domain = get_domain(source_lines[language_independent_url][0][1])
44 | 
45 |             outfile = gzip.open(
46 |                 os.path.join(args.outdir,
47 |                              "%s_%s.gz" % (domain, args.srclang)), "a", 9)
48 |             for line in source_lines[language_independent_url]:
49 |                 downloader.download_and_write(line, outfile, args.crawl)
50 |             outfile.close()
51 | 
52 |             outfile = gzip.open(
53 |                 os.path.join(args.outdir,
54 |                              "%s_%s.gz" % (domain, args.tgtlang)), "a", 9)
55 |             for line in target_lines[language_independent_url]:
56 |                 downloader.download_and_write(line, outfile, args.crawl)
57 |             outfile.close()
58 | 


--------------------------------------------------------------------------------
/baseline/download_and_align.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Downloads pages given location in S3, extracts text, calls bitextors
 4 | # hunalign wrapper and filters the resulting bitext.
 5 | 
 6 | # Exit as soon as any command fails
 7 | set -e
 8 | set -o pipefail
 9 | 
10 | source /home/buck/net/build/virtualenvs/crawl/bin/activate
11 | 
12 | BASEDIR=$1
13 | PREFIX=$2
14 | 
15 | SLANG=en
16 | TLANG=it
17 | 
18 | # Directory in which this script is stored
19 | DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
20 | MOSESDIR=/home/buck/net/build/moses-clean/
21 | BITEXTORDIR=/home/buck/net/build/bitextor/
22 | 
23 | # Step 1: Download page pairs from S3
24 | DONEFILE=${BASEDIR}/locations/${PREFIX}.done
25 | if [ ! -f ${DONEFILE} ]; then
26 |   echo "Downloading locations from ${BASEDIR}/locations/${PREFIX}.loc.gz"
27 |   zcat ${BASEDIR}/locations/${PREFIX}.loc.gz | \
28 |   nice parallel -j 32 --pipe -L 2 \
29 |     ${DIR}/candidates2corpus.py \
30 |         -source_splitter="'${MOSESDIR}/scripts/ems/support/split-sentences.perl -l ${SLANG} -b -q'" \
31 |         -target_splitter="'${MOSESDIR}/scripts/ems/support/split-sentences.perl -l ${TLANG} -b -q'" | \
32 |   nice pigz -9 > ${BASEDIR}/downloaded/${PREFIX}.down.gz
33 |   zcat ${BASEDIR}/downloaded/${PREFIX}.down.gz | wc -l
34 |   touch ${DONEFILE}
35 | fi
36 | 
37 | # Step 2: Use bitextors wrapper around hunalign
38 | DONEFILE=${BASEDIR}/downloaded/${PREFIX}.done
39 | if [ ! -f ${DONEFILE} ]; then
40 |   zcat ${BASEDIR}/downloaded/${PREFIX}.down.gz |
41 |   nice parallel --pipe ${BITEXTORDIR}/bin/bitextor-align-segments \
42 |     --lang1 ${SLANG} --lang2 ${TLANG} \
43 |     -d ${BITEXTORDIR}/dictionaries-hunalign/${SLANG}-${TLANG}.dic | \
44 |   nice pigz -9 > ${BASEDIR}/aligned/${PREFIX}.aligned.gz
45 |   zcat ${BASEDIR}/aligned/${PREFIX}.aligned.gz | wc -l
46 |   touch ${DONEFILE}
47 | fi
48 | 
49 | # Step 3: Filter bad sentence pairs
50 | DONEFILE=${BASEDIR}/aligned/${PREFIX}.done
51 | if [ ! -f ${DONEFILE} ]; then
52 |   zcat ${BASEDIR}/aligned/${PREFIX}.aligned.gz |
53 |   cut -f 3- | \
54 |   ${DIR}/filter_hunalign_bitext.py - ${BASEDIR}/filtered/${PREFIX}.filtered \
55 |     -s ${SLANG} -t ${TLANG} \
56 |   > ${BASEDIR}/filtered/${PREFIX}.deleted
57 |   wc -l ${BASEDIR}/filtered/${PREFIX}.*
58 |   pigz -9 ${BASEDIR}/filtered/${PREFIX}.*
59 |   touch ${DONEFILE}
60 | fi
61 | 


--------------------------------------------------------------------------------
/baseline/lett2corpus_lowmem.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Exit as soon as any command fails
 4 | set -e
 5 | set -o pipefail
 6 | 
 7 | DICT=$1
 8 | LETT=$2
 9 | 
10 | LETTR=${LETT}r
11 | IDX=${LETT/lett/idx}
12 | RIDXS=${LETT/lett/ridx_s}
13 | RIDXT=${LETT/lett/ridx_t}
14 | RIDX=${LETT/lett/ridx}
15 | DIST=${LETT/lett/dist}
16 | DOCS=${LETT/lett/docs}
17 | SENT=${LETT/lett/sent}
18 | 
19 | BT=/home/buck/net/build/bitextor/bin
20 | 
21 | DONEFILE=${LETT/.lett/.done}
22 | LOG=${LETT/.lett/.log}
23 | 
24 | if [ ! -f ${DONEFILE} ]; then
25 |     source /home/buck/net/build/virtualenvs/crawl/bin/activate
26 |     # Need to have the punk tokenizer from nltk
27 |     #echo -e "import nltk\nnltk.download('punkt')" | python 2> /dev/null
28 | 
29 |     ls -lh ${LETT}
30 |     date  >> ${LOG}
31 |     echo "LETT .. LETTR .. " >> ${LOG}
32 |     cat ${LETT} | /home/buck/net/build/DataCollection/baseline/filter_emty_text_from_lett.py | python ${BT}/bitextor-lett2lettr > ${LETTR}
33 |     
34 |     echo "RIDX .. " >> ${LOG}
35 |     /home/buck/net/build/DataCollection/baseline/bitextor_util/lett2ridx_map.py ${LETTR}  -lang1 en -lang2 fr > ${RIDXS} 2>> ${LOG}
36 |     /home/buck/net/build/DataCollection/baseline/bitextor_util/lett2ridx_map.py ${LETTR}  -lang1 en -lang2 fr -dict ${DICT} > ${RIDXT} 2>> ${LOG}
37 |     /home/buck/net/build/DataCollection/baseline/bitextor_util/lett2ridx_combine.py ${RIDXS} ${RIDXT} > ${RIDX}
38 |     #python ${BT}/bitextor-lett2idx --lang1 en --lang2 fr < ${LETTR} > ${IDX}
39 |     #echo "RIDX .. " >> ${LOG}
40 |     #python ${BT}/bitextor-idx2ridx < ${IDX} -d ${DICT} --lang1 en --lang2 fr > ${RIDX}
41 |     echo "DIST .. " >> ${LOG}
42 |     python ${BT}/bitextor-distancefilter -l ${LETTR} ${RIDX}  > ${DIST}
43 |     echo "DOCS .. " >> ${LOG}
44 |     python  ${BT}/bitextor-align-documents ${RIDX} -l ${LETTR}  > ${DOCS}
45 |     echo "SENTS .. " >> ${LOG}
46 |     python  ${BT}/bitextor-align-segments --lang1 en --lang2 fr -d ${DICT} < ${DOCS} > ${SENT} 2>> ${LOG}
47 |     echo "Cleaning up .. " >> ${LOG}
48 |     rm -f ${IDX} ${LETTR} ${RIDX} ${DIST} ${DOCS} ${RIDXS} ${RIDXT}
49 |     echo "Done! " >> ${LOG}
50 |     echo -n "EN: " >> ${LOG}
51 |     cut -f 3 ${SENT} | wc >> ${LOG}
52 |     echo -n "FR: " >> ${LOG}
53 |     cut -f 4 ${SENT} | wc >> ${LOG}
54 | 
55 |     date  >> ${LOG}
56 |     touch ${DONEFILE}
57 | fi
58 | 


--------------------------------------------------------------------------------
/docaligner/eval_bitextor.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | 
 5 | if __name__ == "__main__":
 6 |     import argparse
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument('-devset', help='WMT16 devset',
 9 |                         type=argparse.FileType('r'))
10 |     parser.add_argument('-file2url', help='mapping to real url',
11 |                         type=argparse.FileType('r'))
12 |     # parser.add_argument('bitextor', help='url to index mapping',
13 |     #                     type=argparse.FileType('r'))
14 | 
15 |     args = parser.parse_args()
16 | 
17 |     mapping = {}
18 |     for line in args.file2url:
19 |         filename, url = line.strip().split('\t')
20 |         mapping[filename] = url
21 | 
22 |     print "Read %d mappings" % len(mapping)
23 | 
24 |     bitextor_result = {}
25 |     seen = set()
26 |     for line in sys.stdin:
27 |         line = line.strip().split('\t')
28 |         e, f = line[:2]
29 |         e = mapping.get(e, e)
30 |         f = mapping.get(f, f)
31 |         if f not in seen and e not in seen:
32 |             seen.add(f)
33 |             seen.add(e)
34 |             bitextor_result[e] = f
35 | 
36 |     print "Total pairs: ", len(bitextor_result)
37 | 
38 |     devset = set()
39 |     seen = set()
40 |     correct, wrong = [], []
41 |     for line in args.devset:
42 |         line = line.strip().split('\t')
43 |         f, e = line[:2]
44 | 
45 |         if len(line) > 2:
46 |             if line[0] == 'en' and line[2] == 'fr':
47 |                 e, f = line[1], line[3]
48 |             elif line[0] == 'fr' and line[2] == 'en':
49 |                 f, e = line[1], line[3]
50 | 
51 |         if f not in seen and e not in seen:
52 |             seen.add(f)
53 |             seen.add(e)
54 |             devset.add((f, e))
55 |         elif (f, e) not in devset:
56 |             print "already seen:"
57 |             print f, e
58 |             print devset
59 |             sys.exit()
60 | 
61 |     for f, e in devset:
62 |         if bitextor_result.get(e, '') == f:
63 |             correct.append(line)
64 |         else:
65 |             wrong.append(line)
66 | 
67 |     # print "correct:", correct
68 |     # print "wrong:", wrong
69 |     print "correc, wrong, total"
70 |     print len(correct), len(wrong), len(correct) + len(wrong)
71 | 


--------------------------------------------------------------------------------
/baseline/find_pairs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | import re
 5 | import urlparse
 6 | 
 7 | 
 8 | def process_buffer(buffer):
 9 |     if not buffer or len(buffer) < 2:
10 |         return
11 |     buffer = [line.decode('utf-8', 'ignore') for line in buffer]
12 |     split_buffer = [line.strip().lower().split("\t")
13 |                     for line in buffer]
14 |     if list(set(map(len, split_buffer))) != [4]:
15 |         for line in buffer:
16 |             sys.stderr.write(line.encode('utf-8'))
17 |         return
18 |     original_urls = []
19 |     stripped_languages = []
20 |     detected_languages = []
21 |     for stripped_url, \
22 |             original_url, \
23 |             stripped_language, \
24 |             detected_language in split_buffer:
25 |         original_urls.append(original_url)
26 |         stripped_languages.append(stripped_language)
27 |         detected_languages.append(detected_language)
28 | 
29 |     if len(set(original_urls)) < 2:
30 |         # print "not enough urls"
31 |         return
32 |     if len(set(stripped_languages)) < 2:
33 |         # print "not enough stripped languages", languages_stripped
34 |         return
35 |     if len(set(detected_languages)) < 2:
36 |         # print "not enough detected_languages", detected_languages
37 |         return
38 | 
39 |     for language in stripped_languages:
40 |         for detected_language in detected_languages:
41 |             # print "looking for ", language, " in ", detected_languages
42 |             if language in detected_language.replace("chineset", "chinese") \
43 |                                             .split('/'):
44 |                 for line in buffer:
45 |                     sys.stdout.write(line.encode("utf-8"))
46 |                 return
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     import argparse
51 |     parser = argparse.ArgumentParser()
52 | 
53 |     buffer = []
54 |     buffer_url = None
55 |     for line in sys.stdin:
56 |         # line = line.decode("utf-8", "ignore")
57 |         url = line.split("\t", 1)[0]
58 |         if url != buffer_url:
59 |             process_buffer(buffer)
60 |             buffer = [line]
61 |             buffer_url = url
62 |         else:
63 |             buffer.append(line)
64 |             # print url != buffer_url
65 |     process_buffer(buffer)
66 | 


--------------------------------------------------------------------------------
/baseline/corpus2corpus.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import base64
 4 | import sys
 5 | 
 6 | from html2text import html2text
 7 | from external_processor import TextProcessor
 8 | 
 9 | 
10 | def process_candidates(candidates, outfile):
11 |     if candidates[0][-1] == "" or candidates[1][-1] == "":
12 |         return
13 |     src_url, src_text, src_html = candidates[0]
14 |     tgt_url, tgt_text, tgt_html = candidates[1]
15 | 
16 | 
17 | if __name__ == "__main__":
18 |     import argparse
19 |     parser = argparse.ArgumentParser()
20 |     parser.add_argument('-outfile', type=argparse.FileType('w'),
21 |                         help='output file', default=sys.stdout)
22 |     parser.add_argument('-source_tokenizer',
23 |                         help='call to source tokenizer, incl. args')
24 |     parser.add_argument('-source_splitter',
25 |                         help='call to source sentence splitter, incl. args')
26 |     parser.add_argument('-target_tokenizer',
27 |                         help='call to target tokenizer, incl. args')
28 |     parser.add_argument('-target_splitter',
29 |                         help='call to target sentence splitter, incl. args')
30 |     args = parser.parse_args()
31 | 
32 |     source_text_processor = TextProcessor(splitter=args.source_splitter,
33 |                                           tokenizer=args.source_tokenizer)
34 |     target_text_processor = TextProcessor(splitter=args.target_splitter,
35 |                                           tokenizer=args.target_tokenizer)
36 | 
37 |     for line in sys.stdin:
38 |         src_url, tgt_url, _, _, src_html, tgt_html = line.strip().split("\t")
39 | 
40 |         src_text = html2text(base64.b64decode(src_html), sanitize=True)
41 |         tgt_text = html2text(base64.b64decode(tgt_html), sanitize=True)
42 |         src_text = source_text_processor.process(unicode(src_text))
43 |         tgt_text = source_text_processor.process(unicode(tgt_text))
44 | 
45 |         args.outfile.write("\t".join((src_url,
46 |                                       tgt_url,
47 |                                       base64.b64encode(
48 |                                           src_text.encode('utf-8')),
49 |                                       base64.b64encode(
50 |                                           tgt_text.encode('utf-8')),
51 |                                       src_html,
52 |                                       tgt_html)))
53 |         args.outfile.write("\n")
54 | 


--------------------------------------------------------------------------------
/metadata/lang_stats/accumulate_langstats.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | from collections import defaultdict
 6 | from math import log
 7 | 
 8 | """ Collect language distribution per domain
 9 |     from language splitting output """
10 | 
11 | 
12 | def netloc(url):
13 |     # slightly faster than the urlparse version
14 |     netloc = url.split('//', 1)[1].split('/', 1)[0]
15 |     netloc = netloc.split(':', 1)[0].split('@')[-1]
16 |     # netloc = urlparse(uri).netloc
17 |     return netloc
18 | 
19 | 
20 | def parse_line(line):
21 |     data = {}
22 |     for item in line.rstrip().split()[1:]:
23 |         key, value = item.split(':', 1)
24 |         if key == "bytes":
25 |             value = int(value)
26 |         data[key] = value
27 |     return data
28 | 
29 | 
30 | def entropy(lang_dist):
31 |     total = float(sum(lang_dist.values()))
32 |     h = 0
33 |     for lang, count in lang_dist.iteritems():
34 |         p = count / total
35 |         h += p * log(p)
36 |     return h
37 | 
38 | if __name__ == "__main__":
39 |     import argparse
40 |     parser = argparse.ArgumentParser()
41 |     parser.add_argument('-minbytes', type=int, default=100,
42 |                         help="ignore chunks smaller than this.")
43 |     parser.add_argument('-lang', nargs='*',
44 |                         help="Ignore all other languages but these.")
45 |     args = parser.parse_args(sys.argv[1:])
46 | 
47 |     stats = defaultdict(lambda: defaultdict(int))
48 | 
49 |     header = None
50 |     full_domain = None
51 |     valid_languages = []
52 |     if args.lang:
53 |         valid_languages = [l.lower() for l in args.lang]
54 | 
55 |     for linenr, line in enumerate(sys.stdin):
56 |         header = parse_line(line)
57 | 
58 |         lang = header["language"]
59 |         if valid_languages and lang.lower() not in valid_languages:
60 |             continue
61 | 
62 |         if header["bytes"] >= args.minbytes:
63 |             hostname = netloc(header["uri"])
64 |             stats[hostname][lang] += header["bytes"]
65 | 
66 | for domain in stats:
67 |     h = entropy(stats[domain])
68 |     if h == 0.0:
69 |         continue
70 |     sys.stdout.write("%f %s" % (h, domain))
71 |     counts = [(count, language)
72 |               for language, count in stats[domain].iteritems()]
73 |     counts.sort(reverse=True)
74 |     for count, lang in counts:
75 |         sys.stdout.write(" %s %d" % (lang, count))
76 |     sys.stdout.write("\n")
77 | 


--------------------------------------------------------------------------------
/docaligner/tokenizer.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import subprocess
 3 | import os
 4 | import threading
 5 | from nltk.tokenize import wordpunct_tokenize
 6 | import re
 7 | 
 8 | 
 9 | class SpaceTokenizer(object):
10 | 
11 |     """ Fall-back is no tokenizer is available """
12 | 
13 |     def __init__(self):
14 |         sys.stderr.write("Using SpaceTokenizer.\n")
15 | 
16 |     def process(self, line):
17 |         return " ".join(line.split())
18 | 
19 | 
20 | class WordPunctTokenizer(object):
21 | 
22 |     """ Fall-back is no tokenizer is available """
23 | 
24 |     def __init__(self):
25 |         sys.stderr.write("Using WordPunctTokenizer.\n")
26 | 
27 |     def process(self, line):
28 |         words = [w for w in wordpunct_tokenize(
29 |             line) if re.match("\w+|\d+", w) is not None]
30 |         return " ".join(words)
31 | 
32 | 
33 | class ExternalProcessor(object):
34 | 
35 |     """ wraps an external script and does utf-8 conversions, is thread-safe """
36 | 
37 |     def __init__(self, cmd):
38 |         self.cmd = cmd
39 |         self.devnull = open(os.devnull, 'wb')
40 |         self._lock = threading.Lock()
41 |         self.proc = None
42 |         self.reset()
43 | 
44 |     def reset(self):
45 |         with self._lock:
46 |             if self.proc is not None:
47 |                 res = self.proc.communicate()
48 |                 if res[0].strip():
49 |                     sys.stderr.write("Remaining output: %s" % res[0])
50 |             self.proc = subprocess.Popen(self.cmd.split(),
51 |                                          stdin=subprocess.PIPE,
52 |                                          stdout=subprocess.PIPE,
53 |                                          stderr=self.devnull)
54 | 
55 |     def process_multiline(self, text):
56 |         res = []
57 |         for line in text.split(u'\n'):
58 |             if line.strip():
59 |                 res.append(self.process(line))
60 |         return u'\n'.join(res)
61 | 
62 |     def process(self, line):
63 |         if self.cmd is None or not line.strip():
64 |             return line
65 |         assert u"\n" not in line
66 |         u_string = u"%s\n" % line
67 |         u_string = u_string.encode("utf-8")
68 |         result = u_string  # fallback: return input
69 |         with self._lock:
70 |             self.proc.stdin.write(u_string)
71 |             self.proc.stdin.flush()
72 |             result = self.proc.stdout.readline()
73 |         self.reset()
74 |         return result.decode("utf-8").strip()
75 | 


--------------------------------------------------------------------------------
/baseline/lett2ridx.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | from strip_language_from_uri import LanguageStripper
 6 | from collections import defaultdict
 7 | import urlparse
 8 | 
 9 | """ produces the .ridx file from a lett file by ignoring content in
10 |     favour of url matching
11 | """
12 | 
13 | 
14 | def strip_uri(uri, language_stripper):
15 |     parsed_uri = urlparse.urlparse(uri)
16 | 
17 |     matched_language = language_stripper.match(parsed_uri.path)
18 |     if not matched_language:
19 |         matched_language = language_stripper.match(parsed_uri.query)
20 |         assert matched_language, repr(parsed_uri)
21 | 
22 |     stripped_path = language_stripper.strip(parsed_uri.path)
23 |     stripped_query = language_stripper.strip(parsed_uri.query)
24 |     stripped_uri = urlparse.ParseResult(parsed_uri.scheme,
25 |                                         parsed_uri.netloc,
26 |                                         stripped_path,
27 |                                         parsed_uri.params,
28 |                                         stripped_query,
29 |                                         parsed_uri.fragment).geturl()
30 |     return matched_language, stripped_uri
31 | 
32 | if __name__ == "__main__":
33 |     import argparse
34 | 
35 |     parser = argparse.ArgumentParser()
36 |     parser.add_argument(
37 |         'infile', type=argparse.FileType('r'), default=sys.stdin)
38 |     parser.add_argument(
39 |         'outfile', type=argparse.FileType('w'), default=sys.stdout)
40 |     args = parser.parse_args()
41 |     stripper = LanguageStripper()
42 | 
43 |     urls_src = []
44 |     urls_tgt = defaultdict(list)
45 |     for linenr, line in enumerate(args.infile):
46 |         line = line.split("\t")
47 |         url = line[3]
48 |         lang, stripped_url = strip_uri(url, stripper)
49 |         if lang == "ENGLISH":
50 |             urls_src.append((linenr, stripped_url))
51 |             continue
52 |         else:
53 |             assert lang == "FRENCH", "L: %s url: %s \n" % (lang, url)
54 |             urls_tgt[stripped_url].append(linenr)
55 | 
56 |     for linenr, stripped_url in urls_src:
57 |         # source lang urls appear first in the file
58 |         assert stripped_url in urls_tgt, "URL not fould: %s (%s)" % (
59 |             stripped_url, url)
60 |         args.outfile.write("%d" % (linenr + 1))
61 |         for i in urls_tgt[stripped_url]:
62 |             args.outfile.write("\t%d:1.0" % (i + 1))
63 |         args.outfile.write("\n")
64 | 


--------------------------------------------------------------------------------
/baseline/filter_sent.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | import sys
 6 | import langid
 7 | from collections import defaultdict
 8 | 
 9 | 
10 | if __name__ == "__main__":
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument('infile', nargs='?', type=argparse.FileType('r'),
13 |                         default=sys.stdin)
14 |     parser.add_argument('outfile', nargs='?', type=argparse.FileType('w'),
15 |                         default=sys.stdout)
16 |     parser.add_argument('-s', '--lang1', help='source language',
17 |                         dest='source_lang', default='en')
18 |     parser.add_argument('-t', '--lang2', help='target language',
19 |                         dest='target_lang', default='fr')
20 |     args = parser.parse_args()
21 | 
22 |     deletions = defaultdict(list)
23 | 
24 |     endCount = 0
25 |     totalCount = 0
26 |     langid.set_languages([args.source_lang, args.target_lang])
27 |     for line in args.infile:
28 |         totalCount += 1
29 |         [url1, url2, source, target, score] = line.split("\t")
30 |         langid_source = langid.classify(source.lower())
31 |         langid_target = langid.classify(target.lower())
32 |         if not source.strip():
33 |             deletions["source_empty"].append(source)
34 |         elif not target.strip():
35 |             deletions["target_empty"].append(target)
36 |         elif langid_source[0] != args.source_lang and langid_source[1] > 0.9:
37 |             deletions["source_lang"].append(
38 |                 "%s\t%s\t%f" % (source, langid_source[0], langid_source[1]))
39 |         elif langid_target[0] != args.target_lang and langid_target[1] > 0.9:
40 |             deletions["target_lang"].append(
41 |                 "%s\t%s\t%f" % (target, langid_target[0], langid_target[1]))
42 |         elif source == target:
43 |             deletions["identical"].append(target)
44 |         elif float((len(source) + 15)) / float(len(target) + 15) > 1.5:
45 |             deletions["source_too_long"].append("%s\t%s" % (source, target))
46 |         elif float((len(target) + 15)) / float(len(source) + 15) > 1.5:
47 |             deletions["source_too_short"].append("%s\t%s" % (source, target))
48 |         else:
49 |             args.outfile.write(line)
50 |             endCount += 1
51 |     print "Written: %d of %d = %f percent" % (endCount, totalCount,
52 |                                               100. * endCount / totalCount)
53 |     for reason, deleted in deletions.iteritems():
54 |         print "Deleted %d items due to %s" % (len(deleted), reason)
55 |         for line in deleted:
56 |             print "\t%s" % line
57 | 


--------------------------------------------------------------------------------
/baseline/candidates2bitextor.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | import base64
 5 | from html2text import html2text
 6 | 
 7 | magic_numer = "df6fa1abb58549287111ba8d776733e9"
 8 | 
 9 | 
10 | def process_buffer(buf, d):
11 |     if not buf:
12 |         return
13 |     header = buf[0]
14 |     url = header.split()[1]
15 |     skip = 0
16 |     empty_lines = 0
17 |     while empty_lines < 2:
18 |         skip += 1
19 |         if not buf[skip].strip():
20 |             empty_lines += 1
21 |     html = "".join(buf[skip + 1:])
22 |     d[url] = (header, html)
23 | 
24 | 
25 | def read_file(f, d):
26 |     buf = []
27 |     for line in f:
28 |         if line.startswith(magic_numer):
29 |             process_buffer(buf, d)
30 |             buf = [line]
31 |             continue
32 |         buf.append(line)
33 |     process_buffer(buf, d)
34 | 
35 | 
36 | def process_dict(d):
37 |     for u, (header, html) in d.iteritems():
38 |         original_url = header.split()[2]
39 |         text = hmtl2text(html, sanitize=True)
40 |         # html = base64.b64encode(html.encode("utf-8"))
41 |         yield u, original_url, html, text
42 | 
43 | 
44 | def write_lett(sdict, tdict, slang, tlang, f):
45 |     mime_type = "text/html"
46 |     encoding = "charset=utf-8"
47 |     for l, d in ((slang, sdict), (tlang, tdict)):
48 |         for url, original_url, html, text in process_dict(d):
49 |             f.write("{l}\t{mime}\t{enc}\t{name}\t{html}\t{text}\n".format(
50 |                 l=l,
51 |                 mime=mime_type,
52 |                 enc=encoding,
53 |                 name=original_url,
54 |                 html=base64.b64encode(html),
55 |                 text=base64.b64encode(text.encode("utf-8"))))
56 |         # html=base64.b64encode(html.encode("utf-8")),
57 |         # text=base64.b64encode(text.encode("utf-8"))))
58 | 
59 | if __name__ == "__main__":
60 |     import argparse
61 |     parser = argparse.ArgumentParser()
62 |     parser.add_argument('source', type=argparse.FileType('r'),
63 |                         help='source corpus')
64 |     parser.add_argument('target', type=argparse.FileType('r'),
65 |                         help='target corpus')
66 |     parser.add_argument('outfile', type=argparse.FileType('w'),
67 |                         help='output file')
68 |     parser.add_argument('-slang', help='source language', default='en')
69 |     parser.add_argument('-tlang', help='target language', default='fr')
70 |     args = parser.parse_args(sys.argv[1:])
71 | 
72 |     sdict, tdict = {}, {}
73 |     read_file(args.source, sdict)
74 |     read_file(args.target, tdict)
75 |     write_lett(sdict, tdict, args.slang, args.tlang, args.outfile)
76 | 


--------------------------------------------------------------------------------
/baseline/eval_sent.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | from strip_language_from_uri import LanguageStripper
 5 | import urlparse
 6 | 
 7 | correct, wrong = [], []
 8 | 
 9 | 
10 | def strip_uri(uri, language_stripper):
11 |     parsed_uri = urlparse.urlparse(uri)
12 | 
13 |     matched_language = language_stripper.match(parsed_uri.path)
14 |     if not matched_language:
15 |         matched_language = language_stripper.match(parsed_uri.query)
16 |         assert matched_language
17 | 
18 |     stripped_path = language_stripper.strip(parsed_uri.path)
19 |     stripped_query = language_stripper.strip(parsed_uri.query)
20 |     stripped_uri = urlparse.ParseResult(parsed_uri.scheme,
21 |                                         parsed_uri.netloc,
22 |                                         stripped_path,
23 |                                         parsed_uri.params,
24 |                                         stripped_query,
25 |                                         parsed_uri.fragment).geturl()
26 |     return matched_language, stripped_uri
27 | 
28 | if __name__ == "__main__":
29 |     import argparse
30 | 
31 |     parser = argparse.ArgumentParser()
32 |     parser.add_argument(
33 |         'infile', type=argparse.FileType('r'), default=sys.stdin)
34 |     parser.add_argument(
35 |         '--outfile', type=argparse.FileType('w'))
36 |     parser.add_argument('-filter', action="store_true")
37 |     args = parser.parse_args()
38 |     stripper = LanguageStripper()
39 | 
40 |     source_uris, target_uris = set(), set()
41 | 
42 |     for line in args.infile:
43 |         source_uri, target_uri, source, target, score = line.split("\t")
44 |         source_lang, stripped_source_uri = strip_uri(source_uri, stripper)
45 |         target_lang, stripped_target_uri = strip_uri(target_uri, stripper)
46 |         source_uris.add(source_uri)
47 |         target_uris.add(target_uri)
48 |         if stripped_source_uri != stripped_target_uri:
49 |             wrong.append((stripped_source_uri, stripped_target_uri))
50 |         else:
51 |             if args.outfile:
52 |                 args.outfile.write(line)
53 |             correct.append((stripped_source_uri, stripped_target_uri))
54 | 
55 |     print "found %s source and %s target uris" % (len(source_uris), len(target_uris))
56 | 
57 |     total = len(wrong) + len(correct)
58 |     total_unique = len(set(wrong).union(set(correct)))
59 |     if wrong:
60 |         print "Wrong: ",  len(wrong), len(set(wrong))
61 |     if correct:
62 |         print "Correct", len(correct), len(set(correct))
63 |     if total > 0:
64 |         print "Acc1", float(len(wrong)) / total
65 |         print "Acc2", float(len(set(wrong))) / total_unique
66 | 


--------------------------------------------------------------------------------
/merge/metadata/read_wet.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Reads downloaded website from tar file and writes lett format to be
 6 | processed by bitextor pipeline
 7 | """
 8 | 
 9 | import sys
10 | import chardet
11 | from chardet.universaldetector import UniversalDetector
12 | 
13 | 
14 | def guess_encoding_incremental(data):
15 |     sys.stderr.write("running incremental chardet\n")
16 |     detector = UniversalDetector()
17 |     for line in data.split("\n"):
18 |         detector.feed(line)
19 |         if detector.done:
20 |             break
21 |     detector.close()
22 |     encoding = detector.result
23 |     return encoding["encoding"]
24 | 
25 | 
26 | def guess_encoding(data):
27 |     sys.stderr.write("running full chardet\n")
28 |     encoding = chardet.detect(data)
29 |     return encoding["encoding"]
30 | 
31 | 
32 | def convert_to_utf8(data, force_chardet=False):
33 |     encoding = "utf-8"
34 |     try:
35 |         if force_chardet:
36 |             raise
37 |         data = data.decode(encoding)
38 |     except:
39 |         encoding = guess_encoding_incremental(data)
40 |         try:
41 |             data = data.decode(encoding)
42 |         except:
43 |             encoding = guess_encoding(data)
44 |             try:
45 |                 data = data.decode(encoding)
46 |             except:
47 |                 sys.stderr.write("Fallback: ignoring errors.\n")
48 |                 return data.decode("utf-8", errors='ignore')
49 |     # sys.stderr.write("Detected encoding: %s\n"
50 |     #                  % encoding)
51 |     return data
52 | 
53 | magic_number = "df6fa1abb58549287111ba8d776733e9"
54 | 
55 | 
56 | def process_buffer(uri, buf):
57 |     sys.stdout.write("%s uri:%s\n" % (magic_number, uri))
58 |     buf = "".join(buf)
59 |     buf = convert_to_utf8(buf)
60 |     sys.stdout.write(buf.encode("utf-8"))
61 | 
62 | in_header, in_content = False, False
63 | uri, buf = None, []
64 | 
65 | skip = True
66 | 
67 | for line in sys.stdin:
68 |     assert not (in_header and in_content)
69 |     assert not (in_header and buf)
70 |     assert not (in_content and not uri)
71 | 
72 |     if line.startswith("WARC-Type: conversion"):
73 |         in_header = True
74 |         continue
75 | 
76 |     if in_header:
77 |         if line.startswith("WARC-Target-URI:"):
78 |             uri = line.split(" ", 1)[1].strip()
79 |         if not line.strip():
80 |             in_content = True
81 |             in_header = False
82 |         continue
83 | 
84 |     if in_content:
85 |         if not line.strip():
86 |             process_buffer(uri, buf)
87 |             uri, buf = None, []
88 |             in_content = False
89 |         else:
90 |             buf.append(line)
91 | 


--------------------------------------------------------------------------------
/INSTALL.md:
--------------------------------------------------------------------------------
 1 | # Hardware requirements
 2 | 
 3 | * CPU
 4 |   * Phase 1: 32 Core recommended (language identification in [Phase 1](metadata/metadata.md) takes about 100 days on such a 4 Core CPU, 32 Core CPU recommended)
 5 |   * Phase 2: 4 Core 
 6 | * RAM
 7 |   * 32 GB RAM (see issues #8 and #18)
 8 | * Drive space
 9 |   * Phase 1: 3-4 TB per Common Crawl
10 |   * Phase 2: 300 GB per language direction
11 | 
12 | # Operating system requirements
13 | The system was tested with Ubuntu 14.04 LTS, but other Debian-based Linux distributions should work as well.
14 | 
15 | # Software installation
16 | 
17 | ## Install packages
18 | ```
19 | sudo apt-get update
20 | sudo apt-get install build-essential git-core pkg-config
21 | sudo apt-get install automake libtool
22 | # for scipy
23 | sudo apt-get install libxml2-dev libxslt1-dev
24 | sudo apt-get install parallel
25 | # python packages
26 | sudo apt-get install python-dev python-pip python-virtualenv python-numpy python-scipy  ipython  python-nose
27 | # for RocksDB
28 | sudo apt-get install libgflags-dev libsnappy-dev libbz2-dev liblzma-dev zlib1g-dev libjsoncpp-dev
29 | # Tools
30 | sudo apt-get install pigz
31 | ```
32 | 
33 | ## Make a directory for code
34 | ```
35 | cd
36 | mkdir -p net/build
37 | ```
38 | 
39 | ## Clone project from github (add ssh key before)
40 | ```
41 | cd net/build/
42 | git clone git@github.com:ModernMT/DataCollection.git
43 | ```
44 | 
45 | ## Make new virtualenv (optional)
46 | ```
47 | cd net/build/
48 | virtualenv crawl
49 | ```
50 | 
51 | ## Activate virtualenv (optional)
52 | ```
53 | source ~/net/build/crawl/bin/activate
54 | ```
55 | 
56 | ## Install requirements
57 | ```
58 | sudo apt-get install libffi-dev
59 | sudo apt-get install libssl-dev
60 | sudo apt-get install liblapack-dev
61 | sudo apt-get install gfortran
62 | cd DataCollection/
63 | pip install --upgrade 'git+https://github.com/GregBowyer/cld2-cffi.git'
64 | pip install -r requirements.txt
65 | ```
66 | When encountering issues with installing NLTK data, you might have to hand-edit `DEFAULT_URL` in `/usr/lib/python2.7/dist-packages/nltk/downloader.py` from `http://nltk.googlecode.com/svn/trunk/nltk_data/index.xml` to `http://www.nltk.org/nltk_data/`.
67 | 
68 | ## Install Moses
69 | ```
70 | sudo apt-get install build-essential git-core pkg-config automake libtool
71 | cd /home/build
72 | git clone https://github.com/moses-smt/mosesdecoder moses
73 | cd moses
74 | make -f contrib/Makefiles/install-dependencies.gmake
75 | ```
76 | 
77 | ## Install Bitextor
78 | 
79 | Like described at http://sourceforge.net/p/bitextor/wiki/Home/ (tested baseline with 4.1.0-rc4, but newer versions should work)
80 | 
81 | Potentially needed option when configuring: `./configure --without-apertium`
82 | 
83 | Download `de-en.dict` from Bitextor sourceforge website.
84 | 
85 | 
86 | 


--------------------------------------------------------------------------------
/baseline/check_lett_lang.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | import os
 6 | 
 7 | doc2lang = {}
 8 | 
 9 | if __name__ == "__main__":
10 |     import argparse
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument('referencepairs', type=argparse.FileType('r'))
13 |     parser.add_argument('-slang', help='Source language', default='en')
14 |     parser.add_argument('-tlang', help='Non-english language', default='fr')
15 |     parser.add_argument('-prefix', help='prefix added to make filenames',
16 |                         default="/fs/syn0/pkoehn/crawl/data/site-crawls")
17 | 
18 |     args = parser.parse_args(sys.argv[1:])
19 | 
20 |     # read all the .lett files from stdin
21 | 
22 |     for line in sys.stdin:
23 |         line = line.split("\t")
24 |         if len(line) != 6:
25 |             # sys.stderr.write("broken format: %s\n" % line[0])
26 |             continue
27 |         lang = line[0]
28 |         filename = line[3].strip()
29 |         if filename in doc2lang:
30 |             sys.stderr.write("Duplicate entry: %s:%s\n" % (filename, lang))
31 |         doc2lang[filename] = lang
32 |         # print filename, lang
33 | 
34 |     correct = 0
35 |     total = 0
36 |     unknown = 0
37 |     unknown_but_file = 0
38 |     wrong_lang_pair = 0
39 | 
40 |     for line in args.referencepairs:
41 |         total += 1
42 |         domain, a, b = line.split("\t")
43 |         a = a.strip()
44 |         b = b.strip()
45 | 
46 |         found = True
47 |         for f in (a, b):
48 |             if f not in doc2lang:
49 |                 sys.stderr.write("unknown file %s\n" % (f))
50 |                 unknown += 1
51 | 
52 |                 filename = os.path.join(args.prefix, f.split("/")[0], f)
53 |                 if os.path.isfile(filename):
54 |                     sys.stderr.write("but file %s exists\n" % (filename))
55 |                     unknown_but_file += 1
56 | 
57 |                 found = False
58 |             elif doc2lang[f] not in (args.slang, args.tlang):
59 |                 sys.stderr.write("%s detected as neither %s or %s\n"
60 |                                  % (f, args.slang, args.tland))
61 |                 wrong_lang_pair += 1
62 |                 found = False
63 | 
64 |         if not found:
65 |             continue
66 | 
67 |         if doc2lang[a] == doc2lang[b]:
68 |             sys.stderr.write("Found both %s and %s to be in %s\n"
69 |                              % (a, b, doc2lang[b]))
70 |             wrong_lang_pair += 1
71 |             continue
72 | 
73 |         correct += 1
74 | 
75 |     print "Total: ", total
76 |     print "Possible: ", correct
77 |     print "Unknown: ", unknown
78 |     print "Unknown but file exists: ", unknown_but_file
79 |     print "Wrong_lang_pair: ", wrong_lang_pair
80 | 


--------------------------------------------------------------------------------
/metadata/read_wet.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Reads downloaded website from tar file and writes lett format to be
 6 | processed by bitextor pipeline
 7 | """
 8 | 
 9 | import sys
10 | import chardet
11 | from chardet.universaldetector import UniversalDetector
12 | 
13 | 
14 | def guess_encoding_incremental(data):
15 |     sys.stderr.write("running incremental chardet\n")
16 |     detector = UniversalDetector()
17 |     for line in data.split("\n"):
18 |         detector.feed(line)
19 |         if detector.done:
20 |             break
21 |     detector.close()
22 |     encoding = detector.result
23 |     return encoding["encoding"]
24 | 
25 | 
26 | def guess_encoding(data):
27 |     sys.stderr.write("running full chardet\n")
28 |     encoding = chardet.detect(data)
29 |     return encoding["encoding"]
30 | 
31 | 
32 | def convert_to_utf8(data, force_chardet=False):
33 |     encoding = "utf-8"
34 |     try:
35 |         if force_chardet:
36 |             raise
37 |         data = data.decode(encoding)
38 |     except:
39 |         encoding = guess_encoding_incremental(data)
40 |         try:
41 |             data = data.decode(encoding)
42 |         except:
43 |             encoding = guess_encoding(data)
44 |             try:
45 |                 data = data.decode(encoding)
46 |             except:
47 |                 sys.stderr.write("Fallback: ignoring errors.\n")
48 |                 return data.decode("utf-8", errors='ignore')
49 |     # sys.stderr.write("Detected encoding: %s\n"
50 |     #                  % encoding)
51 |     return data
52 | 
53 | magic_number = "df6fa1abb58549287111ba8d776733e9"
54 | 
55 | 
56 | def process_buffer(uri, buf):
57 |     sys.stdout.write("%s uri:%s\n" % (magic_number, uri))
58 |     buf = "".join(buf)
59 |     buf = convert_to_utf8(buf)
60 |     sys.stdout.write(buf.encode("utf-8"))
61 | 
62 | in_header, in_content = False, False
63 | uri, buf = None, []
64 | 
65 | skip = True
66 | 
67 | for line in sys.stdin:
68 |     assert not (in_header and in_content)
69 |     assert not (in_header and buf)
70 |     assert not (in_content and not uri)
71 | 
72 |     if line.startswith("WARC-Type: conversion"):
73 |         in_header = True
74 |         in_content = False
75 |         uri, buf = None, []
76 |         continue
77 | 
78 |     if in_header:
79 |         if line.startswith("WARC-Target-URI:"):
80 |             uri = line.split(" ", 1)
81 |             if len(uri) > 1:
82 |                 uri = uri[1].strip()
83 |         if not line.strip():
84 |             in_content = True
85 |             in_header = False
86 |         continue
87 | 
88 |     if in_content:
89 |         if not line.strip():
90 |             if uri:
91 |               process_buffer(uri, buf)
92 |             uri, buf = None, []
93 |             in_content = False
94 |         else:
95 |             buf.append(line)
96 | 


--------------------------------------------------------------------------------
/metadata/leveldb/updatekv.cc:
--------------------------------------------------------------------------------
 1 | // Write tab separated key-value pairs to a level-db
 2 | 
 3 | #include <iostream>
 4 | #include <string>
 5 | #include "jsoncpp/json/json.h"
 6 | #include "leveldb/cache.h"
 7 | #include "leveldb/db.h"
 8 | #include "leveldb/write_batch.h"
 9 | 
10 | using std::string;
11 | 
12 | string JoinJSON(const string& s1, const string& s2) {
13 |     Json::Value json1, json2;
14 |     Json::Reader parser;
15 |     bool result1 = parser.parse(s1, json1);
16 |     bool result2 = parser.parse(s2, json2);
17 |     assert(result1 && result2);
18 | 
19 |     // Copy key-value pairs from s2 to s1 (possibliy overwriting)
20 |     for (Json::ValueIterator it = json2.begin(); it != json2.end() ; it++ ) {
21 |         json1[it.key().toStyledString()] = *it;
22 |     }
23 | 
24 |     Json::FastWriter writer;
25 |     const string result = writer.write(json1);
26 |     return result;
27 | }
28 | 
29 | 
30 | int main(int argc, char** argv) {
31 |     if (argc < 2) {
32 |         std::cout << "Usage: " << argv[0] << " db_directory" << std::endl;
33 |         return -1;
34 |     }
35 | 
36 |     leveldb::DB* db;
37 |     leveldb::Options options;
38 |     options.block_cache = leveldb::NewLRUCache(1014 * 1024 * 1024);  // 1GB
39 |     options.write_buffer_size = 1024 * 1024 * 1024; // 1 GB
40 |     options.create_if_missing = true;
41 |     leveldb::Status status = leveldb::DB::Open(options, argv[1], &db);
42 |     if (!status.ok()) {
43 |         std::cerr << "Error opening DB: " << status.ToString() << std::endl;  
44 |         return -1;
45 |     } 
46 | 
47 |     leveldb::WriteOptions writeOptions;
48 |     leveldb::ReadOptions readOptions;
49 |     // writeOptions.sync = true;
50 | 
51 |     string line;
52 |     int nLines = 0;
53 |     leveldb::WriteBatch batch;
54 | 
55 |     while(std::getline(std::cin, line)) {
56 |         ++nLines;
57 |         const size_t key_end = line.find("\t");
58 |         assert(key_end != std::string::npos);
59 |         const string key  = line.substr(0, key_end);
60 |         const string value = line.substr(key_end+1);
61 | 
62 |         string oldValue;
63 |         status = db->Get(readOptions, key, &oldValue);
64 |         if (status.ok()) {
65 |             const string newValue = JoinJSON(value, oldValue);
66 |             batch.Put(key, newValue);
67 |         } else {
68 |             std::cerr << status.ToString() 
69 |                       << "\t" << line << std::endl;
70 |         }
71 |         if (nLines % 1000 == 0) {
72 |             status = db->Write(writeOptions, &batch);
73 |             if (!status.ok()) {
74 |                 std::cerr << "Write error: " << status.ToString() << std::endl;
75 |             }
76 |             batch.Clear();
77 |         }
78 |     }
79 |     // Write remaining entries
80 |     status = db->Write(writeOptions, &batch);
81 |     if (!status.ok()) {
82 |         std::cerr << "Write error: " << status.ToString() << std::endl;
83 |     }
84 | 
85 |     delete db;
86 |     delete options.block_cache;
87 |     return 0;
88 | }
89 | 


--------------------------------------------------------------------------------
/Results/results_ev4.xml:
--------------------------------------------------------------------------------
 1 | Files Downloaded:
 2 | -----------------------------------------------------
 3 | 1468.xml	Storia  - Terex Finlay	http://www.terex.com/mobile-processing-equipment/it/about-us/company-history/
 4 | 2377.xml	Our Values  - Terex Corporation	http://www.terex.com/EN/about-terex/our-values/
 5 | 1007.xml	Business Strategy  - Terex Corporation	http://www.terex.com/EN/about-terex/business-strategy/
 6 | 143.xml	Suppliers  - Terex Corporation	http://www.terex.com/en_uk/about-terex/suppliers/
 7 | 146.xml	Government & Military  - Terex Corporation	http://www.terex.com/en_uk/products-services/government-military/
 8 | 211.xml	Company History  - Terex Finlay	http://www.terex.com/mobile-processing-equipment/en/about-us/company-history/
 9 | 191.xml	Governo e Militari  - Terex Corporation	http://www.terex.com/it/products-services/government-military/
10 | 219.xml	Spare Parts  - Terex Finlay	http://www.terex.com/mobile-processing-equipment/en/sales-and-support/spare-parts/
11 | 188.xml	Fornitori  - Terex Corporation	http://www.terex.com/it/about-terex/suppliers/
12 | 1471.xml	Ricambi  - Terex Finlay	http://www.terex.com/mobile-processing-equipment/it/sales-and-support/spare-parts/
13 | 10.xml	Contact us & Feedback  - Terex Corporation	http://www.terex.com/en/about-terex/contactus-feedback/
14 | 33.xml	Home  - Terex Finlay	http://www.terex.com/mobile-processing-equipment/en/
15 | 2312.xml	Brands  - Terex Corporation	http://www.terex.com/EN/about-terex/brands/
16 | 221.xml	Home  - Terex Finlay	http://www.terex.com/mobile-processing-equipment/it/
17 | 1466.xml	Crushers  - Terex Finlay	http://www.terex.com/mobile-processing-equipment/en/products/recycling/crushers/
18 | 216.xml	Recycling  - Terex Finlay	http://www.terex.com/mobile-processing-equipment/en/products/recycling/
19 | 177.xml	Strategia aziendale  - Terex Corporation	http://www.terex.com/it/about-terex/business-strategy/
20 | 179.xml	Contatti e feedback  - Terex Corporation	http://www.terex.com/it/about-terex/contactus-feedback/
21 | 1470.xml	Assistenza  - Terex Finlay	http://www.terex.com/mobile-processing-equipment/it/sales-and-support/service/
22 | 220.xml	Training  - Terex Finlay	http://www.terex.com/mobile-processing-equipment/en/sales-and-support/training/
23 | 110.xml	Merchandise  - Terex Cranes	http://www.terex.com/cranes/en/products/merchandise/
24 | 176.xml	Marchi  - Terex Corporation	http://www.terex.com/it/about-terex/brands/
25 | 2076.xml	FRANTOI  - Terex Finlay	http://www.terex.com/mobile-processing-equipment/it/products/recycling/crushers/
26 | 187.xml	I nostri valori  - Terex Corporation	http://www.terex.com/it/about-terex/our-values/
27 | 1472.xml	Formazione  - Terex Finlay	http://www.terex.com/mobile-processing-equipment/it/sales-and-support/training/
28 | 1469.xml	Riciclaggio  - Terex Finlay	http://www.terex.com/mobile-processing-equipment/it/products/recycling/
29 | 
30 | Files Mapped:
31 | -----------------------------------------------------
32 | 219.xml	1471.xml
33 | 2076.xml	1466.xml
34 | 221.xml	33.xml
35 | 179.xml	10.xml
36 | 188.xml	143.xml
37 | 187.xml	2377.xml
38 | 216.xml	1469.xml
39 | 220.xml	1472.xml
40 | 110.xml	1470.xml
41 | 146.xml	191.xml
42 | 211.xml	1468.xml
43 | 176.xml	2312.xml
44 | 177.xml	1007.xml
45 | 


--------------------------------------------------------------------------------
/metadata/add_lang_stats.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | from collections import defaultdict
 6 | import leveldb
 7 | import gzip
 8 | from math import log
 9 | from urlparse import urlparse
10 | import tldextract
11 | 
12 | 
13 | magic_number = 'df6fa1abb58549287111ba8d776733e9'
14 | 
15 | 
16 | # Example
17 | # df6fa1abb58549287111ba8d776733e9 uri:http://0ceanfoam.tumblr.com/ encoding:utf-8 tld:com domain:tumblr bytes:417
18 | # ENGLISH 99      816
19 | # df6fa1abb58549287111ba8d776733e9 uri:http://0pointer.de/photos/?gallery=India%202008-11&photo=320 encoding:utf-8 tld:de domain:0pointer bytes:660
20 | # ENGLISH 99      818
21 | # df6fa1abb58549287111ba8d776733e9 uri:http://1-rockeiroapaixonado.tumblr.com/post/43320614237 encoding:utf-8 tld:com domain:tumblr bytes:2636
22 | # ENGLISH 92      719
23 | # PORTUGUESE      7       254
24 | 
25 | 
26 | def parse_line(line):
27 |     data = {}
28 |     for item in line.rstrip().split()[1:]:
29 |         try:
30 |             key, value = item.split(':', 1)
31 |             if key in ["bytes"]:
32 |                 value = int(value)
33 |             data[key] = value
34 |         except:
35 |             pass
36 |     return data
37 | 
38 | 
39 | def process_buffer(buf, db):
40 |     if not buf:
41 |         continue
42 |     header = parse_line(buf[0])
43 |     uri = header["uri"]
44 |     try:
45 |         tld = tldextract.extract(urlparse(uri).netloc)
46 |     except UnicodeError:
47 |         return
48 |         print tld.domain.encode("utf8", "ignore"), json.dumps(res)
49 | 
50 |     key = " ".join((tld, uri, args.crawl))
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     import argparse
55 |     parser = argparse.ArgumentParser()
56 |     parser.add_argument('db', help='leveldb root directory')
57 |     args = parser.parse_args(sys.argv[1:])
58 | 
59 |     db = leveldb.LevelDB(args.db)
60 | 
61 |     stats = defaultdict(lambda: defaultdict(int))
62 | 
63 |     header = None
64 |     full_domain = None
65 |     valid_languages = [l.lower() for l in args.lang]
66 | 
67 |     for linenr, line in enumerate(sys.stdin):
68 |         if line.startswith(magic_number):
69 |             header = parse_line(line)
70 |             full_domain = "%s.%s" % (header["domain"], header["tld"])
71 |             continue
72 | 
73 |         lang, percent, confidence = line.split()
74 | 
75 |         percent = int(percent)
76 |         if percent < args.minpercent:
77 |             continue
78 | 
79 |         if valid_languages and lang.lower() not in valid_languages:
80 |             continue
81 | 
82 |         bytes_in_lang = header["bytes"] * percent / 100
83 |         if bytes_in_lang >= args.minbytes:
84 |             stats[full_domain][lang] += bytes_in_lang
85 | 
86 | # del lang # fix pyflakes warning
87 | 
88 | for domain in stats:
89 |     h = entropy(stats[domain])
90 |     if h == 0.0:
91 |         continue
92 |     sys.stdout.write("%f %s" % (h, domain))
93 |     counts = [(count, language)
94 |               for language, count in stats[domain].iteritems()]
95 |     counts.sort(reverse=True)
96 |     for count, lang in counts:
97 |         sys.stdout.write(" %s %d" % (lang, count))
98 |     sys.stdout.write("\n")
99 | 


--------------------------------------------------------------------------------
/merge/metadata/add_lang_stats.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | from collections import defaultdict
 6 | import leveldb
 7 | import gzip
 8 | from math import log
 9 | from urlparse import urlparse
10 | import tldextract
11 | 
12 | 
13 | magic_number = 'df6fa1abb58549287111ba8d776733e9'
14 | 
15 | 
16 | # Example
17 | # df6fa1abb58549287111ba8d776733e9 uri:http://0ceanfoam.tumblr.com/ encoding:utf-8 tld:com domain:tumblr bytes:417
18 | # ENGLISH 99      816
19 | # df6fa1abb58549287111ba8d776733e9 uri:http://0pointer.de/photos/?gallery=India%202008-11&photo=320 encoding:utf-8 tld:de domain:0pointer bytes:660
20 | # ENGLISH 99      818
21 | # df6fa1abb58549287111ba8d776733e9 uri:http://1-rockeiroapaixonado.tumblr.com/post/43320614237 encoding:utf-8 tld:com domain:tumblr bytes:2636
22 | # ENGLISH 92      719
23 | # PORTUGUESE      7       254
24 | 
25 | 
26 | def parse_line(line):
27 |     data = {}
28 |     for item in line.rstrip().split()[1:]:
29 |         try:
30 |             key, value = item.split(':', 1)
31 |             if key in ["bytes"]:
32 |                 value = int(value)
33 |             data[key] = value
34 |         except:
35 |             pass
36 |     return data
37 | 
38 | 
39 | def process_buffer(buf, db):
40 |     if not buf:
41 |         continue
42 |     header = parse_line(buf[0])
43 |     uri = header["uri"]
44 |     try:
45 |         tld = tldextract.extract(urlparse(uri).netloc)
46 |     except UnicodeError:
47 |         return
48 |         print tld.domain.encode("utf8", "ignore"), json.dumps(res)
49 | 
50 |     key = " ".join((tld, uri, args.crawl))
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     import argparse
55 |     parser = argparse.ArgumentParser()
56 |     parser.add_argument('db', help='leveldb root directory')
57 |     args = parser.parse_args(sys.argv[1:])
58 | 
59 |     db = leveldb.LevelDB(args.db)
60 | 
61 |     stats = defaultdict(lambda: defaultdict(int))
62 | 
63 |     header = None
64 |     full_domain = None
65 |     valid_languages = [l.lower() for l in args.lang]
66 | 
67 |     for linenr, line in enumerate(sys.stdin):
68 |         if line.startswith(magic_number):
69 |             header = parse_line(line)
70 |             full_domain = "%s.%s" % (header["domain"], header["tld"])
71 |             continue
72 | 
73 |         lang, percent, confidence = line.split()
74 | 
75 |         percent = int(percent)
76 |         if percent < args.minpercent:
77 |             continue
78 | 
79 |         if valid_languages and lang.lower() not in valid_languages:
80 |             continue
81 | 
82 |         bytes_in_lang = header["bytes"] * percent / 100
83 |         if bytes_in_lang >= args.minbytes:
84 |             stats[full_domain][lang] += bytes_in_lang
85 | 
86 | # del lang # fix pyflakes warning
87 | 
88 | for domain in stats:
89 |     h = entropy(stats[domain])
90 |     if h == 0.0:
91 |         continue
92 |     sys.stdout.write("%f %s" % (h, domain))
93 |     counts = [(count, language)
94 |               for language, count in stats[domain].iteritems()]
95 |     counts.sort(reverse=True)
96 |     for count, lang in counts:
97 |         sys.stdout.write(" %s %d" % (lang, count))
98 |     sys.stdout.write("\n")
99 | 


--------------------------------------------------------------------------------
/metadata/rocksdb/updatekv.cc:
--------------------------------------------------------------------------------
 1 | // Write tab separated key-value pairs to a level-db
 2 | #include "rdb_options.h"
 3 | 
 4 | #include <iostream>
 5 | #include <string>
 6 | #include "jsoncpp/json/json.h"
 7 | 
 8 | #include "rocksdb/db.h"
 9 | 
10 | 
11 | using std::string;
12 | 
13 | string JoinJSON(const string& s1, const string& s2) {
14 |     Json::Value json1, json2;
15 |     Json::Reader parser;
16 |     bool result1 = parser.parse(s1, json1);
17 |     bool result2 = parser.parse(s2, json2);
18 |     assert(result1 && result2);
19 | 
20 |     // Copy key-value pairs from s2 to s1 (possibliy overwriting)
21 |     for (Json::ValueIterator it = json2.begin(); it != json2.end() ; it++ ) {
22 |         json1[it.key().toStyledString()] = *it;
23 |     }
24 | 
25 |     Json::FastWriter writer;
26 |     const string result = writer.write(json1);
27 |     return result;
28 | }
29 | 
30 | 
31 | int main(int argc, char** argv) {
32 |     if (argc < 2) {
33 |         std::cout << "Usage: " << argv[0] << " db_directory" << std::endl;
34 |         return -1;
35 |     }
36 | 
37 |     rocksdb::DB* db;
38 |     rocksdb::Options options = GetOptions();
39 |     // options.write_buffer_size = 256 * 1024 * 1024; // 256MB
40 |     // options.max_write_buffer_number = 5; // Total of 1GB write cache
41 |     // options.min_write_buffer_number_to_merge = 2;
42 | 
43 |     // auto env = rocksdb::Env::Default();
44 |     // env->SetBackgroundThreads(16, rocksdb::Env::LOW);
45 |     // env->SetBackgroundThreads(4, rocksdb::Env::HIGH);
46 |     // options.max_background_compactions = 16;
47 |     // options.max_background_flushes = 1;
48 |     // options.max_open_files = 1000;
49 |     
50 |     rocksdb::Status status = rocksdb::DB::Open(options, argv[1], &db);
51 |     if (!status.ok()) {
52 |         std::cerr << "Error opening DB: " << status.ToString() << std::endl;  
53 |         return -1;
54 |     } 
55 | 
56 |     rocksdb::WriteOptions writeOptions;
57 |     rocksdb::ReadOptions readOptions;
58 |     // writeOptions.sync = true;
59 | 
60 |     string line;
61 |     int nLines = 0;
62 |     rocksdb::WriteBatch batch;
63 | 
64 |     while(std::getline(std::cin, line)) {
65 |         ++nLines;
66 |         const size_t key_end = line.find("\t");
67 |         assert(key_end != std::string::npos);
68 |         const string key  = line.substr(0, key_end);
69 |         const string value = line.substr(key_end+1);
70 | 
71 |         string oldValue;
72 |         status = db->Get(readOptions, key, &oldValue);
73 |         if (status.ok()) {
74 |             const string newValue = JoinJSON(value, oldValue);
75 |             batch.Put(key, newValue);
76 |         } else {
77 |             std::cerr << status.ToString() 
78 |                       << "\t" << line << std::endl;
79 |         }
80 |         if (nLines % 1000 == 0) {
81 |             status = db->Write(writeOptions, &batch);
82 |             if (!status.ok()) {
83 |                 std::cerr << "Write error: " << status.ToString() << std::endl;
84 |             }
85 |             batch.Clear();
86 |         }
87 |     }
88 |     // Write remaining entries
89 |     status = db->Write(writeOptions, &batch);
90 |     if (!status.ok()) {
91 |         std::cerr << "Write error: " << status.ToString() << std::endl;
92 |     }
93 | 
94 |     delete db;
95 |     return 0;
96 | }
97 | 


--------------------------------------------------------------------------------
/baseline/add_warc_locations.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | import gzip
 5 | import json
 6 | from urlparse import urlparse
 7 | import tldextract
 8 | 
 9 | # Input looks like this:
10 | # lang-independent url    url                       ELANG   LANGS
11 | # where ELANG is the expected language, based on what was removed from the url
12 | # and LANGS are the languages that were detected in the page
13 | # Example:
14 | # http://15october.net//  http://15october.net/ar/  ARABIC  ARABIC/ENGLISH
15 | # http://15october.net//  http://15october.net/de/  GERMAN  GERMAN/ENGLISH
16 | # http://15october.net//  http://15october.net/es/  SPANISH SPANISH/ENGLISH
17 | # http://15october.net//  http://15october.net/es/  SPANISH SPANISH/ENGLISH
18 | # http://15october.net//  http://15october.net/fr/  FRENCH  FRENCH/ENGLISH
19 | 
20 | 
21 | def get_tld(uri):
22 |     tld = tldextract.extract(urlparse(uri).netloc)
23 |     return tld
24 | 
25 | 
26 | def read_candidates(candidates):
27 |     valid_tlds = set()
28 |     uri_dict = {}
29 |     for line in args.candidates:
30 |         line = line.strip().split("\t")
31 |         assert len(line) == 4, "weird line: %s\n" % "\t".join(line)
32 |         uri = line[1]
33 |         tld = get_tld(uri).domain
34 |         # print uri, tld
35 |         valid_tlds.add(tld)
36 |         uri_dict[uri] = "\t".join(line)
37 | 
38 |     return valid_tlds, uri_dict
39 | 
40 | 
41 | def open_file(filename):
42 |     if filename.lower().endswith(".gz"):
43 |         return gzip.open(filename)
44 |     return open(filename)
45 | 
46 | if __name__ == "__main__":
47 |     import argparse
48 |     parser = argparse.ArgumentParser()
49 |     parser.add_argument('candidates', type=open_file,
50 |                         help='file containing candidates')
51 |     parser.add_argument('--prefix', help='prefix for filename',
52 |                         default='')
53 | 
54 |     parser.add_argument('--outfile', type=argparse.FileType('w'),
55 |                         default=sys.stdout)
56 |     args = parser.parse_args(sys.argv[1:])
57 | 
58 |     valid_tlds, uri_dict = read_candidates(args.candidates)
59 |     sys.stderr.write("Looking for %d URLs from %d domains\n" %
60 |                      (len(uri_dict), len(valid_tlds)))
61 | 
62 |     tld_found, uri_found, n_lines, errors = 0, 0, 0, 0
63 |     for line in sys.stdin:
64 |         tld, json_data = line.split(" ", 1)
65 |         n_lines += 1
66 |         if tld not in valid_tlds:
67 |             continue
68 | 
69 |         tld_found += 1
70 | 
71 |         data = json.loads(json_data)
72 |         if data["uri"] not in uri_dict:
73 |             continue
74 |         uri_found += 1
75 |         try:
76 |             container_data = data["container"]
77 |             offset = container_data["Offset"]
78 |             length = container_data["Gzip-Metadata"]["Deflate-Length"]
79 |             filename = args.prefix + container_data["Filename"]
80 |             original = uri_dict[data["uri"]]
81 |             sys.stdout.write("%s\t%s\t%d\t%d\n" %
82 |                              (original, filename, int(offset), int(length)))
83 |         except KeyError:
84 |             errors += 1
85 | 
86 |     sys.stderr.write("Found %d tld and %d uris in %d lines (%d errors)\n"
87 |                      % (tld_found, uri_found, n_lines, errors))
88 | 


--------------------------------------------------------------------------------
/baseline/collect_domains.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | import gzip
 5 | import json
 6 | from urlparse import urlparse
 7 | import tldextract
 8 | 
 9 | # Input looks like this:
10 | # lang-independent url    url                       ELANG   LANGS
11 | # where ELANG is the expected language, based on what was removed from the url
12 | # and LANGS are the languages that were detected in the page
13 | # Example:
14 | # http://15october.net//  http://15october.net/ar/  ARABIC  ARABIC/ENGLISH
15 | # http://15october.net//  http://15october.net/de/  GERMAN  GERMAN/ENGLISH
16 | # http://15october.net//  http://15october.net/es/  SPANISH SPANISH/ENGLISH
17 | # http://15october.net//  http://15october.net/es/  SPANISH SPANISH/ENGLISH
18 | # http://15october.net//  http://15october.net/fr/  FRENCH  FRENCH/ENGLISH
19 | 
20 | 
21 | def get_tld(uri):
22 |     tld = tldextract.extract(urlparse(uri).netloc)
23 |     return tld
24 | 
25 | 
26 | def read_domains(domainstats):
27 |     valid_tlds = set()
28 |     uri_dict = {}
29 |     for line in args.domainstats:
30 |         line = line.strip().split("\t")
31 |         assert len(line) == 4, "weird line: %s\n" % "\t".join(line)
32 |         uri = line[1]
33 |         tld = get_tld(uri).domain
34 |         # print uri, tld
35 |         valid_tlds.add(tld)
36 |         uri_dict[uri] = "\t".join(line)
37 | 
38 |     return valid_tlds, uri_dict
39 | 
40 | 
41 | def open_file(filename):
42 |     if filename.lower().endswith(".gz"):
43 |         return gzip.open(filename)
44 |     return open(filename)
45 | 
46 | if __name__ == "__main__":
47 |     import argparse
48 |     parser = argparse.ArgumentParser()
49 |     parser.add_argument('domainstats', type=open_file,
50 |                         help='file containing language stats per domain')
51 |     parser.add_argument('--prefix', help='prefix for filename',
52 |                         default='')
53 | 
54 |     parser.add_argument('--outfile', type=argparse.FileType('w'),
55 |                         default=sys.stdout)
56 |     args = parser.parse_args(sys.argv[1:])
57 | 
58 |     valid_tlds, uri_dict = read_domains(args.domainstats)
59 |     sys.stderr.write("Looking for %d URLs from %d domains\n" %
60 |                      (len(uri_dict), len(valid_tlds)))
61 | 
62 |     tld_found, uri_found, n_lines, errors = 0, 0, 0, 0
63 |     for line in sys.stdin:
64 |         tld, json_data = line.split(" ", 1)
65 |         n_lines += 1
66 |         if tld not in valid_tlds:
67 |             continue
68 | 
69 |         tld_found += 1
70 | 
71 |         data = json.loads(json_data)
72 |         if data["uri"] not in uri_dict:
73 |             continue
74 |         uri_found += 1
75 |         try:
76 |             container_data = data["container"]
77 |             offset = container_data["Offset"]
78 |             length = container_data["Gzip-Metadata"]["Deflate-Length"]
79 |             filename = args.prefix + container_data["Filename"]
80 |             original = uri_dict[data["uri"]]
81 |             sys.stdout.write("%s\t%s\t%d\t%d\n" %
82 |                              (original, filename, int(offset), int(length)))
83 |         except KeyError:
84 |             errors += 1
85 | 
86 |     sys.stderr.write("Found %d tld and %d uris in %d lines (%d errors)\n"
87 |                      % (tld_found, uri_found, n_lines, errors))
88 | 


--------------------------------------------------------------------------------
/docaligner/ratio.py:
--------------------------------------------------------------------------------
  1 | import difflib
  2 | import editdistance
  3 | 
  4 | 
  5 | def ratio(weights, seq1, seq2):
  6 |     s = difflib.SequenceMatcher(None, seq1, seq2)
  7 |     return s.ratio()
  8 | 
  9 | 
 10 | def ratio_star(seq1_seq2):
 11 |     return ratio(*seq1_seq2)
 12 | 
 13 | 
 14 | def quick_ratio(seq1, seq2):
 15 |     s = difflib.SequenceMatcher(None, seq1, seq2)
 16 |     return s.quick_ratio()
 17 | 
 18 | 
 19 | def quick_ratio_star(seq1_seq2):
 20 |     return quick_ratio(*seq1_seq2)
 21 | 
 22 | 
 23 | def real_quick_ratio(seq1, seq2):
 24 |     s = difflib.SequenceMatcher(None, seq1, seq2)
 25 |     return s.real_quick_ratio()
 26 | 
 27 | 
 28 | def real_quick_ratio_star(seq1_seq2):
 29 |     return real_quick_ratio(*seq1_seq2)
 30 | 
 31 | 
 32 | def levenshtein_min(weights, seq1, seq2):
 33 |     norm = 1.0 * max(1, (min(len(seq1), len(seq2))))
 34 |     return 1 - (editdistance.eval(seq1, seq2) / norm)
 35 | 
 36 | 
 37 | def levenshtein_max(weights, seq1, seq2):
 38 |     norm = 1.0 * max(len(seq1), len(seq2))
 39 |     return 1 - (editdistance.eval(seq1, seq2) / norm)
 40 | 
 41 | 
 42 | def levenshtein_avg(weights, seq1, seq2):
 43 |     norm = .5 * (len(seq1) + len(seq2))
 44 |     return 1 - (editdistance.eval(seq1, seq2) / norm)
 45 | 
 46 | 
 47 | def jaccard(weights, set1, set2):
 48 |     intersection = set1.intersection(set2)
 49 |     intersect_size = len(intersection)
 50 | 
 51 |     # print "Set 1", set1
 52 |     # print "Set 2", set2
 53 |     # print "Overlap:", intersection
 54 |     # print "Weights:"
 55 | 
 56 |     if intersect_size > 0:
 57 |         return float(intersect_size) / len(set1.union(set2))
 58 |         # if weights is None:
 59 |         #     return float(intersect_size) / len(set1.union(set2))
 60 |         # else:
 61 |         #     num = 0.
 62 |         #     for term in intersection:
 63 |         #         if term in weights:
 64 |         # print term, weights[term]
 65 |         #             num += weights[term]
 66 |         #     denom = 0.
 67 |         #     for term in set1.union(set2):
 68 |         #         if term in weights:
 69 |         #             denom += weights[term]
 70 |         #     if denom > 0:
 71 |         #         return num / denom
 72 |     return 0.
 73 | 
 74 | 
 75 | def weighted_jaccard(weights, counts1, counts2):
 76 |     num = 0.
 77 |     for term, count in (counts1 & counts2).iteritems():
 78 |         num += weights[term] * count
 79 |     if num > 0:
 80 |         denom = 0.
 81 |         for term, count in (counts1 | counts2).iteritems():
 82 |             denom += weights[term] * count
 83 |         return num / denom
 84 |     return 0.
 85 | 
 86 | 
 87 | def cosine(weights, counts1, counts2):
 88 |     nom = 0.
 89 |     for term, count in (counts1 & counts2).iteritems():
 90 |         nom += counts1[term] * counts2[term] * (weights[term]**2)
 91 |     if nom > 0:
 92 |         denom = 0.
 93 |         for term in counts1:
 94 |             denom += (counts1[term] * weights[term])**2
 95 |         for term in counts2:
 96 |             denom += (counts2[term] * weights[term])**2
 97 |         return nom / denom
 98 |     return 0.
 99 | 
100 | 
101 | def dice(seq1, seq2):
102 |     s1 = set(seq1)
103 |     s2 = set(seq2)
104 |     if len(s1) == 0 and len(s2) == 0:
105 |         return 1.0
106 |     return 2.0 * len(s1.intersection(s2)) / (len(s1) + len(s2))
107 | 


--------------------------------------------------------------------------------
/html_convert/html2text.cpp:
--------------------------------------------------------------------------------
 1 | #include <algorithm>
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | #include <fstream>
 5 | #include <iostream>
 6 | #include <string>
 7 | #include <vector>
 8 | #include <sstream>
 9 | #include <set>
10 | 
11 | #include "gumbo.h"
12 | 
13 | using std::string;
14 | using std::getline;
15 | using std::cin;
16 | using std::cout;
17 | 
18 | // Tags that define a block and thus introduce a line break
19 | static std::set<string> block_tags = {
20 |     "address", "article", "aside",  "audio",    "blockquote", "canvas",
21 |     "dd",      "div",     "dl",     "fieldset", "figcaption", "figure",
22 |     "footer",  "form",    "h1",     "h2",       "h3",         "h4",
23 |     "h5",      "h6",      "header", "hgroup",   "hr",         "noscript",
24 |     "ol",      "output",  "p",      "pre",      "section",    "table",
25 |     "tfoot",   "ul",      "video"};
26 | 
27 | static void Cleantext(GumboNode* node, std::ostringstream& oss) {
28 |   if (node->type == GUMBO_NODE_TEXT) {
29 |     string text(node->v.text.text);
30 |     std::replace(text.begin(), text.end(), '\n', ' ');
31 |     oss << text;
32 |   } else if (node->type == GUMBO_NODE_WHITESPACE) {
33 |     oss << " ";
34 |   } else if (node->type == GUMBO_NODE_ELEMENT &&
35 |              node->v.element.tag != GUMBO_TAG_SCRIPT &&
36 |              node->v.element.tag != GUMBO_TAG_STYLE) {
37 |     if (node->v.element.tag == GUMBO_TAG_BR ||
38 |         node->v.element.tag == GUMBO_TAG_LI) {
39 |       oss << std::endl;
40 |     }
41 |     // Insert space before and after spans. This is in violation of the 
42 |     // HTML5 standard but spans are often fitted with margins to make the words
43 |     // look sperated when they are not. Adding spaces mimics this.
44 |     if (node->v.element.tag == GUMBO_TAG_SPAN) {
45 |       *textbuffer << " ";
46 |     }
47 |     // descend into subtree
48 |     GumboVector* children = &node->v.element.children;
49 |     for (unsigned int i = 0; i < children->length; ++i) {
50 |       Cleantext((GumboNode*)children->data[i], oss);
51 |     }
52 |  
53 |     // Space after span, see above
54 |     if (node->v.element.tag == GUMBO_TAG_SPAN) {
55 |       *textbuffer << " ";
56 |     }
57 | 
58 |     const std::string tagname = gumbo_normalized_tagname(node->v.element.tag);
59 |     if (block_tags.find(tagname) != block_tags.end()) {
60 |       oss << std::endl;
61 |     }
62 |   }
63 | }
64 | 
65 | void ProcessBuffer(const string& header, const string& buffer) {
66 |   if (header.empty() || buffer.empty()) {
67 |     return;
68 |   }
69 |   GumboOutput* output = gumbo_parse(buffer.c_str());
70 |   std::cout << header << std::endl;
71 |   std::ostringstream extracted_text;
72 |   Cleantext(output->root, extracted_text);
73 |   std::cout << extracted_text.str() << std::endl;
74 |   gumbo_destroy_output(&kGumboDefaultOptions, output);
75 | }
76 | 
77 | int main(int argc, char** argv) {
78 |   const char* magic_number = "df6fa1abb58549287111ba8d776733e9";
79 |   std::ostringstream buffer;
80 |   string line;
81 |   string header;
82 |   while (getline(cin, line)) {
83 |     if (line.find(magic_number) == 0) {
84 |       ProcessBuffer(header, buffer.str());
85 |       buffer.clear();
86 |       buffer.str(string(""));
87 |       header = line;
88 |     } else {
89 |       buffer << line << std::endl;
90 |     }
91 |   }
92 |   ProcessBuffer(header, buffer.str());
93 | }
94 | 


--------------------------------------------------------------------------------
/Results/results_ev9.txt:
--------------------------------------------------------------------------------
 1 | Files Downloaded:
 2 | -----------------------------------------------------
 3 | 65.xml	Pont Europe -	http://www.ponteurope.com/be-fr/news/view/21
 4 | 41.xml	Pont Europe -	http://www.ponteurope.com/us-en/innovation/view/2
 5 | 105.xml	Pont Europe -	http://www.ponteurope.com/fr-fr/innovation/view/3
 6 | 95.xml	Pont Europe -	http://www.ponteurope.com/gb-en/news/view/8
 7 | 89.xml	Pont Europe -	http://www.ponteurope.com/gb-en/news/view/21
 8 | 52.xml	Pont Europe -	http://www.ponteurope.com/us-en/news/view/4
 9 | 85.xml	Pont Europe -	http://www.ponteurope.com/gb-en/news/view/1
10 | 63.xml	Pont Europe -	http://www.ponteurope.com/be-fr/innovation/view/2
11 | 79.xml	Pont Europe -	http://www.ponteurope.com/fr-fr/news/view/17
12 | 38.xml	Pont Europe -	http://www.ponteurope.com/gb-en/page/about
13 | 102.xml	Pont Europe -	http://www.ponteurope.com/be-fr/innovation/view/7
14 | 109.xml	Pont Europe -	http://www.ponteurope.com/gb-en/innovation/view/3
15 | 47.xml	Pont Europe -	http://www.ponteurope.com/us-en/news/view/11
16 | 10.xml	Pont Europe -	http://www.ponteurope.com/us-en/page/disclaimer
17 | 33.xml	Pont Europe -	http://www.ponteurope.com/fr-fr/product/market/4
18 | 16.xml	Pont Europe -	http://www.ponteurope.com/us-en/product/market/5
19 | 4.xml	Pont Europe -	http://www.ponteurope.com/gb-en
20 | 28.xml	Pont Europe -	http://www.ponteurope.com/fr-fr/page/disclaimer
21 | 97.xml	Pont Europe -	http://www.ponteurope.com/gb-en/product/market/1
22 | 77.xml	Pont Europe -	http://www.ponteurope.com/fr-fr/news/view/11
23 | 49.xml	Pont Europe -	http://www.ponteurope.com/us-en/news/view/17
24 | 7.xml	Pont Europe -	http://www.ponteurope.com/us-en/innovation
25 | 32.xml	Pont Europe -	http://www.ponteurope.com/fr-fr/product/market/3
26 | 21.xml	Pont Europe -	http://www.ponteurope.com/be-fr/page/about
27 | 84.xml	Pont Europe -	http://www.ponteurope.com/fr-fr/news/view/9
28 | 31.xml	Pont Europe -	http://www.ponteurope.com/fr-fr/product/market/2
29 | 98.xml	Pont Europe -	http://www.ponteurope.com/gb-en/product/market/2
30 | 66.xml	Pont Europe -	http://www.ponteurope.com/be-fr/news/view/4
31 | 83.xml	Pont Europe -	http://www.ponteurope.com/fr-fr/news/view/8
32 | 15.xml	Pont Europe -	http://www.ponteurope.com/us-en/product/market/4
33 | 14.xml	Pont Europe -	http://www.ponteurope.com/us-en/product/market/3
34 | 90.xml	Pont Europe -	http://www.ponteurope.com/gb-en/news/view/3
35 | 100.xml	Pont Europe -	http://www.ponteurope.com/gb-en/product/market/6
36 | 110.xml	Pont Europe -	http://www.ponteurope.com/gb-en/innovation/view/7
37 | 82.xml	Pont Europe -	http://www.ponteurope.com/fr-fr/news/view/7
38 | 69.xml	Pont Europe -	http://www.ponteurope.com/be-fr/product/market/1
39 | 18.xml	Pont Europe -	http://www.ponteurope.com/be-fr/innovation
40 | 35.xml	Pont Europe -	http://www.ponteurope.com/fr-fr/product/market/6
41 | 2.xml	Pont Europe -	http://www.ponteurope.com/be-fr
42 | 34.xml	Pont Europe -	http://www.ponteurope.com/fr-fr/product/market/5
43 | 
44 | Files Mapped:
45 | -----------------------------------------------------
46 | 85.xml	65.xml
47 | 33.xml	15.xml
48 | 66.xml	52.xml
49 | 84.xml	89.xml
50 | 77.xml	47.xml
51 | 2.xml	4.xml
52 | 35.xml	100.xml
53 | 83.xml	90.xml
54 | 32.xml	14.xml
55 | 28.xml	10.xml
56 | 69.xml	97.xml
57 | 110.xml	102.xml
58 | 82.xml	95.xml
59 | 79.xml	49.xml
60 | 31.xml	98.xml
61 | 41.xml	63.xml
62 | 7.xml	18.xml
63 | 109.xml	105.xml
64 | 38.xml	21.xml
65 | 34.xml	16.xml
66 | 


--------------------------------------------------------------------------------
/metadata/lang_stats/percent_to_bytes.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import sys
  5 | from collections import defaultdict
  6 | import leveldb
  7 | from urlparse import urlparse
  8 | import tldextract
  9 | 
 10 | 
 11 | magic_number = 'df6fa1abb58549287111ba8d776733e9'
 12 | 
 13 | 
 14 | # Example
 15 | # df6fa1abb58549287111ba8d776733e9 uri:http://0ceanfoam.tumblr.com/ encoding:utf-8 tld:com domain:tumblr bytes:417
 16 | # ENGLISH 99      816
 17 | # df6fa1abb58549287111ba8d776733e9 uri:http://0pointer.de/photos/?gallery=India%202008-11&photo=320 encoding:utf-8 tld:de domain:0pointer bytes:660
 18 | # ENGLISH 99      818
 19 | # df6fa1abb58549287111ba8d776733e9 uri:http://1-rockeiroapaixonado.tumblr.com/post/43320614237 encoding:utf-8 tld:com domain:tumblr bytes:2636
 20 | # ENGLISH 92      719
 21 | # PORTUGUESE      7       254
 22 | 
 23 | def parse_line(line):
 24 |     data = {}
 25 |     for item in line.rstrip().split()[1:]:
 26 |         try:
 27 |             key, value = item.split(':', 1)
 28 |             if key in ["bytes"]:
 29 |                 value = int(value)
 30 |             data[key] = value
 31 |         except:
 32 |             pass
 33 |     return data
 34 | 
 35 | 
 36 | def process_buffer(buf, outfile):
 37 |     if not buf:
 38 |         continue
 39 |     header = parse_line(buf[0])
 40 |     uri = header["uri"]
 41 |     domain = urlparse(uri).netloc
 42 | 
 43 |     for line in buf[1:]:
 44 |         lang, percent, confidence = line.split()
 45 |         bytes_in_lang
 46 |         outfile.write()
 47 |     try:
 48 |         tld = tldextract.extract()
 49 |     except UnicodeError:
 50 |         return
 51 |         print tld.domain.encode("utf8", "ignore"), json.dumps(res)
 52 | 
 53 |     key = " ".join((tld, uri, args.crawl))
 54 | 
 55 | 
 56 | if __name__ == "__main__":
 57 |     import argparse
 58 |     parser = argparse.ArgumentParser()
 59 |     parser.add_argument('db', help='leveldb root directory')
 60 |     args = parser.parse_args(sys.argv[1:])
 61 | 
 62 |     db = leveldb.LevelDB(args.db)
 63 | 
 64 |     stats = defaultdict(lambda: defaultdict(int))
 65 | 
 66 |     header = None
 67 |     domain = None
 68 |     # valid_languages = [l.lower() for l in args.lang]
 69 | 
 70 |     for line in sys.stdin:
 71 |         if line.startswith(magic_number):
 72 |             header = parse_line(line)
 73 |             domain = urlparse(uri).netloc
 74 |             continue
 75 | 
 76 |         lang, percent, confidence = line.split()
 77 | 
 78 |         percent = int(percent)
 79 |         if percent < args.minpercent:
 80 |             continue
 81 | 
 82 |         if valid_languages and lang.lower() not in valid_languages:
 83 |             continue
 84 | 
 85 |         bytes_in_lang = header["bytes"] * percent / 100
 86 |         if bytes_in_lang >= args.minbytes:
 87 |             stats[full_domain][lang] += bytes_in_lang
 88 | 
 89 | # del lang # fix pyflakes warning
 90 | 
 91 | for domain in stats:
 92 |     h = entropy(stats[domain])
 93 |     if h == 0.0:
 94 |         continue
 95 |     sys.stdout.write("%f %s" % (h, domain))
 96 |     counts = [(count, language)
 97 |               for language, count in stats[domain].iteritems()]
 98 |     counts.sort(reverse=True)
 99 |     for count, lang in counts:
100 |         sys.stdout.write(" %s %d" % (lang, count))
101 |     sys.stdout.write("\n")
102 | 


--------------------------------------------------------------------------------
/merge/metadata/lang_stats/percent_to_bytes.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import sys
  5 | from collections import defaultdict
  6 | import leveldb
  7 | from urlparse import urlparse
  8 | import tldextract
  9 | 
 10 | 
 11 | magic_number = 'df6fa1abb58549287111ba8d776733e9'
 12 | 
 13 | 
 14 | # Example
 15 | # df6fa1abb58549287111ba8d776733e9 uri:http://0ceanfoam.tumblr.com/ encoding:utf-8 tld:com domain:tumblr bytes:417
 16 | # ENGLISH 99      816
 17 | # df6fa1abb58549287111ba8d776733e9 uri:http://0pointer.de/photos/?gallery=India%202008-11&photo=320 encoding:utf-8 tld:de domain:0pointer bytes:660
 18 | # ENGLISH 99      818
 19 | # df6fa1abb58549287111ba8d776733e9 uri:http://1-rockeiroapaixonado.tumblr.com/post/43320614237 encoding:utf-8 tld:com domain:tumblr bytes:2636
 20 | # ENGLISH 92      719
 21 | # PORTUGUESE      7       254
 22 | 
 23 | def parse_line(line):
 24 |     data = {}
 25 |     for item in line.rstrip().split()[1:]:
 26 |         try:
 27 |             key, value = item.split(':', 1)
 28 |             if key in ["bytes"]:
 29 |                 value = int(value)
 30 |             data[key] = value
 31 |         except:
 32 |             pass
 33 |     return data
 34 | 
 35 | 
 36 | def process_buffer(buf, outfile):
 37 |     if not buf:
 38 |         continue
 39 |     header = parse_line(buf[0])
 40 |     uri = header["uri"]
 41 |     domain = urlparse(uri).netloc
 42 | 
 43 |     for line in buf[1:]:
 44 |         lang, percent, confidence = line.split()
 45 |         bytes_in_lang
 46 |         outfile.write()
 47 |     try:
 48 |         tld = tldextract.extract()
 49 |     except UnicodeError:
 50 |         return
 51 |         print tld.domain.encode("utf8", "ignore"), json.dumps(res)
 52 | 
 53 |     key = " ".join((tld, uri, args.crawl))
 54 | 
 55 | 
 56 | if __name__ == "__main__":
 57 |     import argparse
 58 |     parser = argparse.ArgumentParser()
 59 |     parser.add_argument('db', help='leveldb root directory')
 60 |     args = parser.parse_args(sys.argv[1:])
 61 | 
 62 |     db = leveldb.LevelDB(args.db)
 63 | 
 64 |     stats = defaultdict(lambda: defaultdict(int))
 65 | 
 66 |     header = None
 67 |     domain = None
 68 |     # valid_languages = [l.lower() for l in args.lang]
 69 | 
 70 |     for line in sys.stdin:
 71 |         if line.startswith(magic_number):
 72 |             header = parse_line(line)
 73 |             domain = urlparse(uri).netloc
 74 |             continue
 75 | 
 76 |         lang, percent, confidence = line.split()
 77 | 
 78 |         percent = int(percent)
 79 |         if percent < args.minpercent:
 80 |             continue
 81 | 
 82 |         if valid_languages and lang.lower() not in valid_languages:
 83 |             continue
 84 | 
 85 |         bytes_in_lang = header["bytes"] * percent / 100
 86 |         if bytes_in_lang >= args.minbytes:
 87 |             stats[full_domain][lang] += bytes_in_lang
 88 | 
 89 | # del lang # fix pyflakes warning
 90 | 
 91 | for domain in stats:
 92 |     h = entropy(stats[domain])
 93 |     if h == 0.0:
 94 |         continue
 95 |     sys.stdout.write("%f %s" % (h, domain))
 96 |     counts = [(count, language)
 97 |               for language, count in stats[domain].iteritems()]
 98 |     counts.sort(reverse=True)
 99 |     for count, lang in counts:
100 |         sys.stdout.write(" %s %d" % (lang, count))
101 |     sys.stdout.write("\n")
102 | 


--------------------------------------------------------------------------------
/metadata/lang_stats/accumulate_stats.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | from collections import defaultdict
 6 | import gzip
 7 | from math import log
 8 | 
 9 | magic_number = 'df6fa1abb58549287111ba8d776733e9'
10 | 
11 | 
12 | # Example
13 | # df6fa1abb58549287111ba8d776733e9 uri:http://0ceanfoam.tumblr.com/ encoding:utf-8 tld:com domain:tumblr bytes:417
14 | # ENGLISH 99      816
15 | # df6fa1abb58549287111ba8d776733e9 uri:http://0pointer.de/photos/?gallery=India%202008-11&photo=320 encoding:utf-8 tld:de domain:0pointer bytes:660
16 | # ENGLISH 99      818
17 | # df6fa1abb58549287111ba8d776733e9 uri:http://1-rockeiroapaixonado.tumblr.com/post/43320614237 encoding:utf-8 tld:com domain:tumblr bytes:2636
18 | # ENGLISH 92      719
19 | # PORTUGUESE      7       254
20 | 
21 | 
22 | def parse_line(line):
23 |     data = {}
24 |     for item in line.rstrip().split()[1:]:
25 |         try:
26 |             key, value = item.split(':', 1)
27 |             if key in ["bytes"]:
28 |                 value = int(value)
29 |             data[key] = value
30 |         except:
31 |             pass
32 |     return data
33 | 
34 | 
35 | def entropy(lang_dist):
36 |     total = float(sum(lang_dist.values()))
37 |     h = 0
38 |     for lang, count in lang_dist.iteritems():
39 |         p = count / total
40 |         h += p * log(p)
41 |     return h
42 | 
43 | if __name__ == "__main__":
44 |     import argparse
45 |     parser = argparse.ArgumentParser()
46 |     parser.add_argument('-minpercent', type=int, default=10,
47 |                         help="ignore percentage smaller than this.")
48 |     parser.add_argument('-minbytes', type=int, default=100,
49 |                         help="ignore chunks smaller than this.")
50 |     parser.add_argument('-lang', nargs='*',
51 |                         help="Ignore all other languages but these.")
52 |     args = parser.parse_args(sys.argv[1:])
53 | 
54 |     stats = defaultdict(lambda: defaultdict(int))
55 | 
56 |     header = None
57 |     full_domain = None
58 |     valid_languages = None
59 |     if args.lang:
60 |         valid_languages = [l.lower() for l in args.lang]
61 | 
62 |     for linenr, line in enumerate(sys.stdin):
63 |         if line.startswith(magic_number):
64 |             header = parse_line(line)
65 |             full_domain = "%s.%s" % (header["domain"], header["tld"])
66 |             continue
67 | 
68 |         lang, percent, confidence = line.split()
69 | 
70 |         percent = float(percent)
71 |         if percent < args.minpercent:
72 |             continue
73 | 
74 |         # print lang, valid_languages
75 |         if valid_languages and lang.lower() not in valid_languages:
76 |             continue
77 | 
78 |         bytes_in_lang = header["bytes"] * percent / 100
79 |         if bytes_in_lang >= args.minbytes:
80 |             stats[full_domain][lang] += bytes_in_lang
81 |             # print full_domain, lang, bytes_in_lang
82 | 
83 | # del lang # fix pyflakes warning
84 | 
85 | for domain in stats:
86 |     h = entropy(stats[domain])
87 |     if h == 0.0:
88 |         continue
89 |     sys.stdout.write("%f %s" % (h, domain))
90 |     counts = [(count, language)
91 |               for language, count in stats[domain].iteritems()]
92 |     counts.sort(reverse=True)
93 |     for count, lang in counts:
94 |         sys.stdout.write(" %s %d" % (lang, count))
95 |     sys.stdout.write("\n")
96 | 


--------------------------------------------------------------------------------
/docaligner/extract.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Exit as soon as any command fails
 4 | set -e
 5 | set -o pipefail
 6 | 
 7 | source /home/buck/net/build/virtualenvs/crawl/bin/activate
 8 | 
 9 | 
10 | BASENAME=${1/.lett/}
11 | F=${BASENAME}.feats
12 | LOG=${BASENAME}.log
13 | TARGETS=${BASENAME}.targets
14 | TEXT_EN=${BASENAME}.text.en
15 | TEXT_DE=${BASENAME}.text.de
16 | 
17 | BINDIR=/home/buck/net/build/DataCollection
18 | 
19 | # ${BINDIR}/baseline/extract_foreign_text.py -splitter='' -normalizer='' -tokenizer='' -o ${TEXT_DE} -lang de < $1 &
20 | # ${BINDIR}/baseline/extract_foreign_text.py -splitter='' -normalizer='' -tokenizer='' -o ${TEXT_EN} -lang en < $1 &
21 | 
22 | 
23 | # exit
24 | 
25 | ${BINDIR}/docaligner/extract_features.py $1 BOW -tlang=de -out $F.bow_n1_jaccard -source_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l en" -target_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l de" -n 1 -targets ${TARGETS} &>${LOG}
26 | 
27 | ${BINDIR}/docaligner/extract_features.py $1 BOW -tlang=de -out $F.bow_n2_jaccard -source_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l en" -target_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l de" -n 2 &>>${LOG}
28 | 
29 | ${BINDIR}/docaligner/extract_features.py $1 BOW -tlang=de -out $F.bow_n3_jaccard -source_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l en" -target_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l de" -n 3 &>>${LOG}
30 | 
31 | ${BINDIR}/docaligner/extract_features.py $1 Structure -tlang=de -out $F.structure -source_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l en" -target_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l de" -n 2 &>>${LOG}
32 | 
33 | ${BINDIR}/docaligner/extract_features.py $1 Simhash -tlang=de -out $F.simhash_n2 -source_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l en" -target_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l de" -n 2 &>>${LOG}
34 | 
35 | ${BINDIR}/docaligner/extract_features.py $1 Simhash -tlang=de -out $F.simhash_n3 -source_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l en" -target_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l de" -n 3 &>>${LOG}
36 | 
37 | ${BINDIR}/docaligner/extract_features.py $1 LinkDistance -tlang=de -out $F.linkdistance_a -source_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l en" -target_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l de" &>>${LOG}
38 | 
39 | ${BINDIR}/docaligner/extract_features.py $1 LinkDistance -tlang=de -out $F.linkdistance_img -source_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l en" -target_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l de" -xpath="//img/@src" &>>${LOG}
40 | 
41 | ${BINDIR}/docaligner/extract_features.py $1 TranslatedBOW -tlang=de -out $F.tbow -source_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l en" -target_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l de" -dictfile /home/buck/net/build/bitextor/bitextor-code/dicts/de/en-de.dic &>>${LOG}
42 | 
43 | 
44 | wait
45 | 


--------------------------------------------------------------------------------
/baseline/url_matching.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | import os
 5 | from strip_language_from_uri import LanguageStripper
 6 | import chardet
 7 | from collections import defaultdict
 8 | import re
 9 | import urlparse
10 | 
11 | 
12 | def has_prefix(prefixes, s):
13 |     "Returns true if s starts with one of the prefixes"
14 |     for p in prefixes:
15 |         if s.startswith(p):
16 |             return True
17 |     return False
18 | 
19 | 
20 | def original_url(html):
21 |     m = re.search(r"<!-- Mirrored from ([^>]+) by HTTrack Website Copier",
22 |                   html)
23 |     if m is None:
24 |         return "unknown_url"
25 |     return m.groups()[0]
26 | 
27 | 
28 | def clean_whitespace(s):
29 |     # remove empty lines
30 |     s = [l.strip() for l in s.split("\n") if l.strip()]
31 |     return "\n".join(re.sub("\s+", " ", l) for l in s)
32 | 
33 | 
34 | def read_file(filename):
35 |     # sys.stderr.write("reading: %s\n" % filename)
36 |     f = open(filename, 'r')
37 |     html = f.read()
38 |     try:
39 |         html = html.decode("utf-8")
40 |     except:
41 |         encoding = chardet.detect(html)
42 |         try:
43 |             html = html.decode(encoding["encoding"])
44 |         except:
45 |             sys.stderr.write(
46 |                 "Fallback: ignoring errors for file%s\n" % filename)
47 |             return html.decode("utf-8", errors='ignore')
48 |     return html
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     import argparse
53 |     parser = argparse.ArgumentParser()
54 |     parser.add_argument('outfile', type=argparse.FileType('w'),
55 |                         help='output file')
56 |     parser.add_argument('-prefix', help='prefix added to make filenames',
57 |                         default="/fs/syn0/pkoehn/crawl/data/site-crawls")
58 |     parser.add_argument('-slang', help='source language', default='en')
59 |     parser.add_argument('-tlang', help='target language', default='fr')
60 |     args = parser.parse_args(sys.argv[1:])
61 | 
62 |     correct = 0
63 |     stripper = LanguageStripper()
64 |     for line in sys.stdin:
65 |         was_stripped = 0
66 |         domain, a, b = line.strip().split("\t")
67 | 
68 |         urls = defaultdict(list)
69 |         for s in (a, b):
70 |             filename = os.path.join(args.prefix, domain, s)
71 |             html = read_file(filename)
72 | 
73 |             url = original_url(html)
74 |             url = "http://" + url
75 |             # print url
76 | 
77 |             parsed_url = urlparse.urlparse(url)
78 |             stripped_path = stripper.strip(parsed_url.path).replace("//", "/")
79 |             stripped_query = stripper.strip(
80 |                 parsed_url.query).replace("//", "/")
81 |             stripped_url = urlparse.ParseResult(parsed_url.scheme,
82 |                                                 parsed_url.netloc,
83 |                                                 stripped_path,
84 |                                                 parsed_url.params,
85 |                                                 stripped_query,
86 |                                                 parsed_url.fragment).geturl()
87 | 
88 |             urls[stripped_url].append(url)
89 |             if stripped_url != url:
90 |                 was_stripped += 1
91 |         if len(urls) == 1:
92 |             correct += 1
93 | 
94 |         print len(urls), was_stripped, correct, urls.items()
95 | 
96 |     print "correct: ", correct
97 | 


--------------------------------------------------------------------------------
/metadata/links_from_wat.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | import json
 5 | import tldextract
 6 | from urlparse import urlparse, urlsplit, urlunsplit
 7 | from urllib import quote, quote_plus
 8 | 
 9 | 
10 | def is_pdf_link(link):
11 |     if "path" not in link:
12 |         return False
13 |     if "url" not in link:
14 |         return False
15 |     if link["path"].split("@", 1)[0] != 'A':
16 |         return False
17 |     if link["url"].lower().endswith('.pdf'):
18 |         return True
19 |     if 'pdf' in link.get('text', '').lower():
20 |         return True
21 |     return False
22 | 
23 | 
24 | def quote_spaces(s):
25 |     if isinstance(s, unicode):
26 |         s = s.encode('utf-8', 'ignore')
27 |     scheme, netloc, path, qs, anchor = urlsplit(s)
28 |     path = quote(path, '/%()')
29 |     qs = quote_plus(qs, ':&=')
30 |     return urlunsplit((scheme, netloc, path, qs, anchor))
31 | 
32 | 
33 | def normalize_whitepace(s):
34 |     res = None
35 |     if isinstance(s, unicode):
36 |         s = s.encode('utf-8', 'ignore')
37 |         res = " ".join(s.split())
38 |         res.decode("utf-8")
39 |     else:
40 |         res = " ".join(s.split())
41 |     return res
42 | 
43 | if __name__ == "__main__":
44 |     import argparse
45 |     parser = argparse.ArgumentParser()
46 |     parser.add_argument('-nolinks', action='store_true',
47 |                         help='skip link extraction')
48 |     parser.add_argument('-pdf', action='store_true',
49 |                         help='extract only PDF links')
50 |     args = parser.parse_args(sys.argv[1:])
51 | 
52 |     for line in sys.stdin:
53 |         if not line.startswith("{"):
54 |             continue
55 |         try:
56 |             d = json.loads(line)
57 |         except ValueError:
58 |             continue
59 | 
60 |         uri, links, container, content_type = None, None, None, None
61 |         try:
62 |             container = d["Container"]
63 |             content_type = d["Envelope"]["Payload-Metadata"][
64 |                 "HTTP-Response-Metadata"]["Headers"][
65 |                 "Content-Type"]
66 |             uri = d["Envelope"]["WARC-Header-Metadata"]["WARC-Target-URI"]
67 |             links = d["Envelope"]["Payload-Metadata"][
68 |                 "HTTP-Response-Metadata"]["HTML-Metadata"]["Links"]
69 |         except KeyError:
70 |             continue
71 | 
72 |         if args.pdf:
73 |             uri = uri.encode('utf-8', 'ignore')
74 |             uri = normalize_whitepace(uri)
75 |             for link in links:
76 |                 if is_pdf_link(link):
77 |                     try:
78 |                         url = link['url'].encode('utf-8', 'ignore')
79 |                         url = normalize_whitepace(url)
80 |                         text = link.get('text', "").encode('utf-8', 'ignore')
81 |                         text = normalize_whitepace(text)
82 | 
83 |                         output = "%s\t%s\t%s\n" % (uri, url, text)
84 |                         sys.stdout.write(output)
85 |                     except UnicodeDecodeError:
86 |                         continue
87 | 
88 |             continue
89 | 
90 |         res = {"container": container, "uri": uri, "type": content_type}
91 |         if not args.nolinks:
92 |             res["links"] = links
93 |         try:
94 |             tld = tldextract.extract(urlparse(uri).netloc)
95 |         except UnicodeError:
96 |             continue
97 |         print tld.domain.encode("utf8", "ignore"), json.dumps(res)
98 | 


--------------------------------------------------------------------------------
/metadata/lang_stats/join_stats.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | from collections import defaultdict
 6 | from itertools import izip
 7 | from math import log
 8 | 
 9 | """ join language stat files together, possibly
10 |     filtering by langauge
11 | 
12 |     input data are several file of format
13 | 
14 |     [entropy] domain l1 b1 [l2 b2 [l3 b3 [...]]]
15 |     where
16 |     * ln is a language identifier and
17 |     * bn is the number of byes in that language
18 |     * entropy is optional and ignored
19 | 
20 |     Example:
21 |     -0.000089 www.hammockforums.net en 22430509 da 155
22 | 
23 |     Output is in the same format
24 | """
25 | 
26 | 
27 | def entropy(lang_dist):
28 |     total = float(sum(lang_dist.values()))
29 |     h = 0
30 | 
31 |     if total <= 0:
32 |         sys.stderr.write("weird values: total: %f\n" % total)
33 |         return h
34 | 
35 |     for lang, count in lang_dist.iteritems():
36 |         p = float(count) / total
37 |         try:
38 |             h += p * log(p)
39 |         except ValueError:
40 |             sys.stderr.write("weird values: cnt: %d, total: %f\n"
41 |                              % (count, total))
42 |             return 0
43 |     return h
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     import argparse
48 | 
49 |     parser = argparse.ArgumentParser()
50 |     parser.add_argument('infiles', nargs='+', help="statistics files",
51 |                         type=argparse.FileType('r'))
52 |     parser.add_argument('-lang', nargs='*',
53 |                         help="Ignore all other languages but these.")
54 |     parser.add_argument('-nomono', action='store_true',
55 |                         help='filter monolingual entries')
56 |     parser.add_argument('-total', action='store_true',
57 |                         help='ignore domains')
58 |     args = parser.parse_args()
59 | 
60 |     valid_languages = None
61 |     if args.lang:
62 |         valid_languages = [l.lower() for l in args.lang]
63 | 
64 |     stats = defaultdict(lambda: defaultdict(int))
65 | 
66 |     for f in args.infiles:
67 |         for line in f:
68 |             data = line.split()
69 |             if len(data) % 2 == 0:  # Line contrains entropy in first column
70 |                 _entropty = data.pop(0)
71 |             domain = data.pop(0)
72 |             domain = domain.split('?')[0]
73 |             if args.total:
74 |                 domain = "TOTAL"
75 |             for l, b in izip(data[::2], data[1::2]):
76 |                 l = l.lower()
77 |                 if l.startswith("xx-"):
78 |                     l = "xx"
79 |                 if valid_languages and l not in valid_languages:
80 |                     continue
81 |                 stats[domain][l] += int(b)
82 | 
83 |     for domain in stats:
84 |         if args.nomono and len(stats[domain]) == 1:
85 |             continue
86 |         e = entropy(stats[domain])
87 |         if args.total:
88 |             crawl = args.infiles[0].name.split('.')[0]
89 |             sys.stdout.write("%s\t%s\n" % (crawl, crawl))
90 |             for language in sorted(stats[domain].keys()):
91 |                 sys.stdout.write(
92 |                     "%s\t%d\n" % (language, stats[domain][language]))
93 |         else:
94 |             sys.stdout.write("%f %s" % (e, domain))
95 |             for language in stats[domain]:
96 |                 sys.stdout.write(
97 |                     " %s %d" % (language, stats[domain][language]))
98 |             sys.stdout.write("\n")
99 | 


--------------------------------------------------------------------------------
/baseline/locate_candidates.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | import json
 5 | import tldextract
 6 | import re
 7 | import requests
 8 | 
 9 |  # curl -v "http://statmt.org:8030/query_domain?domain=caletas.cr&full"
10 | 
11 | 
12 | def get_tld(uri):
13 |     try:
14 |         netloc = uri.split(
15 |             '//', 1)[1].split('/', 1)[0].split(':', 1)[0].split('@')[-1]
16 |     except IndexError:
17 |         return ""
18 |     # netloc = urlparse(uri)
19 |     try:
20 |         tld = tldextract.extract(netloc)
21 |     except UnicodeError:
22 |         return None
23 |     except IndexError:
24 |         return None
25 |     return tld
26 | 
27 | 
28 | def get_location(session, url, crawl, server):
29 |     """ Returns success and location """
30 |     payload = {'url': url, 'crawl': crawl,
31 |                'max_results': 1, 'verbose': 1, 'exact': 1}
32 |     r = session.get(server, params=payload)
33 |     assert 'locations' in r.json(), "line:%s\nquery: %s\nrespons:%s\n" % \
34 |         (line,
35 |          json.dumps(payload),
36 |          json.dumps(r.json()))
37 |     data = r.json()['locations']
38 |     if url not in data:
39 |         assert len(data) == 0
40 |         return False, None
41 |     return True, data[url][0]
42 | 
43 | 
44 | def report_error(url, crawl, errors, total):
45 |     percentage = 100. * errors / total
46 |     sys.stderr.write("Errors: %d/%d = %.2f%%\t%s\t%s\n" %
47 |                      (errors, total, percentage, crawl, url))
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     import argparse
52 |     parser = argparse.ArgumentParser()
53 |     parser.add_argument('candidates', type=argparse.FileType('r'),
54 |                         help='file containing candidates')
55 |     parser.add_argument('outfile', type=argparse.FileType('w'),
56 |                         default=sys.stdout)
57 |     parser.add_argument('-server', help='metadata server location',
58 |                         default='http://localhost:8080/query_prefix')
59 |     parser.add_argument('-slang', help='source language (e.g. en)',
60 |                         default='en')
61 |     parser.add_argument('-tlang', help='source language (e.g. it)',
62 |                         default='it')
63 |     args = parser.parse_args(sys.argv[1:])
64 | 
65 |     total_lines, total_errors = 0, 0
66 |     with requests.Session() as session:
67 |         for line in args.candidates:
68 |             total_lines += 1
69 |             line = line.decode("utf-8")
70 |             _, src_url, src_crawl, tgt_url, tgt_crawl = line.strip().split()
71 | 
72 |             src_success, src_loc = get_location(session, src_url,
73 |                                                 src_crawl, args.server)
74 |             if not src_success:
75 |                 total_errors += 1
76 |                 report_error(src_url, src_crawl, total_errors, total_lines)
77 | 
78 |             tgt_success, tgt_loc = get_location(session, tgt_url,
79 |                                                 tgt_crawl, args.server)
80 |             if not tgt_success:
81 |                 total_errors += 1
82 |                 report_error(tgt_url, tgt_crawl, total_errors, total_lines)
83 |             if src_success and tgt_success:
84 |                 args.outfile.write("%s\t%s\t%s\n" %
85 |                                    (src_url, src_crawl, json.dumps(src_loc)))
86 |                 args.outfile.write("%s\t%s\t%s\n" %
87 |                                    (tgt_url, tgt_crawl, json.dumps(tgt_loc)))
88 | 
89 |     sys.stderr.write("Done: ")
90 |     report_error(tgt_url, tgt_crawl, total_errors, total_lines)
91 | 


--------------------------------------------------------------------------------
/baseline/tar2ett.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Reads downloaded website from tar file and writes lett format to be
 6 | processed by bitextor pipeline
 7 | """
 8 | 
 9 | import base64
10 | import magic
11 | import re
12 | import sys
13 | import tarfile
14 | 
15 | from html2text import html2text
16 | from textsanitzer import TextSanitizer
17 | 
18 | magic_number = "df6fa1abb58549287111ba8d776733e9"
19 | 
20 | 
21 | name2code = {"ENGLISH": "en", "FRENCH": "fr"}
22 | 
23 | 
24 | def original_url_from_httrack_comment(html):
25 |     """ Extracts the original url from HTTrack comment """
26 |     url = "unknown_url"
27 |     for m in re.finditer(
28 |             "<!-- Mirrored from ([^>]+) by HTTrack Website Copier", html):
29 |         url = m.groups()[0]
30 |         if not url.startswith('http://'):
31 |             url = "http://" + url
32 |     return url
33 | 
34 | 
35 | def read_file2realurl(fh):
36 |     file2realurl = {}
37 |     if fh:
38 |         for line in fh:
39 |             filename, real_url = line.strip().split("\t")
40 |             assert filename not in file2realurl
41 |             file2realurl[filename] = real_url
42 |     return file2realurl
43 | 
44 | if __name__ == "__main__":
45 |     import argparse
46 |     parser = argparse.ArgumentParser()
47 |     parser.add_argument('tarfile', help='tarfile containing a webdir')
48 |     # parser.add_argument('srclang', help="source langauge e.g. en")
49 |     # parser.add_argument('tgtlang', help="target langauge e.g. fr")
50 |     # parser.add_argument('-out', type=argparse.FileType('w'),
51 |     #                     help='output lett file',
52 |     #                     default=sys.stdout)
53 |     parser.add_argument('-out', type=argparse.FileType('w'),
54 |                         help='output lett file',
55 |                         default=sys.stdout)
56 |     parser.add_argument('-file2url', type=argparse.FileType('w'),
57 |                         help='filename to URL mapping')
58 | 
59 |     mime_type = "text/html"
60 |     enc = "charset=utf-8"
61 | 
62 |     args = parser.parse_args(sys.argv[1:])
63 |     tar = tarfile.open(args.tarfile, "r:gz")
64 | 
65 |     for filenr, tarinfo in enumerate(tar):
66 |         if not tarinfo.isreg():
67 |             continue
68 | 
69 |         filename = tarinfo.name
70 | 
71 |         raw_data = tar.extractfile(tarinfo).read()
72 |         mime_type = magic.from_buffer(raw_data, mime=True)
73 |         if mime_type not in ['text/html', 'text/plain', 'application/xml']:
74 |             sys.stderr.write("Skipping file %s (%d bytes, mime: %s)\n"
75 |                              % (filename, len(raw_data), mime_type))
76 |             continue
77 |         data = TextSanitizer.to_unicode(raw_data, is_html=True, lang='auto')
78 |         data = data.encode('utf-8')  # utf-8 input expected
79 | 
80 |         original_uri = original_url_from_httrack_comment(data)
81 |         if args.file2url:
82 |             args.file2url.write("%s\t%s\n" % (original_uri, filename))
83 |         if original_uri == "unknown_url":
84 |             original_uri = filename
85 | 
86 |         sys.stderr.write("Processed file Nr. %d : %s = %s\n" %
87 |                          (filenr, filename, original_uri))
88 | 
89 |         links = re.findall('''href\s*=\s*['"]\s*([^'"]+)['"]''', data, re.S)
90 |         sys.stdout.write("{html}\t{url}\t{links}\n".format(
91 |             html=base64.b64encode(data),
92 |             url=original_uri,
93 |             links=str(list(set(links)))))
94 | 
95 |     sys.stderr.write("%d files in %s done. \n" % (filenr + 1, args.tarfile))
96 |     tar.close()
97 | 


--------------------------------------------------------------------------------
/docaligner/extract_dev_feats.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Exit as soon as any command fails
 4 | set -e
 5 | set -o pipefail
 6 | 
 7 | source /home/buck/net/build/virtualenvs/crawl/bin/activate
 8 | 
 9 | 
10 | BASENAME=${1/.lett/}
11 | F=${BASENAME}.feats
12 | LOG=${BASENAME}.log
13 | TARGETS=${BASENAME}.targets
14 | TEXT_EN=${BASENAME}.text.en
15 | TEXT_DE=${BASENAME}.text.de
16 | 
17 | SLANG=en
18 | TLANG=fr
19 | 
20 | BINDIR=/home/buck/net/build/DataCollection
21 | 
22 | # ${BINDIR}/baseline/extract_foreign_text.py -splitter='' -normalizer='' -tokenizer='' -o ${TEXT_DE} -lang de < $1 &
23 | # ${BINDIR}/baseline/extract_foreign_text.py -splitter='' -normalizer='' -tokenizer='' -o ${TEXT_EN} -lang en < $1 &
24 | 
25 | 
26 | # exit
27 | 
28 | ${BINDIR}/docaligner/extract_features.py $1 BOW -tlang=${TLANG} -out $F.bow_n1_jaccard -source_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l ${SLANG}" -target_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l ${TLANG}" -targets ${TARGETS} -n 1 &>${LOG}
29 | 
30 | ${BINDIR}/docaligner/extract_features.py $1 BOW -tlang=${TLANG} -out $F.bow_n2_jaccard -source_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l ${SLANG}" -target_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l ${TLANG}" -n 2 &>>${LOG}
31 | 
32 | ${BINDIR}/docaligner/extract_features.py $1 BOW -tlang=${TLANG} -out $F.bow_n3_jaccard -source_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l ${SLANG}" -target_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l ${TLANG}" -n 3 &>>${LOG}
33 | 
34 | ${BINDIR}/docaligner/extract_features.py $1 Simhash -tlang=${TLANG} -out $F.simhash_n2 -source_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l ${SLANG}" -target_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l ${TLANG}" -n 2 &>>${LOG}
35 | 
36 | ${BINDIR}/docaligner/extract_features.py $1 Simhash -tlang=${TLANG} -out $F.simhash_n3 -source_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l ${SLANG}" -target_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l ${TLANG}" -n 3 &>>${LOG}
37 | 
38 | ${BINDIR}/docaligner/extract_features.py $1 LinkDistance -tlang=${TLANG} -out $F.linkdistance_a -source_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l ${SLANG}" -target_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l ${TLANG}" &>>${LOG}
39 | 
40 | ${BINDIR}/docaligner/extract_features.py $1 LinkDistance -tlang=${TLANG} -out $F.linkdistance_img -source_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l ${SLANG}" -target_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l ${TLANG}" -xpath="//img/@src" &>>${LOG}
41 | 
42 | ${BINDIR}/docaligner/extract_features.py $1 TranslatedBOW -tlang=${TLANG} -out $F.tbow -source_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l ${SLANG}" -target_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l ${TLANG}" -dictfile /home/buck/net/build/bitextor/baseline/en-fr.dic &>>${LOG}
43 | 
44 | ${BINDIR}/docaligner/extract_features.py $1 Structure -tlang=${TLANG} -out $F.structure -source_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l ${SLANG}" -target_tokenizer="/home/buck/net/build/moses-clean/scripts/tokenizer/tokenizer.perl -b -l ${TLANG}" -n 2 &>>${LOG}
45 | 
46 | wait
47 | 


--------------------------------------------------------------------------------
/baseline/dictionary.md:
--------------------------------------------------------------------------------
 1 | ## Building bilingual dictionaries for additional translation directions
 2 | 
 3 | To support sentence alignment with the hunalign tool bilingual word dictionaries are needed in Step 5 of the [baseline process](/baseline/baseline.md). These can be built from existing parallel corpora.
 4 | 
 5 | ### Prerequisites
 6 | * An Ubuntu/Debian-based operating system
 7 | * An installation of the [Moses MT toolkit](http://www.statmt.org/moses/) 
 8 | * An installation of [MGIZA](https://github.com/moses-smt/mgiza)
 9 | The following instructions assume installation in your home directory.
10 | 
11 | ## Dictionary build process
12 | The parallel corpus to build the dictionary from should consist of about over 100,000 sentence pairs and be available as two text files named `Corpus.sourcelanguageid` and `Corpus.targetlanguageid`.
13 | ```bash
14 | ~/moses/scripts/tokenizer/tokenizer.perl -l sourcelanguageid < Corpus.sourcelanguageid > Corpus.tok.sourcelanguageid
15 | ~/moses/scripts/tokenizer/tokenizer.perl -l targetlanguageid < Corpus.targetlanguageid > Corpus.tok.targetlanguageid
16 | ~/moses/scripts/tokenizer/lowercase.perl < Corpus.tok.sourcelanguageid > Corpus.lower.sourcelanguageid
17 | ~/moses/scripts/tokenizer/lowercase.perl < Corpus.tok.targetlanguageid > Corpus.lower.targetlanguageid
18 | 
19 | ~/mgiza/mgizapp/bin/mkcls -pCorpus.lower.sourcelanguageid -VCorpus.lower.sourcelanguageid.vcb.classes
20 | ~/mgiza/mgizapp/bin/mkcls -pCorpus.lower.targetlanguageid -VCorpus.lower.targetlanguageid.vcb.classes
21 | 
22 | ~/mgiza/mgizapp/bin/plain2snt Corpus.lower.sourcelanguageid Corpus.lower.targetlanguageid
23 | 
24 | nohup ~/mgiza/mgizapp/bin/snt2cooc Corpus.lower.targetlanguageid_Corpus.lower.sourcelanguageid.cooc Corpus.lower.targetlanguageid.vcb Corpus.lower.sourcelanguageid.vcb Corpus.lower.targetlanguageid_Corpus.lower.sourcelanguageid.snt &
25 | 
26 | mkdir Corpus
27 | mkdir Corpus2
28 | 
29 | nohup ~/mgiza/mgizapp/bin/mgiza -S Corpus.lower.targetlanguageid.vcb -T Corpus.lower.sourcelanguageid.vcb -C Corpus.lower.targetlanguageid_Corpus.lower.sourcelanguageid.snt -o DC -outputpath Corpus -coocurrencefile Corpus.lower.targetlanguageid_Corpus.lower.sourcelanguageid.cooc &
30 | 
31 | nohup ~/mgiza/mgizapp/bin/snt2cooc Corpus.lower.sourcelanguageid_Corpus.lower.targetlanguageid.cooc Corpus.lower.sourcelanguageid.vcb Corpus.lower.targetlanguageid.vcb Corpus.lower.sourcelanguageid_Corpus.lower.targetlanguageid.snt &
32 | 
33 | nohup ~/mgiza/mgizapp/bin/mgiza -S Corpus.lower.sourcelanguageid.vcb -T Corpus.lower.targetlanguageid.vcb -C Corpus.lower.sourcelanguageid_Corpus.lower.targetlanguageid.snt -o DC -outputpath Corpus2 -coocurrencefile Corpus.lower.sourcelanguageid_Corpus.lower.targetlanguageid.cooc &
34 | 
35 | python ~/DataCollection/dicts/filter_giza.py Corpus.lower.sourcelanguageid.vcb Corpus.lower.targetlanguageid.vcb ./Corpus/DC.t3.final ./Corpus2/DC.t3.final > sourcelanguageid_targetlanguageid.dict.unsorted
36 | cat sourcelanguageid_targetlanguageid.dict.unsorted | sort > sourcelanguageid_targetlanguageid.dict.sorted
37 | ```
38 | Add a header line `sourcelanguageid<tab>targetlanguageid` to the resulting `sourcelanguageid_targetlanguageid.dict.sorted` to produce the final dictionary.
39 | 
40 | ## Creating dictionaries for reverse language directions
41 | 
42 | To create a dictionary for a reverse language from a dictionary in Bitextor format (described in Step 5 of the [baseline documentation](/baseline/baseline.md)):
43 | * Remove first line with language identifiers and save the result as `en-de.nohead.dic`
44 | * Reverse the order of the dictionary entries and sort the results with the command `awk '{print $2 " " $1}' en-de.nohead.dic | sort > de-en.dic`
45 | * Add a header line `de<tab>en` to the reversed `de-en.dic`
46 | 


--------------------------------------------------------------------------------
/baseline/bitextor_util/lett2ridx_combine.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | from operator import itemgetter
 5 | from collections import defaultdict
 6 | 
 7 | 
 8 | def read_source_chunk(filename, n):
 9 |     """ Reads a couple of source documents at a time """
10 |     docs = []
11 |     with open(filename, 'r') as sf:
12 |         for sline in sf:
13 |             s_doc_id, s_tokens = sline.split('\t', 1)
14 |             s_doc_id = int(s_doc_id)
15 |             s_tokens = set(s_tokens.strip().split('\t'))
16 |             docs.append((s_doc_id, s_tokens))
17 |             if len(docs) >= n:
18 |                 yield docs
19 |                 docs = []
20 |         if docs:
21 |             yield docs
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     import argparse
26 |     parser = argparse.ArgumentParser()
27 |     parser.add_argument('source', help='source tokens file')
28 |     parser.add_argument('target', help='translated target tokens')
29 |     parser.add_argument('-max_candidates', type=int, default=10,
30 |                         help='maximum number of candidates per document')
31 |     parser.add_argument('-buffersize', type=int, default=2000,
32 |                         help='number of source documents held im memory')
33 |     parser.add_argument('-valid_words',
34 |                         help='List of valid words, i.e. those of low count')
35 |     args = parser.parse_args()
36 | 
37 |     for source_docs in read_source_chunk(args.source, args.buffersize):
38 | 
39 |         similarities = defaultdict(list)
40 | 
41 |         with open(args.target, 'r') as tf:
42 |             for tline in tf:
43 |                 t_doc_id, n_translated, n_orig_tokens, t_tokens = \
44 |                     tline.split('\t', 3)
45 |                 t_doc_id = int(t_doc_id)
46 |                 n_translated = float(n_translated)
47 |                 n_orig_tokens = int(n_orig_tokens)
48 |                 t_tokens = set(t_tokens.strip().split('\t'))
49 | 
50 |                 for s_doc_id, s_tokens in source_docs:
51 |                     # formula from bitextor-idx2ridx
52 |                     max_vocab = float(max(len(s_tokens), n_orig_tokens))
53 |                     min_vocab = float(min(len(s_tokens), n_orig_tokens))
54 |                     num_intersect_words = len(s_tokens.intersection(t_tokens))
55 |                     if max_vocab > 0 and n_translated > 0:
56 |                         similarity = min_vocab / max_vocab * \
57 |                             num_intersect_words / n_translated
58 |                         # if s_doc_id == 13 and t_doc_id == 0:
59 |                         #     print s_tokens
60 |                         #     print len(s_tokens)
61 |                         #     print "Original tokens: ", n_orig_tokens
62 |                         #     print t_tokens
63 |                         #     print len(t_tokens)
64 |                         #
65 |                         # print s_doc_id, t_doc_id
66 |                         #     print similarity
67 |                         #     print min_vocab, max_vocab
68 |                         #     print num_intersect_words,  n_translated
69 |                         similarities[s_doc_id].append((t_doc_id, similarity))
70 | 
71 |         for s_doc_id in similarities:
72 |             similarities[s_doc_id].sort(key=itemgetter(1), reverse=True)
73 |             # High similarity at beginning
74 | 
75 |             # Fileformat expects docids starting with 1
76 |             sys.stdout.write("%d" % (s_doc_id + 1))
77 |             for t_doc_id, similarity in \
78 |                     similarities[s_doc_id][:args.max_candidates]:
79 |                 sys.stdout.write("\t%d:%f" % (t_doc_id + 1, similarity))
80 |             sys.stdout.write('\n')
81 | 


--------------------------------------------------------------------------------
/baseline/html2text.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import html5lib
 5 | import sys
 6 | from textsanitzer import TextSanitizer
 7 | from html5lib import treebuilders, treewalkers
 8 | 
 9 | """ Utility functions to extract text from a website """
10 | 
11 | # from: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements
12 | block_level_elements = set([u'address', u'article', u'aside', u'audio',
13 |                             u'blockquote', u'canvas', u'dd', u'div', u'dl',
14 |                             u'fieldset', u'figcaption', u'figure', u'footer',
15 |                             u'form', u'h1', u'h2', u'h3', u'h4', u'h5', u'h6',
16 |                             u'header', u'hgroup', u'hr', u'noscript', u'ol',
17 |                             u'output', u'p', u'pre', u'section', u'table',
18 |                             u'tfoot', u'ul', u'video'])
19 | 
20 | 
21 | def html2text(html, sanitize=False, ignore_br=False):
22 |     """ Takes utf-8 encoded page and returns unicode text """
23 |     p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
24 |     dom_tree = p.parse(html.decode("utf-8"))
25 |     walker = treewalkers.getTreeWalker("dom")
26 |     stream = walker(dom_tree)
27 | 
28 |     space_introducing_tags = set(['th', 'td'])
29 |     # Add space around spans
30 |     # This technically violates the standard as spans
31 |     # don't introduce whitespace. In practice whitespace
32 |     # is often added via CSS and spans rarely end in the
33 |     # middle of a word.
34 |     space_introducing_tags.add('span')
35 | 
36 |     line_break_tags = block_level_elements
37 |     line_break_tags.add('tr')  # <tr> introduces line-break
38 |     line_break_tags.add('li')  # <li> introduces line-break
39 |     line_break_tags.add('option')  # <option> introduces line-break
40 | 
41 |     if ignore_br:
42 |         space_introducing_tags.add('br')
43 |     else:
44 |         line_break_tags.add('br')
45 | 
46 |     in_script = False
47 |     outbuf = []
48 |     current_line = []
49 |     for token in stream:
50 |         token_name = token.get('name', "").lower()
51 | 
52 |         # ignore everything in scripts
53 |         if token_name in ['script', 'style', 'noscript']:
54 |             in_script = token.get('type', None) == 'StartTag'
55 |         if in_script:
56 |             continue
57 | 
58 |         # Should we start a new line?
59 |         if token_name in line_break_tags:
60 |             if current_line:
61 |                 outbuf.append(u"".join(current_line))
62 |                 current_line = []
63 | 
64 |         # Add space before data
65 |         if token_name in space_introducing_tags:
66 |             current_line.append(u" ")
67 | 
68 |         if token.get(u'type', None) == u'Characters':
69 |             current_line.append(
70 |                 TextSanitizer.clean_whitespace(token['data'],
71 |                                                linesep=u' '))
72 | 
73 |         # Unify any space to standard spaces
74 |         if token.get(u'type', None) == u'SpaceCharacters':
75 |             if current_line and current_line[-1] != u' ':
76 |                 current_line.append(u' ')
77 | 
78 |         # Add space after data
79 |         if token_name in space_introducing_tags:
80 |             current_line.append(u' ')
81 | 
82 |     if current_line:
83 |         outbuf.append(u"".join(current_line))
84 | 
85 |     text = u"\n".join(outbuf)
86 |     text = TextSanitizer.clean_text(
87 |         text, sanitize=sanitize, clean_whitespace=True)
88 |     return text
89 | 
90 | 
91 | if __name__ == "__main__":
92 |     buffer = []
93 |     for line in sys.stdin:
94 |         buffer.append(line)
95 |     html = "".join(buffer)
96 |     text = html2text(html)
97 |     sys.stdout.write(text.encode('utf-8'))
98 |     sys.stdout.write("\n")
99 | 


--------------------------------------------------------------------------------
/parseXML.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #--------------------------------------------------------------------------------------
  3 | #Parse the xml to extract the title and the URL of the crawled web pages by ILSP Crawler.
  4 | #--------------------------------------------------------------------------------------
  5 | import xml.etree.ElementTree as ET
  6 | from os import listdir
  7 | from os.path import isfile, join
  8 | import re
  9 | import codecs
 10 | import sys
 11 | 
 12 | def getInfo(xmlFile):
 13 |     """It gets the title and the URL of the crawled by ILSP Crawler."""
 14 |     
 15 |     ns = {'schema': 'http://www.xces.org/schema/2003'}
 16 |     tree = ET.parse(xmlFile)
 17 |     root = tree.getroot()
 18 |     
 19 |     sourceDesc=root[0][0][2]
 20 |     title=sourceDesc[0][0][0]
 21 |     eAddress=sourceDesc[0][0][2][3]
 22 |     
 23 |     
 24 |     titleText=title.text
 25 |     eAddressText=eAddress.text
 26 |     
 27 |     if (titleText is None) :
 28 |         titleText=""
 29 |     if (eAddressText is None) :
 30 |         eAddressText=""
 31 |         
 32 |     return titleText,eAddressText    
 33 | 
 34 | 
 35 | def isXMLParsedFile(fileCheck):
 36 |     """The xml file that all HTML files are transformed to  have a number followed by .xml (e.g 61.xml)"""
 37 |     return re.search(r'^\d+\.xml$',fileCheck)
 38 | 
 39 | 
 40 | def isXMLMappedFile(fileCheck):
 41 |     """The xml file with that contains the mapping (e.g 59_12_m.xml)"""
 42 |     return re.search(r'_[a-z]+.xml$',fileCheck)
 43 | 
 44 | 
 45 | 
 46 | def getXMLFiles (directory) :
 47 |     "It gets the list of files for which I need to take the title and the address."
 48 |     
 49 |     allFiles = [ f for f in listdir(directory) if isfile(join(directory,f))]
 50 |     xmlParsedList=[ f for f in allFiles if isXMLParsedFile(f)]
 51 |     xmlMappedList=[ f for f in allFiles if isXMLMappedFile(f)]
 52 |     
 53 |     return xmlParsedList,xmlMappedList
 54 |     
 55 | def getMapping (fileMapped) :
 56 |     """Get the mapping between files from the file name."""
 57 |     
 58 |     fileRes=re.sub("_[a-z]+\.xml$","",fileMapped)
 59 |     fileS,fileD=fileRes.split("_")
 60 |     
 61 |     fileS+=".xml"
 62 |     fileD+=".xml"
 63 |     
 64 |     return fileS,fileD 
 65 |     
 66 | 
 67 | def printMapping (xmlMappedList,fileOutput) :
 68 |     """Print the Mapping"""
 69 |     
 70 |     xmlMapingList=[ getMapping(f) for f in xmlMappedList]
 71 |     fo= codecs.open(fileOutput, "a", "utf-8")
 72 |     fo.write("\nFiles Mapped:\n")
 73 |     fo.write("-----------------------------------------------------\n")
 74 |     for mapPair in xmlMapingList :
 75 |         fo.write (mapPair[0]+"\t"+mapPair[1]+"\n")
 76 |         
 77 |     fo.close()    
 78 |    
 79 | def printDownloaded (xmlParsedList,fileOutput,directory) :
 80 |     """Print the downloaded files """
 81 |     
 82 |     fo= codecs.open(fileOutput, "a", "utf-8")
 83 |     fo.write("Files Downloaded:\n")
 84 |     fo.write("-----------------------------------------------------\n")
 85 |     for xmlFile in xmlParsedList:
 86 |         xmlPath=join(directory,xmlFile)
 87 |         title,url=getInfo(xmlPath)
 88 |         fo.write(xmlFile+"\t"+title+"\t"+url+"\n")
 89 |     fo.close()
 90 |     
 91 |     
 92 | def main():
 93 |     
 94 |     print "Read the xml files"
 95 |     directory=sys.argv[1]
 96 |     fileOutput=sys.argv[2]
 97 |     
 98 |     
 99 |     xmlParsedList,xmlMappedList=getXMLFiles (directory)
100 |     print "Get the Title and the web address of the crawled pages by ILFSP Crawler."
101 |     fo= codecs.open(fileOutput, "w", "utf-8")
102 |     fo.close()
103 |     
104 |     printDownloaded (xmlParsedList,fileOutput,directory)
105 |     printMapping (xmlMappedList,fileOutput)
106 |     
107 |     print "Done"
108 |     
109 |     
110 |     
111 | if __name__ == '__main__':
112 |   main()


--------------------------------------------------------------------------------
/baseline/candidates2corpus.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import base64
 4 | import json
 5 | import sys
 6 | 
 7 | from ccdownloader import CCDownloader
 8 | from html2text import html2text
 9 | from textsanitzer import TextSanitizer
10 | from external_processor import TextProcessor
11 | 
12 | 
13 | def process_candidates(candidates, outfile):
14 |     if candidates[0][-1] == "" or candidates[1][-1] == "":
15 |         return
16 |     src_url, src_text, src_html = candidates[0]
17 |     tgt_url, tgt_text, tgt_html = candidates[1]
18 | 
19 |     if not src_text or not tgt_text:
20 |         return
21 | 
22 |     outfile.write("\t".join((src_url,
23 |                              tgt_url,
24 |                              base64.b64encode(src_text.encode('utf-8')),
25 |                              base64.b64encode(tgt_text.encode('utf-8')),
26 |                              base64.b64encode(src_html.encode('utf-8')),
27 |                              base64.b64encode(tgt_html.encode('utf-8')),)))
28 |     outfile.write("\n")
29 | 
30 | 
31 | def extract(html, lang, text_processor):
32 |     text = u""
33 |     if html:
34 |         html = TextSanitizer.to_unicode(html, is_html=True,
35 |                                         lang=lang)
36 |         text = html2text(html.encode('utf-8'), sanitize=True)
37 |         text = text_processor.process(text)
38 |     return html, text
39 | 
40 | if __name__ == "__main__":
41 |     import argparse
42 |     parser = argparse.ArgumentParser()
43 |     parser.add_argument('-outfile', type=argparse.FileType('w'),
44 |                         help='output file', default=sys.stdout)
45 |     parser.add_argument('-source_tokenizer',
46 |                         help='call to source tokenizer, incl. args')
47 |     parser.add_argument('-source_splitter',
48 |                         help='call to source sentence splitter, incl. args')
49 |     parser.add_argument('-target_tokenizer',
50 |                         help='call to target tokenizer, incl. args')
51 |     parser.add_argument('-target_splitter',
52 |                         help='call to target sentence splitter, incl. args')
53 |     parser.add_argument('-srclang',
54 |                         help="source langauge e.g. en")
55 |     parser.add_argument('-tgtlang',
56 |                         help="target langauge e.g. fr")
57 |     args = parser.parse_args(sys.argv[1:])
58 | 
59 |     downloader = CCDownloader()
60 |     source_text_processor = TextProcessor(splitter=args.source_splitter,
61 |                                           tokenizer=args.source_tokenizer)
62 |     target_text_processor = TextProcessor(splitter=args.target_splitter,
63 |                                           tokenizer=args.target_tokenizer)
64 | 
65 |     candidates = []
66 |     for linenr, line in enumerate(sys.stdin):
67 |         # if linenr > 0:
68 |         #     if linenr % 100 == 0:
69 |         #         sys.stderr.write('.')
70 |         #     if linenr % 1000 == 0:
71 |         #         sys.stderr.write("[%d]\n" % linenr)
72 |         url, _crawl, data = line.split('\t', 2)
73 |         data = json.loads(data)
74 |         # Workaround server error
75 |         if 'offset:' in data:
76 |             data['offset'] = data.pop('offset:')
77 | 
78 |         html = downloader.download(data['filename'],
79 |                                    int(data[u'offset']),
80 |                                    int(data['length']),
81 |                                    html_only=True)
82 | 
83 |         text = u""
84 |         if len(candidates) == 0:
85 |             html, text = extract(html, args.srclang, source_text_processor)
86 |         else:
87 |             assert len(candidates) == 1
88 |             html, text = extract(html, args.tgtlang, target_text_processor)
89 | 
90 |         candidates.append((url, text, html))
91 | 
92 |         if len(candidates) == 2:
93 |             process_candidates(candidates, args.outfile)
94 |             candidates = []
95 | 


--------------------------------------------------------------------------------
/baseline/strip_language_from_uri.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | import sys
  4 | import re
  5 | import urlparse
  6 | 
  7 | magic_number = "df6fa1abb58549287111ba8d776733e9"
  8 | 
  9 | 
 10 | def stoi(s):
 11 |     """ works like int(s) but also accepts floats and scientific notation """
 12 |     try:
 13 |         return int(s)
 14 |     except ValueError:
 15 |         return int(float(s))
 16 | 
 17 | 
 18 | class LanguageStripper(object):
 19 | 
 20 |     def __init__(self):
 21 |         self.code_to_language = {}
 22 |         for code in ["arabic", "ara", "ar"]:
 23 |             self.code_to_language[code] = "ARABIC"
 24 |         for code in ["bulgarian", "bul", "bg"]:
 25 |             self.code_to_language[code] = "BULGARIAN"
 26 |         for code in ["czech", "cze", "cz", "cs"]:
 27 |             self.code_to_language[code] = "CZECH"
 28 |         for code in ["deutsch", "german", "ger", "deu", "de"]:
 29 |             self.code_to_language[code] = "GERMAN"
 30 |         for code in ["english", "eng", "en"]:
 31 |             self.code_to_language[code] = "ENGLISH"
 32 |         for code in ["espanol", "spanish", "spa", "esp", "es"]:
 33 |             self.code_to_language[code] = "SPANISH"
 34 |         for code in ["french", "francais", "fra", "fre", "fr"]:
 35 |             self.code_to_language[code] = "FRENCH"
 36 |         for code in ["chinese", "chi", "zh"]:
 37 |             self.code_to_language[code] = "CHINESE"
 38 |         regexp_string = "(?<![a-zA-Z0-9])(?:%s)(?![a-zA-Z0-9])" % (
 39 |             "|".join(self.code_to_language.keys()))
 40 |         self.re_code = re.compile(regexp_string)
 41 | 
 42 |     def stripn(self, uri):
 43 |         return self.re_code.subn('', uri)
 44 | 
 45 |     def strip(self, uri):
 46 |         return self.re_code.sub('', uri)
 47 | 
 48 |     def match(self, uri):
 49 |         for match in self.re_code.findall(uri):
 50 |             assert match in self.code_to_language
 51 |             return self.code_to_language[match]
 52 |         return ""
 53 | 
 54 | 
 55 | def get_languages(buffer):
 56 |     return [(lang, int(percentage), stoi(num_bytes))
 57 |             for lang, percentage, num_bytes in
 58 |             [line.split() for line in buffer]]
 59 | 
 60 | 
 61 | def process_buffer(buffer, language_stripper):
 62 |     if not buffer or len(buffer) < 2:
 63 |         return
 64 |     assert buffer[0].startswith(magic_number)
 65 | 
 66 |     uri = buffer[0].split(' ', 2)[1].split(':', 1)[1]
 67 |     parsed_uri = urlparse.urlparse(uri)
 68 | 
 69 |     matched_language = language_stripper.match(parsed_uri.path)
 70 |     if not matched_language:
 71 |         matched_language = language_stripper.match(parsed_uri.query)
 72 |         if not matched_language:
 73 |             return
 74 | 
 75 |     stripped_path = language_stripper.strip(parsed_uri.path)
 76 |     stripped_query = language_stripper.strip(parsed_uri.query)
 77 |     stripped_uri = urlparse.ParseResult(parsed_uri.scheme,
 78 |                                         parsed_uri.netloc,
 79 |                                         stripped_path,
 80 |                                         parsed_uri.params,
 81 |                                         stripped_query,
 82 |                                         parsed_uri.fragment).geturl()
 83 | 
 84 |     languages = [lang for lang, percent, num_bytes in
 85 |                  get_languages(buffer[1:])]
 86 |     print "\t".join((stripped_uri, uri, matched_language, "/".join(languages))).encode('utf-8')
 87 | 
 88 | 
 89 | if __name__ == "__main__":
 90 |     buffer = []
 91 |     language_stripper = LanguageStripper()
 92 |     for line in sys.stdin:
 93 |         line = line.decode("utf-8", "ignore")
 94 |         if line.startswith(magic_number):
 95 |             process_buffer(buffer, language_stripper)
 96 |             buffer = [line]
 97 |         elif buffer:
 98 |             buffer.append(line)
 99 |     process_buffer(buffer, language_stripper)
100 | 


--------------------------------------------------------------------------------
/baseline/score_ngrams.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | from math import log
 6 | from collections import defaultdict
 7 | import numpy as np
 8 | from scipy.spatial import distance
 9 | # import scipy as sp
10 | 
11 | 
12 | def cos_cdist(matrix, vector):
13 |     """
14 |     Compute the cosine distances between each row of matrix and vector.
15 |     """
16 |     v = vector.reshape(1, -1)
17 |     return distance.cdist(matrix, v, 'cosine').reshape(-1)
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     import argparse
22 |     parser = argparse.ArgumentParser()
23 |     parser.add_argument('engrams', type=argparse.FileType('r'),
24 |                         help="original English ngrams, \
25 |                         Format: ngram<tab>filename")
26 |     parser.add_argument('tngrams', type=argparse.FileType('r'),
27 |                         help="ngrams from translation (into English). \
28 |                         Format: ngram<tab>filename")
29 | 
30 |     parser.add_argument('langfile', type=argparse.FileType('r'),
31 |                         help="format: filename<tab>main language")
32 |     # parser.add_argument('-filename', type=argparse.FileType('w'),
33 |     #                     help='filename without prefix', required=True)
34 |     parser.add_argument('-outfile', type=argparse.FileType('w'),
35 |                         help='output file', required=True)
36 |     parser.add_argument('-prefix', help='prefix added to make filenames',
37 |                         default="/fs/syn0/pkoehn/crawl/data/site-crawls")
38 |     parser.add_argument('-slang', help='Source language', default='en')
39 |     parser.add_argument('-tlang', help='Non-english language', default='fr')
40 |     args = parser.parse_args(sys.argv[1:])
41 | 
42 |     # inverse index
43 |     ngram2doc = defaultdict(set)
44 |     ngram2idx = {}  # ngram to dimension
45 |     doc2ngram = defaultdict(set)  # ignore ngram counts (tf)
46 | 
47 |     for f in (args.engrams, args.tngrams):
48 |         for line in f:
49 |             ngram, filename = line.strip().split("\t", 1)
50 |             ngram2doc[ngram].add(filename)  # docs identified by filename
51 |             doc2ngram[filename].add(ngram)
52 |             ngram2idx[ngram] = len(ngram2idx)
53 | 
54 |     # precompute idf scores = log ( |D|/df(ngram) )
55 |     ngram2idf = {}
56 |     ndocs = len(doc2ngram)
57 |     for ngram, docs in ngram2doc.iteritems():
58 |         ngram2idf[ngram] = log(float(ndocs) / len(docs))
59 | 
60 |     # collect sets of source and target docs based on their main language
61 |     source_docs, target_docs = set(), set()
62 |     for line in args.langfile:
63 |         filename, lang = line.strip().split("\t")
64 |         if filename not in doc2ngram:
65 |             sys.stderr.write("no known ngrams for %s\n" % filename)
66 |             continue
67 |         assert lang in (args.slang, args.tlang),\
68 |             "unexpected language %s\n" % lang
69 |         if lang == args.slang:
70 |             source_docs.add(filename)
71 |         else:
72 |             target_docs.add(filename)
73 | 
74 |     target_docs = list(target_docs)
75 |     source_docs = list(source_docs)
76 | 
77 |     # matrix of target document vectors
78 |     target_matrix = np.zeros((len(target_docs), len(ngram2idx)))
79 |     for tidx, td in enumerate(target_docs):
80 |         for ngram in doc2ngram[td]:
81 |             ngram_idx = ngram2idx[ngram]
82 |             target_matrix[tidx, ngram_idx] = ngram2idf[ngram]
83 | 
84 |     for sd in source_docs:
85 |         v = np.zeros(len(ngram2idx))
86 |         for ngram in doc2ngram[sd]:
87 |             ngram_idx = ngram2idx[ngram]
88 |             v[ngram_idx] = ngram2idf[ngram]
89 | 
90 |         d = cos_cdist(target_matrix, v)
91 |         # print d
92 |         min_dist = np.min(d)
93 |         if min_dist < 1.:
94 |             best_td = target_docs[np.argmin(d)]
95 |             args.outfile.write("%s\t%s\t%f\n" % (sd, best_td, min_dist))
96 |         else:
97 |             sys.stderr.write("No match found for %s\n" % sd)
98 | 
99 | 


--------------------------------------------------------------------------------