├── __init__.py ├── baseline ├── __init__.py ├── util │ └── __init__.py ├── filter_emty_text_from_lett.py ├── add_warc_locations.sh ├── ngrams.py ├── dedupe.sh ├── text2langstats.py ├── lett_viewer.py ├── bitextor_util │ ├── bitextorutil.py │ ├── show_bitextor_docs.py │ ├── wordcounts.py │ └── lett2ridx_combine.py ├── dumptar.py ├── filter_tmx.sh ├── strip_headers.py ├── download_domain.py ├── corpus_by_domain.py ├── download_candidates.py ├── download_and_align.sh ├── lett2corpus_lowmem.sh ├── find_pairs.py ├── corpus2corpus.py ├── lett2ridx.py ├── filter_sent.py ├── candidates2bitextor.py ├── eval_sent.py ├── check_lett_lang.py ├── add_warc_locations.py ├── collect_domains.py ├── url_matching.py ├── locate_candidates.py ├── tar2ett.py ├── dictionary.md ├── html2text.py ├── candidates2corpus.py ├── strip_language_from_uri.py └── score_ngrams.py ├── metadata ├── __init__.py ├── lang_stats │ ├── __init__.py │ ├── old2new_stats.py │ ├── cld2helper.py │ ├── accumulate_langstats.py │ ├── percent_to_bytes.py │ ├── accumulate_stats.py │ └── join_stats.py ├── leveldb │ ├── Readme │ ├── Makefile │ ├── insertkv.cc │ └── updatekv.cc ├── meta_data_kv.sh ├── drop_links_from_json.py ├── extract_monolingual.sh ├── insert_kv.py ├── extract_links.sh ├── rocksdb │ ├── Makefile │ ├── rdb_options.h │ ├── insertkv.cc │ └── updatekv.cc ├── url_classifier │ ├── filter_languages.py │ └── filter_features.py ├── dump_keys.py ├── extract_pdflinks.sh ├── extract_location.sh ├── query_md.py ├── langstats2kv.py ├── count_uniq_urls.py ├── read_wet.py ├── add_lang_stats.py └── links_from_wat.py ├── merge └── metadata │ ├── __init__.py │ ├── lang_stats │ ├── __init__.py │ └── percent_to_bytes.py │ ├── drop_links_from_json.py │ ├── read_wet.py │ └── add_lang_stats.py ├── .gitignore ├── Results ├── .DS_Store ├── results_ev6.txt ├── results-ev12.txt ├── results-ev7.txt ├── results_ev4.xml └── results_ev9.txt ├── common_crawl_process.png ├── dicts ├── fix_encoding.py ├── dict_convert.py └── filter_giza.py ├── docaligner ├── numpy_text2npz.sh ├── numpy_text2npz.py ├── minmaxstd.py ├── htmlprocessor.py ├── page.py ├── counts2idf.py ├── nn.py ├── hash_lines.py ├── split_long_short.py ├── matching.py ├── table4paper.py ├── map_translations.py ├── eval_bitextor.py ├── tokenizer.py ├── ratio.py ├── extract.sh └── extract_dev_feats.sh ├── crawlertest ├── bitextor │ ├── extract_urls.py │ └── map_urls.py ├── httrack.sh ├── filename2url.py ├── httrack_pdf.sh └── bitextor_notes.txt ├── monolingual ├── README.md └── collect_lang.py ├── html_convert ├── example │ ├── example.html │ └── example.html~ ├── header.h ├── Makefile ├── anything_to_utf8.py ├── string_util.h └── html2text.cpp ├── requirements.txt ├── docalign_task └── eval_langid.py ├── README.md ├── INSTALL.md └── parseXML.py /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baseline/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /metadata/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baseline/util/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /merge/metadata/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /metadata/lang_stats/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /merge/metadata/lang_stats/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .history.buck 2 | *.pyc 3 | *.xz 4 | *.gz 5 | dc.sublime-workspace -------------------------------------------------------------------------------- /Results/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/Results/.DS_Store -------------------------------------------------------------------------------- /common_crawl_process.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/common_crawl_process.png -------------------------------------------------------------------------------- /dicts/fix_encoding.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | 6 | for line in sys.stdin: 7 | line = line.decode("utf-8").encode("iso-8859-1") # .decode("utf-8") 8 | line = line.strip() 9 | 10 | print line.strip() 11 | -------------------------------------------------------------------------------- /metadata/leveldb/Readme: -------------------------------------------------------------------------------- 1 | Building instructions: 2 | 3 | 1. Install libsnappy-dev and libgoogle-perftools-dev 4 | 2. Get leveldb 5 | 6 | cd build 7 | git clone git@github.com:google/leveldb.git 8 | cd leveldb 9 | make 10 | 11 | 3. Modify Makefile to point to leveldb includes 12 | 13 | -------------------------------------------------------------------------------- /metadata/meta_data_kv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #$1 /fs/gna0/buck/cc/links 4 | #$2 2013_20 5 | #$3 1368701562534 6 | 7 | # find $1/$2/$3/ | grep internal.links.gz | xargs zcat |\ 8 | zcat $1/$2/$3/*internal.links.gz | \ 9 | /home/buck/net/build/DataCollection/metadata/metadatabase.py $2 $3 | \ 10 | gzip -9 > $1/$2/$3/db_kv.gz -------------------------------------------------------------------------------- /docaligner/numpy_text2npz.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Exit as soon as any command fails 4 | set -e 5 | set -o pipefail 6 | 7 | OUTFILE=`basename $1`.npz 8 | DONEFILE=npy/${OUTFILE}.done 9 | 10 | if [ ! -f ${DONEFILE} ]; then 11 | nice python /home/buck/net/build/DataCollection/docaligner/numpy_text2npz.py $1 -out npy/${OUTFILE} 12 | touch ${DONEFILE} 13 | fi 14 | -------------------------------------------------------------------------------- /baseline/filter_emty_text_from_lett.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ Removes lines from .lett file where the last column containing base64 5 | encoded text is empty. This otherwise leads to problems downstream. """ 6 | 7 | import sys 8 | 9 | for line in sys.stdin: 10 | if line.split("\t")[-1].strip(): 11 | sys.stdout.write(line) 12 | -------------------------------------------------------------------------------- /metadata/drop_links_from_json.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Reads downloaded website from tar file and writes lett format to be 6 | processed by bitextor pipeline 7 | """ 8 | 9 | import sys 10 | import json 11 | 12 | for line in sys.stdin: 13 | domain, data = line.split(" ", 1) 14 | data = json.loads(data) 15 | data.pop("links") 16 | print domain, json.dumps(data) 17 | -------------------------------------------------------------------------------- /merge/metadata/drop_links_from_json.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Reads downloaded website from tar file and writes lett format to be 6 | processed by bitextor pipeline 7 | """ 8 | 9 | import sys 10 | import json 11 | 12 | for line in sys.stdin: 13 | domain, data = line.split(" ", 1) 14 | data = json.loads(data) 15 | data.pop("links") 16 | print domain, json.dumps(data) 17 | -------------------------------------------------------------------------------- /Results/results_ev6.txt: -------------------------------------------------------------------------------- 1 | Files Downloaded: 2 | ----------------------------------------------------- 3 | 1.xml AB Coring http://ab-carottage.fr/en/ 4 | 6.xml Contact "AB Coring http://ab-carottage.fr/en/contact/ 5 | 24.xml Contact « AB Carottage http://ab-carottage.fr/contact/ 6 | 2.xml AB Carottage http://ab-carottage.fr/ 7 | 8 | Files Mapped: 9 | ----------------------------------------------------- 10 | 6.xml 24.xml 11 | 2.xml 1.xml 12 | -------------------------------------------------------------------------------- /crawlertest/bitextor/extract_urls.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | import re 5 | 6 | if len(sys.argv) != 2: 7 | print "Usage: python "+sys.argv[0]+" language_id" 8 | exit() 9 | 10 | langid = sys.argv[1] 11 | 12 | r = re.compile("(\S*"+langid+"\S*?)\t(.*?)\t") 13 | for line in sys.stdin: 14 | m = r.match(line) 15 | if m: 16 | (urlsource, urltarget) = m.group(1,2) 17 | print urlsource+"\t"+urltarget 18 | 19 | -------------------------------------------------------------------------------- /metadata/extract_monolingual.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | set -o pipefail 5 | 6 | FILENAME=$(echo $1 | awk ' BEGIN { FS = "/" } { print $(NF-2) "/" $(NF)}') 7 | 8 | if [ ! -f ${FILENAME}.done ]; then 9 | curl -s $1 | gzip -cd | \ 10 | /fs/nas/heithrun0/commoncrawl/langsplit/bin/read_wet.py | \ 11 | /fs/nas/heithrun0/commoncrawl/langsplit/bin/langsplit --printchunks 2> /dev/null | \ 12 | xz -9 -e > ${FILENAME}.langsplit.xz 13 | touch ${FILENAME}.done 14 | fi 15 | -------------------------------------------------------------------------------- /monolingual/README.md: -------------------------------------------------------------------------------- 1 | For monolingual [Common Crawl](http://commoncrawl.org) data and code to process it please refer to these resources: 2 | * [University of Edinburgh N-gram site](http://statmt.org/ngrams) 3 | * Code to process corpora: https://github.com/kpu/preprocess 4 | * Code to produce raw monolingual files from CommonCrawl: https://github.com/treigerm/CommonCrawlProcessing 5 | * Alternative monolingual data extraction under development in ParaCrawl project: https://github.com/paracrawl/extractor 6 | -------------------------------------------------------------------------------- /metadata/insert_kv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | import leveldb 5 | 6 | if __name__ == "__main__": 7 | errors = 0 8 | import argparse 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('db', help='leveldb root directory') 12 | args = parser.parse_args(sys.argv[1:]) 13 | 14 | db = leveldb.LevelDB(args.db) 15 | 16 | for line in sys.stdin: 17 | k, v = line.rstrip().split("\t", 1) 18 | db.Put(k, v) 19 | 20 | sys.stderr.write("%s" % db.GetStats()) 21 | -------------------------------------------------------------------------------- /html_convert/example/example.html: -------------------------------------------------------------------------------- 1 | df6fa1abb58549287111ba8d776733e9 http://example.com/site.html 2 | 3 | This is some English text and it should hopefully be classified at such. 4 | 9 | 10 | Have some links: 11 | Google 12 | LREC paper. 13 | 14 | Am Ende finden wir dann noch etwas deutschen Text, wobei nicht klar ist ob die Menge ausreicht um ihn vom englishen zu unterscheiden. 15 | 16 | -------------------------------------------------------------------------------- /html_convert/example/example.html~: -------------------------------------------------------------------------------- 1 | df6fa1abb58549287111ba8d776733e9 http://example.com/site.html 2 | 3 | This is some English text and it should hopefully be classified at such. 4 | 9 | 10 | Have some links: 11 | Google 12 | LREC paper. 13 | 14 | Am Ende finden wir dann noch etwas deutschen Text, wobei nicht klar ist ob die Menge ausreicht um ihn vom englishen zu unterscheiden. 15 | 16 | -------------------------------------------------------------------------------- /baseline/add_warc_locations.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Downloads $1 which should be a CommonCrawl wat file, 4 | # extracts links, sorts by domainname and xzips the result 5 | 6 | # Exit as soon as any command fails 7 | set -e 8 | set -o pipefail 9 | 10 | # Directory in which this script is stored 11 | DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) 12 | 13 | PREFIX=$(dirname ${1}/x | awk -F '/' '{print $NF}') 14 | 15 | zcat ${1}/*.links.gz | \ 16 | python ${DIR}/add_warc_locations.py --prefix=${PREFIX}/ /home/achim/stats/found_urls.txt \ 17 | > ${1}/found_locations.txt 18 | touch ${1}/found_locations.done 19 | -------------------------------------------------------------------------------- /baseline/ngrams.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | # ngrams = defaultdict(lambda: defaultdict:) 6 | 7 | if __name__ == "__main__": 8 | import argparse 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('-n', type=int, default=4) 11 | args = parser.parse_args(sys.argv[1:]) 12 | 13 | for line in sys.stdin: 14 | filename, text = line.split("\t", 1) 15 | text = text.strip().split() 16 | for start in range(len(text) - args.n + 1): 17 | sys.stdout.write( 18 | "%s\t%s\n" % (" ".join(text[start:start + args.n]), filename)) 19 | -------------------------------------------------------------------------------- /metadata/leveldb/Makefile: -------------------------------------------------------------------------------- 1 | CXXFLAGS = -O2 -Wall -fmessage-length=0 2 | 3 | default: all 4 | 5 | insertkv: insertkv.o 6 | $(CXX) -o $@ $^ -pg -L/home/buck/net/build/leveldb -ltcmalloc -lleveldb -lpthread -lsnappy -Wl,-rpath=/home/buck/net/build/leveldb 7 | 8 | updatekv: updatekv.o 9 | $(CXX) -o $@ $^ -pg -L/home/buck/net/build/leveldb -ltcmalloc -lleveldb -lpthread -lsnappy -ljsoncpp -Wl,-rpath=/home/buck/net/build/leveldb 10 | 11 | 12 | %.o : %.cpp *.h Makefile 13 | @echo "***" $< "***" 14 | $(CXX) $(CXXFLAGS) -I/home/buck/net/build/leveldb/include -c $< -o $@ 15 | 16 | .PHONY : all clean 17 | all: insertkv updatekv 18 | 19 | clean: 20 | rm -f insertkv updatekv 21 | -------------------------------------------------------------------------------- /baseline/dedupe.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Usage: dedupe.sh 4 | 5 | # Location of dedupe binary built from https://github.com/kpu/preprocess 6 | dedupebin=~/preprocess/bin/dedupe 7 | 8 | if [ ! -d $1 ] 9 | then 10 | echo "$1 is not a folder." 11 | exit 12 | fi 13 | 14 | if [ -e $2] 15 | then 16 | if [ ! -d $2 ] 17 | then 18 | echo "$2 is not a directory." 19 | exit 20 | fi 21 | else 22 | mkdir -p $2 23 | fi 24 | 25 | for insrc in $1/*.$3 26 | do 27 | intgt=${insrc%.$3}.$4 28 | outsrc=$2/${insrc##*/} 29 | outtgt=$2/${intgt##*/} 30 | `$dedupebin $insrc $intgt $outsrc $outtgt` 31 | done 32 | 33 | -------------------------------------------------------------------------------- /dicts/dict_convert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Converts a bilingual dictionary from Eduard Barbu's format 5 | # to a word-based dictionary usable as a dictionary for Bitextor's 6 | # sentence aligner 7 | # 8 | # Usage: python dict_convert.py < input_dict > output_dict 9 | 10 | import sys 11 | 12 | # TBD: output language identifiers here; read from command line? 13 | for line in sys.stdin: 14 | line = line.rstrip('\r\n') 15 | entry = line.split('@#@') 16 | source = entry[0].split() 17 | if len(source) != 1: 18 | continue 19 | target = entry[1].split() 20 | if len(target) != 1: 21 | continue 22 | print source[0]+'\t'+target[0] 23 | 24 | -------------------------------------------------------------------------------- /crawlertest/httrack.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [ $# != 3 ]; then 4 | echo "Usage: $0 start-url output-directory" 5 | echo "* start-url: initial seed url" 6 | echo "* output-directory: write logs and downloads to this dir" 7 | exit 8 | fi 9 | 10 | httrack \ 11 | --connection-per-second=20 \ 12 | --sockets=10 \ 13 | --keep-alive \ 14 | --disable-security-limits \ 15 | --max-rate=500000 \ 16 | --display \ 17 | --verbose \ 18 | --advanced-progressinfo \ 19 | --continue \ 20 | --robots=0 \ 21 | --urlhack \ 22 | --index=0 \ 23 | -m \ 24 | -F 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36' \ 25 | -#L500000000 \ 26 | --skeleton \ 27 | --path=$2 \ 28 | $1 29 | -------------------------------------------------------------------------------- /docaligner/numpy_text2npz.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import numpy as np 4 | import sys 5 | 6 | if __name__ == "__main__": 7 | import argparse 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('infile', help='input text format matrix') 10 | parser.add_argument('-outfile', help='output npz file') 11 | args = parser.parse_args(sys.argv[1:]) 12 | 13 | if args.infile.endswith('npz'): 14 | m = np.load(args.infile)['m'] 15 | else: 16 | m = np.loadtxt(args.infile) 17 | print "Loaded ", args.infile, " of shape ", m.shape 18 | if args.outfile: 19 | np.save(args.outfile, m) 20 | print "Wrote ", args.outfile, " of shape ", m.shape 21 | -------------------------------------------------------------------------------- /baseline/text2langstats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | from collections import defaultdict 6 | 7 | magic_number = "df6fa1abb58549287111ba8d776733e9" 8 | 9 | 10 | langstats = defaultdict(int) 11 | 12 | lang = None 13 | for line in sys.stdin: 14 | if line.startswith(magic_number): 15 | # df6fa1abb58549287111ba8d776733e9 16 | # http://www.achpr.org/about/documentation-centre/ language:en 17 | # offset:200 bytes: 3424 18 | lang = line.split()[2].split(":")[-1] 19 | continue 20 | langstats[lang] += len(line.decode("utf-8").strip()) 21 | 22 | for lang, num_bytes in langstats.items(): 23 | sys.stdout.write("%s\t%d\n" % (lang, num_bytes)) 24 | -------------------------------------------------------------------------------- /metadata/extract_links.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Downloads $1 which should be a CommonCrawl wat file, 4 | # extracts links, sorts by domainname and xzips the result 5 | 6 | # Exit as soon as any command fails 7 | set -e 8 | 9 | FILENAME=`echo $1 | awk ' BEGIN { FS = "/" } { print $(NF-2) "/" $(NF) }'` 10 | OUTFILE=${FILENAME/warc.wat.gz/links.xz} 11 | TMPDIR=./tmp/`hostname` 12 | mkdir -p ${TMPDIR} 13 | 14 | # Directory in which this script is stored 15 | DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) 16 | 17 | curl -s --retry 5 $1 | gzip -cd | ${DIR}/links_from_wat.py | sort -t" " -S500M -k1,1 --compress-program=pigz --temporary-directory=${TMPDIR} --parallel=2 | uniq | /home/buck/net/build/pxz/pxz -T 2 -9 -e > ${OUTFILE} && touch ${OUTFILE/links.xz/done} 18 | -------------------------------------------------------------------------------- /metadata/rocksdb/Makefile: -------------------------------------------------------------------------------- 1 | CXXFLAGS = -O3 -Wall -Wextra -Wsign-compare -Wshadow -Wno-unused-parameter -std=c++11 -fmessage-length=0 -Wfatal-errors -I/home/buck/net/build/rocksdb/include/ 2 | 3 | default: all 4 | 5 | insertkv: insertkv.o 6 | $(CXX) -o $@ $^ -static -L/home/buck/net/build/rocksdb -lrocksdb -lrt -lz -lbz2 -lpthread -lsnappy -Wl,-rpath=/home/buck/net/build/rocksdb 7 | 8 | updatekv: updatekv.o 9 | $(CXX) -o $@ $^ -static -L/home/buck/net/build/rocksdb -lrocksdb -lrt -lz -lbz2 -lpthread -lsnappy -ljsoncpp -Wl,-rpath=/home/buck/net/build/rocksdb 10 | 11 | 12 | %.o : %.cc *.h Makefile 13 | @echo "***" $< "***" 14 | $(CXX) $(CXXFLAGS) -c $< -o $@ 15 | 16 | .PHONY : all clean 17 | all: insertkv updatekv 18 | 19 | clean: 20 | rm -f insertkv updatekv 21 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.4.1 2 | cffi==0.9.2 3 | chardet==2.3.0 4 | CherryPy==3.7.0 5 | cld2-cffi==0.1.1 6 | cryptography==0.9 7 | Cython==0.23.1 8 | enum34==1.0.4 9 | html5lib==0.99999 10 | idna==2.0 11 | ipaddress==1.0.7 12 | ipython==3.1.0 13 | jsonrpc==1.2 14 | jsonrpclib==0.1.3 15 | langid==1.1.4.dev0 16 | leveldb==0.193 17 | line-profiler==1.0 18 | lxml==3.4.4 19 | munkres==1.0.7 20 | ndg-httpsclient==0.4.0 21 | nltk==3.0.2 22 | numpy==1.9.2 23 | pexpect==3.3 24 | pyasn1==0.1.7 25 | pycparser==2.14 26 | pyOpenSSL==0.15.1 27 | pyrocksdb==0.4 28 | python-Levenshtein==0.12.0 29 | regex==2015.9.15 30 | requests==2.7.0 31 | scipy==0.15.1 32 | simhash==1.6.2 33 | simplejson==3.8.0 34 | six==1.9.0 35 | tldextract==1.6 36 | Unidecode==0.4.18 37 | urltools==0.3.2 38 | xmltodict==0.9.2 39 | -------------------------------------------------------------------------------- /Results/results-ev12.txt: -------------------------------------------------------------------------------- 1 | Files Downloaded: 2 | ----------------------------------------------------- 3 | 65.xml Philosophy: Simplicity pays off http://tekstwerk.com/philosophy 4 | 60.xml Wir über uns: tekstwerk•com kennenlernen http://tekstwerk.com/de/wir-ueber-uns 5 | 52.xml Nieuws http://tekstwerk.com/de/news 6 | 63.xml Nieuws http://tekstwerk.com/news 7 | 19.xml Introduction: Learn more about tekstwerk•com http://tekstwerk.com/en/introduction 8 | 10.xml Portfolio http://www.tekstwerk.com/en/portfolio 9 | 33.xml Philosophie: Einfachheit lohnt http://www.tekstwerk.com/de/philosophie 10 | 55.xml Portfolio http://tekstwerk.com/de/portfolio 11 | 12 | Files Mapped: 13 | ----------------------------------------------------- 14 | 10.xml 55.xml 15 | 52.xml 63.xml 16 | 65.xml 33.xml 17 | 19.xml 60.xml 18 | -------------------------------------------------------------------------------- /docaligner/minmaxstd.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import numpy as np 4 | import sys 5 | import gzip 6 | 7 | if __name__ == "__main__": 8 | import argparse 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('infile', help='input text format matrix') 11 | parser.add_argument('-outfile', help='output npz file') 12 | args = parser.parse_args(sys.argv[1:]) 13 | 14 | fh = open(args.infile, 'r') 15 | if args.infile.endswith('.gz'): 16 | fh = gzip.open(args.infile) 17 | m = np.load(fh) 18 | 19 | print "Loaded ", args.infile, " of shape ", m.shape 20 | print "Std\t", np.std(m) 21 | print "Min\t", np.min(m) 22 | print "Max\t", np.max(m) 23 | print "Mean\t", np.average(m) 24 | print "Median\t", np.median(m) 25 | -------------------------------------------------------------------------------- /monolingual/collect_lang.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | import argparse 5 | 6 | magic_number = 'df6fa1abb58549287111ba8d776733e9' 7 | 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('-lang', help='language code') 10 | args = parser.parse_args() 11 | 12 | buf = [] 13 | keep = False 14 | for line in sys.stdin: 15 | if line.startswith(magic_number): 16 | if buf: 17 | assert keep is True 18 | sys.stdout.write("".join(buf)) 19 | 20 | keep = False 21 | buf = [] 22 | 23 | if "language:%s" % args.lang in line.strip().split(): 24 | keep = True 25 | 26 | if keep: 27 | buf.append(line) 28 | 29 | if buf: 30 | assert keep is True 31 | sys.stdout.write("".join(buf)) 32 | -------------------------------------------------------------------------------- /metadata/url_classifier/filter_languages.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | 6 | if __name__ == "__main__": 7 | import argparse 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('--valid', type=argparse.FileType(), 10 | help='file containing valid label, one per line') 11 | parser.add_argument('--default', 12 | help='replacement for invalid labels') 13 | args = parser.parse_args(sys.argv[1:]) 14 | 15 | valid = set([l.strip() for l in args.valid]) 16 | 17 | for line in sys.stdin: 18 | label, feats = line.split("\t", 1) 19 | if label not in valid: 20 | label = args.default 21 | if label: 22 | sys.stdout.write("%s\t%s" % (label, feats)) 23 | -------------------------------------------------------------------------------- /metadata/dump_keys.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | import rocksdb 5 | 6 | if __name__ == "__main__": 7 | import argparse 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('db', 10 | help='path to rocksdb') 11 | parser.add_argument( 12 | '-outfile', help='output file', type=argparse.FileType('w'), 13 | default=sys.stdout) 14 | args = parser.parse_args() 15 | 16 | opts = rocksdb.Options() 17 | opts.create_if_missing = False 18 | opts.max_open_files = 100 19 | opts.num_levels = 6 20 | db = rocksdb.DB(args.db, opts, read_only=True) 21 | it = db.iterkeys() 22 | it.seek_to_first() 23 | for key in it: 24 | tld, url, crawl = key.split(" ", 2) 25 | args.outfile.write(url + "\n") 26 | -------------------------------------------------------------------------------- /metadata/extract_pdflinks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Downloads $1 which should be a CommonCrawl wat file, 4 | # extracts links, sorts by domainname and xzips the result 5 | 6 | # Exit as soon as any command fails 7 | set -e 8 | set -o pipefail 9 | 10 | FILENAME=`echo $1 | awk ' BEGIN { FS = "/" } { print $(NF-2) "/" $(NF) }'` 11 | OUTFILE=${FILENAME/warc.wat.gz/pdflinks.xz} 12 | 13 | # don't let temporary sort files fill up local /tmp 14 | TMPDIR=./tmp/`hostname` 15 | mkdir -p ${TMPDIR} 16 | 17 | # Directory in which this script is stored 18 | DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) 19 | 20 | if [ ! -f ${OUTFILE/.xz/.done} ]; then 21 | curl -s --retry 5 $1 | \ 22 | gzip -cd | \ 23 | ${DIR}/links_from_wat.py -pdf | \ 24 | sort -t" " -S500M -k1,1 --compress-program=pigz --temporary-directory=${TMPDIR} -u --parallel=2 | \ 25 | xz -9 -e \ 26 | > ${OUTFILE} 27 | touch ${OUTFILE/.xz/.done} 28 | fi 29 | 30 | 31 | -------------------------------------------------------------------------------- /html_convert/header.h: -------------------------------------------------------------------------------- 1 | #ifndef HEADER_H_ 2 | #define HEADER_H_ 3 | 4 | #include 5 | 6 | #include "string_util.h" 7 | 8 | namespace { 9 | 10 | using std::string; 11 | 12 | class Header { 13 | public: 14 | explicit Header(const string& header) { 15 | for (const auto& value : StringUtil::Split(header, ' ')) { 16 | if (value.find("tld:") == 0) { 17 | tld_ = value.substr(4); 18 | } else if (value.find("uri:") == 0) { 19 | uri_ = value.substr(4); 20 | } else if (value.find("encoding:") == 0) { 21 | encoding_ = value.substr(9); 22 | } 23 | } 24 | } 25 | 26 | const string get_tld() const { return tld_; } 27 | const string get_uri() const { return uri_; } 28 | const string get_encoding() const { return encoding_; } 29 | 30 | private: 31 | string uri_; 32 | string tld_; 33 | string encoding_; 34 | }; 35 | 36 | } // namespace 37 | 38 | #endif /* HEADER_H_ */ 39 | -------------------------------------------------------------------------------- /docaligner/htmlprocessor.py: -------------------------------------------------------------------------------- 1 | from HTMLParser import HTMLParser 2 | 3 | 4 | class HTMLSequencer(HTMLParser): 5 | 6 | def __init__(self, length_function, growth_function): 7 | HTMLParser.__init__(self) 8 | self.sequence = [] 9 | self.length_function = length_function 10 | self.growth_function = growth_function 11 | 12 | def handle_starttag(self, tag, attrs): 13 | self.sequence.append("<%s>" % tag) 14 | 15 | def handle_endtag(self, tag): 16 | self.sequence.append("" % tag) 17 | 18 | def handle_data(self, data): 19 | if not data.strip(): 20 | return 21 | n = self.length_function(data) 22 | 23 | for n in range(int(self.growth_function(n))): 24 | self.sequence.append("%d" % n) 25 | 26 | def get_result(self): 27 | return self.sequence 28 | 29 | def reset(self): 30 | HTMLParser.reset(self) 31 | self.sequence = [] 32 | -------------------------------------------------------------------------------- /metadata/extract_location.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Downloads $1 which should be a CommonCrawl wat file, 4 | # extracts links, sorts by domainname and xzips the result 5 | 6 | # Exit as soon as any command fails 7 | set -e 8 | set -o pipefail 9 | 10 | FILENAME=`echo $1 | awk ' BEGIN { FS = "/" } { print $(NF-2) "/" $(NF) }'` 11 | OUTFILE=${FILENAME/warc.wat.gz/meta.xz} 12 | 13 | # don't let temporary sort files fill up local /tmp 14 | TMPDIR=./tmp/`hostname` 15 | mkdir -p ${TMPDIR} 16 | 17 | # Directory in which this script is stored 18 | DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) 19 | 20 | if [ ! -f ${OUTFILE/.xz/.done} ]; then 21 | curl -s --retry 5 $1 | \ 22 | gzip -cd | \ 23 | ${DIR}/links_from_wat.py -nolinks | \ 24 | sort -t" " -S500M -k1,1 --compress-program=pigz --temporary-directory=${TMPDIR} --parallel=2 | \ 25 | uniq | \ 26 | /home/buck/net/build/pxz/pxz -T 2 -9 -e \ 27 | > ${OUTFILE} 28 | touch ${OUTFILE/.xz/.done} 29 | fi 30 | 31 | 32 | -------------------------------------------------------------------------------- /metadata/lang_stats/old2new_stats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import tldextract 6 | 7 | from cld2helper import read_cld2_languages 8 | 9 | 10 | def get_domain(netloc): 11 | extract = tldextract.extract(netloc) 12 | return ".".join((extract.domain, extract.suffix)).encode('idna') 13 | 14 | 15 | if __name__ == "__main__": 16 | import argparse 17 | 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('infile', nargs='?', type=argparse.FileType('r')) 20 | args = parser.parse_args() 21 | 22 | name2code, code2name = read_cld2_languages(args.infile) 23 | 24 | for line in sys.stdin: 25 | domain, language, num_bytes = line.split() 26 | assert language in code2name 27 | language = code2name[language] 28 | # domain = get_domain(domain) 29 | 30 | sys.stdout.write("%s %s %d\n" % (domain, language, int(num_bytes))) 31 | 32 | 33 | # en.wikipedia.org xx-Kali 274 34 | -------------------------------------------------------------------------------- /docaligner/page.py: -------------------------------------------------------------------------------- 1 | class Page(object): 2 | 3 | def __init__(self, url, html, text, mime_type, 4 | encoding, french, english, english_mt): 5 | self.url = url 6 | self.html = html 7 | self.text = text 8 | self.mime_type = mime_type 9 | self.encoding = encoding 10 | self.french = french 11 | self.english = english 12 | self.english_mt = english_mt 13 | 14 | def __str__(self): 15 | res = [] 16 | res.append("--Page--") 17 | res.append("url : %s" % self.url) 18 | res.append("html : %s" % self.html) 19 | res.append("text : %s" % self.text.encode('utf-8')) 20 | res.append("mime_type : %s" % self.mime_type) 21 | res.append("encoding : %s" % self.encoding) 22 | res.append("french : %s" % self.french.encode('utf-8')) 23 | res.append("english : %s" % self.english.encode('utf-8')) 24 | res.append("english_mt : %s" % self.english_mt.encode('utf-8')) 25 | return "\n".join(res) 26 | -------------------------------------------------------------------------------- /crawlertest/bitextor/map_urls.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | 5 | if __name__ == "__main__": 6 | import argparse 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('mapping', help='mapping from filename to url', 9 | type=argparse.FileType('r')) 10 | args = parser.parse_args(sys.argv[1:]) 11 | 12 | mapping = {} 13 | for line in args.mapping: 14 | filename, url = line.strip().split() 15 | assert filename not in mapping, "Repeated value: %s\n" % line 16 | mapping[filename] = url 17 | 18 | for line in sys.stdin: 19 | filesource, filetarget = line.strip().split() 20 | if filesource in mapping: 21 | if filetarget in mapping: 22 | print mapping[filesource] + "\t" + mapping[filetarget] 23 | else: 24 | sys.stderr.write( 25 | "Target file mapping not found:" + filetarget + "\n") 26 | else: 27 | sys.stderr.write( 28 | "Source file mapping not found:" + filesource + "\n") 29 | -------------------------------------------------------------------------------- /crawlertest/filename2url.py: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Extract original URL from httrack webdir 6 | """ 7 | 8 | import sys 9 | import re 10 | 11 | # Example line we're looking for: 12 | #