├── __init__.py
├── baseline
├── __init__.py
├── util
│ └── __init__.py
├── filter_emty_text_from_lett.py
├── add_warc_locations.sh
├── ngrams.py
├── dedupe.sh
├── text2langstats.py
├── lett_viewer.py
├── bitextor_util
│ ├── bitextorutil.py
│ ├── show_bitextor_docs.py
│ ├── wordcounts.py
│ └── lett2ridx_combine.py
├── dumptar.py
├── filter_tmx.sh
├── strip_headers.py
├── download_domain.py
├── corpus_by_domain.py
├── download_candidates.py
├── download_and_align.sh
├── lett2corpus_lowmem.sh
├── find_pairs.py
├── corpus2corpus.py
├── lett2ridx.py
├── filter_sent.py
├── candidates2bitextor.py
├── eval_sent.py
├── check_lett_lang.py
├── add_warc_locations.py
├── collect_domains.py
├── url_matching.py
├── locate_candidates.py
├── tar2ett.py
├── dictionary.md
├── html2text.py
├── candidates2corpus.py
├── strip_language_from_uri.py
└── score_ngrams.py
├── metadata
├── __init__.py
├── lang_stats
│ ├── __init__.py
│ ├── old2new_stats.py
│ ├── cld2helper.py
│ ├── accumulate_langstats.py
│ ├── percent_to_bytes.py
│ ├── accumulate_stats.py
│ └── join_stats.py
├── leveldb
│ ├── Readme
│ ├── Makefile
│ ├── insertkv.cc
│ └── updatekv.cc
├── meta_data_kv.sh
├── drop_links_from_json.py
├── extract_monolingual.sh
├── insert_kv.py
├── extract_links.sh
├── rocksdb
│ ├── Makefile
│ ├── rdb_options.h
│ ├── insertkv.cc
│ └── updatekv.cc
├── url_classifier
│ ├── filter_languages.py
│ └── filter_features.py
├── dump_keys.py
├── extract_pdflinks.sh
├── extract_location.sh
├── query_md.py
├── langstats2kv.py
├── count_uniq_urls.py
├── read_wet.py
├── add_lang_stats.py
└── links_from_wat.py
├── merge
└── metadata
│ ├── __init__.py
│ ├── lang_stats
│ ├── __init__.py
│ └── percent_to_bytes.py
│ ├── drop_links_from_json.py
│ ├── read_wet.py
│ └── add_lang_stats.py
├── .gitignore
├── Results
├── .DS_Store
├── results_ev6.txt
├── results-ev12.txt
├── results-ev7.txt
├── results_ev4.xml
└── results_ev9.txt
├── common_crawl_process.png
├── dicts
├── fix_encoding.py
├── dict_convert.py
└── filter_giza.py
├── docaligner
├── numpy_text2npz.sh
├── numpy_text2npz.py
├── minmaxstd.py
├── htmlprocessor.py
├── page.py
├── counts2idf.py
├── nn.py
├── hash_lines.py
├── split_long_short.py
├── matching.py
├── table4paper.py
├── map_translations.py
├── eval_bitextor.py
├── tokenizer.py
├── ratio.py
├── extract.sh
└── extract_dev_feats.sh
├── crawlertest
├── bitextor
│ ├── extract_urls.py
│ └── map_urls.py
├── httrack.sh
├── filename2url.py
├── httrack_pdf.sh
└── bitextor_notes.txt
├── monolingual
├── README.md
└── collect_lang.py
├── html_convert
├── example
│ ├── example.html
│ └── example.html~
├── header.h
├── Makefile
├── anything_to_utf8.py
├── string_util.h
└── html2text.cpp
├── requirements.txt
├── docalign_task
└── eval_langid.py
├── README.md
├── INSTALL.md
└── parseXML.py
/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/baseline/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/metadata/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/baseline/util/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/merge/metadata/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/metadata/lang_stats/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/merge/metadata/lang_stats/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .history.buck
2 | *.pyc
3 | *.xz
4 | *.gz
5 | dc.sublime-workspace
--------------------------------------------------------------------------------
/Results/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/Results/.DS_Store
--------------------------------------------------------------------------------
/common_crawl_process.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/common_crawl_process.png
--------------------------------------------------------------------------------
/dicts/fix_encoding.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import sys
5 |
6 | for line in sys.stdin:
7 | line = line.decode("utf-8").encode("iso-8859-1") # .decode("utf-8")
8 | line = line.strip()
9 |
10 | print line.strip()
11 |
--------------------------------------------------------------------------------
/metadata/leveldb/Readme:
--------------------------------------------------------------------------------
1 | Building instructions:
2 |
3 | 1. Install libsnappy-dev and libgoogle-perftools-dev
4 | 2. Get leveldb
5 |
6 | cd build
7 | git clone git@github.com:google/leveldb.git
8 | cd leveldb
9 | make
10 |
11 | 3. Modify Makefile to point to leveldb includes
12 |
13 |
--------------------------------------------------------------------------------
/metadata/meta_data_kv.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #$1 /fs/gna0/buck/cc/links
4 | #$2 2013_20
5 | #$3 1368701562534
6 |
7 | # find $1/$2/$3/ | grep internal.links.gz | xargs zcat |\
8 | zcat $1/$2/$3/*internal.links.gz | \
9 | /home/buck/net/build/DataCollection/metadata/metadatabase.py $2 $3 | \
10 | gzip -9 > $1/$2/$3/db_kv.gz
--------------------------------------------------------------------------------
/docaligner/numpy_text2npz.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Exit as soon as any command fails
4 | set -e
5 | set -o pipefail
6 |
7 | OUTFILE=`basename $1`.npz
8 | DONEFILE=npy/${OUTFILE}.done
9 |
10 | if [ ! -f ${DONEFILE} ]; then
11 | nice python /home/buck/net/build/DataCollection/docaligner/numpy_text2npz.py $1 -out npy/${OUTFILE}
12 | touch ${DONEFILE}
13 | fi
14 |
--------------------------------------------------------------------------------
/baseline/filter_emty_text_from_lett.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | """ Removes lines from .lett file where the last column containing base64
5 | encoded text is empty. This otherwise leads to problems downstream. """
6 |
7 | import sys
8 |
9 | for line in sys.stdin:
10 | if line.split("\t")[-1].strip():
11 | sys.stdout.write(line)
12 |
--------------------------------------------------------------------------------
/metadata/drop_links_from_json.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | """
5 | Reads downloaded website from tar file and writes lett format to be
6 | processed by bitextor pipeline
7 | """
8 |
9 | import sys
10 | import json
11 |
12 | for line in sys.stdin:
13 | domain, data = line.split(" ", 1)
14 | data = json.loads(data)
15 | data.pop("links")
16 | print domain, json.dumps(data)
17 |
--------------------------------------------------------------------------------
/merge/metadata/drop_links_from_json.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | """
5 | Reads downloaded website from tar file and writes lett format to be
6 | processed by bitextor pipeline
7 | """
8 |
9 | import sys
10 | import json
11 |
12 | for line in sys.stdin:
13 | domain, data = line.split(" ", 1)
14 | data = json.loads(data)
15 | data.pop("links")
16 | print domain, json.dumps(data)
17 |
--------------------------------------------------------------------------------
/Results/results_ev6.txt:
--------------------------------------------------------------------------------
1 | Files Downloaded:
2 | -----------------------------------------------------
3 | 1.xml AB Coring http://ab-carottage.fr/en/
4 | 6.xml Contact "AB Coring http://ab-carottage.fr/en/contact/
5 | 24.xml Contact « AB Carottage http://ab-carottage.fr/contact/
6 | 2.xml AB Carottage http://ab-carottage.fr/
7 |
8 | Files Mapped:
9 | -----------------------------------------------------
10 | 6.xml 24.xml
11 | 2.xml 1.xml
12 |
--------------------------------------------------------------------------------
/crawlertest/bitextor/extract_urls.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | import sys
4 | import re
5 |
6 | if len(sys.argv) != 2:
7 | print "Usage: python "+sys.argv[0]+" language_id"
8 | exit()
9 |
10 | langid = sys.argv[1]
11 |
12 | r = re.compile("(\S*"+langid+"\S*?)\t(.*?)\t")
13 | for line in sys.stdin:
14 | m = r.match(line)
15 | if m:
16 | (urlsource, urltarget) = m.group(1,2)
17 | print urlsource+"\t"+urltarget
18 |
19 |
--------------------------------------------------------------------------------
/metadata/extract_monolingual.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 | set -o pipefail
5 |
6 | FILENAME=$(echo $1 | awk ' BEGIN { FS = "/" } { print $(NF-2) "/" $(NF)}')
7 |
8 | if [ ! -f ${FILENAME}.done ]; then
9 | curl -s $1 | gzip -cd | \
10 | /fs/nas/heithrun0/commoncrawl/langsplit/bin/read_wet.py | \
11 | /fs/nas/heithrun0/commoncrawl/langsplit/bin/langsplit --printchunks 2> /dev/null | \
12 | xz -9 -e > ${FILENAME}.langsplit.xz
13 | touch ${FILENAME}.done
14 | fi
15 |
--------------------------------------------------------------------------------
/monolingual/README.md:
--------------------------------------------------------------------------------
1 | For monolingual [Common Crawl](http://commoncrawl.org) data and code to process it please refer to these resources:
2 | * [University of Edinburgh N-gram site](http://statmt.org/ngrams)
3 | * Code to process corpora: https://github.com/kpu/preprocess
4 | * Code to produce raw monolingual files from CommonCrawl: https://github.com/treigerm/CommonCrawlProcessing
5 | * Alternative monolingual data extraction under development in ParaCrawl project: https://github.com/paracrawl/extractor
6 |
--------------------------------------------------------------------------------
/metadata/insert_kv.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | import leveldb
5 |
6 | if __name__ == "__main__":
7 | errors = 0
8 | import argparse
9 |
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('db', help='leveldb root directory')
12 | args = parser.parse_args(sys.argv[1:])
13 |
14 | db = leveldb.LevelDB(args.db)
15 |
16 | for line in sys.stdin:
17 | k, v = line.rstrip().split("\t", 1)
18 | db.Put(k, v)
19 |
20 | sys.stderr.write("%s" % db.GetStats())
21 |
--------------------------------------------------------------------------------
/html_convert/example/example.html:
--------------------------------------------------------------------------------
1 | df6fa1abb58549287111ba8d776733e9 http://example.com/site.html
2 |
3 | This is some English text and it should hopefully be classified at such.
4 |
5 | Single.
6 | Word.
7 | Lines.
8 |
9 |
10 | Have some links:
11 | Google
12 | LREC paper .
13 |
14 | Am Ende finden wir dann noch etwas deutschen Text, wobei nicht klar ist ob die Menge ausreicht um ihn vom englishen zu unterscheiden.
15 |
16 |
--------------------------------------------------------------------------------
/html_convert/example/example.html~:
--------------------------------------------------------------------------------
1 | df6fa1abb58549287111ba8d776733e9 http://example.com/site.html
2 |
3 | This is some English text and it should hopefully be classified at such.
4 |
5 | Single.
6 | Word.
7 | Lines.
8 |
9 |
10 | Have some links:
11 | Google
12 | LREC paper .
13 |
14 | Am Ende finden wir dann noch etwas deutschen Text, wobei nicht klar ist ob die Menge ausreicht um ihn vom englishen zu unterscheiden.
15 |
16 |
--------------------------------------------------------------------------------
/baseline/add_warc_locations.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Downloads $1 which should be a CommonCrawl wat file,
4 | # extracts links, sorts by domainname and xzips the result
5 |
6 | # Exit as soon as any command fails
7 | set -e
8 | set -o pipefail
9 |
10 | # Directory in which this script is stored
11 | DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
12 |
13 | PREFIX=$(dirname ${1}/x | awk -F '/' '{print $NF}')
14 |
15 | zcat ${1}/*.links.gz | \
16 | python ${DIR}/add_warc_locations.py --prefix=${PREFIX}/ /home/achim/stats/found_urls.txt \
17 | > ${1}/found_locations.txt
18 | touch ${1}/found_locations.done
19 |
--------------------------------------------------------------------------------
/baseline/ngrams.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import sys
5 | # ngrams = defaultdict(lambda: defaultdict:)
6 |
7 | if __name__ == "__main__":
8 | import argparse
9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('-n', type=int, default=4)
11 | args = parser.parse_args(sys.argv[1:])
12 |
13 | for line in sys.stdin:
14 | filename, text = line.split("\t", 1)
15 | text = text.strip().split()
16 | for start in range(len(text) - args.n + 1):
17 | sys.stdout.write(
18 | "%s\t%s\n" % (" ".join(text[start:start + args.n]), filename))
19 |
--------------------------------------------------------------------------------
/metadata/leveldb/Makefile:
--------------------------------------------------------------------------------
1 | CXXFLAGS = -O2 -Wall -fmessage-length=0
2 |
3 | default: all
4 |
5 | insertkv: insertkv.o
6 | $(CXX) -o $@ $^ -pg -L/home/buck/net/build/leveldb -ltcmalloc -lleveldb -lpthread -lsnappy -Wl,-rpath=/home/buck/net/build/leveldb
7 |
8 | updatekv: updatekv.o
9 | $(CXX) -o $@ $^ -pg -L/home/buck/net/build/leveldb -ltcmalloc -lleveldb -lpthread -lsnappy -ljsoncpp -Wl,-rpath=/home/buck/net/build/leveldb
10 |
11 |
12 | %.o : %.cpp *.h Makefile
13 | @echo "***" $< "***"
14 | $(CXX) $(CXXFLAGS) -I/home/buck/net/build/leveldb/include -c $< -o $@
15 |
16 | .PHONY : all clean
17 | all: insertkv updatekv
18 |
19 | clean:
20 | rm -f insertkv updatekv
21 |
--------------------------------------------------------------------------------
/baseline/dedupe.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Usage: dedupe.sh
4 |
5 | # Location of dedupe binary built from https://github.com/kpu/preprocess
6 | dedupebin=~/preprocess/bin/dedupe
7 |
8 | if [ ! -d $1 ]
9 | then
10 | echo "$1 is not a folder."
11 | exit
12 | fi
13 |
14 | if [ -e $2]
15 | then
16 | if [ ! -d $2 ]
17 | then
18 | echo "$2 is not a directory."
19 | exit
20 | fi
21 | else
22 | mkdir -p $2
23 | fi
24 |
25 | for insrc in $1/*.$3
26 | do
27 | intgt=${insrc%.$3}.$4
28 | outsrc=$2/${insrc##*/}
29 | outtgt=$2/${intgt##*/}
30 | `$dedupebin $insrc $intgt $outsrc $outtgt`
31 | done
32 |
33 |
--------------------------------------------------------------------------------
/dicts/dict_convert.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # Converts a bilingual dictionary from Eduard Barbu's format
5 | # to a word-based dictionary usable as a dictionary for Bitextor's
6 | # sentence aligner
7 | #
8 | # Usage: python dict_convert.py < input_dict > output_dict
9 |
10 | import sys
11 |
12 | # TBD: output language identifiers here; read from command line?
13 | for line in sys.stdin:
14 | line = line.rstrip('\r\n')
15 | entry = line.split('@#@')
16 | source = entry[0].split()
17 | if len(source) != 1:
18 | continue
19 | target = entry[1].split()
20 | if len(target) != 1:
21 | continue
22 | print source[0]+'\t'+target[0]
23 |
24 |
--------------------------------------------------------------------------------
/crawlertest/httrack.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | if [ $# != 3 ]; then
4 | echo "Usage: $0 start-url output-directory"
5 | echo "* start-url: initial seed url"
6 | echo "* output-directory: write logs and downloads to this dir"
7 | exit
8 | fi
9 |
10 | httrack \
11 | --connection-per-second=20 \
12 | --sockets=10 \
13 | --keep-alive \
14 | --disable-security-limits \
15 | --max-rate=500000 \
16 | --display \
17 | --verbose \
18 | --advanced-progressinfo \
19 | --continue \
20 | --robots=0 \
21 | --urlhack \
22 | --index=0 \
23 | -m \
24 | -F 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36' \
25 | -#L500000000 \
26 | --skeleton \
27 | --path=$2 \
28 | $1
29 |
--------------------------------------------------------------------------------
/docaligner/numpy_text2npz.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import numpy as np
4 | import sys
5 |
6 | if __name__ == "__main__":
7 | import argparse
8 | parser = argparse.ArgumentParser()
9 | parser.add_argument('infile', help='input text format matrix')
10 | parser.add_argument('-outfile', help='output npz file')
11 | args = parser.parse_args(sys.argv[1:])
12 |
13 | if args.infile.endswith('npz'):
14 | m = np.load(args.infile)['m']
15 | else:
16 | m = np.loadtxt(args.infile)
17 | print "Loaded ", args.infile, " of shape ", m.shape
18 | if args.outfile:
19 | np.save(args.outfile, m)
20 | print "Wrote ", args.outfile, " of shape ", m.shape
21 |
--------------------------------------------------------------------------------
/baseline/text2langstats.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import sys
5 | from collections import defaultdict
6 |
7 | magic_number = "df6fa1abb58549287111ba8d776733e9"
8 |
9 |
10 | langstats = defaultdict(int)
11 |
12 | lang = None
13 | for line in sys.stdin:
14 | if line.startswith(magic_number):
15 | # df6fa1abb58549287111ba8d776733e9
16 | # http://www.achpr.org/about/documentation-centre/ language:en
17 | # offset:200 bytes: 3424
18 | lang = line.split()[2].split(":")[-1]
19 | continue
20 | langstats[lang] += len(line.decode("utf-8").strip())
21 |
22 | for lang, num_bytes in langstats.items():
23 | sys.stdout.write("%s\t%d\n" % (lang, num_bytes))
24 |
--------------------------------------------------------------------------------
/metadata/extract_links.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Downloads $1 which should be a CommonCrawl wat file,
4 | # extracts links, sorts by domainname and xzips the result
5 |
6 | # Exit as soon as any command fails
7 | set -e
8 |
9 | FILENAME=`echo $1 | awk ' BEGIN { FS = "/" } { print $(NF-2) "/" $(NF) }'`
10 | OUTFILE=${FILENAME/warc.wat.gz/links.xz}
11 | TMPDIR=./tmp/`hostname`
12 | mkdir -p ${TMPDIR}
13 |
14 | # Directory in which this script is stored
15 | DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
16 |
17 | curl -s --retry 5 $1 | gzip -cd | ${DIR}/links_from_wat.py | sort -t" " -S500M -k1,1 --compress-program=pigz --temporary-directory=${TMPDIR} --parallel=2 | uniq | /home/buck/net/build/pxz/pxz -T 2 -9 -e > ${OUTFILE} && touch ${OUTFILE/links.xz/done}
18 |
--------------------------------------------------------------------------------
/metadata/rocksdb/Makefile:
--------------------------------------------------------------------------------
1 | CXXFLAGS = -O3 -Wall -Wextra -Wsign-compare -Wshadow -Wno-unused-parameter -std=c++11 -fmessage-length=0 -Wfatal-errors -I/home/buck/net/build/rocksdb/include/
2 |
3 | default: all
4 |
5 | insertkv: insertkv.o
6 | $(CXX) -o $@ $^ -static -L/home/buck/net/build/rocksdb -lrocksdb -lrt -lz -lbz2 -lpthread -lsnappy -Wl,-rpath=/home/buck/net/build/rocksdb
7 |
8 | updatekv: updatekv.o
9 | $(CXX) -o $@ $^ -static -L/home/buck/net/build/rocksdb -lrocksdb -lrt -lz -lbz2 -lpthread -lsnappy -ljsoncpp -Wl,-rpath=/home/buck/net/build/rocksdb
10 |
11 |
12 | %.o : %.cc *.h Makefile
13 | @echo "***" $< "***"
14 | $(CXX) $(CXXFLAGS) -c $< -o $@
15 |
16 | .PHONY : all clean
17 | all: insertkv updatekv
18 |
19 | clean:
20 | rm -f insertkv updatekv
21 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.4.1
2 | cffi==0.9.2
3 | chardet==2.3.0
4 | CherryPy==3.7.0
5 | cld2-cffi==0.1.1
6 | cryptography==0.9
7 | Cython==0.23.1
8 | enum34==1.0.4
9 | html5lib==0.99999
10 | idna==2.0
11 | ipaddress==1.0.7
12 | ipython==3.1.0
13 | jsonrpc==1.2
14 | jsonrpclib==0.1.3
15 | langid==1.1.4.dev0
16 | leveldb==0.193
17 | line-profiler==1.0
18 | lxml==3.4.4
19 | munkres==1.0.7
20 | ndg-httpsclient==0.4.0
21 | nltk==3.0.2
22 | numpy==1.9.2
23 | pexpect==3.3
24 | pyasn1==0.1.7
25 | pycparser==2.14
26 | pyOpenSSL==0.15.1
27 | pyrocksdb==0.4
28 | python-Levenshtein==0.12.0
29 | regex==2015.9.15
30 | requests==2.7.0
31 | scipy==0.15.1
32 | simhash==1.6.2
33 | simplejson==3.8.0
34 | six==1.9.0
35 | tldextract==1.6
36 | Unidecode==0.4.18
37 | urltools==0.3.2
38 | xmltodict==0.9.2
39 |
--------------------------------------------------------------------------------
/Results/results-ev12.txt:
--------------------------------------------------------------------------------
1 | Files Downloaded:
2 | -----------------------------------------------------
3 | 65.xml Philosophy: Simplicity pays off http://tekstwerk.com/philosophy
4 | 60.xml Wir über uns: tekstwerk•com kennenlernen http://tekstwerk.com/de/wir-ueber-uns
5 | 52.xml Nieuws http://tekstwerk.com/de/news
6 | 63.xml Nieuws http://tekstwerk.com/news
7 | 19.xml Introduction: Learn more about tekstwerk•com http://tekstwerk.com/en/introduction
8 | 10.xml Portfolio http://www.tekstwerk.com/en/portfolio
9 | 33.xml Philosophie: Einfachheit lohnt http://www.tekstwerk.com/de/philosophie
10 | 55.xml Portfolio http://tekstwerk.com/de/portfolio
11 |
12 | Files Mapped:
13 | -----------------------------------------------------
14 | 10.xml 55.xml
15 | 52.xml 63.xml
16 | 65.xml 33.xml
17 | 19.xml 60.xml
18 |
--------------------------------------------------------------------------------
/docaligner/minmaxstd.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import numpy as np
4 | import sys
5 | import gzip
6 |
7 | if __name__ == "__main__":
8 | import argparse
9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('infile', help='input text format matrix')
11 | parser.add_argument('-outfile', help='output npz file')
12 | args = parser.parse_args(sys.argv[1:])
13 |
14 | fh = open(args.infile, 'r')
15 | if args.infile.endswith('.gz'):
16 | fh = gzip.open(args.infile)
17 | m = np.load(fh)
18 |
19 | print "Loaded ", args.infile, " of shape ", m.shape
20 | print "Std\t", np.std(m)
21 | print "Min\t", np.min(m)
22 | print "Max\t", np.max(m)
23 | print "Mean\t", np.average(m)
24 | print "Median\t", np.median(m)
25 |
--------------------------------------------------------------------------------
/monolingual/collect_lang.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | import argparse
5 |
6 | magic_number = 'df6fa1abb58549287111ba8d776733e9'
7 |
8 | parser = argparse.ArgumentParser()
9 | parser.add_argument('-lang', help='language code')
10 | args = parser.parse_args()
11 |
12 | buf = []
13 | keep = False
14 | for line in sys.stdin:
15 | if line.startswith(magic_number):
16 | if buf:
17 | assert keep is True
18 | sys.stdout.write("".join(buf))
19 |
20 | keep = False
21 | buf = []
22 |
23 | if "language:%s" % args.lang in line.strip().split():
24 | keep = True
25 |
26 | if keep:
27 | buf.append(line)
28 |
29 | if buf:
30 | assert keep is True
31 | sys.stdout.write("".join(buf))
32 |
--------------------------------------------------------------------------------
/metadata/url_classifier/filter_languages.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 |
6 | if __name__ == "__main__":
7 | import argparse
8 | parser = argparse.ArgumentParser()
9 | parser.add_argument('--valid', type=argparse.FileType(),
10 | help='file containing valid label, one per line')
11 | parser.add_argument('--default',
12 | help='replacement for invalid labels')
13 | args = parser.parse_args(sys.argv[1:])
14 |
15 | valid = set([l.strip() for l in args.valid])
16 |
17 | for line in sys.stdin:
18 | label, feats = line.split("\t", 1)
19 | if label not in valid:
20 | label = args.default
21 | if label:
22 | sys.stdout.write("%s\t%s" % (label, feats))
23 |
--------------------------------------------------------------------------------
/metadata/dump_keys.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | import rocksdb
5 |
6 | if __name__ == "__main__":
7 | import argparse
8 | parser = argparse.ArgumentParser()
9 | parser.add_argument('db',
10 | help='path to rocksdb')
11 | parser.add_argument(
12 | '-outfile', help='output file', type=argparse.FileType('w'),
13 | default=sys.stdout)
14 | args = parser.parse_args()
15 |
16 | opts = rocksdb.Options()
17 | opts.create_if_missing = False
18 | opts.max_open_files = 100
19 | opts.num_levels = 6
20 | db = rocksdb.DB(args.db, opts, read_only=True)
21 | it = db.iterkeys()
22 | it.seek_to_first()
23 | for key in it:
24 | tld, url, crawl = key.split(" ", 2)
25 | args.outfile.write(url + "\n")
26 |
--------------------------------------------------------------------------------
/metadata/extract_pdflinks.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Downloads $1 which should be a CommonCrawl wat file,
4 | # extracts links, sorts by domainname and xzips the result
5 |
6 | # Exit as soon as any command fails
7 | set -e
8 | set -o pipefail
9 |
10 | FILENAME=`echo $1 | awk ' BEGIN { FS = "/" } { print $(NF-2) "/" $(NF) }'`
11 | OUTFILE=${FILENAME/warc.wat.gz/pdflinks.xz}
12 |
13 | # don't let temporary sort files fill up local /tmp
14 | TMPDIR=./tmp/`hostname`
15 | mkdir -p ${TMPDIR}
16 |
17 | # Directory in which this script is stored
18 | DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
19 |
20 | if [ ! -f ${OUTFILE/.xz/.done} ]; then
21 | curl -s --retry 5 $1 | \
22 | gzip -cd | \
23 | ${DIR}/links_from_wat.py -pdf | \
24 | sort -t" " -S500M -k1,1 --compress-program=pigz --temporary-directory=${TMPDIR} -u --parallel=2 | \
25 | xz -9 -e \
26 | > ${OUTFILE}
27 | touch ${OUTFILE/.xz/.done}
28 | fi
29 |
30 |
31 |
--------------------------------------------------------------------------------
/html_convert/header.h:
--------------------------------------------------------------------------------
1 | #ifndef HEADER_H_
2 | #define HEADER_H_
3 |
4 | #include
5 |
6 | #include "string_util.h"
7 |
8 | namespace {
9 |
10 | using std::string;
11 |
12 | class Header {
13 | public:
14 | explicit Header(const string& header) {
15 | for (const auto& value : StringUtil::Split(header, ' ')) {
16 | if (value.find("tld:") == 0) {
17 | tld_ = value.substr(4);
18 | } else if (value.find("uri:") == 0) {
19 | uri_ = value.substr(4);
20 | } else if (value.find("encoding:") == 0) {
21 | encoding_ = value.substr(9);
22 | }
23 | }
24 | }
25 |
26 | const string get_tld() const { return tld_; }
27 | const string get_uri() const { return uri_; }
28 | const string get_encoding() const { return encoding_; }
29 |
30 | private:
31 | string uri_;
32 | string tld_;
33 | string encoding_;
34 | };
35 |
36 | } // namespace
37 |
38 | #endif /* HEADER_H_ */
39 |
--------------------------------------------------------------------------------
/docaligner/htmlprocessor.py:
--------------------------------------------------------------------------------
1 | from HTMLParser import HTMLParser
2 |
3 |
4 | class HTMLSequencer(HTMLParser):
5 |
6 | def __init__(self, length_function, growth_function):
7 | HTMLParser.__init__(self)
8 | self.sequence = []
9 | self.length_function = length_function
10 | self.growth_function = growth_function
11 |
12 | def handle_starttag(self, tag, attrs):
13 | self.sequence.append("<%s>" % tag)
14 |
15 | def handle_endtag(self, tag):
16 | self.sequence.append("%s>" % tag)
17 |
18 | def handle_data(self, data):
19 | if not data.strip():
20 | return
21 | n = self.length_function(data)
22 |
23 | for n in range(int(self.growth_function(n))):
24 | self.sequence.append("%d" % n)
25 |
26 | def get_result(self):
27 | return self.sequence
28 |
29 | def reset(self):
30 | HTMLParser.reset(self)
31 | self.sequence = []
32 |
--------------------------------------------------------------------------------
/metadata/extract_location.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Downloads $1 which should be a CommonCrawl wat file,
4 | # extracts links, sorts by domainname and xzips the result
5 |
6 | # Exit as soon as any command fails
7 | set -e
8 | set -o pipefail
9 |
10 | FILENAME=`echo $1 | awk ' BEGIN { FS = "/" } { print $(NF-2) "/" $(NF) }'`
11 | OUTFILE=${FILENAME/warc.wat.gz/meta.xz}
12 |
13 | # don't let temporary sort files fill up local /tmp
14 | TMPDIR=./tmp/`hostname`
15 | mkdir -p ${TMPDIR}
16 |
17 | # Directory in which this script is stored
18 | DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
19 |
20 | if [ ! -f ${OUTFILE/.xz/.done} ]; then
21 | curl -s --retry 5 $1 | \
22 | gzip -cd | \
23 | ${DIR}/links_from_wat.py -nolinks | \
24 | sort -t" " -S500M -k1,1 --compress-program=pigz --temporary-directory=${TMPDIR} --parallel=2 | \
25 | uniq | \
26 | /home/buck/net/build/pxz/pxz -T 2 -9 -e \
27 | > ${OUTFILE}
28 | touch ${OUTFILE/.xz/.done}
29 | fi
30 |
31 |
32 |
--------------------------------------------------------------------------------
/metadata/lang_stats/old2new_stats.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import sys
5 | import tldextract
6 |
7 | from cld2helper import read_cld2_languages
8 |
9 |
10 | def get_domain(netloc):
11 | extract = tldextract.extract(netloc)
12 | return ".".join((extract.domain, extract.suffix)).encode('idna')
13 |
14 |
15 | if __name__ == "__main__":
16 | import argparse
17 |
18 | parser = argparse.ArgumentParser()
19 | parser.add_argument('infile', nargs='?', type=argparse.FileType('r'))
20 | args = parser.parse_args()
21 |
22 | name2code, code2name = read_cld2_languages(args.infile)
23 |
24 | for line in sys.stdin:
25 | domain, language, num_bytes = line.split()
26 | assert language in code2name
27 | language = code2name[language]
28 | # domain = get_domain(domain)
29 |
30 | sys.stdout.write("%s %s %d\n" % (domain, language, int(num_bytes)))
31 |
32 |
33 | # en.wikipedia.org xx-Kali 274
34 |
--------------------------------------------------------------------------------
/docaligner/page.py:
--------------------------------------------------------------------------------
1 | class Page(object):
2 |
3 | def __init__(self, url, html, text, mime_type,
4 | encoding, french, english, english_mt):
5 | self.url = url
6 | self.html = html
7 | self.text = text
8 | self.mime_type = mime_type
9 | self.encoding = encoding
10 | self.french = french
11 | self.english = english
12 | self.english_mt = english_mt
13 |
14 | def __str__(self):
15 | res = []
16 | res.append("--Page--")
17 | res.append("url : %s" % self.url)
18 | res.append("html : %s" % self.html)
19 | res.append("text : %s" % self.text.encode('utf-8'))
20 | res.append("mime_type : %s" % self.mime_type)
21 | res.append("encoding : %s" % self.encoding)
22 | res.append("french : %s" % self.french.encode('utf-8'))
23 | res.append("english : %s" % self.english.encode('utf-8'))
24 | res.append("english_mt : %s" % self.english_mt.encode('utf-8'))
25 | return "\n".join(res)
26 |
--------------------------------------------------------------------------------
/crawlertest/bitextor/map_urls.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | import sys
4 |
5 | if __name__ == "__main__":
6 | import argparse
7 | parser = argparse.ArgumentParser()
8 | parser.add_argument('mapping', help='mapping from filename to url',
9 | type=argparse.FileType('r'))
10 | args = parser.parse_args(sys.argv[1:])
11 |
12 | mapping = {}
13 | for line in args.mapping:
14 | filename, url = line.strip().split()
15 | assert filename not in mapping, "Repeated value: %s\n" % line
16 | mapping[filename] = url
17 |
18 | for line in sys.stdin:
19 | filesource, filetarget = line.strip().split()
20 | if filesource in mapping:
21 | if filetarget in mapping:
22 | print mapping[filesource] + "\t" + mapping[filetarget]
23 | else:
24 | sys.stderr.write(
25 | "Target file mapping not found:" + filetarget + "\n")
26 | else:
27 | sys.stderr.write(
28 | "Source file mapping not found:" + filesource + "\n")
29 |
--------------------------------------------------------------------------------
/crawlertest/filename2url.py:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # -*- coding: utf-8 -*-
3 |
4 | """
5 | Extract original URL from httrack webdir
6 | """
7 |
8 | import sys
9 | import re
10 |
11 | # Example line we're looking for:
12 | #