&indices, char delim, Functor &callback) {
52 | const char *begin = str.data();
53 | const char *const end = str.data() + str.size();
54 | unsigned int index = 0;
55 | for (const FieldRange f : indices) {
56 | for (; index < f.begin; ++index) {
57 | begin = std::find(begin, end, delim) + 1;
58 | if (begin >= end) return;
59 | }
60 | if (f.end == FieldRange::kInfiniteEnd) {
61 | callback(util::StringPiece(begin, end - begin));
62 | return;
63 | }
64 | const char *old_begin = begin;
65 | for (; index < f.end; ++index) {
66 | const char *found = std::find(begin, end, delim);
67 | begin = found + 1;
68 | if (begin >= end) {
69 | callback(util::StringPiece(old_begin, end - old_begin));
70 | return;
71 | }
72 | }
73 | callback(util::StringPiece(old_begin, begin - old_begin - 1));
74 | }
75 | return;
76 | }
77 |
78 | // This is called with the parts of the input that relate to the key.
79 | class HashCallback {
80 | public:
81 | explicit HashCallback(uint64_t seed = 47849374332489ULL) : hash_(seed) /* Be different from deduper */ {}
82 |
83 | void operator()(util::StringPiece key) {
84 | hash_ = util::MurmurHashNative(key.data(), key.size(), hash_);
85 | }
86 |
87 | uint64_t Hash() const { return hash_; }
88 |
89 | private:
90 | uint64_t hash_;
91 | };
92 |
93 | } // namespace preprocess
94 |
--------------------------------------------------------------------------------
/preprocess/gigaword_extract.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #Extract sentences from gigaword but don't process them
3 | set -e -o pipefail
4 | BINDIR="$(dirname "$0")"
5 | if [ ${#1} != 2 ]; then
6 | echo "Expected language on the command line." 1>&2
7 | exit 1
8 | fi
9 | $BINDIR/gigaword_unwrap | $BINDIR/../moses/ems/support/split-sentences.perl -l $1 |fgrep -v ""
10 |
--------------------------------------------------------------------------------
/preprocess/heuristics.perl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | #More preprocessing. This assumes that process_unicode is run with at minimum --flatten 1 --normalize 1 first.
3 |
4 | use strict;
5 | use utf8;
6 |
7 | binmode STDIN, ":utf8";
8 | binmode STDOUT, ":utf8";
9 | binmode STDERR, ":utf8";
10 |
11 | my $language = "en";
12 |
13 | while (@ARGV) {
14 | $_ = shift;
15 | /^-l$/ && ($language = shift, next);
16 | }
17 |
18 | while(my $eline = )
19 | {
20 | chomp $eline;
21 | $eline = " $eline ";
22 |
23 | #Normalize long chains of underscores to just two.
24 | $eline =~ s/_\s*_[\s_]*/ __ /g;
25 |
26 | #Silja dropped * entirely. I keep one. Bullet points are converted to * by a Chris Dyer rule in process_unicode.
27 | $eline =~ s/\*\s*\*[\s\*]*/ * /g;
28 | #Silja, originally for prepgigaword-silja.pl
29 | $eline =~ s/#+//g;
30 | $eline =~ s/[\!]+/!/g;
31 | $eline =~ s/!([^ ])/! $1/g;
32 | $eline =~ s/\.([^\s\d.])/. $1/g;
33 | $eline =~ s/\+(\D)/+ $1/g;
34 | $eline =~ s/(\D)\+/$1 +/g;
35 | $eline =~ s/,(\D)/, $1/g;
36 | $eline =~ s/(\s)-([^\s\d\-])/$1- $2/g;
37 | $eline =~ s/^ *-- *//g;
38 | #The next rule was botching ellipses. . .
39 | #$eline =~ s/\.\./ . /g;
40 |
41 | #Greg
42 | #Gigaword apw does this.
43 | $eline =~ s/ dlrs / \$ /g;
44 | if ($language == "fr") {
45 | $eline =~ s/([^ -]+)-t-(je|j'|tu|il|elle|on|nous|vous|ils|elles|me|m'|te|t'|le|l'|la|les|lui|leur|moi|toi|eux|elles|ce|c'|ça|ceci|cela|qui|ci|là) /\1 -t-\2 /gi;
46 | $eline =~ s/([^ -]+)-(je|j'|tu|il|elle|on|nous|vous|ils|elles|me|m'|te|t'|le|l'|la|les|lui|leur|moi|toi|eux|elles|ce|c'|ça|ceci|cela|qui|ci|là) /\1 -\2 /gi;
47 | $eline =~ s/\s+(qu|c|d|l|j|s|n|m|lorsqu|puisqu)\s+'\s+/ \1' /gi;
48 | $eline =~ s/\s+aujourd\s*'\s*hui\s+/ aujourd'hui /gi;
49 | }
50 |
51 | #Chris Dyer, t2.perl
52 | if ($language == "en") {
53 | $eline =~ s/ élite / elite /gi;
54 | $eline =~ s/ (s|at) & (t|p) / $1&$2 /ig;
55 | $eline =~ s/ (full|half|part) - (time) / $1-$2 /ig;
56 | $eline =~ s/ (vis|viz) - (.|..) - (vis|viz) / vis-à-vis /ig;
57 | $eline =~ s/ (short|long|medium|one|half|two|on|off|in|post|ex|multi|de|mid|co|inter|intra|anti|re|pre|e|non|pro|self) - / $1- /ig;
58 |
59 | #kheafiel
60 | $eline =~ s/ (ca|are|do|could|did|does|do|had|has|have|is|must|need|should|was|were|wo|would)n 't / \1n't /gi;
61 | }
62 | $eline =~ s/ ([AaEe][Ll]) - / \1-/g;
63 |
64 | if ($language != "de") {
65 | #Take out any "words" that are longer than 50 chars
66 | $eline =~ s/\S{50,}/-/g;
67 | }
68 |
69 | $eline =~ s/\.\s*\.\s*\.\s*[\.\s]*/ ... /g;
70 | $eline =~ s/!\s*![!\s]*/ ! /g;
71 | $eline =~ s/\?\s*\?[\?\s]*/ ? /g;
72 | $eline =~ s/ ' s / 's /g;
73 | #cut multiple hyphens down to one and space separate it (single hyphens are not space separated)
74 | $eline =~ s/([^-])--+([^-])/$1 - $2/g;
75 |
76 | #Delete excess spaces:
77 | $eline =~ s/\s+/ /g;
78 | $eline =~ s/^\s+//;
79 | $eline =~ s/\s+$//;
80 |
81 | print "$eline\n";
82 | }
83 |
84 |
--------------------------------------------------------------------------------
/preprocess/idf_main.cc:
--------------------------------------------------------------------------------
1 | /* Computes inverse document frequency for each token seen in the input. A document is a line. */
2 | #include "util/file_piece.hh"
3 | #include "util/murmur_hash.hh"
4 | #include "util/pool.hh"
5 | #include "util/probing_hash_table.hh"
6 | #include "util/tokenize_piece.hh"
7 | #include "util/file_stream.hh"
8 |
9 | #include
10 | #include
11 |
12 | struct Entry {
13 | typedef uint64_t Key;
14 | uint64_t hash;
15 |
16 | uint64_t GetKey() const { return hash; }
17 | void SetKey(uint64_t to) { hash = to; }
18 |
19 | // Should be allocated from pool to ensure survival.
20 | util::StringPiece str;
21 |
22 | uint64_t document_count;
23 | };
24 |
25 | int main() {
26 | uint64_t documents = 0;
27 | util::Pool strings;
28 | util::AutoProbing words;
29 | Entry ent;
30 | ent.document_count = 1;
31 | for (util::StringPiece line : util::FilePiece(0)) {
32 | ++documents;
33 | std::unordered_set seen_in_line;
34 | for (util::TokenIter it(line, util::kSpaces); it; ++it) {
35 | ent.hash = util::MurmurHashNative(it->data(), it->size());
36 | if (seen_in_line.insert(ent.hash).second) {
37 | // Newly seen in this line.
38 | util::AutoProbing::MutableIterator words_it;
39 | if (words.FindOrInsert(ent, words_it)) {
40 | ++(words_it->document_count);
41 | } else {
42 | char *data = static_cast(strings.Allocate(it->size()));
43 | memcpy(data, it->data(), it->size());
44 | words_it->str = util::StringPiece(data, it->size());
45 | }
46 | }
47 | }
48 | }
49 | double documents_log = std::log(static_cast(documents));
50 | util::FileStream out(1);
51 | for (util::AutoProbing::ConstIterator i = words.RawBegin(); i != words.RawEnd(); ++i) {
52 | if (i->GetKey()) {
53 | double count = static_cast(i->document_count);
54 | double idf = documents_log - std::log(count);
55 | out << i->str << ' ' << idf << '\n';
56 | }
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/preprocess/mmhsum_main.cc:
--------------------------------------------------------------------------------
1 | #include "util/murmur_hash.hh"
2 |
3 | #include
4 | #include
5 | #include
6 | #include
7 |
8 | int main(int argc, char *argv[]) {
9 | if (argc > 1) {
10 | std::cerr << "Usage: [stdin] " << argv[0] << std::endl;
11 | return 1;
12 | }
13 |
14 | constexpr size_t bufferSize = 1024*1024;
15 | std::vector buffer(bufferSize);
16 | uint64_t chained_hash = 0;
17 |
18 | while (std::cin)
19 | {
20 | std::cin.read(&buffer[0], bufferSize);
21 | if(std::cin.bad()){
22 | std::cerr << "Error trying to read from stdin\n";
23 | return 1;
24 | }
25 | size_t count = std::cin.gcount();
26 | if (!count)
27 | break;
28 | chained_hash = util::MurmurHashNative(&buffer[0], count, chained_hash);
29 | }
30 | std::cout << std::hex << chained_hash << '\n';
31 | }
32 |
--------------------------------------------------------------------------------
/preprocess/order_independent_hash_main.cc:
--------------------------------------------------------------------------------
1 | #include "util/murmur_hash.hh"
2 | #include "util/file_piece.hh"
3 |
4 | int main() {
5 | uint64_t sum = 0;
6 | for (util::StringPiece line : util::FilePiece(0)) {
7 | sum += util::MurmurHash64A(line.data(), line.size());
8 | }
9 | std::cout << sum << std::endl;
10 | }
11 |
--------------------------------------------------------------------------------
/preprocess/parallel.hh:
--------------------------------------------------------------------------------
1 | #ifndef PREPROCESS_PARALLEL__
2 | #define PREPROCESS_PARALLEL__
3 |
4 | #include "util/file_stream.hh"
5 | #include "util/file_piece.hh"
6 |
7 | #include
8 | #include
9 | #include
10 |
11 | #include
12 |
13 | namespace preprocess {
14 |
15 | template int FilterParallel(const std::vector &files, PassArguments&&... pass_construct) {
16 | uint64_t input = 0, output = 0;
17 | if (files.empty()) {
18 | Pass pass(std::forward(pass_construct)...);
19 | util::StringPiece line;
20 | util::FilePiece in(0, NULL, &std::cerr);
21 | util::FileStream out(1);
22 | while (true) {
23 | try {
24 | line = in.ReadLine();
25 | } catch (const util::EndOfFileException &e) { break; }
26 | ++input;
27 | if (pass(line)) {
28 | out << line << '\n';
29 | ++output;
30 | }
31 | }
32 | } else if (files.size() == 4) {
33 | Pass pass0(std::forward(pass_construct)...), pass1(std::forward(pass_construct)...);
34 | util::StringPiece line0, line1;
35 | util::FilePiece in0(files[0].c_str(), &std::cerr), in1(files[1].c_str());
36 | util::FileStream out0(util::CreateOrThrow(files[2].c_str())), out1(util::CreateOrThrow(files[3].c_str()));
37 | while (true) {
38 | try {
39 | line0 = in0.ReadLine();
40 | } catch (const util::EndOfFileException &e) { break; }
41 | line1 = in1.ReadLine();
42 | ++input;
43 | if (pass0(line0) && pass1(line1)) {
44 | out0 << line0 << '\n';
45 | out1 << line1 << '\n';
46 | ++output;
47 | }
48 | }
49 | try {
50 | line1 = in1.ReadLine();
51 | std::cerr << "Input is not balaced: " << files[1] << " has " << line1 << std::endl;
52 | return 2;
53 | } catch (const util::EndOfFileException &e) {}
54 | } else {
55 | std::cerr <<
56 | "To filter from stdin to stdout, run without an argument.\n"
57 | "To filter parallel files, run in0 in1 out0 out1\n";
58 | return 1;
59 | }
60 | std::cerr << "Kept " << output << " / " << input << " = " << (static_cast(output) / static_cast(input)) << std::endl;
61 | return 0;
62 | }
63 |
64 | } // namespace preprocess
65 | #endif
66 |
--------------------------------------------------------------------------------
/preprocess/process_unicode_main.cc:
--------------------------------------------------------------------------------
1 | #include "util/utf8.hh"
2 | #include "util/utf8_icu.hh"
3 |
4 | #include
5 | #include
6 | #include
7 |
8 | #include
9 | #include
10 |
11 | #include
12 | #include
13 | #include
14 |
15 | using U_ICU_NAMESPACE::UnicodeString;
16 |
17 | namespace {
18 | struct Options {
19 | std::string language;
20 | bool lower;
21 | bool flatten;
22 | bool normalize;
23 | };
24 | void ParseArgs(int argc, char *argv[], Options &out) {
25 | namespace po = boost::program_options;
26 | po::options_description desc("Unicode treatment options");
27 | desc.add_options()
28 | ("language,l", po::value(&out.language)->default_value("en"), "Language (only applies to flatten)")
29 | ("lower", po::bool_switch(&out.lower)->default_value(false), "Convert to lowercase")
30 | ("flatten", po::bool_switch(&out.flatten)->default_value(false), "Canonicalize some characters for English")
31 | ("normalize", po::bool_switch(&out.normalize)->default_value(false), "Normalize Unicode format");
32 | po::variables_map vm;
33 | po::store(po::parse_command_line(argc, argv, desc), vm);
34 | po::notify(vm);
35 | }
36 | } // namespace
37 |
38 | int main(int argc, char *argv[]) {
39 | Options opt;
40 | ParseArgs(argc, argv, opt);
41 | util::Flatten flatten(opt.language);
42 | std::string line, normalized;
43 | UnicodeString str[2];
44 | UnicodeString *cur = &str[0], *tmp = &str[1];
45 | while (getline(std::cin, line)) {
46 | *cur = UnicodeString::fromUTF8(line);
47 | if (opt.lower) {
48 | cur->toLower();
49 | }
50 | if (opt.flatten) {
51 | flatten.Apply(*cur, *tmp);
52 | std::swap(cur, tmp);
53 | }
54 | if (opt.normalize) {
55 | util::Normalize(*cur, *tmp);
56 | std::swap(cur, tmp);
57 | }
58 | std::cout << *str << '\n';
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/preprocess/remove_invalid_utf8_base64_main.cc:
--------------------------------------------------------------------------------
1 | #include "util/file_stream.hh"
2 | #include "util/file_piece.hh"
3 | #include "util/utf8.hh"
4 |
5 | #include "base64.hh"
6 |
7 | int main() {
8 | util::FilePiece in(0);
9 | util::FileStream out(1);
10 | util::StringPiece line;
11 | std::string decoded;
12 | std::string empty_base64;
13 | preprocess::base64_encode("", empty_base64);
14 | while (in.ReadLineOrEOF(line)) {
15 | preprocess::base64_decode(line, decoded);
16 | if (util::IsUTF8(decoded)) {
17 | out << line << '\n';
18 | } else {
19 | out << empty_base64 << '\n';
20 | }
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/preprocess/remove_invalid_utf8_main.cc:
--------------------------------------------------------------------------------
1 | #include "util/file_stream.hh"
2 | #include "util/file_piece.hh"
3 | #include "util/utf8.hh"
4 |
5 | int main() {
6 | util::FilePiece in(0);
7 | util::FileStream out(1);
8 | util::StringPiece line;
9 | while (in.ReadLineOrEOF(line)) {
10 | if (util::IsUTF8(line)) {
11 | out << line << '\n';
12 | }
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/preprocess/remove_long_lines_main.cc:
--------------------------------------------------------------------------------
1 | #include "util/file_stream.hh"
2 | #include "util/file_piece.hh"
3 |
4 | #include
5 | #include
6 |
7 | #include
8 |
9 | int main(int argc, char *argv[]) {
10 | std::size_t limit;
11 | if (argc == 1) {
12 | limit = 2000;
13 | } else if (argc == 2) {
14 | limit = boost::lexical_cast(argv[1]);
15 | } else {
16 | std::cerr << "Usage: " << argv[0] << " [length limit in bytes]" << std::endl;
17 | return 1;
18 | }
19 | util::FilePiece f(0, NULL, &std::cerr);
20 | util::FileStream out(1);
21 | try {
22 | while (true) {
23 | util::StringPiece l = f.ReadLine();
24 | if (l.size() <= limit) {
25 | out << l << '\n';
26 | }
27 | }
28 | } catch (const util::EndOfFileException &e) {}
29 | }
30 |
--------------------------------------------------------------------------------
/preprocess/resplit.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e -o pipefail
3 | BINDIR="$(dirname "$0")"
4 | #Argument 1 is language
5 | l="$1"
6 | if [ ${#l} == 0 ]; then
7 | echo "Argument is language" 1>&2
8 | exit 1
9 | fi
10 | sed 's/^/\n/' | $BINDIR/../moses/ems/support/split-sentences.perl -l $1 |fgrep -vx "
"
11 |
--------------------------------------------------------------------------------
/preprocess/shard_main.cc:
--------------------------------------------------------------------------------
1 | #include "preprocess/fields.hh"
2 | #include "util/buffered_stream.hh"
3 | #include "util/threaded_buffered_stream.hh"
4 | #include "util/file_piece.hh"
5 | #include "util/fixed_array.hh"
6 | #include "util/murmur_hash.hh"
7 |
8 | #include
9 | #include
10 |
11 | #include
12 | #include
13 |
14 | namespace preprocess {
15 |
16 | struct Options {
17 | std::vector key_fields;
18 | char delim;
19 | std::vector outputs;
20 | util::WriteCompressed::Compression compression;
21 | };
22 |
23 | void ParseArgs(int argc, char *argv[], Options &out) {
24 | namespace po = boost::program_options;
25 | po::options_description desc("Arguments");
26 | std::string fields;
27 | std::string prefix;
28 | std::string compression_string;
29 | unsigned int number;
30 |
31 | desc.add_options()
32 | ("help,h", po::bool_switch(), "Show this help message")
33 | ("fields,f", po::value(&fields)->default_value("1-"), "Fields to use for key like cut -f")
34 | ("delim,d", po::value(&out.delim)->default_value('\t'), "Field delimiter")
35 | ("prefix,p", po::value(&prefix), "Prefix and count of outputs")
36 | ("number,n", po::value(&number), "Number of shards")
37 | ("output,o", po::value(&out.outputs)->multitoken(), "Output file names (or just list them without -o)")
38 | ("compress,c", po::value(&compression_string)->default_value("none"), "Compression. One of none, gzip, or bzip2");
39 |
40 | po::positional_options_description pd;
41 | pd.add("output", -1);
42 |
43 | po::variables_map vm;
44 | po::store(po::command_line_parser(argc, argv).options(desc).positional(pd).run(), vm);
45 | if (argc == 1 || vm["help"].as()) {
46 | std::cerr <<
47 | "Shards stdin into multiple files by the hash of the key.\n" <<
48 | "Output is specified as --prefix prefix --number n or just listing file names.\n" <<
49 | desc <<
50 | "Examples:\n" <<
51 | argv[0] << " a b #Shards stdin to files a and b using the whole line as key.\n" <<
52 | argv[0] << " a b c #Shards stdin to files a, b, and c using the whole line as key.\n" <<
53 | argv[0] << " -f 1 a b #Shards stdin to files a and b using tab-delimited field 1.\n" <<
54 | argv[0] << " -d ' ' -f 1 a b #Shards stdin to files a and b using space-delimited field 1." << std::endl;
55 | exit(1);
56 | }
57 | po::notify(vm);
58 |
59 | ParseFields(fields.c_str(), out.key_fields);
60 | DefragmentFields(out.key_fields);
61 |
62 | if (out.outputs.empty()) {
63 | UTIL_THROW_IF2(!vm.count("prefix"), "Specify outputs using --outputs or e.g. --prefix pre --number 2");
64 | UTIL_THROW_IF2(!vm.count("number"), "--prefix specified but we need to know how many shards with -n");
65 | // How many digits will be in the 0-indexed representation?
66 | unsigned int digits = 0;
67 | for (unsigned int compare = number - 1; compare; ++digits, compare /= 10) {}
68 | std::ostringstream stream;
69 | stream << std::setfill('0') << std::setw(digits);
70 | for (unsigned int i = 0; i < number; ++i) {
71 | stream << std::setw(digits) << i;
72 | out.outputs.push_back(prefix + stream.str());
73 | stream.str(std::string());
74 | stream.clear();
75 | }
76 | } else {
77 | UTIL_THROW_IF2(vm.count("prefix"), "Specify --prefix or --output");
78 | UTIL_THROW_IF2(vm.count("number") && number != out.outputs.size(), "Number of outputs does not match");
79 | }
80 | if (compression_string == "none") {
81 | out.compression = util::WriteCompressed::NONE;
82 | } else if (compression_string == "gzip") {
83 | out.compression = util::WriteCompressed::GZIP;
84 | } else if (compression_string == "bzip2") {
85 | out.compression = util::WriteCompressed::BZIP;
86 | } else {
87 | UTIL_THROW(util::Exception, "Unknown compression algorithm " << compression_string);
88 | }
89 | }
90 |
91 | } // namespace preprocess
92 |
93 | int main(int argc, char *argv[]) {
94 | preprocess::Options options;
95 | preprocess::ParseArgs(argc, argv, options);
96 | uint64_t shard_count = options.outputs.size();
97 |
98 | util::FilePiece in(0);
99 | util::StringPiece line;
100 | util::FixedArray > out(options.outputs.size());
101 | std::string output(argv[1]);
102 | for (const std::string &o : options.outputs) {
103 | out.push_back(util::CreateOrThrow(o.c_str()), options.compression);
104 | }
105 | while (in.ReadLineOrEOF(line)) {
106 | preprocess::HashCallback cb;
107 | preprocess::RangeFields(line, options.key_fields, options.delim, cb);
108 | out[cb.Hash() % shard_count] << line << '\n';
109 | }
110 | return 0;
111 | }
112 |
--------------------------------------------------------------------------------
/preprocess/substitute_main.cc:
--------------------------------------------------------------------------------
1 | #include "preprocess/fields.hh"
2 | #include "util/file_stream.hh"
3 | #include "util/file_piece.hh"
4 | #include "util/murmur_hash.hh"
5 | #include "util/pool.hh"
6 | #include "util/probing_hash_table.hh"
7 | #include
8 |
9 | struct Entry {
10 | typedef uint64_t Key;
11 | Key key;
12 | uint64_t GetKey() const { return key; }
13 | void SetKey(uint64_t to) { key = to; }
14 | util::StringPiece value;
15 | };
16 |
17 | class RecordCallback {
18 | public:
19 | RecordCallback(util::StringPiece *to) : i_(to) {}
20 |
21 | void operator()(util::StringPiece str) {
22 | *(i_++) = str;
23 | }
24 |
25 | const util::StringPiece *Position() const { return i_; }
26 |
27 | private:
28 | util::StringPiece *i_;
29 | };
30 |
31 | int main() {
32 | std::vector fields;
33 | fields.resize(4);
34 | util::StringPiece segments[4];
35 | fields[0].begin = 0;
36 | fields[0].end = 2;
37 | util::StringPiece &sentences = segments[1];
38 | fields[1].begin = 2;
39 | fields[1].end = 4;
40 | util::StringPiece &value = segments[2];
41 | fields[2].begin = 4;
42 | fields[2].end = 5;
43 | util::StringPiece &after = segments[3];
44 | fields[3].begin = 5;
45 | fields[3].end = preprocess::FieldRange::kInfiniteEnd;
46 |
47 | util::Pool string_pool;
48 | util::FileStream out(1);
49 |
50 | typedef util::AutoProbing Table;
51 | Table table;
52 | for (util::StringPiece line : util::FilePiece(0)) {
53 | RecordCallback cb(segments);
54 | preprocess::RangeFields(line, fields, '\t', cb);
55 | UTIL_THROW_IF2(cb.Position() != segments + 4, "Did not get all fields in line " << line);
56 | Entry entry;
57 | entry.key = util::MurmurHashNative(sentences.data(), sentences.size());
58 | Table::MutableIterator it;
59 | if (table.FindOrInsert(entry, it)) {
60 | out << util::StringPiece(line.data(), sentences.data() + sentences.size() - line.data());
61 | out << '\t' << it->value << '\t';
62 | out << after;
63 | } else {
64 | char *mem = static_cast(memcpy(string_pool.Allocate(value.size()), value.data(), value.size()));
65 | it->value = util::StringPiece(mem, value.size());
66 | out << line;
67 | }
68 | out << '\n';
69 | }
70 | }
71 |
--------------------------------------------------------------------------------
/preprocess/subtract_lines_main.cc:
--------------------------------------------------------------------------------
1 | #include "util/file_piece.hh"
2 | #include "util/file_stream.hh"
3 | #include "util/murmur_hash.hh"
4 | #include "util/probing_hash_table.hh"
5 |
6 | #include
7 |
8 | struct Entry {
9 | typedef uint64_t Key;
10 | uint64_t key;
11 | uint64_t GetKey() const { return key; }
12 | void SetKey(uint64_t to) { key = to; }
13 | };
14 |
15 | int main(int argc, char *argv[]) {
16 | if (argc != 2) {
17 | std::cerr << "Usage: " << argv[0] << " subtract output\n"
18 | "Copies from stdin to stdout, skipping lines that appear in `subtract`.\n"
19 | "The subtraction is approximate, based on the hash of the line.\n"
20 | "This is set subtraction. All copies of a line are removed.\n";
21 | return 1;
22 | }
23 | util::AutoProbing table;
24 | // Load subtraction into table.
25 | for (util::StringPiece line : util::FilePiece(argv[1])) {
26 | Entry entry;
27 | entry.key = util::MurmurHashNative(line.data(), line.size(), 1);
28 | util::AutoProbing::MutableIterator it;
29 | table.FindOrInsert(entry, it);
30 | }
31 | util::FileStream out(1);
32 | for (util::StringPiece line : util::FilePiece(0)) {
33 | uint64_t key = util::MurmurHashNative(line.data(), line.size(), 1);
34 | util::AutoProbing::ConstIterator it;
35 | if (!table.Find(key, it)) {
36 | out << line << '\n';
37 | }
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/preprocess/tests/cache/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | . "$(dirname "$0")"/../vars
3 | diff <("$BIN"/cache cat <"$CUR"/input) "$CUR"/input
4 | diff <("$BIN"/cache -t " " -k 1 cat <"$CUR"/input) "$CUR"/space_expected
5 |
6 |
--------------------------------------------------------------------------------
/preprocess/tests/cache/space_ref.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import sys
3 | lines = {}
4 | for l in sys.stdin:
5 | key = l[0:-1].split(' ')[0]
6 | if key in lines:
7 | sys.stdout.write(lines[key])
8 | else:
9 | lines[key] = l
10 | sys.stdout.write(l)
11 |
--------------------------------------------------------------------------------
/preprocess/tests/dedupe/columns:
--------------------------------------------------------------------------------
1 | 1 a
2 | 2 a
3 | 3 b
4 | 4 a
5 | 5 a
6 | 6 b
7 | 7 b
8 |
--------------------------------------------------------------------------------
/preprocess/tests/dedupe/columns.out:
--------------------------------------------------------------------------------
1 | 1 a
2 | 3 b
3 |
--------------------------------------------------------------------------------
/preprocess/tests/dedupe/ref.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import sys
3 | lines = set()
4 | for l in sys.stdin:
5 | if l not in lines:
6 | lines.add(l)
7 | sys.stdout.write(l)
8 |
--------------------------------------------------------------------------------
/preprocess/tests/dedupe/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | . "$(dirname "$0")"/../vars
3 | diff <("$BIN/dedupe" <"$CUR/input") "$CUR/expected"
4 | "$BIN"/dedupe "$CUR"/input <(rev "$CUR"/input) "$TMP"/output0 "$TMP"/output1
5 | diff "$CUR"/expected "$TMP"/output0
6 | diff <(rev "$CUR"/expected) "$TMP"/output1
7 | rm "$TMP"/output0 "$TMP"/output1
8 | diff <("$BIN"/dedupe -f 2 -d " " <"$CUR"/columns) "$CUR"/columns.out
9 |
--------------------------------------------------------------------------------
/preprocess/tests/foldfilter/input:
--------------------------------------------------------------------------------
1 | ../../../COPYING
--------------------------------------------------------------------------------
/preprocess/tests/foldfilter/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | . "$(dirname "$0")"/../vars
3 | #GPL has short columns
4 | diff <("$BIN/foldfilter" cat <"$CUR"/input) "$CUR"/input
5 | diff <("$BIN/foldfilter" -w 10 cat <"$CUR"/input) "$CUR"/input
6 | "$BIN/foldfilter" -w 10 tee "$TMP/fold10" <"$CUR"/input >/dev/null
7 | # Line breaks are not great with leading space but it does work
8 | diff "$TMP/fold10" "$CUR/fold10.expected"
9 | rm "$TMP/fold10"
10 |
--------------------------------------------------------------------------------
/preprocess/tests/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | CURRENT="$(dirname "$0")"
3 | set -eo pipefail
4 | for i in "$CURRENT"/*/; do
5 | "${i}"run.sh || echo "FAILURE: ${i}" 1>&2
6 | done
7 |
--------------------------------------------------------------------------------
/preprocess/tests/shard/input:
--------------------------------------------------------------------------------
1 | ../../../README.md
--------------------------------------------------------------------------------
/preprocess/tests/shard/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | . "$(dirname "$0")"/../vars
3 | "$BIN/shard" "$TMP"/test_a "$TMP"/test_b <"$CUR"/input
4 | diff <(sort "$TMP"/test_a "$TMP"/test_b) <(sort "$CUR"/input)
5 | "$BIN/shard" --prefix "$TMP"/test --number 4 <"$CUR"/input
6 | diff <(sort "$TMP"/test{0,1,2,3}) <(sort "$CUR"/input)
7 | "$BIN/shard" --prefix "$TMP"/test -c gzip --number 4 <"$CUR"/input
8 | diff <(zcat "$TMP"/test{0,1,2,3} |sort) <(sort "$CUR"/input)
9 | "$BIN/shard" --prefix "$TMP"/test -c bzip2 --number 4 <"$CUR"/input
10 | diff <(bzcat "$TMP"/test{0,1,2,3} |sort) <(sort "$CUR"/input)
11 | rm "$TMP"/test_a "$TMP"/test_b "$TMP"/test{0,1,2,3}
12 |
--------------------------------------------------------------------------------
/preprocess/tests/vars:
--------------------------------------------------------------------------------
1 | set -eo pipefail
2 | CUR="$(dirname "$0")"
3 | BIN="${BIN:-"$CUR"/../../../build/bin}"
4 | TMP="$CUR"
5 |
--------------------------------------------------------------------------------
/preprocess/text.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e -o pipefail
3 | BINDIR="$(dirname "$0")"
4 | #Argument 1 is language, argument 2 is lowercase (1) or not (0)
5 | l="$1"
6 | if [ ${#l} != 2 ]; then
7 | echo "Usage: \"$0 language lowercase\" where lowercase is 0 or 1." 1>&2
8 | exit 1
9 | fi
10 | if [ "$2" != 1 ] && [ "$2" != 0 ]; then
11 | echo "Second argument (lowercase) should be 0 or 1" 1>&2
12 | exit 1
13 | fi
14 | #If statement hack to only run process unicode if lowercasing.
15 | "$BINDIR"/process_unicode --language $l --flatten --normalize |"$BINDIR"/../moses/tokenizer/tokenizer.perl -l $l | "$BINDIR"/heuristics.perl -l $l | if [ "$2" == 1 ]; then
16 | "$BINDIR"/../moses/tokenizer/normalize-punctuation.perl $l | "$BINDIR"/process_unicode --language $l --lower
17 | else
18 | "$BINDIR"/../moses/tokenizer/normalize-punctuation.perl $l
19 | fi
20 |
--------------------------------------------------------------------------------
/preprocess/train_case_main.cc:
--------------------------------------------------------------------------------
1 | #include "util/file_stream.hh"
2 | #include "util/file_piece.hh"
3 | #include "util/murmur_hash.hh"
4 | #include "util/mutable_vocab.hh"
5 | #include "util/tokenize_piece.hh"
6 | #include "util/utf8.hh"
7 | #include "util/utf8_icu.hh"
8 |
9 | #include
10 |
11 | #include
12 |
13 | namespace {
14 | void SplitLine(util::FilePiece &from, std::vector &to) {
15 | to.clear();
16 | for (util::TokenIter i(from.ReadLine(), ' '); i; ++i) {
17 | to.push_back(*i);
18 | }
19 | }
20 |
21 | class Recorder {
22 | public:
23 | void Add(util::StringPiece source, util::StringPiece target) {
24 | util::ToLower(target, lowered_);
25 | uint64_t key = util::MurmurHash64A(lowered_.data(), lowered_.size(), util::MurmurHash64A(source.data(), source.size()));
26 | ++map_[key][vocab_.FindOrInsert(target)];
27 | }
28 |
29 | void Dump() {
30 | util::FileStream out(1);
31 | for (Map::const_iterator i = map_.begin(); i != map_.end(); ++i) {
32 | out << boost::lexical_cast(i->first);
33 | for (std::unordered_map::const_iterator j = i->second.begin(); j != i->second.end(); ++j) {
34 | out << '\t' << vocab_.String(j->first) << ' ' << j->second;
35 | }
36 | out << '\n';
37 | }
38 | }
39 |
40 | private:
41 | util::MutableVocab vocab_;
42 |
43 | std::string lowered_;
44 |
45 | // map_[hash(lowered_target, hash(cased_source))][cased_target] = count(cased_source, cased_target)
46 | typedef std::unordered_map > Map;
47 | Map map_;
48 | };
49 |
50 | } // namespace
51 |
52 | int main(int argc, char *argv[]) {
53 | if (argc != 4) {
54 | std::cerr << "Usage: " << argv[0] << " alignment source target\n";
55 | return 1;
56 | }
57 | util::FilePiece align(argv[1], &std::cerr), source_file(argv[2]), target_file(argv[3]);
58 | std::vector source_words, target_words;
59 | Recorder recorder;
60 | std::size_t sentence = 0, discarded = 0;
61 | for (; ; ++sentence) {
62 | try {
63 | SplitLine(source_file, source_words);
64 | } catch (const util::EndOfFileException &e) { break; }
65 | SplitLine(target_file, target_words);
66 | // parse comment lone
67 | // "# sentence pair (0) source length"
68 | for (unsigned int i = 0; i < 6; ++i) {
69 | align.ReadDelimited();
70 | }
71 | unsigned long from_length = align.ReadULong();
72 | align.ReadDelimited(); align.ReadDelimited(); // target length
73 | unsigned long to_length = align.ReadULong();
74 | align.ReadLine(); // comment line ending
75 |
76 | align.ReadLine(); // uncased sentence
77 | util::StringPiece word(align.ReadDelimited());
78 | UTIL_THROW_IF2("NULL" != word, "Expected NULL at the beginning, not " << word);
79 |
80 | if (from_length != source_words.size() || to_length != target_words.size()) {
81 | align.ReadLine(); // Complete line.
82 | ++discarded;
83 | continue;
84 | }
85 |
86 | while ("})" != align.ReadDelimited()) {}
87 | for (unsigned long from = 0; align.ReadWordSameLine(word); ++from) {
88 | align.ReadWordSameLine(word);
89 | UTIL_THROW_IF2(word != "({", "Expected ({ not " << word);
90 | UTIL_THROW_IF2(from >= source_words.size(), "Index " << from << " too high for source text at sentence " << sentence);
91 | for (align.SkipSpaces(); align.peek() != '}'; align.SkipSpaces()) {
92 | unsigned long to = align.ReadULong() - 1 /* NULL word */;
93 | UTIL_THROW_IF2(to >= target_words.size(), "Index " << to << " too high for target text");
94 | // Throw out beginning of sentence.
95 | if (from != 0 && to != 0) {
96 | recorder.Add(source_words[from], target_words[to]);
97 | }
98 | }
99 | UTIL_THROW_IF2(align.ReadDelimited() != "})", "Expected })");
100 | }
101 | align.ReadLine(); // Complete line.
102 | }
103 | std::cerr << "Discarded " << discarded << "/" << sentence << std::endl;
104 | recorder.Dump();
105 | }
106 |
--------------------------------------------------------------------------------
/preprocess/unescape_html.perl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl -w
2 | binmode(STDIN, ":utf8");
3 | binmode(STDOUT, ":utf8");
4 |
5 | use HTML::Entities;
6 | use utf8;
7 |
8 | while() {
9 | $str = decode_entities($_);
10 | $str =~ s// /g;
11 | print $str;
12 | }
13 |
--------------------------------------------------------------------------------
/preprocess/vocab_main.cc:
--------------------------------------------------------------------------------
1 | #include "util/file_piece.hh"
2 | #include "util/file_stream.hh"
3 | #include "util/murmur_hash.hh"
4 | #include "util/probing_hash_table.hh"
5 |
6 | #include
7 |
8 | #include
9 |
10 | #include
11 |
12 | struct Entry {
13 | typedef uint64_t Key;
14 | uint64_t key;
15 | uint64_t GetKey() const { return key; }
16 | void SetKey(uint64_t to) { key = to; }
17 | };
18 |
19 |
20 | int main() {
21 | bool delimiters[256];
22 | memset(delimiters, 0, sizeof(delimiters));
23 | delimiters['\0'] = true;
24 | delimiters['\t'] = true;
25 | delimiters['\r'] = true;
26 | delimiters['\n'] = true;
27 | delimiters[' '] = true;
28 |
29 | util::AutoProbing seen;
30 |
31 | util::FilePiece in(0, "stdin", &std::cerr);
32 | util::FileStream out(1);
33 |
34 | util::AutoProbing::MutableIterator it;
35 | Entry entry;
36 |
37 | try { while (true) {
38 | util::StringPiece word = in.ReadDelimited(delimiters);
39 | entry.SetKey(util::MurmurHashNative(word.data(), word.size()));
40 | if (!seen.FindOrInsert(entry, it)) {
41 | out << word << '\0';
42 | }
43 | } } catch (const util::EndOfFileException &e) {}
44 | }
45 |
--------------------------------------------------------------------------------
/preprocess/warc.cc:
--------------------------------------------------------------------------------
1 | #include "preprocess/warc.hh"
2 |
3 | #include "util/exception.hh"
4 | #include "util/file.hh"
5 | #include "util/compress.hh"
6 |
7 | #include
8 | #include
9 | #include
10 | #include
11 |
12 | namespace preprocess {
13 |
14 | bool ReadMore(util::ReadCompressed &reader, std::string &out) {
15 | const std::size_t kRead = 4096;
16 | std::size_t had = out.size();
17 | out.resize(out.size() + kRead);
18 | std::size_t got = reader.Read(&out[had], out.size() - had);
19 | if (!got) {
20 | // End of file
21 | UTIL_THROW_IF(had, util::EndOfFileException, "Unexpected end of file inside header");
22 | return false;
23 | }
24 | out.resize(had + got);
25 | return true;
26 | }
27 |
28 | class HeaderReader {
29 | public:
30 | HeaderReader(util::ReadCompressed &reader, std::string &out)
31 | : reader_(reader), out_(out), consumed_(0) {}
32 |
33 | bool Line(util::StringPiece &line) {
34 | std::size_t newline_start = consumed_;
35 | std::size_t newline;
36 | while (std::string::npos == (newline = out_.find('\n', newline_start))) {
37 | newline_start = out_.size();
38 | if (!ReadMore(reader_, out_)) return false;
39 | }
40 | // The line is [consumed, newline). A blank line indicates header end.
41 | line = util::StringPiece(out_.data() + consumed_, newline - consumed_);
42 | // Remove carriage return if present.
43 | if (!line.empty() && line.data()[line.size() - 1] == '\r') {
44 | line = util::StringPiece(line.data(), line.size() - 1);
45 | }
46 | consumed_ = newline + 1;
47 | return true;
48 | }
49 |
50 | std::size_t Consumed() const { return consumed_; }
51 |
52 | private:
53 | util::ReadCompressed &reader_;
54 | std::string &out_;
55 |
56 | std::size_t consumed_;
57 | };
58 |
59 | bool WARCReader::Read(std::string &out) {
60 | std::swap(overhang_, out);
61 | overhang_.clear();
62 | out.reserve(32768);
63 | HeaderReader header(reader_, out);
64 | util::StringPiece line;
65 | if (!header.Line(line)) return false;
66 | UTIL_THROW_IF(line != "WARC/1.0", util::Exception, "Expected WARC/1.0 header but got `" << line << '\'');
67 | std::size_t length = 0;
68 | bool seen_content_length = false;
69 | const char kContentLength[] = "Content-Length:";
70 | const std::size_t kContentLengthLength = sizeof(kContentLength) - 1;
71 | while (!line.empty()) {
72 | UTIL_THROW_IF(!header.Line(line), util::EndOfFileException, "WARC ended in header.");
73 | if (line.size() >= kContentLengthLength && !strncasecmp(line.data(), kContentLength, kContentLengthLength)) {
74 | UTIL_THROW_IF2(seen_content_length, "Two Content-Length headers?");
75 | seen_content_length = true;
76 | char *end;
77 | length = std::strtoll(line.data() + kContentLengthLength, &end, 10);
78 | // TODO: tolerate whitespace?
79 | UTIL_THROW_IF2(end != line.data() + line.size(), "Content-Length parse error in `" << line << '\'');
80 | }
81 | }
82 | UTIL_THROW_IF2(!seen_content_length, "No Content-Length: header in " << out);
83 | std::size_t total_length = header.Consumed() + length + 4 /* CRLF CRLF after data as specified in the standard. */;
84 |
85 | if (total_length < out.size()) {
86 | overhang_.assign(out.data() + total_length, out.size() - total_length);
87 | out.resize(total_length);
88 | } else {
89 | std::size_t start = out.size();
90 | out.resize(total_length);
91 | while (start != out.size()) {
92 | std::size_t got = reader_.Read(&out[start], out.size() - start);
93 | UTIL_THROW_IF(!got, util::EndOfFileException, "Unexpected end of file while reading content of length " << length);
94 | start += got;
95 | }
96 | }
97 | // Check CRLF CRLF.
98 | UTIL_THROW_IF2(util::StringPiece(out.data() + out.size() - 4, 4) != util::StringPiece("\r\n\r\n", 4), "End of WARC record missing CRLF CRLF");
99 | return true;
100 | }
101 |
102 | } // namespace preprocess
103 |
--------------------------------------------------------------------------------
/preprocess/warc.hh:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include "util/compress.hh"
4 |
5 | #include
6 |
7 | namespace preprocess {
8 |
9 | class WARCReader {
10 | public:
11 | explicit WARCReader(int fd) : reader_(fd) {}
12 |
13 | bool Read(std::string &out);
14 |
15 | private:
16 | util::ReadCompressed reader_;
17 |
18 | std::string overhang_;
19 | };
20 |
21 | } // namespace preprocess
22 |
--------------------------------------------------------------------------------
/util/buffered_stream.hh:
--------------------------------------------------------------------------------
1 | /* A buffered output stream.
2 | * The Writer class has this interface.
3 | * class Writer {
4 | * private:
5 | * void write(const void *data, size_t amount);
6 | * void flush();
7 | * };
8 | */
9 | #ifndef UTIL_BUFFERED_STREAM_H
10 | #define UTIL_BUFFERED_STREAM_H
11 |
12 | #include "util/fake_ostream.hh"
13 | #include "util/file.hh"
14 | #include "util/scoped.hh"
15 |
16 | #include
17 | #include
18 |
19 | #include
20 |
21 | namespace util {
22 |
23 | template class BufferedStream : public FakeOStream > {
24 | public:
25 | const std::size_t kBufferSize = std::max(8192, kToStringMaxBytes);
26 | template explicit BufferedStream(Args&&... args)
27 | : buf_(kBufferSize),
28 | current_(static_cast(buf_.get())),
29 | end_(current_ + kBufferSize),
30 | writer_(std::forward(args)...) {}
31 |
32 | /* The source of the move is left in an unusable state that can only be destroyed. */
33 | #if __cplusplus >= 201103L
34 | BufferedStream(BufferedStream &&from) noexcept : buf_(std::move(from.buf_)), current_(from.current_), end_(from.end_) {
35 | from.end_ = reinterpret_cast(from.buf_.get());
36 | from.current_ = from.end_;
37 | }
38 | #endif
39 |
40 | ~BufferedStream() {
41 | flush();
42 | }
43 |
44 | BufferedStream &flush() {
45 | SpillBuffer();
46 | writer_.flush();
47 | return *this;
48 | }
49 |
50 | // For writes of arbitrary size.
51 | BufferedStream &write(const void *data, std::size_t length) {
52 | if (UTIL_LIKELY(current_ + length <= end_)) {
53 | std::memcpy(current_, data, length);
54 | current_ += length;
55 | return *this;
56 | }
57 | SpillBuffer();
58 | if (current_ + length <= end_) {
59 | std::memcpy(current_, data, length);
60 | current_ += length;
61 | } else {
62 | writer_.write(data, length);
63 | }
64 | return *this;
65 | }
66 |
67 | private:
68 | friend class FakeOStream >;
69 | // For writes directly to buffer guaranteed to have amount < buffer size.
70 | char *Ensure(std::size_t amount) {
71 | if (UTIL_UNLIKELY(current_ + amount > end_)) {
72 | SpillBuffer();
73 | assert(current_ + amount <= end_);
74 | }
75 | return current_;
76 | }
77 |
78 | void AdvanceTo(char *to) {
79 | current_ = to;
80 | assert(current_ <= end_);
81 | }
82 |
83 | void SpillBuffer() {
84 | if (current_ != buf_.get()) {
85 | writer_.write(buf_.get(), current_ - (char*)buf_.get());
86 | current_ = static_cast(buf_.get());
87 | }
88 | }
89 |
90 | util::scoped_malloc buf_;
91 | char *current_, *end_;
92 | Writer writer_;
93 | };
94 |
95 | } // namespace util
96 |
97 | #endif
98 |
--------------------------------------------------------------------------------
/util/cat_compressed_main.cc:
--------------------------------------------------------------------------------
1 | // Like cat but interprets compressed files.
2 | #include "util/file.hh"
3 | #include "util/read_compressed.hh"
4 |
5 | #include
6 | #include
7 |
8 | namespace {
9 | const std::size_t kBufSize = 16384;
10 | void Copy(util::ReadCompressed &from, int to) {
11 | util::scoped_malloc buffer(util::MallocOrThrow(kBufSize));
12 | while (std::size_t amount = from.Read(buffer.get(), kBufSize)) {
13 | util::WriteOrThrow(to, buffer.get(), amount);
14 | }
15 | }
16 | } // namespace
17 |
18 | int main(int argc, char *argv[]) {
19 | // Lane Schwartz likes -h and --help
20 | for (int i = 1; i < argc; ++i) {
21 | char *arg = argv[i];
22 | if (!strcmp(arg, "--")) break;
23 | if (!strcmp(arg, "-h") || !strcmp(arg, "--help")) {
24 | std::cerr <<
25 | "A cat implementation that interprets compressed files.\n"
26 | "Usage: " << argv[0] << " [file1] [file2] ...\n"
27 | "If no file is provided, then stdin is read.\n";
28 | return 1;
29 | }
30 | }
31 |
32 | try {
33 | if (argc == 1) {
34 | util::ReadCompressed in(0);
35 | Copy(in, 1);
36 | } else {
37 | for (int i = 1; i < argc; ++i) {
38 | util::ReadCompressed in(util::OpenReadOrThrow(argv[i]));
39 | Copy(in, 1);
40 | }
41 | }
42 | } catch (const std::exception &e) {
43 | std::cerr << e.what() << std::endl;
44 | return 2;
45 | }
46 | return 0;
47 | }
48 |
--------------------------------------------------------------------------------
/util/compress.hh:
--------------------------------------------------------------------------------
1 | #ifndef UTIL_COMPRESS_H
2 | #define UTIL_COMPRESS_H
3 |
4 | #include "util/exception.hh"
5 | #include "util/file.hh"
6 | #include "util/scoped.hh"
7 |
8 | #include
9 | #include
10 | #include
11 |
12 | namespace util {
13 |
14 | class CompressedException : public Exception {
15 | public:
16 | CompressedException() throw();
17 | virtual ~CompressedException() throw();
18 | };
19 |
20 | class GZException : public CompressedException {
21 | public:
22 | GZException() throw();
23 | ~GZException() throw();
24 | };
25 |
26 | class BZException : public CompressedException {
27 | public:
28 | BZException() throw();
29 | ~BZException() throw();
30 | };
31 |
32 | class XZException : public CompressedException {
33 | public:
34 | XZException() throw();
35 | ~XZException() throw();
36 | };
37 |
38 | class ReadCompressed;
39 |
40 | class ReadBase {
41 | public:
42 | virtual ~ReadBase() {}
43 |
44 | virtual std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) = 0;
45 |
46 | protected:
47 | static void ReplaceThis(ReadBase *with, ReadCompressed &thunk);
48 |
49 | ReadBase *Current(ReadCompressed &thunk);
50 |
51 | static uint64_t &ReadCount(ReadCompressed &thunk);
52 | };
53 |
54 | class ReadCompressed {
55 | public:
56 | static const std::size_t kMagicSize = 6;
57 | // Must have at least kMagicSize bytes.
58 | static bool DetectCompressedMagic(const void *from);
59 |
60 | // Takes ownership of fd.
61 | explicit ReadCompressed(int fd);
62 |
63 | // Try to avoid using this. Use the fd instead.
64 | // There is no decompression support for istreams.
65 | explicit ReadCompressed(std::istream &in);
66 |
67 | // Must call Reset later.
68 | ReadCompressed();
69 |
70 | // Takes ownership of fd.
71 | void Reset(int fd);
72 |
73 | // Same advice as the constructor.
74 | void Reset(std::istream &in);
75 |
76 | std::size_t Read(void *to, std::size_t amount);
77 |
78 | // Repeatedly call read to fill a buffer unless EOF is hit.
79 | // Return number of bytes read.
80 | std::size_t ReadOrEOF(void *const to, std::size_t amount);
81 |
82 | uint64_t RawAmount() const { return raw_amount_; }
83 |
84 | private:
85 | friend class ReadBase;
86 |
87 | scoped_ptr internal_;
88 |
89 | uint64_t raw_amount_;
90 | };
91 |
92 | class WriteBase {
93 | public:
94 | virtual ~WriteBase();
95 |
96 | virtual void write(const void *data, std::size_t amount) = 0;
97 |
98 | virtual void flush() = 0;
99 |
100 | protected:
101 | WriteBase();
102 | };
103 |
104 | /* Currently xzip is missing */
105 | class WriteCompressed {
106 | public:
107 | enum Compression { NONE, GZIP, BZIP, XZIP };
108 | // Takes ownership of fd.
109 | explicit WriteCompressed(int fd, Compression compression);
110 |
111 | ~WriteCompressed();
112 |
113 | void write(const void *data, std::size_t amount);
114 |
115 | void flush();
116 |
117 | private:
118 | scoped_ptr backend_;
119 | };
120 |
121 | // Very basic gzip compression support. Normally this would involve streams
122 | // but I needed the compression in the thread with fused output.
123 | void GZCompress(StringPiece from, std::string &to, int level = 9);
124 |
125 | } // namespace util
126 |
127 | #endif // UTIL_COMPRESS_H
128 |
--------------------------------------------------------------------------------
/util/double-conversion/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # This CMake file was created by Lane Schwartz
2 |
3 | # Explicitly list the source files for this subdirectory
4 | #
5 | # If you add any source files to this subdirectory
6 | # that should be included in the kenlm library,
7 | # (this excludes any unit test files)
8 | # you should add them to the following list:
9 | #
10 | # In order to allow CMake files in the parent directory
11 | # to see this variable definition, we set PARENT_SCOPE.
12 | #
13 | # In order to set correct paths to these files
14 | # when this variable is referenced by CMake files in the parent directory,
15 | # we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}.
16 | #
17 | set(PREPROCESS_UTIL_DOUBLECONVERSION_SOURCE
18 | ${CMAKE_CURRENT_SOURCE_DIR}/bignum-dtoa.cc
19 | ${CMAKE_CURRENT_SOURCE_DIR}/bignum.cc
20 | ${CMAKE_CURRENT_SOURCE_DIR}/cached-powers.cc
21 | ${CMAKE_CURRENT_SOURCE_DIR}/diy-fp.cc
22 | ${CMAKE_CURRENT_SOURCE_DIR}/double-conversion.cc
23 | ${CMAKE_CURRENT_SOURCE_DIR}/fast-dtoa.cc
24 | ${CMAKE_CURRENT_SOURCE_DIR}/fixed-dtoa.cc
25 | ${CMAKE_CURRENT_SOURCE_DIR}/strtod.cc
26 | PARENT_SCOPE)
27 |
28 |
--------------------------------------------------------------------------------
/util/double-conversion/Jamfile:
--------------------------------------------------------------------------------
1 | fakelib double-conversion : [ glob *.cc ] : : : . ;
2 |
--------------------------------------------------------------------------------
/util/double-conversion/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2006-2011, the V8 project authors. All rights reserved.
2 | Redistribution and use in source and binary forms, with or without
3 | modification, are permitted provided that the following conditions are
4 | met:
5 |
6 | * Redistributions of source code must retain the above copyright
7 | notice, this list of conditions and the following disclaimer.
8 | * Redistributions in binary form must reproduce the above
9 | copyright notice, this list of conditions and the following
10 | disclaimer in the documentation and/or other materials provided
11 | with the distribution.
12 | * Neither the name of Google Inc. nor the names of its
13 | contributors may be used to endorse or promote products derived
14 | from this software without specific prior written permission.
15 |
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 |
--------------------------------------------------------------------------------
/util/double-conversion/bignum-dtoa.h:
--------------------------------------------------------------------------------
1 | // Copyright 2010 the V8 project authors. All rights reserved.
2 | // Redistribution and use in source and binary forms, with or without
3 | // modification, are permitted provided that the following conditions are
4 | // met:
5 | //
6 | // * Redistributions of source code must retain the above copyright
7 | // notice, this list of conditions and the following disclaimer.
8 | // * Redistributions in binary form must reproduce the above
9 | // copyright notice, this list of conditions and the following
10 | // disclaimer in the documentation and/or other materials provided
11 | // with the distribution.
12 | // * Neither the name of Google Inc. nor the names of its
13 | // contributors may be used to endorse or promote products derived
14 | // from this software without specific prior written permission.
15 | //
16 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 |
28 | #ifndef DOUBLE_CONVERSION_BIGNUM_DTOA_H_
29 | #define DOUBLE_CONVERSION_BIGNUM_DTOA_H_
30 |
31 | #include "utils.h"
32 |
33 | namespace double_conversion {
34 |
35 | enum BignumDtoaMode {
36 | // Return the shortest correct representation.
37 | // For example the output of 0.299999999999999988897 is (the less accurate but
38 | // correct) 0.3.
39 | BIGNUM_DTOA_SHORTEST,
40 | // Same as BIGNUM_DTOA_SHORTEST but for single-precision floats.
41 | BIGNUM_DTOA_SHORTEST_SINGLE,
42 | // Return a fixed number of digits after the decimal point.
43 | // For instance fixed(0.1, 4) becomes 0.1000
44 | // If the input number is big, the output will be big.
45 | BIGNUM_DTOA_FIXED,
46 | // Return a fixed number of digits, no matter what the exponent is.
47 | BIGNUM_DTOA_PRECISION
48 | };
49 |
50 | // Converts the given double 'v' to ascii.
51 | // The result should be interpreted as buffer * 10^(point-length).
52 | // The buffer will be null-terminated.
53 | //
54 | // The input v must be > 0 and different from NaN, and Infinity.
55 | //
56 | // The output depends on the given mode:
57 | // - SHORTEST: produce the least amount of digits for which the internal
58 | // identity requirement is still satisfied. If the digits are printed
59 | // (together with the correct exponent) then reading this number will give
60 | // 'v' again. The buffer will choose the representation that is closest to
61 | // 'v'. If there are two at the same distance, than the number is round up.
62 | // In this mode the 'requested_digits' parameter is ignored.
63 | // - FIXED: produces digits necessary to print a given number with
64 | // 'requested_digits' digits after the decimal point. The produced digits
65 | // might be too short in which case the caller has to fill the gaps with '0's.
66 | // Example: toFixed(0.001, 5) is allowed to return buffer="1", point=-2.
67 | // Halfway cases are rounded up. The call toFixed(0.15, 2) thus returns
68 | // buffer="2", point=0.
69 | // Note: the length of the returned buffer has no meaning wrt the significance
70 | // of its digits. That is, just because it contains '0's does not mean that
71 | // any other digit would not satisfy the internal identity requirement.
72 | // - PRECISION: produces 'requested_digits' where the first digit is not '0'.
73 | // Even though the length of produced digits usually equals
74 | // 'requested_digits', the function is allowed to return fewer digits, in
75 | // which case the caller has to fill the missing digits with '0's.
76 | // Halfway cases are again rounded up.
77 | // 'BignumDtoa' expects the given buffer to be big enough to hold all digits
78 | // and a terminating null-character.
79 | void BignumDtoa(double v, BignumDtoaMode mode, int requested_digits,
80 | Vector buffer, int* length, int* point);
81 |
82 | } // namespace double_conversion
83 |
84 | #endif // DOUBLE_CONVERSION_BIGNUM_DTOA_H_
85 |
--------------------------------------------------------------------------------
/util/double-conversion/cached-powers.h:
--------------------------------------------------------------------------------
1 | // Copyright 2010 the V8 project authors. All rights reserved.
2 | // Redistribution and use in source and binary forms, with or without
3 | // modification, are permitted provided that the following conditions are
4 | // met:
5 | //
6 | // * Redistributions of source code must retain the above copyright
7 | // notice, this list of conditions and the following disclaimer.
8 | // * Redistributions in binary form must reproduce the above
9 | // copyright notice, this list of conditions and the following
10 | // disclaimer in the documentation and/or other materials provided
11 | // with the distribution.
12 | // * Neither the name of Google Inc. nor the names of its
13 | // contributors may be used to endorse or promote products derived
14 | // from this software without specific prior written permission.
15 | //
16 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 |
28 | #ifndef DOUBLE_CONVERSION_CACHED_POWERS_H_
29 | #define DOUBLE_CONVERSION_CACHED_POWERS_H_
30 |
31 | #include "diy-fp.h"
32 |
33 | namespace double_conversion {
34 |
35 | class PowersOfTenCache {
36 | public:
37 |
38 | // Not all powers of ten are cached. The decimal exponent of two neighboring
39 | // cached numbers will differ by kDecimalExponentDistance.
40 | static const int kDecimalExponentDistance;
41 |
42 | static const int kMinDecimalExponent;
43 | static const int kMaxDecimalExponent;
44 |
45 | // Returns a cached power-of-ten with a binary exponent in the range
46 | // [min_exponent; max_exponent] (boundaries included).
47 | static void GetCachedPowerForBinaryExponentRange(int min_exponent,
48 | int max_exponent,
49 | DiyFp* power,
50 | int* decimal_exponent);
51 |
52 | // Returns a cached power of ten x ~= 10^k such that
53 | // k <= decimal_exponent < k + kCachedPowersDecimalDistance.
54 | // The given decimal_exponent must satisfy
55 | // kMinDecimalExponent <= requested_exponent, and
56 | // requested_exponent < kMaxDecimalExponent + kDecimalExponentDistance.
57 | static void GetCachedPowerForDecimalExponent(int requested_exponent,
58 | DiyFp* power,
59 | int* found_exponent);
60 | };
61 |
62 | } // namespace double_conversion
63 |
64 | #endif // DOUBLE_CONVERSION_CACHED_POWERS_H_
65 |
--------------------------------------------------------------------------------
/util/double-conversion/diy-fp.cc:
--------------------------------------------------------------------------------
1 | // Copyright 2010 the V8 project authors. All rights reserved.
2 | // Redistribution and use in source and binary forms, with or without
3 | // modification, are permitted provided that the following conditions are
4 | // met:
5 | //
6 | // * Redistributions of source code must retain the above copyright
7 | // notice, this list of conditions and the following disclaimer.
8 | // * Redistributions in binary form must reproduce the above
9 | // copyright notice, this list of conditions and the following
10 | // disclaimer in the documentation and/or other materials provided
11 | // with the distribution.
12 | // * Neither the name of Google Inc. nor the names of its
13 | // contributors may be used to endorse or promote products derived
14 | // from this software without specific prior written permission.
15 | //
16 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 |
28 |
29 | #include "diy-fp.h"
30 | #include "utils.h"
31 |
32 | namespace double_conversion {
33 |
34 | void DiyFp::Multiply(const DiyFp& other) {
35 | // Simply "emulates" a 128 bit multiplication.
36 | // However: the resulting number only contains 64 bits. The least
37 | // significant 64 bits are only used for rounding the most significant 64
38 | // bits.
39 | const uint64_t kM32 = 0xFFFFFFFFU;
40 | uint64_t a = f_ >> 32;
41 | uint64_t b = f_ & kM32;
42 | uint64_t c = other.f_ >> 32;
43 | uint64_t d = other.f_ & kM32;
44 | uint64_t ac = a * c;
45 | uint64_t bc = b * c;
46 | uint64_t ad = a * d;
47 | uint64_t bd = b * d;
48 | uint64_t tmp = (bd >> 32) + (ad & kM32) + (bc & kM32);
49 | // By adding 1U << 31 to tmp we round the final result.
50 | // Halfway cases will be round up.
51 | tmp += 1U << 31;
52 | uint64_t result_f = ac + (ad >> 32) + (bc >> 32) + (tmp >> 32);
53 | e_ += other.e_ + 64;
54 | f_ = result_f;
55 | }
56 |
57 | } // namespace double_conversion
58 |
--------------------------------------------------------------------------------
/util/double-conversion/diy-fp.h:
--------------------------------------------------------------------------------
1 | // Copyright 2010 the V8 project authors. All rights reserved.
2 | // Redistribution and use in source and binary forms, with or without
3 | // modification, are permitted provided that the following conditions are
4 | // met:
5 | //
6 | // * Redistributions of source code must retain the above copyright
7 | // notice, this list of conditions and the following disclaimer.
8 | // * Redistributions in binary form must reproduce the above
9 | // copyright notice, this list of conditions and the following
10 | // disclaimer in the documentation and/or other materials provided
11 | // with the distribution.
12 | // * Neither the name of Google Inc. nor the names of its
13 | // contributors may be used to endorse or promote products derived
14 | // from this software without specific prior written permission.
15 | //
16 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 |
28 | #ifndef DOUBLE_CONVERSION_DIY_FP_H_
29 | #define DOUBLE_CONVERSION_DIY_FP_H_
30 |
31 | #include "utils.h"
32 |
33 | namespace double_conversion {
34 |
35 | // This "Do It Yourself Floating Point" class implements a floating-point number
36 | // with a uint64 significand and an int exponent. Normalized DiyFp numbers will
37 | // have the most significant bit of the significand set.
38 | // Multiplication and Subtraction do not normalize their results.
39 | // DiyFp are not designed to contain special doubles (NaN and Infinity).
40 | class DiyFp {
41 | public:
42 | static const int kSignificandSize = 64;
43 |
44 | DiyFp() : f_(0), e_(0) {}
45 | DiyFp(uint64_t f, int e) : f_(f), e_(e) {}
46 |
47 | // this = this - other.
48 | // The exponents of both numbers must be the same and the significand of this
49 | // must be bigger than the significand of other.
50 | // The result will not be normalized.
51 | void Subtract(const DiyFp& other) {
52 | ASSERT(e_ == other.e_);
53 | ASSERT(f_ >= other.f_);
54 | f_ -= other.f_;
55 | }
56 |
57 | // Returns a - b.
58 | // The exponents of both numbers must be the same and this must be bigger
59 | // than other. The result will not be normalized.
60 | static DiyFp Minus(const DiyFp& a, const DiyFp& b) {
61 | DiyFp result = a;
62 | result.Subtract(b);
63 | return result;
64 | }
65 |
66 |
67 | // this = this * other.
68 | void Multiply(const DiyFp& other);
69 |
70 | // returns a * b;
71 | static DiyFp Times(const DiyFp& a, const DiyFp& b) {
72 | DiyFp result = a;
73 | result.Multiply(b);
74 | return result;
75 | }
76 |
77 | void Normalize() {
78 | ASSERT(f_ != 0);
79 | uint64_t f = f_;
80 | int e = e_;
81 |
82 | // This method is mainly called for normalizing boundaries. In general
83 | // boundaries need to be shifted by 10 bits. We thus optimize for this case.
84 | const uint64_t k10MSBits = UINT64_2PART_C(0xFFC00000, 00000000);
85 | while ((f & k10MSBits) == 0) {
86 | f <<= 10;
87 | e -= 10;
88 | }
89 | while ((f & kUint64MSB) == 0) {
90 | f <<= 1;
91 | e--;
92 | }
93 | f_ = f;
94 | e_ = e;
95 | }
96 |
97 | static DiyFp Normalize(const DiyFp& a) {
98 | DiyFp result = a;
99 | result.Normalize();
100 | return result;
101 | }
102 |
103 | uint64_t f() const { return f_; }
104 | int e() const { return e_; }
105 |
106 | void set_f(uint64_t new_value) { f_ = new_value; }
107 | void set_e(int new_value) { e_ = new_value; }
108 |
109 | private:
110 | static const uint64_t kUint64MSB = UINT64_2PART_C(0x80000000, 00000000);
111 |
112 | uint64_t f_;
113 | int e_;
114 | };
115 |
116 | } // namespace double_conversion
117 |
118 | #endif // DOUBLE_CONVERSION_DIY_FP_H_
119 |
--------------------------------------------------------------------------------
/util/double-conversion/fast-dtoa.h:
--------------------------------------------------------------------------------
1 | // Copyright 2010 the V8 project authors. All rights reserved.
2 | // Redistribution and use in source and binary forms, with or without
3 | // modification, are permitted provided that the following conditions are
4 | // met:
5 | //
6 | // * Redistributions of source code must retain the above copyright
7 | // notice, this list of conditions and the following disclaimer.
8 | // * Redistributions in binary form must reproduce the above
9 | // copyright notice, this list of conditions and the following
10 | // disclaimer in the documentation and/or other materials provided
11 | // with the distribution.
12 | // * Neither the name of Google Inc. nor the names of its
13 | // contributors may be used to endorse or promote products derived
14 | // from this software without specific prior written permission.
15 | //
16 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 |
28 | #ifndef DOUBLE_CONVERSION_FAST_DTOA_H_
29 | #define DOUBLE_CONVERSION_FAST_DTOA_H_
30 |
31 | #include "utils.h"
32 |
33 | namespace double_conversion {
34 |
35 | enum FastDtoaMode {
36 | // Computes the shortest representation of the given input. The returned
37 | // result will be the most accurate number of this length. Longer
38 | // representations might be more accurate.
39 | FAST_DTOA_SHORTEST,
40 | // Same as FAST_DTOA_SHORTEST but for single-precision floats.
41 | FAST_DTOA_SHORTEST_SINGLE,
42 | // Computes a representation where the precision (number of digits) is
43 | // given as input. The precision is independent of the decimal point.
44 | FAST_DTOA_PRECISION
45 | };
46 |
47 | // FastDtoa will produce at most kFastDtoaMaximalLength digits. This does not
48 | // include the terminating '\0' character.
49 | static const int kFastDtoaMaximalLength = 17;
50 | // Same for single-precision numbers.
51 | static const int kFastDtoaMaximalSingleLength = 9;
52 |
53 | // Provides a decimal representation of v.
54 | // The result should be interpreted as buffer * 10^(point - length).
55 | //
56 | // Precondition:
57 | // * v must be a strictly positive finite double.
58 | //
59 | // Returns true if it succeeds, otherwise the result can not be trusted.
60 | // There will be *length digits inside the buffer followed by a null terminator.
61 | // If the function returns true and mode equals
62 | // - FAST_DTOA_SHORTEST, then
63 | // the parameter requested_digits is ignored.
64 | // The result satisfies
65 | // v == (double) (buffer * 10^(point - length)).
66 | // The digits in the buffer are the shortest representation possible. E.g.
67 | // if 0.099999999999 and 0.1 represent the same double then "1" is returned
68 | // with point = 0.
69 | // The last digit will be closest to the actual v. That is, even if several
70 | // digits might correctly yield 'v' when read again, the buffer will contain
71 | // the one closest to v.
72 | // - FAST_DTOA_PRECISION, then
73 | // the buffer contains requested_digits digits.
74 | // the difference v - (buffer * 10^(point-length)) is closest to zero for
75 | // all possible representations of requested_digits digits.
76 | // If there are two values that are equally close, then FastDtoa returns
77 | // false.
78 | // For both modes the buffer must be large enough to hold the result.
79 | bool FastDtoa(double d,
80 | FastDtoaMode mode,
81 | int requested_digits,
82 | Vector buffer,
83 | int* length,
84 | int* decimal_point);
85 |
86 | } // namespace double_conversion
87 |
88 | #endif // DOUBLE_CONVERSION_FAST_DTOA_H_
89 |
--------------------------------------------------------------------------------
/util/double-conversion/fixed-dtoa.h:
--------------------------------------------------------------------------------
1 | // Copyright 2010 the V8 project authors. All rights reserved.
2 | // Redistribution and use in source and binary forms, with or without
3 | // modification, are permitted provided that the following conditions are
4 | // met:
5 | //
6 | // * Redistributions of source code must retain the above copyright
7 | // notice, this list of conditions and the following disclaimer.
8 | // * Redistributions in binary form must reproduce the above
9 | // copyright notice, this list of conditions and the following
10 | // disclaimer in the documentation and/or other materials provided
11 | // with the distribution.
12 | // * Neither the name of Google Inc. nor the names of its
13 | // contributors may be used to endorse or promote products derived
14 | // from this software without specific prior written permission.
15 | //
16 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 |
28 | #ifndef DOUBLE_CONVERSION_FIXED_DTOA_H_
29 | #define DOUBLE_CONVERSION_FIXED_DTOA_H_
30 |
31 | #include "utils.h"
32 |
33 | namespace double_conversion {
34 |
35 | // Produces digits necessary to print a given number with
36 | // 'fractional_count' digits after the decimal point.
37 | // The buffer must be big enough to hold the result plus one terminating null
38 | // character.
39 | //
40 | // The produced digits might be too short in which case the caller has to fill
41 | // the gaps with '0's.
42 | // Example: FastFixedDtoa(0.001, 5, ...) is allowed to return buffer = "1", and
43 | // decimal_point = -2.
44 | // Halfway cases are rounded towards +/-Infinity (away from 0). The call
45 | // FastFixedDtoa(0.15, 2, ...) thus returns buffer = "2", decimal_point = 0.
46 | // The returned buffer may contain digits that would be truncated from the
47 | // shortest representation of the input.
48 | //
49 | // This method only works for some parameters. If it can't handle the input it
50 | // returns false. The output is null-terminated when the function succeeds.
51 | bool FastFixedDtoa(double v, int fractional_count,
52 | Vector buffer, int* length, int* decimal_point);
53 |
54 | } // namespace double_conversion
55 |
56 | #endif // DOUBLE_CONVERSION_FIXED_DTOA_H_
57 |
--------------------------------------------------------------------------------
/util/double-conversion/strtod.h:
--------------------------------------------------------------------------------
1 | // Copyright 2010 the V8 project authors. All rights reserved.
2 | // Redistribution and use in source and binary forms, with or without
3 | // modification, are permitted provided that the following conditions are
4 | // met:
5 | //
6 | // * Redistributions of source code must retain the above copyright
7 | // notice, this list of conditions and the following disclaimer.
8 | // * Redistributions in binary form must reproduce the above
9 | // copyright notice, this list of conditions and the following
10 | // disclaimer in the documentation and/or other materials provided
11 | // with the distribution.
12 | // * Neither the name of Google Inc. nor the names of its
13 | // contributors may be used to endorse or promote products derived
14 | // from this software without specific prior written permission.
15 | //
16 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 |
28 | #ifndef DOUBLE_CONVERSION_STRTOD_H_
29 | #define DOUBLE_CONVERSION_STRTOD_H_
30 |
31 | #include "utils.h"
32 |
33 | namespace double_conversion {
34 |
35 | // The buffer must only contain digits in the range [0-9]. It must not
36 | // contain a dot or a sign. It must not start with '0', and must not be empty.
37 | double Strtod(Vector buffer, int exponent);
38 |
39 | // The buffer must only contain digits in the range [0-9]. It must not
40 | // contain a dot or a sign. It must not start with '0', and must not be empty.
41 | float Strtof(Vector buffer, int exponent);
42 |
43 | } // namespace double_conversion
44 |
45 | #endif // DOUBLE_CONVERSION_STRTOD_H_
46 |
--------------------------------------------------------------------------------
/util/ersatz_progress.cc:
--------------------------------------------------------------------------------
1 | #include "util/ersatz_progress.hh"
2 |
3 | #include
4 | #include
5 | #include
6 | #include
7 |
8 | namespace util {
9 |
10 | namespace { const unsigned char kWidth = 100; }
11 |
12 | const char kProgressBanner[] = "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n";
13 |
14 | ErsatzProgress::ErsatzProgress() : current_(0), next_(std::numeric_limits::max()), complete_(next_), out_(NULL) {}
15 |
16 | ErsatzProgress::~ErsatzProgress() {
17 | if (out_) Finished();
18 | }
19 |
20 | ErsatzProgress::ErsatzProgress(uint64_t complete, std::ostream *to, const std::string &message)
21 | : current_(0), next_(complete / kWidth), complete_(complete), stones_written_(0), out_(to) {
22 | if (!out_) {
23 | next_ = std::numeric_limits::max();
24 | return;
25 | }
26 | if (!message.empty()) *out_ << message << '\n';
27 | *out_ << kProgressBanner;
28 | }
29 |
30 | void ErsatzProgress::Milestone() {
31 | if (!out_) { current_ = 0; return; }
32 | if (!complete_) return;
33 | unsigned char stone = std::min(static_cast(kWidth), (current_ * kWidth) / complete_);
34 |
35 | for (; stones_written_ < stone; ++stones_written_) {
36 | (*out_) << '*';
37 | }
38 | if (stone == kWidth) {
39 | (*out_) << std::endl;
40 | next_ = std::numeric_limits::max();
41 | out_ = NULL;
42 | } else {
43 | next_ = std::max(next_, ((stone + 1) * complete_ + kWidth - 1) / kWidth);
44 | }
45 | }
46 |
47 | } // namespace util
48 |
--------------------------------------------------------------------------------
/util/ersatz_progress.hh:
--------------------------------------------------------------------------------
1 | #ifndef UTIL_ERSATZ_PROGRESS_H
2 | #define UTIL_ERSATZ_PROGRESS_H
3 |
4 | #include
5 | #include
6 | #include
7 |
8 | // Ersatz version of boost::progress so core language model doesn't depend on
9 | // boost. Also adds option to print nothing.
10 |
11 | namespace util {
12 |
13 | extern const char kProgressBanner[];
14 |
15 | class ErsatzProgress {
16 | public:
17 | // No output.
18 | ErsatzProgress();
19 |
20 | // Null means no output. The null value is useful for passing along the ostream pointer from another caller.
21 | explicit ErsatzProgress(uint64_t complete, std::ostream *to = &std::cerr, const std::string &message = "");
22 |
23 | #if __cplusplus >= 201103L
24 | ErsatzProgress(ErsatzProgress &&from) noexcept : current_(from.current_), next_(from.next_), complete_(from.complete_), stones_written_(from.stones_written_), out_(from.out_) {
25 | from.out_ = nullptr;
26 | from.next_ = (uint64_t)-1;
27 | }
28 | #endif
29 |
30 | ~ErsatzProgress();
31 |
32 | ErsatzProgress &operator++() {
33 | if (++current_ >= next_) Milestone();
34 | return *this;
35 | }
36 |
37 | ErsatzProgress &operator+=(uint64_t amount) {
38 | if ((current_ += amount) >= next_) Milestone();
39 | return *this;
40 | }
41 |
42 | void Set(uint64_t to) {
43 | if ((current_ = to) >= next_) Milestone();
44 | }
45 |
46 | void Finished() {
47 | Set(complete_);
48 | }
49 |
50 | private:
51 | void Milestone();
52 |
53 | uint64_t current_, next_, complete_;
54 | unsigned char stones_written_;
55 | std::ostream *out_;
56 |
57 | // noncopyable
58 | ErsatzProgress(const ErsatzProgress &other);
59 | ErsatzProgress &operator=(const ErsatzProgress &other);
60 | };
61 |
62 | } // namespace util
63 |
64 | #endif // UTIL_ERSATZ_PROGRESS_H
65 |
--------------------------------------------------------------------------------
/util/exception.cc:
--------------------------------------------------------------------------------
1 | #include "util/exception.hh"
2 |
3 | #ifdef __GXX_RTTI
4 | #include
5 | #endif
6 |
7 | #include
8 | #include
9 |
10 | #if defined(_WIN32) || defined(_WIN64)
11 | #include
12 | #include
13 | #endif
14 |
15 | namespace util {
16 |
17 | Exception::Exception() throw() {}
18 | Exception::~Exception() throw() {}
19 |
20 | void Exception::SetLocation(const char *file, unsigned int line, const char *func, const char *child_name, const char *condition) {
21 | /* The child class might have set some text, but we want this to come first.
22 | * Another option would be passing this information to the constructor, but
23 | * then child classes would have to accept constructor arguments and pass
24 | * them down.
25 | */
26 | std::string old_text;
27 | what_.swap(old_text);
28 | what_ << file << ':' << line;
29 | if (func) what_ << " in " << func << " threw ";
30 | if (child_name) {
31 | what_ << child_name;
32 | } else {
33 | #ifdef __GXX_RTTI
34 | what_ << typeid(this).name();
35 | #else
36 | what_ << "an exception";
37 | #endif
38 | }
39 | if (condition) {
40 | what_ << " because `" << condition << '\'';
41 | }
42 | what_ << ".\n";
43 | what_ << old_text;
44 | }
45 |
46 | namespace {
47 |
48 | #ifdef __GNUC__
49 | const char *HandleStrerror(int ret, const char *buf) __attribute__ ((unused));
50 | const char *HandleStrerror(const char *ret, const char * /*buf*/) __attribute__ ((unused));
51 | #endif
52 | // At least one of these functions will not be called.
53 | #ifdef __clang__
54 | #pragma clang diagnostic push
55 | #pragma clang diagnostic ignored "-Wunused-function"
56 | #endif
57 | // The XOPEN version.
58 | const char *HandleStrerror(int ret, const char *buf) {
59 | if (!ret) return buf;
60 | return NULL;
61 | }
62 |
63 | // The GNU version.
64 | const char *HandleStrerror(const char *ret, const char * /*buf*/) {
65 | return ret;
66 | }
67 | #ifdef __clang__
68 | #pragma clang diagnostic pop
69 | #endif
70 | } // namespace
71 |
72 | ErrnoException::ErrnoException() throw() : errno_(errno) {
73 | char buf[200];
74 | buf[0] = 0;
75 | #if defined(sun) || defined(_WIN32) || defined(_WIN64)
76 | const char *add = strerror(errno);
77 | #else
78 | const char *add = HandleStrerror(strerror_r(errno, buf, 200), buf);
79 | #endif
80 |
81 | if (add) {
82 | *this << add << ' ';
83 | }
84 | }
85 |
86 | ErrnoException::~ErrnoException() throw() {}
87 |
88 | OverflowException::OverflowException() throw() {}
89 | OverflowException::~OverflowException() throw() {}
90 |
91 | #if defined(_WIN32) || defined(_WIN64)
92 | WindowsException::WindowsException() throw() {
93 | unsigned int last_error = GetLastError();
94 | char error_msg[256] = "";
95 | if (!FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL, last_error, LANG_NEUTRAL, error_msg, sizeof(error_msg), NULL)) {
96 | *this << "Windows error " << GetLastError() << " while formatting Windows error " << last_error << ". ";
97 | } else {
98 | *this << "Windows error " << last_error << ": " << error_msg;
99 | }
100 | }
101 | WindowsException::~WindowsException() throw() {}
102 | #endif
103 |
104 | } // namespace util
105 |
--------------------------------------------------------------------------------
/util/fake_ostream.hh:
--------------------------------------------------------------------------------
1 | #ifndef UTIL_FAKE_OSTREAM_H
2 | #define UTIL_FAKE_OSTREAM_H
3 |
4 | #include "util/float_to_string.hh"
5 | #include "util/integer_to_string.hh"
6 | #include "util/string_piece.hh"
7 |
8 | #include
9 | #include
10 |
11 | #include
12 |
13 | namespace util {
14 |
15 | /* Like std::ostream but without being incredibly slow.
16 | * Supports most of the built-in types except for long double.
17 | *
18 | * The FakeOStream class is intended to be inherited from. The inherting class
19 | * should provide:
20 | * public:
21 | * Derived &flush();
22 | * Derived &write(const void *data, std::size_t length);
23 | *
24 | * private: or protected:
25 | * friend class FakeOStream;
26 | * char *Ensure(std::size_t amount);
27 | * void AdvanceTo(char *to);
28 | *
29 | * The Ensure function makes enough space for an in-place write and returns
30 | * where to write. The AdvanceTo function happens after the write, saying how
31 | * much was actually written.
32 | *
33 | * Precondition:
34 | * amount <= kToStringMaxBytes for in-place writes.
35 | */
36 | template class FakeOStream {
37 | public:
38 | FakeOStream() {}
39 |
40 | // This also covers std::string and char*
41 | Derived &operator<<(StringPiece str) {
42 | return C().write(str.data(), str.size());
43 | }
44 |
45 | // Handle integers by size and signedness.
46 | private:
47 | template struct EnableIfKludge {
48 | typedef Derived type;
49 | };
50 | template ::is_signed, bool IsInteger = std::numeric_limits::is_integer> struct Coerce {};
51 |
52 | template struct Coerce { typedef uint16_t To; };
53 | template struct Coerce { typedef uint32_t To; };
54 | template struct Coerce { typedef uint64_t To; };
55 |
56 | template struct Coerce { typedef int16_t To; };
57 | template struct Coerce { typedef int32_t To; };
58 | template struct Coerce { typedef int64_t To; };
59 | public:
60 | template typename EnableIfKludge::To>::type &operator<<(const From value) {
61 | return CallToString(static_cast::To>(value));
62 | }
63 |
64 | // Character types that get copied as bytes instead of displayed as integers.
65 | Derived &operator<<(char val) { return put(val); }
66 | Derived &operator<<(signed char val) { return put(static_cast(val)); }
67 | Derived &operator<<(unsigned char val) { return put(static_cast(val)); }
68 |
69 | Derived &operator<<(bool val) { return put(val + '0'); }
70 | // enums will fall back to int but are not caught by the template.
71 | Derived &operator<<(int val) { return CallToString(static_cast::To>(val)); }
72 |
73 | Derived &operator<<(float val) { return CallToString(val); }
74 | Derived &operator<<(double val) { return CallToString(val); }
75 |
76 | // This is here to catch all the other pointer types.
77 | Derived &operator<<(const void *value) { return CallToString(value); }
78 | // This is here because the above line also catches const char*.
79 | Derived &operator<<(const char *value) { return *this << StringPiece(value); }
80 | Derived &operator<<(char *value) { return *this << StringPiece(value); }
81 |
82 | Derived &put(char val) {
83 | char *c = C().Ensure(1);
84 | *c = val;
85 | C().AdvanceTo(++c);
86 | return C();
87 | }
88 |
89 | char widen(char val) const { return val; }
90 |
91 | private:
92 | // References to derived class for convenience.
93 | Derived &C() {
94 | return *static_cast(this);
95 | }
96 |
97 | const Derived &C() const {
98 | return *static_cast(this);
99 | }
100 |
101 | // This is separate to prevent an infinite loop if the compiler considers
102 | // types the same (i.e. gcc std::size_t and uint64_t or uint32_t).
103 | template Derived &CallToString(const T value) {
104 | C().AdvanceTo(ToString(value, C().Ensure(ToStringBuf::kBytes)));
105 | return C();
106 | }
107 | };
108 |
109 | } // namespace
110 |
111 | #endif // UTIL_FAKE_OSTREAM_H
112 |
--------------------------------------------------------------------------------
/util/file_stream.hh:
--------------------------------------------------------------------------------
1 | /* Like std::ofstream but without being incredibly slow. Backed by a raw fd that it owns.
2 | * Supports most of the built-in types except for long double.
3 | */
4 | #ifndef UTIL_FILE_STREAM_H
5 | #define UTIL_FILE_STREAM_H
6 |
7 | #include "util/buffered_stream.hh"
8 | #include "util/file.hh"
9 |
10 | #include
11 |
12 | namespace util {
13 |
14 | typedef BufferedStream FileStream;
15 |
16 | } // namespace
17 |
18 | #endif
19 |
--------------------------------------------------------------------------------
/util/float_to_string.cc:
--------------------------------------------------------------------------------
1 | #include "util/float_to_string.hh"
2 |
3 | #include "util/double-conversion/double-conversion.h"
4 | #include "util/double-conversion/utils.h"
5 |
6 | namespace util {
7 | namespace {
8 | const double_conversion::DoubleToStringConverter kConverter(double_conversion::DoubleToStringConverter::NO_FLAGS, "inf", "NaN", 'e', -6, 21, 6, 0);
9 | } // namespace
10 |
11 | char *ToString(double value, char *to) {
12 | double_conversion::StringBuilder builder(to, ToStringBuf::kBytes);
13 | kConverter.ToShortest(value, &builder);
14 | return &to[builder.position()];
15 | }
16 |
17 | char *ToString(float value, char *to) {
18 | double_conversion::StringBuilder builder(to, ToStringBuf::kBytes);
19 | kConverter.ToShortestSingle(value, &builder);
20 | return &to[builder.position()];
21 | }
22 |
23 | } // namespace util
24 |
--------------------------------------------------------------------------------
/util/float_to_string.hh:
--------------------------------------------------------------------------------
1 | #ifndef UTIL_FLOAT_TO_STRING_H
2 | #define UTIL_FLOAT_TO_STRING_H
3 |
4 | // Just for ToStringBuf
5 | #include "util/integer_to_string.hh"
6 |
7 | namespace util {
8 |
9 | template <> struct ToStringBuf {
10 | // DoubleToStringConverter::kBase10MaximalLength + 1 for null paranoia.
11 | static const unsigned kBytes = 19;
12 | };
13 |
14 | // Single wasn't documented in double conversion, so be conservative and
15 | // say the same as double.
16 | template <> struct ToStringBuf {
17 | static const unsigned kBytes = 19;
18 | };
19 |
20 | char *ToString(double value, char *to);
21 | char *ToString(float value, char *to);
22 |
23 | } // namespace util
24 |
25 | #endif // UTIL_FLOAT_TO_STRING_H
26 |
--------------------------------------------------------------------------------
/util/have.hh:
--------------------------------------------------------------------------------
1 | /* Optional packages. You might want to integrate this with your build system e.g. config.h from ./configure. */
2 | #ifndef UTIL_HAVE
3 | #define UTIL_HAVE
4 |
5 | #ifndef HAVE_BOOST
6 | //#define HAVE_BOOST
7 | #endif
8 |
9 | #endif // UTIL_HAVE
10 |
--------------------------------------------------------------------------------
/util/integer_to_string.hh:
--------------------------------------------------------------------------------
1 | #ifndef UTIL_INTEGER_TO_STRING_H
2 | #define UTIL_INTEGER_TO_STRING_H
3 | #include
4 | #include
5 |
6 | namespace util {
7 |
8 | /* These functions convert integers to strings and return the end pointer.
9 | */
10 | char *ToString(uint32_t value, char *to);
11 | char *ToString(uint64_t value, char *to);
12 |
13 | // Implemented as wrappers to above
14 | char *ToString(int32_t value, char *to);
15 | char *ToString(int64_t value, char *to);
16 |
17 | // Calls the 32-bit versions for now.
18 | char *ToString(uint16_t value, char *to);
19 | char *ToString(int16_t value, char *to);
20 |
21 | char *ToString(const void *value, char *to);
22 |
23 | inline char *ToString(bool value, char *to) {
24 | *to++ = '0' + value;
25 | return to;
26 | }
27 |
28 | // How many bytes to reserve in the buffer for these strings:
29 | // g++ 4.9.1 doesn't work with this:
30 | // static const std::size_t kBytes = 5;
31 | // So use enum.
32 | template struct ToStringBuf;
33 | template <> struct ToStringBuf {
34 | enum { kBytes = 1 };
35 | };
36 | template <> struct ToStringBuf {
37 | enum { kBytes = 5 };
38 | };
39 | template <> struct ToStringBuf {
40 | enum { kBytes = 6 };
41 | };
42 | template <> struct ToStringBuf {
43 | enum { kBytes = 10 };
44 | };
45 | template <> struct ToStringBuf {
46 | enum { kBytes = 11 };
47 | };
48 | template <> struct ToStringBuf {
49 | enum { kBytes = 20 };
50 | };
51 | template <> struct ToStringBuf {
52 | // Not a typo. 2^63 has 19 digits.
53 | enum { kBytes = 20 };
54 | };
55 |
56 | template <> struct ToStringBuf {
57 | // Either 18 on 64-bit or 10 on 32-bit.
58 | enum { kBytes = sizeof(const void*) * 2 + 2 };
59 | };
60 |
61 | // Maximum over this and float.
62 | enum { kToStringMaxBytes = 20 };
63 |
64 | } // namespace util
65 |
66 | #endif // UTIL_INTEGER_TO_STRING_H
67 |
--------------------------------------------------------------------------------
/util/integer_to_string_test.cc:
--------------------------------------------------------------------------------
1 | #define BOOST_LEXICAL_CAST_ASSUME_C_LOCALE
2 | #include "util/integer_to_string.hh"
3 | #include "util/string_piece.hh"
4 |
5 | #define BOOST_TEST_MODULE IntegerToStringTest
6 | #include
7 | #include
8 |
9 | #include
10 |
11 | namespace util {
12 | namespace {
13 |
14 | template void TestValue(const T value) {
15 | char buf[ToStringBuf::kBytes];
16 | StringPiece result(buf, ToString(value, buf) - buf);
17 | BOOST_REQUIRE_GE(static_cast(ToStringBuf::kBytes), result.size());
18 | if (value) {
19 | BOOST_CHECK_EQUAL(boost::lexical_cast(value), result);
20 | } else {
21 | // Platforms can do void * as 0x0 or 0.
22 | BOOST_CHECK(result == "0x0" || result == "0");
23 | }
24 | }
25 |
26 | template void TestCorners() {
27 | TestValue(std::numeric_limits::min());
28 | TestValue(std::numeric_limits::max());
29 | TestValue((T)0);
30 | TestValue((T)-1);
31 | TestValue((T)1);
32 | }
33 |
34 | BOOST_AUTO_TEST_CASE(Corners) {
35 | TestCorners();
36 | TestCorners();
37 | TestCorners();
38 | TestCorners();
39 | TestCorners();
40 | TestCorners();
41 | TestCorners