├── bench
    ├── clear_cache.sh
    ├── compare_files.py
    ├── convert_files.sh
    ├── test_disk.sh
    ├── convert_benchmark_files.sh
    ├── test_readcsv.R
    ├── run_experiment.sh
    ├── run_pyspark_experiment.sh
    ├── convert.py
    ├── test_readr.R
    ├── test_fread.R
    ├── convert_to_binary.py
    ├── compile_log_files.py
    └── generate_experiments.py
├── ci
    ├── osx_install.sh
    └── linux_install.sh
├── src
    ├── generic
    │   ├── encoding.hpp
    │   ├── parse_params.hpp
    │   ├── quote_adjustment_worker.hpp
    │   └── chunker.hpp
    ├── paratext_internal.hpp
    ├── util
    │   ├── unicode.hpp
    │   └── safe_string_output.hpp
    ├── csv
    │   ├── parallel.hpp
    │   ├── rowbased_loader.hpp
    │   ├── rowbased_worker.hpp
    │   ├── header_parser.hpp
    │   ├── colbased_chunk.hpp
    │   └── colbased_worker.hpp
    ├── paratext_internal.i
    ├── paratext_internal.cpp
    ├── diagnostic
    │   ├── memcopy.hpp
    │   ├── newline_counter.hpp
    │   └── parse_and_sum.hpp
    └── python
    │   ├── processor.hpp
    │   ├── python.i
    │   └── numpy_helper.hpp
├── .travis.yml
├── python
    ├── paratext
    │   ├── helpers.py
    │   ├── serial.py
    │   └── testing.py
    └── setup.py
├── tests
    └── test_paratext.py
├── LICENSE
└── README.md


/bench/clear_cache.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #
3 | # Ensure all writes are synced to disk.
4 | sudo bash -c "sync || sync || sync || sync"
5 | # Clear the caches
6 | sudo bash -c "echo 3 > /proc/sys/vm/drop_caches"
7 | 


--------------------------------------------------------------------------------
/ci/osx_install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | brew update
 4 | 
 5 | if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
 6 |     wget https://repo.continuum.io/miniconda/Miniconda2-latest-MacOSX-x86_64.sh -O miniconda.sh;
 7 | else
 8 |     wget https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh;
 9 | fi
10 | 
11 | echo "clang++" > .cxx.choice
12 | 


--------------------------------------------------------------------------------
/bench/compare_files.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import paratext
 4 | import json
 5 | 
 6 | for fn in ["mnist8m", "mnist", "messy", "messy2", "car", "float1", "float2", "float3", "float4"]:
 7 |     if os.path.exists(fn + ".csv"):
 8 |         result = paratext.internal_compare(fn)
 9 |         fid = open(fn + "-compare.json", "w")
10 |         json.dumps(fid)
11 |         fid.close()
12 | 


--------------------------------------------------------------------------------
/ci/linux_install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | #apt-get update -qq
 4 | #apt-get install -qq g++-4.8
 5 | #add-apt-repository -y ppa:ubuntu-toolchain-r/test
 6 | 
 7 | if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
 8 |     wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh -O miniconda.sh;
 9 | else
10 |     wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
11 | fi
12 | 
13 | echo "g++-4.8" > .cxx.choice
14 | 
15 | 


--------------------------------------------------------------------------------
/bench/convert_files.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Convert original CSV files to different formats.
 4 | 
 5 | echo mnist
 6 | python convert.py mnist.csv mnist.feather mnist.hdf5 mnist.pkl mnist.npy
 7 | echo messy
 8 | python convert.py messy.csv messy.feather messy.pkl messy.npy
 9 | echo messy2
10 | python convert.py messy2.csv messy2.feather messy2.pkl messy2.npy
11 | echo mnist8m
12 | python convert.py mnist8m.csv mnist8m.feather mnist8m.hdf5 mnist8m.pkl mnist8m.npy
13 | echo car
14 | python convert.py car.csv car.feather car.pkl car.npy
15 | echo floats
16 | python convert.py floats.csv floats.feather floats.pkl floats.npy floats.hdf5
17 | 
18 | 


--------------------------------------------------------------------------------
/bench/test_disk.sh:
--------------------------------------------------------------------------------
 1 | # Tests a filesystem's warm and cold performance over time. Run the directory where the device is mounted.
 2 | # Throttled disks should result in variable results. Otherwise, the results should be closer to constant.
 3 | 
 4 | device="$1"
 5 | log="$2"
 6 | 
 7 | for i in $(seq 1 1000);
 8 | do
 9 |   disk_results="$(sudo bash -c "hdparm -Tt ${device} | grep Timing | sed -e 's/.*=//g' | sed -e 's/ MB\/sec//g'")"
10 |   disk_A="$(echo $disk_results | cut -d' ' -f1)"
11 |   disk_B="$(echo $disk_results | cut -d' ' -f2)"
12 |   sudo bash -c "free && sync && echo 3 > /proc/sys/vm/drop_caches && free"
13 |   run_experiment.py - cmd=countnl disk_state="cold" filename="floats.csv" diskA="$disk_A" diskB="$disk_B" log="$log" num_threads=32
14 |   run_experiment.py - cmd=countnl disk_state="warm" filename="floats.csv" log="$log" num_threads=32
15 | done
16 | 
17 | 


--------------------------------------------------------------------------------
/bench/convert_benchmark_files.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Convert original CSV files to different formats.
 4 | 
 5 | echo mnist
 6 | python convert_to_binary.py mnist.csv 1 mnist.feather mnist.hdf5 mnist.pkl mnist.npy
 7 | echo messy
 8 | python convert_to_binary.py messy.csv 1 messy.feather messy.pkl messy.npy
 9 | echo messy2
10 | python convert_to_binary.py messy2.csv 1 messy2.feather messy2.pkl messy2.npy
11 | echo mnist8m
12 | python convert_to_binary.py mnist8m.csv 1 mnist8m.feather mnist8m.hdf5 mnist8m.pkl mnist8m.npy
13 | echo car
14 | python convert_to_binary.py car.csv 1 car.feather car.pkl car.npy
15 | echo floats
16 | python convert_to_binary.py floats.csv 0 floats.feather floats.pkl floats.npy floats.hdf5
17 | python convert_to_binary.py floats2.csv 0 floats2.feather floats2.pkl floats2.npy floats2.hdf5
18 | python convert_to_binary.py floats3.csv 0 floats3.feather floats3.pkl floats3.npy floats3.hdf5
19 | python convert_to_binary.py floats4.csv 0 floats4.feather floats4.pkl floats4.npy floats4.hdf5
20 | 
21 | 


--------------------------------------------------------------------------------
/bench/test_readcsv.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | #
 3 | # test_readcsv.R in.csv out.json
 4 | #
 5 | # Loads the file in.csv into an R data frame. Sums its numeric
 6 | # columns. Outputs the run times and memory usage to the JSON file
 7 | # out.json.
 8 | 
 9 | require("rjson", quietly=TRUE)
10 | 
11 | memory_usage <- function(){
12 |  return(strtoi(system(paste("ps -o rss ", Sys.getpid(), "| tail -1"), intern=TRUE))*1024)
13 | }
14 | 
15 | args <- commandArgs(TRUE)
16 | filename <- args[1]
17 | result_filename <- args[2]
18 | load_tic <- Sys.time()
19 | df = read.csv(filename)
20 | load_toc <- Sys.time()
21 | load_time <- as.double(difftime(load_toc, load_tic, units="secs"))
22 | 
23 | mem <- memory_usage()
24 | 
25 | sum_tic <- Sys.time()
26 | s <- colSums(Filter(is.numeric, df))
27 | s2 <- apply(Filter(function(x){!is.numeric(x)}, df), 2, function(x){sum(nchar(x))})
28 | sum_toc <- Sys.time()
29 | sum_time <- as.double(difftime(sum_toc, sum_tic, units="secs"))
30 | 
31 | results = list(cmd = "R-readcsv", load_time = load_time, mem = mem, sum_time = sum_time)
32 | json = rjson::toJSON(results)
33 | write(json, result_filename)
34 | 


--------------------------------------------------------------------------------
/bench/run_experiment.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | json_file="${1:-}"
 4 | num_trials="${2:-1}"
 5 | did="${3:-normal}"
 6 | log_path="${4:-normal}"
 7 | 
 8 | if [ "$json_file" == "" ];
 9 | then
10 |     echo "usage: run_experiment.sh json_filename [num_trials:1]"
11 |     exit 1
12 | fi
13 | 
14 | echo "Starting ${num_trials} trials on ${json_file}"
15 | 
16 | if [ "$(grep warm $json_file | wc -l)" == "1" ]
17 | then
18 |     echo warm: $json_file
19 |     cat $json_file
20 |     # First do a cold run and throw away the log.
21 |     sudo bash -c "sync || sync || sync || sync"
22 |     sudo bash -c "echo 3 > /proc/sys/vm/drop_caches"
23 |     run_experiment.py "$json_file" log="/dev/null" did="$did"
24 | 
25 |     # Now do x trials
26 |     for trials in $(seq 1 $num_trials); do
27 |         run_experiment.py "$json_file" did="$did" log_path="$log_path"
28 |     done
29 | else
30 |     echo cold: $json_file
31 |     cat $json_file
32 |     for trials in $(seq 1 $num_trials); do
33 |         free
34 |         sudo bash -c "sync || sync || sync || sync"
35 |         sudo bash -c "echo 3 > /proc/sys/vm/drop_caches"
36 |         free
37 |         sleep 1
38 |         run_experiment.py "$json_file" did="$did" log_path="$log_path"
39 |     done
40 | fi
41 | 


--------------------------------------------------------------------------------
/src/generic/encoding.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | 
 3 |     ParaText: parallel text reading
 4 |     Copyright (C) 2016. wise.io, Inc.
 5 | 
 6 |    Licensed to the Apache Software Foundation (ASF) under one
 7 |    or more contributor license agreements.  See the NOTICE file
 8 |    distributed with this work for additional information
 9 |    regarding copyright ownership.  The ASF licenses this file
10 |    to you under the Apache License, Version 2.0 (the
11 |    "License"); you may not use this file except in compliance
12 |    with the License.  You may obtain a copy of the License at
13 | 
14 |      http://www.apache.org/licenses/LICENSE-2.0
15 | 
16 |    Unless required by applicable law or agreed to in writing,
17 |    software distributed under the License is distributed on an
18 |    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
19 |    KIND, either express or implied.  See the License for the
20 |    specific language governing permissions and limitations
21 |    under the License.
22 |  */
23 | 
24 | /*
25 |   Coder: Damian Eads.
26 |  */
27 | 
28 | #ifndef PARATEXT_ENCODING_HPP
29 | #define PARATEXT_ENCODING_HPP
30 | 
31 | namespace ParaText {
32 |   typedef enum {UNKNOWN_BYTES, UNICODE_UTF8, ASCII} Encoding;
33 | 
34 |   struct as_raw_bytes {
35 |     std::string val;
36 |   };
37 |   
38 |   struct as_utf8 {
39 |     std::string val;
40 |   };
41 | 
42 | }
43 | 
44 | #endif
45 | 


--------------------------------------------------------------------------------
/bench/run_pyspark_experiment.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | json_file="${1:-}"
 4 | num_trials="${2:-1}"
 5 | did="${3:-normal}"
 6 | log_path="${4:-normal}"
 7 | 
 8 | if [ "$json_file" == "" ];
 9 | then
10 |     echo "usage: run_experiment.sh json_filename [num_trials:1]"
11 |     exit 1
12 | fi
13 | 
14 | echo "Starting ${num_trials} trials on ${json_file}"
15 | 
16 | SPARK_OPTIONS="--driver-memory 300G --executor-memory 300G --num-executors 32 --conf spark.driver.maxResultSize=10g --packages com.databricks:spark-csv_2.11:1.4.0"
17 | 
18 | if [ "$(grep warm $json_file | wc -l)" == "1" ]
19 | then
20 |     echo warm: $json_file
21 |     cat $json_file
22 |     # First do a cold run and throw away the log.
23 |     sudo bash -c "sync || sync || sync || sync"
24 |     sudo bash -c "echo 3 > /proc/sys/vm/drop_caches"
25 |     spark-submit $SPARK_OPTIONS $(which run_experiment.py) "$json_file" log="/dev/null" did="$did"
26 | 
27 |     # Now do x trials
28 |     for trials in $(seq 1 $num_trials); do
29 |         spark-submit $SPARK_OPTIONS $(which run_experiment.py) "$json_file" did="$did" log_path="$log_path"
30 |     done
31 | else
32 |     echo cold: $json_file
33 |     cat $json_file
34 |     for trials in $(seq 1 $num_trials); do
35 |         free
36 |         sudo bash -c "sync || sync || sync || sync"
37 |         sudo bash -c "echo 3 > /proc/sys/vm/drop_caches"
38 |         free
39 |         sleep 1
40 |         spark-submit $SPARK_OPTIONS $(which run_experiment.py) "$json_file" did="$did" log_path="$log_path"
41 |     done
42 | fi
43 | 
44 | 


--------------------------------------------------------------------------------
/src/paratext_internal.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |    Licensed to the Apache Software Foundation (ASF) under one
 3 |    or more contributor license agreements.  See the NOTICE file
 4 |    distributed with this work for additional information
 5 |    regarding copyright ownership.  The ASF licenses this file
 6 |    to you under the Apache License, Version 2.0 (the
 7 |    "License"); you may not use this file except in compliance
 8 |    with the License.  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |    Unless required by applicable law or agreed to in writing,
13 |    software distributed under the License is distributed on an
14 |    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 |    KIND, either express or implied.  See the License for the
16 |    specific language governing permissions and limitations
17 |    under the License.
18 | 
19 |    Copyright (C) wise.io, Inc. 2016.
20 | */
21 | #ifndef PARATEXT_PARACSV_HPP
22 | #define PARATEXT_PARACSV_HPP
23 | 
24 | #include "generic/parse_params.hpp"
25 | 
26 | size_t get_num_cores();
27 | 
28 | std::string as_quoted_string(const std::string &s, bool do_not_escape_newlines = false);
29 | 
30 | ParaText::as_raw_bytes get_random_string(size_t length, long seed, long min = 0, long max = 255);
31 | ParaText::as_utf8 get_random_string_utf8(size_t num_sequences, long seed, bool include_null = true);
32 | 
33 | size_t get_string_length(const std::string &s);
34 | 
35 | bool are_strings_equal(const std::string &x, const std::string &y);
36 | 
37 | #endif
38 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: c++
 2 | 
 3 | env:
 4 |   - PY_VERSION=2.7
 5 |   - PY_VERSION=3.5
 6 | 
 7 | os:
 8 |   - linux
 9 |   - osx
10 | 
11 | addons:
12 |   apt:
13 |     sources:
14 |     - sourceline: 'ppa:ubuntu-toolchain-r/test'
15 |     packages:
16 |     - g++-4.8
17 | 
18 | before_install:
19 |   - echo "before_install"
20 |   - echo $VIRTUAL_ENV
21 |   - export PATH="$HOME/miniconda/bin:$PATH"
22 |   - df -h
23 |   - date
24 |   - pwd
25 |   - uname -a
26 |   - python -V
27 |   - which g++
28 |   - g++ --version
29 |   - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then sudo /bin/bash ci/osx_install.sh; fi
30 |   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo /bin/bash ci/linux_install.sh; fi
31 | 
32 | # command to install dependencies
33 | install:
34 |   - export CXX="$(cat .cxx.choice)"
35 |   - ls $(dirname $(which g++))/g++*
36 |   - echo "install"
37 |   - bash miniconda.sh -b -p $HOME/miniconda
38 |   - export PATH="$HOME/miniconda/bin:$PATH"
39 |   - hash -r
40 |   - conda config --set always_yes yes --set changeps1 no
41 |   - conda update -q conda
42 |   # Useful for debugging any issues with conda
43 |   - conda info -a
44 |   - conda install python=$PY_VERSION nose
45 |   # Replace dep1 dep2 ... with your dependencies
46 |   - conda create -q -n test-environment python=$PY_VERSION swig=3.0.8 pandas numpy
47 |   - source activate test-environment
48 |   - cd python
49 |   - python setup.py build install
50 |   - cd ..
51 |   - pwd
52 | 
53 | before_script:
54 |   - export PY_PREFIX=$(python -c "import sys; print(sys.prefix)")
55 |   - echo $PY_PREFIX
56 |   - export PYTHONPATH=$PY_PREFIX/lib/python$PY_VERSION/site-packages:$PYTHONPATH
57 |   - echo $PYTHONPATH
58 | 
59 | # command to run tests
60 | script: nosetests -s --failure-detail --with-xunit tests/test_paratext.py
61 | 


--------------------------------------------------------------------------------
/bench/convert.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import pandas
 4 | import pickle
 5 | import feather
 6 | import h5py
 7 | import numpy as np
 8 | import scipy.io as sio
 9 | import os
10 | import sys
11 | 
12 | def convert_feather(df, output_filename):
13 |     feather.write_dataframe(df, output_filename)
14 | 
15 | def convert_hdf5(df, output_filename):
16 |     X = df.values
17 |     f = h5py.File(output_filename, "w")
18 |     ds=f.create_dataset("mydataset", X.shape, dtype=X.dtype)
19 |     ds[...] = X
20 | 
21 | def convert_npy(df, output_filename):
22 |     X = df.values
23 |     np.save(output_filename, X)
24 | 
25 | def convert_pkl(df, output_filename):
26 |     fid = open(output_filename, "wb")
27 |     pickle.dump(df, fid)
28 |     fid.close()
29 | 
30 | def convert_mat(df, output_filename):
31 |     dd = {key: df[key].values.flatten() for key in df.keys()}
32 |     sio.savemat(output_filename, dd)
33 | 
34 | input_filename = sys.argv[1]
35 | output_filenames = sys.argv[2:]
36 | 
37 | if not input_filename.endswith(".csv"):
38 |     print "input must be a CSV file (by extension)"
39 |     sys.exit(1)
40 | 
41 | df = paratext.load_csv_to_pandas(input_filename, allow_quoted_newlines=True)
42 | 
43 | for output_filename in output_filenames:
44 |     _, extension = os.path.splitext(output_filename)
45 |     if extension == ".hdf5":
46 |         convert_hdf5(df, output_filename)
47 |     elif extension == ".feather":
48 |         convert_feather(df, output_filename)
49 |     elif extension == ".pkl":
50 |         convert_pkl(df, output_filename)
51 |     elif extension == ".npy":
52 |         convert_npy(df, output_filename)
53 |     elif extension == ".mat":
54 |         convert_mat(df, output_filename)
55 |     else:
56 |         print "skipping '%s'; invalid output format '%s'" % (output_filename, extension)
57 | 


--------------------------------------------------------------------------------
/bench/test_readr.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | #
 3 | # test_datatable.R in.csv out.json
 4 | #
 5 | # Loads the file in.csv into an R data frame with fread, sums its numeric
 6 | # columns, and outputs the run times and memory usage to the JSON file
 7 | # out.json.
 8 | 
 9 | require("readr", quietly=TRUE)
10 | 
11 | "OlsonNames" = function () 
12 | {
13 |     if (.Platform$OS.type == "windows") 
14 |         tzdir <- Sys.getenv("TZDIR", file.path(R.home("share"), 
15 |             "zoneinfo"))
16 |     else {
17 |         tzdirs <- c(Sys.getenv("TZDIR"), file.path(R.home("share"), 
18 |             "zoneinfo"), "/usr/share/zoneinfo", "/usr/share/lib/zoneinfo", 
19 |             "/usr/lib/zoneinfo", "/usr/local/etc/zoneinfo", "/etc/zoneinfo", 
20 |             "/usr/etc/zoneinfo")
21 |         tzdirs <- tzdirs[file.exists(tzdirs)]
22 |         if (!length(tzdirs)) {
23 |             warning("no Olson database found")
24 |             return(character())
25 |         }
26 |         else tzdir <- tzdirs[1]
27 |     }
28 |     x <- list.files(tzdir, recursive = TRUE)
29 |     grep("^[ABCDEFGHIJKLMNOPQRSTUVWXYZ]", x, value = TRUE)
30 | }
31 | 
32 | memory_usage <- function(){
33 |  return(strtoi(system(paste("ps -o rss ", Sys.getpid(), "| tail -1"), intern=TRUE))*1024)
34 | }
35 | 
36 | args <- commandArgs(TRUE)
37 | filename <- args[1]
38 | result_filename <- args[2]
39 | load_tic <- Sys.time()
40 | df = read_csv(filename)
41 | load_toc <- Sys.time()
42 | load_time <- as.double(difftime(load_toc, load_tic, units="secs"))
43 | 
44 | mem <- memory_usage()
45 | 
46 | sum_tic <- Sys.time()
47 | s <- colSums(Filter(is.numeric, df))
48 | s2 <- apply(Filter(function(x){!is.numeric(x)}, df), 2, function(x){sum(nchar(x))})
49 | sum_toc <- Sys.time()
50 | sum_time <- as.double(difftime(sum_toc, sum_tic, units="secs"))
51 | 
52 | results = list(cmd = "R-readr", load_time = load_time, mem = mem, sum_time = sum_time)
53 | json = rjson::toJSON(results)
54 | write(json, result_filename)
55 | 


--------------------------------------------------------------------------------
/bench/test_fread.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | #
 3 | # test_fread.R in.csv out.json
 4 | #
 5 | # Loads the file in.csv into an R data frame with fread, sums its numeric
 6 | # columns, and outputs the run times and memory usage to the JSON file
 7 | # out.json.
 8 | 
 9 | require("data.table", quietly=TRUE)
10 | require("rjson", quietly=TRUE)
11 | 
12 | "OlsonNames" = function () 
13 | {
14 |     if (.Platform$OS.type == "windows") 
15 |         tzdir <- Sys.getenv("TZDIR", file.path(R.home("share"), 
16 |             "zoneinfo"))
17 |     else {
18 |         tzdirs <- c(Sys.getenv("TZDIR"), file.path(R.home("share"), 
19 |             "zoneinfo"), "/usr/share/zoneinfo", "/usr/share/lib/zoneinfo", 
20 |             "/usr/lib/zoneinfo", "/usr/local/etc/zoneinfo", "/etc/zoneinfo", 
21 |             "/usr/etc/zoneinfo")
22 |         tzdirs <- tzdirs[file.exists(tzdirs)]
23 |         if (!length(tzdirs)) {
24 |             warning("no Olson database found")
25 |             return(character())
26 |         }
27 |         else tzdir <- tzdirs[1]
28 |     }
29 |     x <- list.files(tzdir, recursive = TRUE)
30 |     grep("^[ABCDEFGHIJKLMNOPQRSTUVWXYZ]", x, value = TRUE)
31 | }
32 | 
33 | memory_usage <- function(){
34 |  return(strtoi(system(paste("ps -o rss ", Sys.getpid(), "| tail -1"), intern=TRUE))*1024)
35 | }
36 | 
37 | args <- commandArgs(TRUE)
38 | filename <- args[1]
39 | result_filename <- args[2]
40 | load_tic <- Sys.time()
41 | df = fread(filename)
42 | load_toc <- Sys.time()
43 | load_time <- as.double(difftime(load_toc, load_tic, units="secs"))
44 | 
45 | mem <- memory_usage()
46 | 
47 | sum_tic <- Sys.time()
48 | s <- colSums(Filter(is.numeric, df))
49 | s <- s + apply(Filter(function(x){!is.numeric(x)}, df), 2, function(x){sum(nchar(x))})
50 | sum_toc <- Sys.time()
51 | sum_time <- as.double(difftime(sum_toc, sum_tic, units="secs"))
52 | 
53 | results = list(cmd = "R-fread", load_time = load_time, mem = mem, sum_time = sum_time)
54 | json = rjson::toJSON(results)
55 | write(json, result_filename)
56 | 


--------------------------------------------------------------------------------
/src/generic/parse_params.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | 
 3 |     ParaText: parallel text reading
 4 |     Copyright (C) 2016. wise.io, Inc.
 5 | 
 6 |    Licensed to the Apache Software Foundation (ASF) under one
 7 |    or more contributor license agreements.  See the NOTICE file
 8 |    distributed with this work for additional information
 9 |    regarding copyright ownership.  The ASF licenses this file
10 |    to you under the Apache License, Version 2.0 (the
11 |    "License"); you may not use this file except in compliance
12 |    with the License.  You may obtain a copy of the License at
13 | 
14 |      http://www.apache.org/licenses/LICENSE-2.0
15 | 
16 |    Unless required by applicable law or agreed to in writing,
17 |    software distributed under the License is distributed on an
18 |    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
19 |    KIND, either express or implied.  See the License for the
20 |    specific language governing permissions and limitations
21 |    under the License.
22 |  */
23 | 
24 | /*
25 |   Coder: Damian Eads.
26 |  */
27 | 
28 | #ifndef PARATEXT_PARSE_PARAMS_HPP
29 | #define PARATEXT_PARSE_PARAMS_HPP
30 | 
31 | #include <string>
32 | #include <limits>
33 | #include "generic/encoding.hpp"
34 | 
35 | namespace ParaText {
36 | 
37 |   typedef enum {ROW_BASED, COL_BASED} ParserType;
38 |   typedef enum {NONE, SNAPPY, MSGPACK} Compression;
39 | 
40 |   typedef enum {CATEGORICAL, NUMERIC, TEXT, UNKNOWN} Semantics;
41 | 
42 |   template <class T, int InEncoding, int OutEncoding>
43 |   struct TagEncoding {};
44 | 
45 |   struct ColumnInfo {
46 |     std::string name;
47 |     Semantics semantics;
48 |   };
49 | 
50 |   struct ParseParams {
51 |     ParseParams() : no_header(false), number_only(false), convert_null_to_space(true), block_size(32768), num_threads(16), allow_quoted_newlines(false),  max_level_name_length(std::numeric_limits<size_t>::max()), max_levels(std::numeric_limits<size_t>::max()), compression(Compression::NONE), parser_type(ParserType::COL_BASED) {}
52 |     bool no_header;
53 |     bool number_only;
54 |     bool compute_sum;
55 |     bool convert_null_to_space;
56 |     size_t block_size;
57 |     size_t num_threads;
58 |     bool allow_quoted_newlines;
59 |     size_t max_level_name_length;
60 |     size_t max_levels;
61 |     Compression compression;
62 |     ParserType parser_type;
63 |   };
64 | 
65 | }
66 | #endif
67 | 


--------------------------------------------------------------------------------
/src/util/unicode.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef WISEIO_UNICODE_HPP
 2 | #define WISEIO_UNICODE_HPP
 3 | 
 4 | #define UNI_REPLACEMENT_CHAR (WUTF32)0x0000FFFD
 5 | #define UNI_MAX_BMP (WUTF32)0x0000FFFF
 6 | #define UNI_MAX_UTF16 (WUTF32)0x0010FFFF
 7 | #define UNI_MAX_UTF32 (WUTF32)0x7FFFFFFF
 8 | #define UNI_MAX_LEGAL_UTF32 (WUTF32)0x0010FFFF
 9 | 
10 | #define UNI_SUR_HIGH_START  (WUTF32)0xD800
11 | #define UNI_SUR_HIGH_END    (WUTF32)0xDBFF
12 | #define UNI_SUR_LOW_START   (WUTF32)0xDC00
13 | #define UNI_SUR_LOW_END     (WUTF32)0xDFFF
14 | 
15 | namespace WiseIO {
16 | 
17 |   template <class InputIterator, class OutputIterator>
18 |   int convert_utf32_to_utf8(InputIterator start,
19 |                             InputIterator end,
20 |                             OutputIterator out,
21 |                             bool strict = false) {
22 | 
23 |     typedef unsigned long WUTF32;
24 |     typedef unsigned char WUTF8;
25 |     static const WUTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
26 |     int result = 0;
27 |     unsigned char buf[4] = {0,0,0,0};
28 |     for (InputIterator it = start; it != end; it++) {
29 |       WUTF32 ch = *it;
30 |       unsigned short bytesToWrite = 0;
31 |       const WUTF32 byteMask = 0xBF;
32 |       const WUTF32 byteMark = 0x80;
33 |       if (strict) {
34 |         /* UTF-16 surrogate values are illegal in UTF-32 */
35 |         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
36 |           result = 1;
37 |           break;
38 |         }
39 |       }
40 |       /*
41 |        * Figure out how many bytes the result will require. Turn any
42 |        * illegally large UTF32 things (> Plane 17) into replacement chars.
43 |        */
44 |       if (ch < (WUTF32)0x80) {	     bytesToWrite = 1;
45 |       } else if (ch < (WUTF32)0x800) {     bytesToWrite = 2;
46 |       } else if (ch < (WUTF32)0x10000) {   bytesToWrite = 3;
47 |       } else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
48 |       } else {			    bytesToWrite = 3;
49 |         ch = UNI_REPLACEMENT_CHAR;
50 |         result = 1;
51 |         break;
52 |       }
53 |       switch (bytesToWrite) { /* note: everything falls through. */
54 |       case 4: buf[3] = (WUTF8)((ch | byteMark) & byteMask); ch >>= 6;
55 |       case 3: buf[2] = (WUTF8)((ch | byteMark) & byteMask); ch >>= 6;
56 |       case 2: buf[1] = (WUTF8)((ch | byteMark) & byteMask); ch >>= 6;
57 |       case 1: buf[0] = (WUTF8) (ch | firstByteMark[bytesToWrite]);
58 |       }
59 |       for(int i = 0; i < bytesToWrite; i++) {
60 |         *(out++) = buf[i];
61 |       }
62 |     }
63 |     return result;
64 |   }
65 | }
66 | #endif
67 | 


--------------------------------------------------------------------------------
/python/paratext/helpers.py:
--------------------------------------------------------------------------------
 1 | #   Licensed to the Apache Software Foundation (ASF) under one
 2 | #   or more contributor license agreements.  See the NOTICE file
 3 | #   distributed with this work for additional information
 4 | #   regarding copyright ownership.  The ASF licenses this file
 5 | #   to you under the Apache License, Version 2.0 (the
 6 | #   "License"); you may not use this file except in compliance
 7 | #   with the License.  You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | #   Unless required by applicable law or agreed to in writing,
12 | #   software distributed under the License is distributed on an
13 | #   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | #   KIND, either express or implied.  See the License for the
15 | #   specific language governing permissions and limitations
16 | #   under the License.
17 | #
18 | #   Copyright (C) Wise.io, Inc. 2016.
19 | 
20 | def make_messy_frame(num_rows, num_cols, num_cats, num_ints):
21 |     fid = open("/etc/dictionaries-common/words")
22 |     words=[line.strip() for line in fid.readlines()]
23 |     perm = np.random.permutation(num_cols)
24 |     num_catints = num_cats + num_ints
25 |     float_ids = perm[num_catints:]
26 |     int_ids = perm[num_cats:num_catints]
27 |     cat_ids = perm[0:num_cats]
28 |     d = {}
29 |     dtypes = {}
30 |     for col in cat_ids:
31 |         X = np.zeros((num_rows,), dtype=np.object);
32 |         for row in xrange(0, num_rows):
33 |             num_newlines = np.random.randint(3,7)
34 |             num_commas = np.random.randint(3,7)
35 |             X[row] = ""
36 |             tricky_delims = np.asarray(["\n"] * num_newlines + [","] * num_commas)
37 |             np.random.shuffle(tricky_delims)
38 |             for delim in tricky_delims:
39 |                 X[row] += string.join(random.sample(words, 5), ' ')
40 |                 X[row] += delim
41 |                 X[row] += string.join(random.sample(words, 5), ' ')
42 |         d[col] = X
43 |         dtypes[col] = 'string'
44 |     for col in float_ids:
45 |         d[col] = np.random.randn(num_rows)
46 |         dtypes[col] = 'float'
47 |     min_int = [0,   -2**7, 0   , -2**15,     0, -2**31,     0, -2**62]
48 |     max_int = [2**8, 2**7, 2**16,  2**15, 2**32,  2**31, 2**62, 2**62]
49 |     dtypes_int = ["uint8", "int8", "uint16", "int16", "uint32", "int32", "uint64", "int64"]
50 |     for col in int_ids:
51 |         j = np.random.randint(0, len(min_int))
52 |         d[col] = np.random.randint(min_int[j], max_int[j], num_rows)
53 |         dtypes[col] = dtypes_int[j]
54 |     return d, dtypes
55 | 


--------------------------------------------------------------------------------
/src/csv/parallel.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |     ParaText: parallel text reading
 3 |     Copyright (C) 2016. wise.io, Inc.
 4 | 
 5 |    Licensed to the Apache Software Foundation (ASF) under one
 6 |    or more contributor license agreements.  See the NOTICE file
 7 |    distributed with this work for additional information
 8 |    regarding copyright ownership.  The ASF licenses this file
 9 |    to you under the Apache License, Version 2.0 (the
10 |    "License"); you may not use this file except in compliance
11 |    with the License.  You may obtain a copy of the License at
12 | 
13 |      http://www.apache.org/licenses/LICENSE-2.0
14 | 
15 |    Unless required by applicable law or agreed to in writing,
16 |    software distributed under the License is distributed on an
17 |    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 |    KIND, either express or implied.  See the License for the
19 |    specific language governing permissions and limitations
20 |    under the License.
21 | */
22 | 
23 | /*
24 |   These functions by Guillem Blanco are taken from WiseML.
25 |  */
26 | 
27 | #ifndef PARATEXT_PARALLEL_HPP
28 | #define PARATEXT_PARALLEL_HPP
29 | 
30 | #include <algorithm>
31 | 
32 | /*
33 |  * Same as std::for_each but fun must have signature: void f(Iterator).
34 |  */
35 | template <typename Iterator, typename F>
36 | inline F for_each_it(Iterator first, Iterator last, F &&fun) {
37 |   for (; first != last; ++first)
38 |     fun(first);
39 |   return std::move(fun);
40 | }
41 | 
42 | /*
43 |  * Distributes the application of F in [first, last) among different threads.
44 |  */
45 | template <class Iterator, class F>
46 | F parallel_for_each(Iterator first, Iterator last, size_t suggested_num_threads, F &&f) {
47 |   using namespace std::placeholders;
48 | 
49 |   const std::size_t num_elements = std::distance(first, last);
50 |   if (num_elements == 0) {
51 |     return std::move(f);
52 |   }
53 |   const size_t num_threads =
54 |       std::min(std::max(1UL, suggested_num_threads), num_elements);
55 |   const std::size_t elements_thread = num_elements / num_threads;
56 |   const std::size_t excess = num_elements % num_threads;
57 | 
58 |   /* Thread pool */
59 |   std::vector<std::thread> thread_pool;
60 |   thread_pool.reserve(num_threads);
61 | 
62 |   /* Spawn threads */
63 |   Iterator it = first;
64 |   for (std::size_t thread_id = 0; thread_id < num_threads; ++thread_id) {
65 |     const std::size_t step = elements_thread + (thread_id < excess ? 1 : 0);
66 |     thread_pool
67 |         .emplace_back([ it, step, thread_id, f = std::forward<F>(f) ]() {
68 |             for_each_it(it, it + step, std::bind(f, _1, thread_id));
69 |         });
70 |     it += step;
71 |   }
72 | 
73 |   /* Join threads */
74 |   for (auto &&thread : thread_pool) {
75 |     thread.join();
76 |   }
77 |   return std::move(f);
78 | }
79 | 
80 | #endif
81 | 


--------------------------------------------------------------------------------
/bench/convert_to_binary.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | #   Licensed to the Apache Software Foundation (ASF) under one
 4 | #   or more contributor license agreements.  See the NOTICE file
 5 | #   distributed with this work for additional information
 6 | #   regarding copyright ownership.  The ASF licenses this file
 7 | #   to you under the Apache License, Version 2.0 (the
 8 | #   "License"); you may not use this file except in compliance
 9 | #   with the License.  You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0 
12 | #
13 | #   Unless required by applicable law or agreed to in writing,
14 | #   software distributed under the License is distributed on an
15 | #   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | #   KIND, either express or implied.  See the License for the
17 | #   specific language governing permissions and limitations
18 | #   under the License.
19 | #
20 | #   Copyright (C) Wise.io, Inc. 2016.
21 | 
22 | import pandas
23 | import pickle
24 | import feather
25 | import h5py
26 | import numpy as np
27 | import scipy.io as sio
28 | import os
29 | import sys
30 | 
31 | def convert_feather(df, output_filename):
32 |     feather.write_dataframe(df, output_filename)
33 | 
34 | def convert_hdf5(df, output_filename):
35 |     X = df.values
36 |     f = h5py.File(output_filename, "w")
37 |     ds=f.create_dataset("mydataset", X.shape, dtype=X.dtype)
38 |     ds[...] = X
39 | 
40 | def convert_npy(df, output_filename):
41 |     X = df.values
42 |     np.save(output_filename, X)
43 | 
44 | def convert_pkl(df, output_filename):
45 |     fid = open(output_filename, "wb")
46 |     pickle.dump(df, fid)
47 |     fid.close()
48 | 
49 | def convert_mat(df, output_filename):
50 |     dd = {key: df[key].values.flatten() for key in df.keys()}
51 |     sio.savemat(output_filename, dd)
52 | 
53 | input_filename = sys.argv[1]
54 | has_header = int(sys.argv[2])
55 | output_filenames = sys.argv[3:]
56 | 
57 | if not input_filename.endswith(".csv"):
58 |     print "input must be a CSV file (by extension)"
59 |     sys.exit(1)
60 | 
61 | if has_header:
62 |     df = pandas.read_csv(input_filename)
63 | else:
64 |     df = pandas.read_csv(input_filename, header=None)
65 | 
66 | for output_filename in output_filenames:
67 |     _, extension = os.path.splitext(output_filename)
68 |     if extension == ".hdf5":
69 |         convert_hdf5(df, output_filename)
70 |     elif extension == ".feather":
71 |         convert_feather(df, output_filename)
72 |     elif extension == ".pkl":
73 |         convert_pkl(df, output_filename)
74 |     elif extension == ".npy":
75 |         convert_npy(df, output_filename)
76 |     elif extension == ".mat":
77 |         convert_mat(df, output_filename)
78 |     else:
79 |         print "skipping '%s'; invalid output format '%s'" % (output_filename, extension)
80 | 


--------------------------------------------------------------------------------
/src/paratext_internal.i:
--------------------------------------------------------------------------------
  1 | /*
  2 | 
  3 |     ParaText: parallel text reading
  4 |     Copyright (C) 2016. wise.io, Inc.
  5 | 
  6 |    Licensed to the Apache Software Foundation (ASF) under one
  7 |    or more contributor license agreements.  See the NOTICE file
  8 |    distributed with this work for additional information
  9 |    regarding copyright ownership.  The ASF licenses this file
 10 |    to you under the Apache License, Version 2.0 (the
 11 |    "License"); you may not use this file except in compliance
 12 |    with the License.  You may obtain a copy of the License at
 13 | 
 14 |      http://www.apache.org/licenses/LICENSE-2.0
 15 | 
 16 |    Unless required by applicable law or agreed to in writing,
 17 |    software distributed under the License is distributed on an
 18 |    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 19 |    KIND, either express or implied.  See the License for the
 20 |    specific language governing permissions and limitations
 21 |    under the License.
 22 | */
 23 | 
 24 | /*
 25 |   Coder: Damian Eads.
 26 |  */
 27 | 
 28 | %module paratext_internal
 29 | 
 30 | #if defined(SWIGPYTHON)
 31 | %include "python/python.i"
 32 | #else
 33 | #warning "no SWIG typemaps defined for the target language"
 34 | #endif
 35 | 
 36 |  //%include "std_string.i"
 37 | %include "std_vector.i"
 38 | %include "std_pair.i"
 39 | 
 40 | %ignore ParaText::CSV::ColBasedPopulator::get_type_index() const;
 41 | %ignore ParaText::CSV::StringVectorPopulator::get_type_index() const;
 42 | %ignore ParaText::CSV::ColBasedLoader::get_type_index(size_t) const;
 43 | %ignore ParaText::CSV::ColBasedIterator::operator++();
 44 | %ignore ParaText::CSV::ColBasedIterator::operator++(int);
 45 | 
 46 | namespace std {
 47 |   %template(vectori) std::vector<int>;
 48 | }
 49 | 
 50 | ///////// Generic Header
 51 | %include "paratext_internal.hpp"
 52 | %{
 53 | #include "paratext_internal.hpp"
 54 | %}
 55 | 
 56 | ///////// Parsing Parameters
 57 | %include "generic/parse_params.hpp"
 58 | %{
 59 | #include "generic/parse_params.hpp"
 60 | %}
 61 | 
 62 | %include "generic/encoding.hpp"
 63 | %{
 64 | #include "generic/encoding.hpp"
 65 | %}
 66 | 
 67 | //////// CSV-loading Stuff
 68 | 
 69 | %include "csv/colbased_loader.hpp"
 70 | %{
 71 | #include "csv/colbased_loader.hpp"
 72 | %}
 73 | 
 74 | %include "diagnostic/memcopy.hpp"
 75 | %{
 76 | #include "diagnostic/memcopy.hpp"
 77 | %}
 78 | 
 79 | %include "diagnostic/newline_counter.hpp"
 80 | %{
 81 | #include "diagnostic/newline_counter.hpp"
 82 | %}
 83 | 
 84 | %include "diagnostic/parse_and_sum.hpp"
 85 | %{
 86 | #include "diagnostic/parse_and_sum.hpp"
 87 | %}
 88 | 
 89 | %include "util/safe_string_output.hpp"
 90 | %{
 91 | #include "util/safe_string_output.hpp"
 92 | %}
 93 | 
 94 | #if defined(PARATEXT_ROWBASED_CSV)
 95 | %include "csv/rowbased_loader.hpp"
 96 | %{
 97 | #include "csv/rowbased_loader.hpp"
 98 | %}
 99 | #endif
100 | 


--------------------------------------------------------------------------------
/src/paratext_internal.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |    Licensed to the Apache Software Foundation (ASF) under one
 3 |    or more contributor license agreements.  See the NOTICE file
 4 |    distributed with this work for additional information
 5 |    regarding copyright ownership.  The ASF licenses this file
 6 |    to you under the Apache License, Version 2.0 (the
 7 |    "License"); you may not use this file except in compliance
 8 |    with the License.  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |    Unless required by applicable law or agreed to in writing,
13 |    software distributed under the License is distributed on an
14 |    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 |    KIND, either express or implied.  See the License for the
16 |    specific language governing permissions and limitations
17 |    under the License.
18 | */
19 | 
20 | #include "paratext_internal.hpp"
21 | #include "util/strings.hpp"
22 | 
23 | #include <string>
24 | #include <type_traits>
25 | #include <iostream>
26 | #include <thread>
27 | #include <chrono>
28 | //#include <locale>
29 | //#include <codecvt>
30 | 
31 | size_t get_num_cores() {
32 |   return std::thread::hardware_concurrency();
33 | }
34 | 
35 | std::string as_quoted_string(const std::string &s, bool do_not_escape_newlines) {
36 |   return get_quoted_string(s.begin(), s.end(), true, do_not_escape_newlines);
37 | }
38 | 
39 | ParaText::as_raw_bytes get_random_string(size_t length, long seed, long min, long max) {
40 |   std::string output;
41 |   if (seed == 0) {
42 |     seed = std::chrono::system_clock::now().time_since_epoch().count();
43 |   }
44 |   std::default_random_engine e1(seed);
45 |   std::uniform_int_distribution<int> byte_range(min, max);
46 |   for (size_t i = 0; i < length; i++) {
47 |     output.push_back(byte_range(e1));
48 |   }
49 |   ParaText::as_raw_bytes retval;
50 |   retval.val = output;
51 |   return retval;
52 | }
53 | 
54 | ParaText::as_utf8 get_random_string_utf8(size_t num_sequences, long seed, bool include_null) {
55 |   std::string output;
56 |   if (seed == 0) {
57 |     seed = std::chrono::system_clock::now().time_since_epoch().count();
58 |   }
59 |   std::default_random_engine e1(seed);
60 |   unsigned long surrogate_range = 2048;
61 |   std::uniform_int_distribution<unsigned long> codepoint_range(include_null ? 0 : 1, 0x10FFFF - surrogate_range);
62 |   std::vector<unsigned long> seq;
63 |   for (size_t i = 0; i < num_sequences; i++) {
64 |     unsigned long val = codepoint_range(e1);
65 |     if (val >= 0xD800) {
66 |       val += surrogate_range;
67 |     }
68 |     seq.push_back(val);
69 |   }
70 |   WiseIO::convert_utf32_to_utf8(seq.begin(), seq.end(), std::back_inserter(output));
71 |   ParaText::as_utf8 retval;
72 |   retval.val = output;
73 |   return retval;
74 | }
75 | 
76 | bool are_strings_equal(const std::string &x, const std::string &y) {
77 |   return x == y;
78 | }
79 | 
80 | size_t get_string_length(const std::string &s) {
81 |   return s.size();
82 | }
83 | 


--------------------------------------------------------------------------------
/python/setup.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import sys, os, os.path, string, subprocess
 3 | import json
 4 | 
 5 | # First, check for the presence of swig, which we will need to build
 6 | # the Python bindings.
 7 | p = subprocess.Popen(["which", "swig"])
 8 | p.communicate("")
 9 | if p.returncode != 0:
10 |     print("Error: you must install SWIG first.")
11 |     sys.exit(1)
12 | 
13 | # The multi-threaded reader will core dump unless -pthread is given.
14 | extra_link_args = []
15 | extra_compile_args = ["-std=c++11", "-Wall", "-Wextra", "-pthread"]
16 | extra_libraries = []
17 | 
18 | if sys.platform == 'darwin':
19 |     extra_compile_args += ["-m64", "-D_REENTRANT"]
20 |     extra_link_args += []
21 |     extra_libraries += []
22 | elif sys.platform.startswith("linux"):
23 |     extra_compile_args += []
24 |     extra_link_args += []
25 |     extra_libraries += []
26 | 
27 | if len(set(('develop', 'release', 'bdist_egg', 'bdist_rpm',
28 |             'bdist_wininst', 'install_egg_info', 'build_sphinx',
29 |             'egg_info', 'easy_install', 'upload',
30 |             )).intersection(sys.argv)) > 0:
31 |     import setuptools
32 |     extra_setuptools_args = dict(
33 |         zip_safe=False,  # the package can run out of an .egg file
34 |         )
35 | else:
36 |     extra_setuptools_args = dict()
37 | 
38 | from numpy.distutils.core import setup, Extension
39 | 
40 | version = "0.2.1rc1"
41 | 
42 | init_py = open("paratext/__init__.py", "w")
43 | 
44 | init_py.write("""
45 | __all__ = ['paratext']
46 | 
47 | from paratext.core import *
48 | 
49 | import paratext_internal
50 | import warnings
51 | 
52 | __version__ = "%s"
53 | """ % version)
54 | 
55 | init_py.close()
56 | 
57 | 
58 | print(version)
59 | 
60 | swig_cmd = ["swig", "-c++", "-python"]
61 | 
62 | if sys.version_info >= (3,):
63 |     swig_cmd += ["-py3"]
64 | 
65 | swig_cmd += ["-I../src/", "-outdir", "./", "../src/paratext_internal.i"]
66 | 
67 | print("running swig: ", swig_cmd)
68 | p = subprocess.Popen(swig_cmd)
69 | p.communicate("")
70 | if p.returncode != 0:
71 |     print("Error generating SWIG wrappers.")
72 |     sys.exit(1)
73 | 
74 | setup(name='paratext',
75 |       version=version,
76 |       description='Reads text files in parallel. The first release includes a parallel CSV reader.',
77 |       long_description="""
78 | See README
79 | """,
80 |       keywords=['csv', 'reading'],
81 |       ext_modules=[Extension('_paratext_internal',
82 |                              ['../src/paratext_internal_wrap.cxx', '../src/paratext_internal.cpp'],
83 |                              extra_link_args = extra_link_args,
84 |                              extra_compile_args = extra_compile_args,
85 |                              include_dirs=['../src/'],
86 |                              libraries=["stdc++"] + extra_libraries),
87 |                    ],
88 |       py_modules=["paratext_internal"],
89 |       author="Damian Eads",
90 |       author_email="damian@wise.io",
91 |       license="Apache License",
92 |       packages = ['paratext'],
93 |       url = 'http://wise.io',
94 |       include_package_data = True,
95 |       **extra_setuptools_args
96 |       )
97 | 


--------------------------------------------------------------------------------
/bench/compile_log_files.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | 
 4 | #   Licensed to the Apache Software Foundation (ASF) under one
 5 | #   or more contributor license agreements.  See the NOTICE file
 6 | #   distributed with this work for additional information
 7 | #   regarding copyright ownership.  The ASF licenses this file
 8 | #   to you under the Apache License, Version 2.0 (the
 9 | #   "License"); you may not use this file except in compliance
10 | #   with the License.  You may obtain a copy of the License at
11 | #
12 | #     http://www.apache.org/licenses/LICENSE-2.0 
13 | #
14 | #   Unless required by applicable law or agreed to in writing,
15 | #   software distributed under the License is distributed on an
16 | #   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17 | #   KIND, either express or implied.  See the License for the
18 | #   specific language governing permissions and limitations
19 | #   under the License.
20 | #
21 | #   Copyright (C) Wise.io, Inc. 2016.
22 | 
23 | # Usage: compile_log_files.py dir1 dir2 dir3 dir4 dir5
24 | 
25 | import numpy as np
26 | import seaborn
27 | import pandas
28 | import sys
29 | import os
30 | import json
31 | 
32 | # Compiles the log files into a single log.
33 | 
34 | def get_dataset_key(fn):
35 |     base_fn = os.path.basename(fn)
36 |     base_fn = base_fn[:base_fn.find(".")]
37 |     if base_fn == "car-pyspark":
38 |         return "car"
39 |     else:
40 |         return base_fn
41 | 
42 | # The log directory X where the result files are stored.
43 | #
44 | # 1. It searches directories: X/cmdname/*.log for log files.
45 | #
46 | # 2. It outputs CSV files: log-X.csv
47 | #
48 | 
49 | if len(sys.argv) > 2:
50 |     for bench_name in ["avgcols", "countnl", "cPickle", "disk-to-mem", "feather", "hdf5", "noop", "npy", "numpy", "pandas", "paratext", "pickle", "pyspark", "R-readcsv", "R-fread", "R-readr", "sframe"]:
51 |         df = pandas.DataFrame()
52 |         for bench_dir in sys.argv[1:]:
53 |             bench_subdir = os.path.join(bench_dir, bench_name)
54 |             print bench_subdir
55 |             if not os.path.exists(bench_subdir):
56 |                 continue
57 |             bench_files = os.listdir(bench_subdir)
58 |             for filename in bench_files:
59 |                 fn = os.path.join(bench_subdir, filename)
60 |                 print "opening ", fn
61 |                 bench_json = json.load(open(fn))
62 |                 log = bench_json["log"]
63 |                 mini_df = pandas.DataFrame()
64 |                 for i in xrange(0, len(log)):
65 |                     for key in log[i].keys():
66 |                         if log[i][key] == '?':
67 |                             log[i][key] = None
68 |                     mini_df = mini_df.append(log[i], ignore_index = True)
69 |                     mini_df["log_key"] = filename.replace(".log","")
70 |                 df = df.append(mini_df)
71 |         if bench_name in ["R-readcsv", "R-fread", "R-readr"]:
72 |             df["mem"] = df["mem"] / 1000000
73 |         if "filename" in df.keys():
74 |             df["ds"] = df["filename"].apply(get_dataset_key)
75 |         else:
76 |             df["ds"] = '?'
77 |         df.to_csv("log-" + bench_name + ".csv", index=False)
78 | else:
79 |     print "usage: gen_plot_files.py [log_dir1] [log_dir2]"
80 | 


--------------------------------------------------------------------------------
/src/csv/rowbased_loader.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | 
  3 |     ParaText: parallel text reading
  4 |     Copyright (C) 2016. wise.io, Inc.
  5 | 
  6 |    Licensed to the Apache Software Foundation (ASF) under one
  7 |    or more contributor license agreements.  See the NOTICE file
  8 |    distributed with this work for additional information
  9 |    regarding copyright ownership.  The ASF licenses this file
 10 |    to you under the Apache License, Version 2.0 (the
 11 |    "License"); you may not use this file except in compliance
 12 |    with the License.  You may obtain a copy of the License at
 13 | 
 14 |      http://www.apache.org/licenses/LICENSE-2.0
 15 | 
 16 |    Unless required by applicable law or agreed to in writing,
 17 |    software distributed under the License is distributed on an
 18 |    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 19 |    KIND, either express or implied.  See the License for the
 20 |    specific language governing permissions and limitations
 21 |    under the License.
 22 |  */
 23 | 
 24 | /*
 25 |   Coder: Damian Eads.
 26 |  */
 27 | 
 28 | #ifndef PARATEXT_ROW_BASED_LOADER_HPP
 29 | #define PARATEXT_ROW_BASED_LOADER_HPP
 30 | 
 31 | #include "parse_params.hpp"
 32 | #include "rowbased_worker.hpp"
 33 | #include "chunker.hpp"
 34 | #include "header_parser.hpp"
 35 | 
 36 | namespace ParaText {
 37 | 
 38 | namespace CSV {
 39 | 
 40 | class RowBasedLoader {
 41 | public:
 42 |   RowBasedLoader() : length_(0) {}
 43 | 
 44 |   void load(const std::string &filename, const ParseParams &params) {
 45 |       header_parser_.open(filename, params.no_header);
 46 |       struct stat fs;
 47 |       if (stat(filename.c_str(), &fs) == -1) {
 48 |         throw std::logic_error("cannot stat file");
 49 |       }
 50 |       length_ = fs.st_size;
 51 |       column_infos_.resize(header_parser_.get_num_columns());
 52 |       for (size_t i = 0; i < column_infos_.size(); i++) {
 53 |         column_infos_[i].name = header_parser_.get_column_name(i);
 54 |       }
 55 |       if (header_parser_.has_header()) {
 56 |         chunker_.process(filename, header_parser_.get_end_of_header()+1, params.num_threads, params.allow_quoted_newlines);
 57 |       }
 58 |       else {
 59 |         chunker_.process(filename, 0, params.num_threads, params.allow_quoted_newlines);
 60 |       }
 61 |     std::vector<std::thread> threads;
 62 |     std::vector<std::shared_ptr<RowBasedParseWorker> > workers;
 63 |     for (size_t worker_id = 0; worker_id < params.num_threads; worker_id++) {
 64 |       long start_of_chunk, end_of_chunk = 0;
 65 |       std::tie(start_of_chunk, end_of_chunk) = chunker_.get_chunk(worker_id);
 66 |       
 67 |       /* If the chunk was eliminated because its entirety represents quoted
 68 |          text, do not spawn a worker thread for it. */
 69 |       if (start_of_chunk < 0 || end_of_chunk < 0) {
 70 |         continue;
 71 |       }
 72 |       workers.push_back(std::make_shared<RowBasedParseWorker>(start_of_chunk, end_of_chunk, length_, params.block_size, params.compression == Compression::SNAPPY));
 73 |       threads.emplace_back(&RowBasedParseWorker::parse,
 74 |                            workers.back(),
 75 |                            filename);
 76 |       start_of_chunk = end_of_chunk;
 77 |     }
 78 |     for (size_t i = 0; i < threads.size(); i++) {
 79 |       threads[i].join();
 80 |     }
 81 |   }
 82 | 
 83 |     /*
 84 |       Returns the number of columns parsed by this loader.
 85 |      */
 86 |     size_t     get_num_columns() const {
 87 |       return column_infos_.size();
 88 |     }
 89 | 
 90 |     /*
 91 |       Returns the info about the column.
 92 |      */
 93 |     ParaText::ColumnInfo get_column_info(size_t column_index) const {
 94 |       return column_infos_[column_index];
 95 |     }
 96 | 
 97 |     /*
 98 |       Returns the categorical levels.
 99 |      */
100 |     const std::vector<std::string> &get_levels(size_t column_index) const {
101 |       std::cout << level_names_[column_index].size();
102 |       return level_names_[column_index];
103 |     }
104 | 
105 |     size_t size() const {
106 |       return size_.back();
107 |     }
108 | 
109 | private:
110 |   size_t length_;
111 |   mutable std::vector<std::unordered_map<std::string, size_t> > level_ids_;
112 |   mutable std::vector<std::vector<std::string> > level_names_;
113 |   std::vector<size_t> size_;
114 |   std::vector<ColumnInfo> column_infos_;
115 |   TextChunker chunker_;
116 |   HeaderParser header_parser_;
117 | };
118 | }
119 | }
120 | #endif
121 | 


--------------------------------------------------------------------------------
/src/diagnostic/memcopy.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | 
  3 |     ParaText: parallel text reading
  4 |     Copyright (C) 2016. wise.io, Inc.
  5 | 
  6 |    Licensed to the Apache Software Foundation (ASF) under one
  7 |    or more contributor license agreements.  See the NOTICE file
  8 |    distributed with this work for additional information
  9 |    regarding copyright ownership.  The ASF licenses this file
 10 |    to you under the Apache License, Version 2.0 (the
 11 |    "License"); you may not use this file except in compliance
 12 |    with the License.  You may obtain a copy of the License at
 13 | 
 14 |      http://www.apache.org/licenses/LICENSE-2.0
 15 | 
 16 |    Unless required by applicable law or agreed to in writing,
 17 |    software distributed under the License is distributed on an
 18 |    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 19 |    KIND, either express or implied.  See the License for the
 20 |    specific language governing permissions and limitations
 21 |    under the License.
 22 |  */
 23 | 
 24 | /*
 25 |   Coder: Damian Eads.
 26 |  */
 27 | 
 28 | #ifndef PARATEXT_DIAGNOSTIC_MEM_COPY_HPP
 29 | #define PARATEXT_DIAGNOSTIC_MEM_COPY_HPP
 30 | 
 31 | #include <iostream>
 32 | #include <fstream>
 33 | 
 34 | #include <sys/types.h>
 35 | #include <sys/stat.h>
 36 | #include <unistd.h>
 37 | #include <thread>
 38 | #include <sstream>
 39 | 
 40 | #include "generic/chunker.hpp"
 41 | #include "csv/header_parser.hpp"
 42 | 
 43 | namespace ParaText {
 44 | 
 45 |   namespace Diagnostic {
 46 | 
 47 | class MemCopyWorker {
 48 | public:
 49 |   MemCopyWorker(size_t chunk_start, size_t chunk_end, size_t block_size)
 50 |     : chunk_start_(chunk_start),
 51 |       chunk_end_(chunk_end),
 52 |       block_size_(block_size) {}
 53 | 
 54 |   virtual ~MemCopyWorker() {}
 55 | 
 56 |   void parse(const std::string &filename) {
 57 |     try {
 58 |       parse_impl(filename);
 59 |     }
 60 |     catch (...) {
 61 |       thread_exception_ = std::current_exception();
 62 |     }
 63 |   }
 64 | 
 65 |   std::exception_ptr get_exception() {
 66 |     return thread_exception_;
 67 |   }
 68 | 
 69 |   void parse_impl(const std::string &filename) {
 70 |     std::ifstream in;
 71 |     in.open(filename.c_str());
 72 |     const size_t block_size = block_size_;
 73 |     char buf[block_size];
 74 |     in.seekg(chunk_start_, std::ios_base::beg);
 75 |     size_t current = chunk_start_;
 76 |     while (current <= chunk_end_) {
 77 |       in.read(buf, std::min(chunk_end_ - current + 1, block_size));
 78 |       size_t nread = in.gcount();
 79 |       if (nread == 0) {
 80 |         break;
 81 |       }
 82 |       data_.insert(data_.begin(), buf + 0, buf + nread);
 83 |       current += nread;
 84 |     }
 85 |   }
 86 | 
 87 | private:
 88 |   size_t chunk_start_;
 89 |   size_t chunk_end_;
 90 |   size_t block_size_;
 91 |   std::vector<char> data_;
 92 |   std::exception_ptr thread_exception_;
 93 | };
 94 | 
 95 |   class MemCopyBaseline {
 96 |   public:
 97 |     MemCopyBaseline()          {}
 98 | 
 99 |     virtual ~MemCopyBaseline() {}
100 | 
101 |     void load(const std::string &filename, const ParseParams &params) {
102 |       std::vector<std::thread> threads;
103 |       std::vector<std::shared_ptr<MemCopyWorker> > workers;
104 |       header_parser_.open(filename, params.no_header);
105 |       std::exception_ptr thread_exception;
106 |       if (header_parser_.has_header()) {
107 |         chunker_.process(filename, header_parser_.get_end_of_header()+1, params.num_threads, params.allow_quoted_newlines);
108 |       }
109 |       else {
110 |         chunker_.process(filename, 0, params.num_threads, params.allow_quoted_newlines);
111 |       }
112 |       for (size_t worker_id = 0; worker_id < chunker_.num_chunks(); worker_id++) {
113 |         size_t start_of_chunk = 0, end_of_chunk = 0;
114 |         std::tie(start_of_chunk, end_of_chunk) = chunker_.get_chunk(worker_id);
115 |         
116 |         if (start_of_chunk == end_of_chunk) {
117 |           continue;
118 |         }
119 |         workers.push_back(std::make_shared<MemCopyWorker>(start_of_chunk, end_of_chunk, params.block_size));
120 |         threads.emplace_back(&MemCopyWorker::parse,
121 |                              workers.back(),
122 |                              filename);
123 |       }
124 | 
125 |       for (size_t i = 0; i < threads.size(); i++) {
126 |         threads[i].join();
127 |         if (!thread_exception) {
128 |           thread_exception = workers[i]->get_exception();
129 |         }
130 |       }
131 |       // We're now outside the parallel region.
132 |       if (thread_exception) {
133 |         std::rethrow_exception(thread_exception);
134 |       }
135 |     }
136 | 
137 |   private:
138 |     CSV::HeaderParser header_parser_;
139 |     TextChunker chunker_;
140 |   };
141 |   }
142 | }
143 | #endif
144 | 


--------------------------------------------------------------------------------
/src/diagnostic/newline_counter.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | 
  3 |     ParaText: parallel text reading
  4 |     Copyright (C) 2016. wise.io, Inc.
  5 | 
  6 |    Licensed to the Apache Software Foundation (ASF) under one
  7 |    or more contributor license agreements.  See the NOTICE file
  8 |    distributed with this work for additional information
  9 |    regarding copyright ownership.  The ASF licenses this file
 10 |    to you under the Apache License, Version 2.0 (the
 11 |    "License"); you may not use this file except in compliance
 12 |    with the License.  You may obtain a copy of the License at
 13 | 
 14 |      http://www.apache.org/licenses/LICENSE-2.0
 15 | 
 16 |    Unless required by applicable law or agreed to in writing,
 17 |    software distributed under the License is distributed on an
 18 |    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 19 |    KIND, either express or implied.  See the License for the
 20 |    specific language governing permissions and limitations
 21 |    under the License.
 22 |  */
 23 | 
 24 | /*
 25 |   Coder: Damian Eads.
 26 |  */
 27 | 
 28 | #ifndef PARATEXT_DIAGNOSTIC_NL_COUNTER_HPP
 29 | #define PARATEXT_DIAGNOSTIC_NL_COUNTER_HPP
 30 | 
 31 | #include <iostream>
 32 | #include <fstream>
 33 | 
 34 | #include <sys/types.h>
 35 | #include <sys/stat.h>
 36 | #include <unistd.h>
 37 | #include <thread>
 38 | #include <sstream>
 39 | 
 40 | #include "generic/chunker.hpp"
 41 | #include "csv/header_parser.hpp"
 42 | 
 43 | namespace ParaText {
 44 | 
 45 |   namespace Diagnostic {
 46 | 
 47 | class NewlineCountWorker {
 48 | public:
 49 |   NewlineCountWorker(size_t chunk_start, size_t chunk_end, size_t block_size)
 50 |     : chunk_start_(chunk_start),
 51 |       chunk_end_(chunk_end),
 52 |       block_size_(block_size) {}
 53 | 
 54 |   virtual ~NewlineCountWorker() {}
 55 | 
 56 |   void parse(const std::string &filename) {
 57 |     try {
 58 |       parse_impl(filename);
 59 |     }
 60 |     catch (...) {
 61 |       thread_exception_ = std::current_exception();
 62 |     }
 63 |   }
 64 | 
 65 |   std::exception_ptr get_exception() {
 66 |     return thread_exception_;
 67 |   }
 68 | 
 69 |   void parse_impl(const std::string &filename) {
 70 |     std::ifstream in;
 71 |     in.open(filename.c_str());
 72 |     const size_t block_size = block_size_;
 73 |     char buf[block_size];
 74 |     in.seekg(chunk_start_, std::ios_base::beg);
 75 |     size_t current = chunk_start_;
 76 |     num_newlines_ = 0;
 77 |     while (current <= chunk_end_) {
 78 |       in.read(buf, std::min(chunk_end_ - current + 1, block_size));
 79 |       size_t nread = in.gcount();
 80 |       if (nread == 0) {
 81 |         break;
 82 |       }
 83 |       for (size_t i = 0; i < nread; i++) {
 84 |         if (buf[i] == '\n') {
 85 |           num_newlines_++;
 86 |         }
 87 |       }
 88 |       current += nread;
 89 |     }
 90 |   }
 91 | 
 92 |   size_t get_num_newlines() const {
 93 |     return num_newlines_;
 94 |   }
 95 | 
 96 | private:
 97 |   size_t chunk_start_;
 98 |   size_t chunk_end_;
 99 |   size_t block_size_;
100 |   size_t num_newlines_;
101 |   std::exception_ptr thread_exception_;
102 | };
103 | 
104 |   class NewlineCounter {
105 |   public:
106 |     NewlineCounter()          {}
107 | 
108 |     virtual ~NewlineCounter() {}
109 | 
110 |     size_t load(const std::string &filename, const ParseParams &params) {
111 |       std::vector<std::thread> threads;
112 |       std::vector<std::shared_ptr<NewlineCountWorker> > workers;
113 |       std::exception_ptr thread_exception;
114 |       chunker_.process(filename, 0, params.num_threads, params.allow_quoted_newlines);
115 |       for (size_t worker_id = 0; worker_id < chunker_.num_chunks(); worker_id++) {
116 |         long start_of_chunk = 0, end_of_chunk = 0;
117 |         std::tie(start_of_chunk, end_of_chunk) = chunker_.get_chunk(worker_id);
118 |         if (start_of_chunk < 0 || end_of_chunk < 0) {
119 |           continue;
120 |         }
121 |         workers.push_back(std::make_shared<NewlineCountWorker>(start_of_chunk, end_of_chunk, params.block_size));
122 |         threads.emplace_back(&NewlineCountWorker::parse,
123 |                              workers.back(),
124 |                              filename);
125 |       }
126 | 
127 |       for (size_t i = 0; i < threads.size(); i++) {
128 |         threads[i].join();
129 |         if (!thread_exception) {
130 |           thread_exception = workers[i]->get_exception();
131 |         }
132 |       }
133 |       // We're now outside the parallel region.
134 |       if (thread_exception) {
135 |         std::rethrow_exception(thread_exception);
136 |       }
137 |       size_t newline_count = 0;
138 |       for (size_t i = 0; i < workers.size(); i++) {
139 |         newline_count += workers[i]->get_num_newlines();
140 |       }
141 |       return newline_count;
142 |     }
143 | 
144 |   private:
145 |     TextChunker chunker_;
146 |   };
147 |   }
148 | }
149 | #endif
150 | 


--------------------------------------------------------------------------------
/src/python/processor.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |   File: processor.bhpp
  3 | 
  4 |   Author: Damian Eads, PhD
  5 | 
  6 |   Copyright (C) wise.io, Inc. 2015.
  7 |  */
  8 | 
  9 | #ifndef WISEIO_PROCESSOR_HPP
 10 | #define WISEIO_PROCESSOR_HPP
 11 | 
 12 | #include <string>
 13 | #include <exception>
 14 | 
 15 | #ifdef PARATEXT_DATE_TIME
 16 | #include <boost/date_time/posix_time/posix_time.hpp>
 17 | #endif
 18 | 
 19 | namespace ParaText {
 20 | 
 21 |   /*
 22 |     A generic call-back interface for processing a sequence of
 23 |     variably-typed objects coming from a different language.
 24 | 
 25 |     For example, if a C++ functor requires a sequence, it
 26 |     can implement the interface of this class.
 27 | 
 28 |     WiseTransfer will iterate over the array, list, tuple,
 29 |     iterable object, or sequence object. When an element of a
 30 |     string is found, it calls process_string. If it is floating
 31 |     point, process_float is called. If it is a long integer,
 32 |     process_long is called.
 33 |    */
 34 |   class CallbackProcessor {
 35 |   public:
 36 | 
 37 |     /*
 38 |       The base constructor. Does nothing in this part of sub-object
 39 |       construction.
 40 |      */
 41 |     CallbackProcessor();
 42 | 
 43 |     /*
 44 |       The destructor deletes this callback processor and deallocates
 45 |       any temporary resources needed.
 46 |      */
 47 |     virtual ~CallbackProcessor();
 48 |     
 49 |     /*
 50 |       Tells the functor to ingest the next element, which is a string.
 51 |      */
 52 |     virtual void process_string(const char *begin, const char *end) = 0;
 53 | 
 54 |     /*
 55 |       Tells the functor to ingest the next element, which is a float.
 56 |      */
 57 |     virtual void process_float(float fval) = 0;
 58 | 
 59 |     /*
 60 |       Tells the functor to ingest the next element, which is a long.
 61 |      */
 62 |     virtual void process_long(long lval) = 0;
 63 | 
 64 |     /*
 65 |       Tells the functor to ingest the next element, which is a bool.
 66 |      */
 67 |     virtual void process_bool(bool bval) = 0;
 68 | 
 69 |     /*
 70 |       Asks the functor to translate an exception thrown while calling
 71 |       one of the process_XXX methods into a string.
 72 |      */
 73 |     virtual void process_exception(std::exception_ptr ptr, std::string &text) = 0;
 74 | 
 75 |     /*
 76 |       Process the next sparse value.
 77 |      */
 78 |     virtual void process_sparse(size_t row_index, size_t col_index, float value) = 0;
 79 | 
 80 | 
 81 |     /*
 82 |       Process an empty sparse row.
 83 |      */
 84 |     virtual void process_sparse(size_t row_index) = 0;
 85 |   };
 86 | 
 87 |   /*
 88 |     An enumerated type for identifying the type of element in an
 89 |     IteratorProcessor.
 90 |    */
 91 |   enum class IteratorElementType {STRING, LONG, BOOL, FLOAT, DATETIME};
 92 | 
 93 |   /*
 94 |     An IteratorProcessor (iterproc for short) generic interface for
 95 |     manipulating an iterator over primitive types in another language.
 96 | 
 97 |     An iterproc X can be queried if there are any more elements remaining
 98 |     as follows::
 99 | 
100 |         while (X.has_next()) {
101 |            switch (X.get_type()) {
102 |            case IteratorElementType::STRING:
103 |               ...
104 |               break;
105 |            }
106 |            X.advance();
107 |         }
108 | 
109 |     The get_type() function returns the type of the current element.
110 |     The advance() function advances the iterator to the next element.
111 |     The element that the iterator is currently pointing to can be
112 |     retrieved with:
113 | 
114 |           X.get_string()
115 |           X.get_float()
116 |           X.get_long()
117 |           X.get_bool()
118 | 
119 |     
120 |    */
121 |   class IteratorProcessor {
122 |   public:
123 |     /*
124 |       The base constructor for an IteratorProcessor. This part of
125 |       the sub-object construction does nothing.
126 |      */
127 |     IteratorProcessor() {}
128 | 
129 |     /*
130 |       A virtual destructor for the iterator processor.
131 |     */
132 |     virtual ~IteratorProcessor() {}
133 | 
134 |     /*
135 |       The type of the element to which the iterator currently points.
136 |     */
137 |     virtual IteratorElementType get_type() const = 0;
138 | 
139 |     /*
140 |       Retrieves a string representation of the current element.
141 |      */
142 |     virtual std::string get_string() const = 0;
143 | 
144 |     /*
145 |       Retrieves a float at the current element.
146 |      */
147 |     virtual double get_float() const = 0;
148 | 
149 |     /*
150 |       Retrieves a long at the current element.
151 |      */
152 |     virtual long get_long() const = 0;
153 | 
154 | #ifdef PARATEXT_DATE_TIME
155 |     /*
156 |       Retrieves a date time at the current element.
157 |      */
158 |     virtual boost::posix_time::ptime get_datetime() const = 0;
159 | #endif
160 | 
161 |     /*
162 |       Retrieves a bool at the current element.
163 |      */
164 |     virtual bool get_bool() const = 0;
165 | 
166 |     /*
167 |       Returns true if and only if this iterator has another element
168 |       past the current element.
169 |      */
170 |     virtual bool has_next() const = 0;
171 | 
172 |     /*
173 |       Advances the iterator to the next element.
174 |      */
175 |     virtual void advance() = 0;
176 | 
177 |     /*
178 |       Returns the number of elements this iterator processes. If this
179 |       number is not known, std::numeric_limits<size_t>::max() is used
180 |       instead.
181 |      */
182 |     virtual size_t size() const = 0;
183 |   };
184 | }
185 | #endif
186 | 


--------------------------------------------------------------------------------
/src/python/python.i:
--------------------------------------------------------------------------------
  1 | /*
  2 |     ParaText: parallel text reading
  3 |     Copyright (C) 2016. wise.io, Inc.
  4 | 
  5 |    Licensed to the Apache Software Foundation (ASF) under one
  6 |    or more contributor license agreements.  See the NOTICE file
  7 |    distributed with this work for additional information
  8 |    regarding copyright ownership.  The ASF licenses this file
  9 |    to you under the Apache License, Version 2.0 (the
 10 |    "License"); you may not use this file except in compliance
 11 |    with the License.  You may obtain a copy of the License at
 12 | 
 13 |      http://www.apache.org/licenses/LICENSE-2.0
 14 | 
 15 |    Unless required by applicable law or agreed to in writing,
 16 |    software distributed under the License is distributed on an
 17 |    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 18 |    KIND, either express or implied.  See the License for the
 19 |    specific language governing permissions and limitations
 20 |    under the License.
 21 |  */
 22 | 
 23 | /*
 24 |   Coder: Damian Eads.
 25 |  */
 26 | 
 27 | %init %{
 28 |   import_array();
 29 | %}
 30 | 
 31 | #define PARATEXT_TYPEMAP_EXCEPTION_START try {
 32 | 
 33 | #define PARATEXT_TYPEMAP_EXCEPTION_END    } catch (const std::string &e) {\
 34 |       std::string s = e;\
 35 |       SWIG_exception(SWIG_RuntimeError, s.c_str());\
 36 |       SWIG_fail;\
 37 |     } catch (const std::exception &e) {\
 38 |       SWIG_exception(SWIG_RuntimeError, e.what());\
 39 |       SWIG_fail;\
 40 |     } catch (const char *emsg) {\
 41 |       SWIG_exception(SWIG_RuntimeError, emsg);\
 42 |       SWIG_fail;\
 43 |     } catch (...) {\
 44 |       SWIG_exception(SWIG_RuntimeError, "unknown exception");\
 45 |       SWIG_fail;\
 46 |     }
 47 | 
 48 | %exception {
 49 |     try {
 50 |         $action
 51 |     } catch (const std::string &e) {
 52 |       std::string s = e;
 53 |       SWIG_exception(SWIG_RuntimeError, s.c_str());
 54 |       SWIG_fail;
 55 |     } catch (const std::exception &e) {
 56 |       SWIG_exception(SWIG_RuntimeError, e.what());
 57 |       SWIG_fail;
 58 |     } catch (const char *emsg) {
 59 |       SWIG_exception(SWIG_RuntimeError, emsg);
 60 |       SWIG_fail;
 61 |     } catch (...) {
 62 |       SWIG_exception(SWIG_RuntimeError, "unknown exception");
 63 |       SWIG_fail;
 64 |     }
 65 | }
 66 | 
 67 | %typemap(out) std::vector<int> {
 68 |   $result = (PyObject*)::build_array<ParaText::Encoding::UNKNOWN_BYTES, ParaText::Encoding::UNKNOWN_BYTES, std::vector<int>>($1);
 69 | }
 70 | 
 71 | %typemap(out) std::vector<double> {
 72 |   $result = (PyObject*)::build_array<ParaText::Encoding::UNKNOWN_BYTES, ParaText::Encoding::UNKNOWN_BYTES, std::vector<double>>($1);
 73 | }
 74 | 
 75 | %typemap(out) std::vector<size_t> {
 76 |   $result = (PyObject*)::build_array<ParaText::Encoding::UNKNOWN_BYTES, ParaText::Encoding::UNKNOWN_BYTES, std::vector<size_t>>($1);
 77 | }
 78 | 
 79 | %typemap(out) const std::vector<std::string> & {
 80 |   { auto range = std::make_pair($1->begin(), $1->end());
 81 |    $result = (PyObject*)::build_array_from_range(range);
 82 |   }
 83 | }
 84 | 
 85 | %typemap(out) std::vector<std::string> {
 86 |   $result = (PyObject*)::build_array<ParaText::Encoding::UNKNOWN_BYTES, ParaText::Encoding::UNICODE_UTF8, std::vector<std::string>>($1);
 87 | }
 88 | 
 89 | %typemap(out) const std::vector<std::string> & {
 90 |   { auto range = std::make_pair($1->begin(), $1->end());
 91 |     $result = (PyObject*)::build_array_from_range<ParaText::Encoding::UNKNOWN_BYTES, ParaText::Encoding::UNKNOWN_BYTES>(range);
 92 |   }
 93 | }
 94 | 
 95 | %typemap(out) const std::pair<std::vector<std::string>, ParaText::TagEncoding<UNICODE_UTF8, UNICODE_UTF8> > & {
 96 |   { auto range = std::make_pair($1->begin(), $1->end());
 97 |     $result = (PyObject*)::build_array_from_range<ParaText::Encoding::UNICODE_UTF8>>(range);
 98 |   }
 99 | }
100 | 
101 | %typemap(out) const std::pair<std::vector<std::string>, ParaText::TagEncoding<UNNOWN_BYTES, UNICODE_UTF8> > & {
102 |   { auto range = std::make_pair($1->begin(), $1->end());
103 |     $result = (PyObject*)::build_array_from_range<ParaText::Encoding::UNICODE_UTF8>>(range);
104 |   }
105 | }
106 | 
107 | %typemap(out) std::vector<std::string> {
108 |   $result = (PyObject*)::build_array<ParaText::Encoding::UNKNOWN_BYTES, ParaText::Encoding::UNICODE_UTF8, std::vector<std::string>>($1);
109 | }
110 | 
111 | %typemap(out) ParaText::CSV::ColBasedPopulator {
112 |   $result = (PyObject*)::build_populator<ParaText::CSV::ColBasedPopulator>($1);
113 | }
114 | 
115 | %typemap(out) ParaText::CSV::StringVectorPopulator {
116 |   $result = (PyObject*)::build_populator<ParaText::CSV::StringVectorPopulator>($1);
117 | }
118 | 
119 | /*
120 | %typemap(in) const std::string & {
121 |   std::string result(ParaText::get_as_string($input, 0));
122 |   $1 = &result;
123 | }
124 | 
125 | %typemap(in) std::string & {
126 |   std::string result(ParaText::get_as_string($input, 0));
127 |   $1 = &result;
128 | }
129 | */
130 | 
131 | %typemap(in) const std::string & {
132 |   PARATEXT_TYPEMAP_EXCEPTION_START
133 |   std::unique_ptr<std::string> result(new std::string(ParaText::get_as_string($input, 0)));
134 |   $1 = result.release();
135 |   PARATEXT_TYPEMAP_EXCEPTION_END
136 | }
137 | 
138 | %typemap(in) std::string & {
139 |   PARATEXT_TYPEMAP_EXCEPTION_START
140 |   std::unique_ptr<std::string> result(new std::string(ParaText::get_as_string($input, 0)));
141 |   $1 = result.release();
142 |   PARATEXT_TYPEMAP_EXCEPTION_END
143 | }
144 | 
145 | %typemap(freearg) const std::string & {
146 |   delete $1;
147 | }
148 | 
149 | %typemap(freearg) std::string & {
150 |   delete $1;
151 | }
152 | 
153 | 
154 | %typemap(out) const std::string & {
155 |   AsPythonString<ParaText::Encoding::UNKNOWN_BYTES, ParaText::Encoding::UNICODE_UTF8> helper;
156 |   $result = helper(*$1);
157 | }
158 | 
159 | %typemap(out) std::string & {
160 |   AsPythonString<ParaText::Encoding::UNKNOWN_BYTES, ParaText::Encoding::UNICODE_UTF8> helper;
161 |   $result = helper($1);
162 | }
163 | 
164 | %typemap(out) std::string {
165 |   AsPythonString<ParaText::Encoding::UNKNOWN_BYTES, ParaText::Encoding::UNICODE_UTF8> helper;
166 |   $result = helper($1);
167 | }
168 | 
169 | %typemap(out) ParaText::as_raw_bytes {
170 |   AsPythonString<ParaText::Encoding::UNKNOWN_BYTES, ParaText::Encoding::UNKNOWN_BYTES> helper;
171 |   $result = helper($1.val);
172 | }
173 | 
174 | %typemap(out) ParaText::as_utf8 {
175 |   AsPythonString<ParaText::Encoding::UNKNOWN_BYTES, ParaText::Encoding::UNICODE_UTF8> helper;
176 |   $result = helper($1.val);
177 | }
178 | 
179 | 
180 | 
181 | %{
182 | #include "python/numpy_helper.hpp"
183 | #include "python/python_input.hpp"
184 | %}
185 | 
186 | 


--------------------------------------------------------------------------------
/python/paratext/serial.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Single-threaded utilities
  3 | """
  4 | 
  5 | #   Licensed to the Apache Software Foundation (ASF) under one
  6 | #   or more contributor license agreements.  See the NOTICE file
  7 | #   distributed with this work for additional information
  8 | #   regarding copyright ownership.  The ASF licenses this file
  9 | #   to you under the Apache License, Version 2.0 (the
 10 | #   "License"); you may not use this file except in compliance
 11 | #   with the License.  You may obtain a copy of the License at
 12 | #
 13 | #     http://www.apache.org/licenses/LICENSE-2.0 
 14 | #
 15 | #   Unless required by applicable law or agreed to in writing,
 16 | #   software distributed under the License is distributed on an
 17 | #   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 18 | #   KIND, either express or implied.  See the License for the
 19 | #   specific language governing permissions and limitations
 20 | #   under the License.
 21 | #
 22 | #   Copyright (C) Wise.io, Inc. 2016.
 23 | 
 24 | 
 25 | #
 26 | #   Coder: Damian Eads
 27 | #
 28 | 
 29 | import paratext_internal as pti
 30 | 
 31 | import os
 32 | import six
 33 | from six.moves import range
 34 | from six.moves.urllib_parse import urlparse
 35 | 
 36 | import random
 37 | import numpy as np
 38 | import string
 39 | 
 40 | import numpy as np
 41 | import unittest
 42 | import collections
 43 | import pandas
 44 | import paratext_internal
 45 | import os
 46 | import random
 47 | import sys
 48 | 
 49 | if sys.version_info>=(3,0):
 50 |     def _repr_bytes(o):
 51 |         return bytes(repr(o), 'utf-8')
 52 | else:
 53 |     def _repr_bytes(o):
 54 |         return repr(o)
 55 | 
 56 | def as_quoted_string(s, do_not_escape_newlines=False):
 57 |     return paratext_internal.as_quoted_string(s, do_not_escape_newlines)
 58 | 
 59 | 
 60 | def _docstring_parameter(*sub):
 61 |      def dec(obj):
 62 |          obj.__doc__ = obj.__doc__.format(*sub)
 63 |          return obj
 64 |      return dec
 65 | 
 66 | _save_frame_params = """
 67 |     frame : DataFrame, mapping, dict
 68 |          This object must be DataFrame-like (ie implement .keys() and __getattr__).
 69 | 
 70 |     allow_quoted_newlines : bool
 71 |          Whether to allow newlines to be unescaped in a quoted string. If False, all newline
 72 |          are encountered are escaped.
 73 | 
 74 |     out_encoding : bool
 75 |          The encoding to use. Valid options include:
 76 |             - `utf-8`: UTF-8 data
 77 |             - `arbitrary`: arbitrary bytes (values 0x00-0xFF)
 78 |             - `printable_ascii`: values 0x20-0xFF. 0x0A is included if `allow_quoted_newlines`=True
 79 |             - `ascii`: values 0x00-0x7F
 80 |          If any values are outside of this range, they are backslash-escaped.
 81 | 
 82 |     dos : bool
 83 |          Whether to add a carriage return before a newline (Windows and DOS compatability).
 84 | """
 85 | 
 86 | 
 87 | @_docstring_parameter(_save_frame_params)
 88 | def save_frame(filename, frame, allow_quoted_newlines=True, out_encoding='arbitrary', dos=False):
 89 |     """
 90 |     Saves a dictframe/DataFrame of sequences of the same size to a CSV file.
 91 | 
 92 |     Parameters
 93 |     ----------
 94 |     filename : str, unicode
 95 |          The name of the filename to write.
 96 | 
 97 |     {0}
 98 |     """
 99 |     f = open(filename, 'wb')
100 |     write_frame(f, frame, allow_quoted_newlines=allow_quoted_newlines, out_encoding=out_encoding, dos=dos)
101 |     f.close()
102 | 
103 | @_docstring_parameter(_save_frame_params)
104 | def write_frame(stream, frame, allow_quoted_newlines=True, out_encoding='arbitrary', dos=False):
105 |     """
106 |     Saves a dictframe/DataFrame of sequences of the same size to a byte stream (binary mode).
107 | 
108 |     Parameters
109 |     ----------
110 |     filename : str, unicode
111 |          The name of the filename to write.
112 | 
113 |     {0}
114 |     """
115 | 
116 |     # In case .keys() is non-deterministic
117 |     keys = list(frame.keys())
118 |     cols = []
119 | 
120 |     psafe=paratext_internal.SafeStringOutput()
121 |     psafe.escape_nonascii(True)
122 |     psafe.escape_nonprintables(True)
123 |     safe=paratext_internal.SafeStringOutput()
124 |     safe.escape_special(True)
125 |     if out_encoding == 'utf-8':
126 |         safe.escape_nonutf8(True)
127 |     elif out_encoding == 'ascii':
128 |         safe.escape_nonascii(True)
129 |     elif out_encoding == 'printable_ascii':
130 |         safe.escape_nonascii(True)
131 |         safe.escape_nonprintables(True)
132 |     if not allow_quoted_newlines:
133 |         safe.escape_newlines(True)
134 |         psafe.escape_newlines(True)
135 |     safe.double_quote_output(True)
136 |     psafe.double_quote_output(True)
137 |     for col in range(len(keys)):
138 |         if col > 0:
139 |             stream.write(b",")
140 |             stream.flush()
141 |         key = keys[col]
142 |         if out_encoding == 'utf-8':
143 |             stream.flush()
144 |             if isinstance(key, bytes):
145 |                 skey = psafe.to_raw_string(key)
146 |             else:
147 |                 skey = safe.to_raw_string(key)
148 |             stream.write(skey)
149 |         else:
150 |             stream.flush()
151 |             if isinstance(key, bytes):
152 |                 skey = psafe.to_raw_string(key)
153 |             else:
154 |                 skey = safe.to_raw_string(key)
155 |             stream.write(skey)
156 |         if isinstance(frame[key], pandas.Series):
157 |             cols.append(frame[key].values)
158 |         else:
159 |             cols.append(np.asarray(frame[key]))
160 |     if dos:
161 |         stream.write(b"\r\n")
162 |     else:
163 |         stream.write(b"\n")
164 |     if hasattr(frame, "shape"):
165 |         num_rows = frame.shape[0]
166 |     elif len(keys) == 0:
167 |         num_rows = 0
168 |     else:
169 |         num_rows = len(frame[keys[0]])
170 |     for row in range(num_rows):
171 |         for col in range(len(cols)):
172 |             if col > 0:
173 |                 stream.write(b',')
174 |             val = cols[col][row]
175 |             if np.issubdtype(type(val), np.string_) or np.issubdtype(type(val), np.unicode_) or isinstance(val, six.string_types):
176 |                 if out_encoding == 'utf-8':
177 |                     #sval = safe.to_utf8_string(val)
178 |                     if isinstance(val, bytes):
179 |                         sval = psafe.to_raw_string(val)
180 |                     else:
181 |                         sval = safe.to_raw_string(val)
182 |                     stream.write(sval)
183 |                 else:
184 |                     sval = safe.to_raw_string(val)
185 |                     stream.write(sval)
186 |             else:
187 |                 stream.write(bytes(_repr_bytes(val)))
188 |         if dos:
189 |             stream.write(b"\r\n")
190 |         else:
191 |             stream.write(b"\n")
192 | 


--------------------------------------------------------------------------------
/src/csv/rowbased_worker.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | 
  3 |     ParaText: parallel text reading
  4 |     Copyright (C) 2016. wise.io, Inc.
  5 | 
  6 |    Licensed to the Apache Software Foundation (ASF) under one
  7 |    or more contributor license agreements.  See the NOTICE file
  8 |    distributed with this work for additional information
  9 |    regarding copyright ownership.  The ASF licenses this file
 10 |    to you under the Apache License, Version 2.0 (the
 11 |    "License"); you may not use this file except in compliance
 12 |    with the License.  You may obtain a copy of the License at
 13 | 
 14 |      http://www.apache.org/licenses/LICENSE-2.0
 15 | 
 16 |    Unless required by applicable law or agreed to in writing,
 17 |    software distributed under the License is distributed on an
 18 |    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 19 |    KIND, either express or implied.  See the License for the
 20 |    specific language governing permissions and limitations
 21 |    under the License.
 22 |  */
 23 | 
 24 | /*
 25 |   Coder: Damian Eads.
 26 |  */
 27 | 
 28 | #ifndef PARATEXT_ROW_BASED_WORKER_HPP
 29 | #define PARATEXT_ROW_BASED_WORKER_HPP
 30 | 
 31 | #include <snappy.h>
 32 | 
 33 | namespace ParaText {
 34 | 
 35 | namespace CSV {
 36 | 
 37 | class RowBasedParseWorker {
 38 | public:
 39 |   RowBasedParseWorker(size_t chunk_start, size_t chunk_end, size_t file_size, size_t block_size, bool compression)
 40 |     : chunk_start_(chunk_start),
 41 |       chunk_end_(chunk_end),
 42 |       file_size_(file_size),
 43 |       block_size_(block_size),
 44 |       compression_(compression) {}
 45 | 
 46 |   virtual ~RowBasedParseWorker() {}
 47 | 
 48 |   void parse(const std::string &filename) {
 49 |     std::ifstream in;
 50 |     in.open(filename.c_str());
 51 |     const size_t block_size = block_size_;
 52 |     char buf[block_size];
 53 |     in.seekg(chunk_start_, std::ios_base::beg);
 54 |     size_t current = chunk_start_;
 55 |     uint8_t state = 0;
 56 |     //std::array <uint8_t, 256> staters;
 57 |     // 0: assumed negative-integer
 58 |     // 1: assumed integer, only digits encountered
 59 |     // 2: assumed float, '.' encountered
 60 |     // 3: assumed float, digits encountered before and after '.'
 61 |     // 4: assumed float, 'e' encountered
 62 |     // 5: closed-string
 63 |     // 6: open-string, '"' encountered
 64 |     // 7: unquoted delimiter
 65 |     // 8: unquoted newline
 66 |     std::vector<unsigned char> token;
 67 |     state = 0;
 68 |     std::vector<unsigned char> input;
 69 |     //msgpack::sbuffer ss;
 70 |     std::string output;
 71 |     column_index_ = 0;
 72 |     while (in && current < chunk_end_) {
 73 |       in.read(buf, std::min(chunk_end_ - current, block_size));
 74 |       size_t nread = in.gcount();
 75 |       if (nread == 0) {
 76 |         break;
 77 |       }
 78 |       size_t i = 0;
 79 |       if (state == 6) { /* open quote. */
 80 |         for (; i < nread; i++) {
 81 |           if (buf[i] == '\"') {
 82 |             i++;
 83 |             state = 5;
 84 |             break;
 85 |           }
 86 |           else {
 87 |             token.push_back(buf[i]);
 88 |           }
 89 |         }
 90 |       }
 91 |       if (state < 4) {
 92 |         if (buf[i] == 'E' || buf[i] == 'e') {
 93 |           token.push_back(buf[i]);
 94 |           i++;
 95 |           state = 4;
 96 |         }
 97 |       }
 98 |       for (size_t i = 0; i < nread; i++) {
 99 |         if (buf[i] >= 0x3A) {
100 |           if (state >= 4) {
101 |             state = 5;
102 |             token.push_back(buf[i]);
103 |           }
104 |           else if (buf[i] == 'E' || buf[i] == 'e') {
105 |             state = 4;
106 |             token.push_back(buf[i]);
107 |           }
108 |         }
109 |         else if (buf[i] >= 0x30) {
110 |           token.push_back(buf[i]);
111 |         }
112 |         else {
113 |           if (buf[i] == ',' || buf[i] == '\n') {
114 |             //std::cout << "[" << (int)state << "," << std::string(token.begin(), token.end()) << "]" << std::endl;
115 |             if (state < 2) {
116 |               input.push_back(0);
117 |               long val = fast_atoi<long>(token.begin(), token.end());
118 |                 unsigned char *bb = (unsigned char *)(void*)&val;
119 |               //input.insert(0);
120 |               input.insert(input.end(), bb, bb + sizeof(long));
121 |               #if 0
122 |               msgpack::pack(ss, val);
123 |               input.insert(input.end(), ss.data(), ss.data() + ss.size());
124 |               ss.clear();
125 |               #endif
126 |               #if 0
127 |               if (val >= 0 && val < 128) {
128 |                 unsigned char v = (unsigned char)val;
129 |                 unsigned char *bb = (unsigned char *)(void*)&v;
130 |                 input.insert(input.end(), bb, bb + 1);
131 |               }
132 |               else {
133 |                 input.push_back(128);
134 |                 unsigned char *bb = (unsigned char *)(void*)&val;
135 |                 input.insert(input.end(), bb, bb + sizeof(long));
136 |               }
137 |               #endif
138 |             }
139 |             else if (state < 5) {
140 |               input.push_back(1);
141 |               double val = bsd_strtod(token.begin(), token.end());
142 |               unsigned char *bb = (unsigned char *)(void*)&val;
143 |               input.insert(input.end(), bb, bb + sizeof(double));
144 |             }
145 |             else {
146 |               input.push_back(2);
147 |               long len = token.size();
148 |               unsigned char *bl = (unsigned char *)(void*)&len;
149 |               input.insert(input.end(), bl, bl + sizeof(long));
150 |               input.insert(input.end(), token.begin(), token.end());
151 |             }
152 |             if (rows_.size() == 0) {
153 |               starting_state_.push_back(state);
154 |             }
155 |             column_index_++;
156 |             if (column_index_ < starting_state_.size()) {
157 |               state = starting_state_[column_index_];
158 |             }
159 |             else {
160 |               state = 0;
161 |             }
162 |             token.clear();
163 |           }
164 |           else if (buf[i] == '.') {
165 |             if (state < 2) {
166 |               state = 3;
167 |             }
168 |             else {
169 |               state = 5;
170 |             }
171 |             token.push_back('.');
172 |           }
173 |           else if (buf[i] == '"') {
174 |             if (state == 6) {
175 |               state = 5;
176 |             }
177 |             else {
178 |               state = 6;
179 |             }
180 |           }
181 |           else {
182 |             token.push_back(buf[i]);
183 |           }
184 |           if (buf[i] == '\n') {
185 |             //std::cout << input.size() << std::endl;
186 |             if (compression_) {
187 |               snappy::Compress((const char *)input.data(), input.size(), &output);
188 |               input.clear();
189 |               rows_.emplace_back(output.begin(), output.end());
190 |             }
191 |             else {
192 |               rows_.emplace_back(input.begin(), input.end());
193 |               input.clear();
194 |             }
195 |           }
196 |         }
197 |       }
198 |       current += nread;
199 |     }
200 |     if (input.size() > 0) {
201 |       if (compression_) {
202 |         snappy::Compress((const char*)input.data(), input.size(), &output);
203 |         input.clear();
204 |         rows_.emplace_back(output.begin(), output.end());
205 |       }
206 |       else {
207 |         rows_.emplace_back(input.begin(), input.end());
208 |         input.clear();
209 |       }
210 |     }
211 |   }
212 |   
213 | private:
214 |   size_t chunk_start_;
215 |   size_t chunk_end_;
216 |   size_t file_size_;
217 |   size_t column_index_;
218 |   const size_t block_size_;
219 |   bool compression_;
220 |   std::vector<double> maximum_values_;
221 |   std::vector<std::vector<uint8_t> > rows_;
222 |   std::vector<size_t> starting_state_;
223 | };
224 | }
225 | }
226 | #endif
227 | 


--------------------------------------------------------------------------------
/src/csv/header_parser.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | 
  3 |     ParaText: parallel text reading
  4 |     Copyright (C) 2016. wise.io, Inc.
  5 | 
  6 |    Licensed to the Apache Software Foundation (ASF) under one
  7 |    or more contributor license agreements.  See the NOTICE file
  8 |    distributed with this work for additional information
  9 |    regarding copyright ownership.  The ASF licenses this file
 10 |    to you under the Apache License, Version 2.0 (the
 11 |    "License"); you may not use this file except in compliance
 12 |    with the License.  You may obtain a copy of the License at
 13 | 
 14 |      http://www.apache.org/licenses/LICENSE-2.0
 15 | 
 16 |    Unless required by applicable law or agreed to in writing,
 17 |    software distributed under the License is distributed on an
 18 |    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 19 |    KIND, either express or implied.  See the License for the
 20 |    specific language governing permissions and limitations
 21 |    under the License.
 22 | 
 23 |  */
 24 | 
 25 | /*
 26 |   Coder: Damian Eads.
 27 |  */
 28 | 
 29 | #ifndef PARATEXT_HEADER_PARSER_HPP
 30 | #define PARATEXT_HEADER_PARSER_HPP
 31 | 
 32 | #include <vector>
 33 | #include <fstream>
 34 | #include <unordered_set>
 35 | 
 36 | #include "util/strings.hpp"
 37 | 
 38 | namespace ParaText {
 39 | 
 40 | namespace CSV {
 41 | 
 42 |   /*
 43 |     Parses the first line of a CSV file to determine the header.
 44 |    */
 45 |   class HeaderParser {
 46 |   public:
 47 |     /*
 48 |       Constructs an uninitialized header parser.
 49 |      */
 50 |     HeaderParser() : length_(0), end_of_header_(0), has_header_(false) {}
 51 | 
 52 |     /*
 53 |       Destroys this parser.
 54 |      */
 55 |     virtual ~HeaderParser() {}
 56 | 
 57 |     /*
 58 |       Opens a file and parses its header.
 59 |      */
 60 |     void open(const std::string &filename, bool no_header) {
 61 |       struct stat fs;
 62 |       if (stat(filename.c_str(), &fs) == -1) {
 63 |         std::ostringstream ostr;
 64 |         ostr << "cannot open file '" << filename << "'";
 65 |         throw std::logic_error(ostr.str());
 66 |       }
 67 |       length_ = fs.st_size;
 68 |       in_.open(filename);
 69 |       if (!in_) {
 70 |         std::ostringstream ostr;
 71 |         ostr << "cannot open file '" << filename << "'";
 72 |         throw std::logic_error(ostr.str());
 73 |       }
 74 |       parse_header(no_header);
 75 |     }
 76 | 
 77 |     /*
 78 |       Returns the number of columns detected in the header.
 79 |      */
 80 |     size_t get_num_columns() const {
 81 |       return column_names_.size();
 82 |     }
 83 |     
 84 |     /*
 85 |       Adds a column of a specified name.
 86 |      */
 87 |     void add_column_name(const std::string &name) {
 88 |       //std::cerr << "col " << column_names_.size() << ": " << name << std::endl;
 89 | 
 90 |       std::string transformed_name;
 91 |       parse_unquoted_string(name.begin(), name.end(), std::back_inserter(transformed_name));
 92 |       convert_null_to_space(transformed_name.begin(), transformed_name.end());
 93 |       column_names_.push_back(transformed_name);
 94 |     }
 95 |     
 96 |     /*
 97 |       Returns a specific name of a column.
 98 |      */
 99 |     const std::string &get_column_name(size_t index) const {
100 |       return column_names_[index];
101 |     }
102 |     
103 |     /*
104 |       Parses a header.
105 |      */
106 |     void parse_header(bool no_header=false) {
107 |       std::string token;
108 |       size_t current = 0;
109 |       size_t block_size = 4096;
110 |       size_t escape_jump = 0;
111 |       char buf[block_size];
112 |       char quote_started = 0;
113 |       bool eoh_encountered = false;
114 |       bool soh_encountered = false;
115 |       in_.seekg(0, std::ios_base::beg);
116 |       while (current < length_ && !eoh_encountered) {
117 |         if (current % block_size == 0) { /* The block is aligned. */
118 |           in_.read(buf, std::min(length_ - current, block_size));
119 |         }
120 |         else { /* Our first read should ensure our further reads are block-aligned. */
121 |           in_.read(buf, std::min(length_ - current, std::min(block_size, current % block_size)));
122 |         }
123 |         size_t nread = in_.gcount();
124 |         size_t i = 0;
125 |         /* ignore leading whitespace in the file. */
126 |         while (i < nread && !soh_encountered) {
127 |           if (isspace(buf[i])) {
128 |             i++; /* eat the whitespace. */
129 |           } else {
130 |             soh_encountered = true;
131 |             /* do not do i++. we need to process it like non-whitespace */
132 |           }
133 |         }
134 |         while (i < nread && !eoh_encountered) {
135 |           if (quote_started) {
136 |             for (; i < nread; i++) {
137 |               if (escape_jump > 0) {
138 |                 escape_jump--;
139 |               }
140 |               else if (buf[i] == '\\') {
141 |                 escape_jump = 1;
142 |               }
143 |               else if (buf[i] == quote_started) {
144 |                 i++;
145 |                 quote_started = 0;
146 |                 break;
147 |               }
148 |               token.push_back(buf[i]);
149 |             }
150 |           }
151 |           else {
152 |             for (; i < nread; i++) {
153 |               if (escape_jump > 0) {
154 |                 token.push_back(buf[i]);
155 |                 escape_jump--;
156 |               }
157 |               else if (buf[i] == '\\') {
158 |                 token.push_back(buf[i]);
159 |                 escape_jump = 1;
160 |               }
161 |               else if (buf[i] == '\"' || buf[i] == '\'') {
162 |                 quote_started = buf[i];
163 |                 i++;
164 |                 break;
165 |               }
166 |               else if (buf[i] == ',') {
167 |                 add_column_name(token);
168 |                 token.clear();
169 |               }              
170 |               else if (buf[i] == '\r') { /* do nothing: dos wastes a byte each line. */ }
171 |               else if (buf[i] == '\n') {
172 |                 add_column_name(token);
173 |                 token.clear();
174 |                 end_of_header_ = current + i;
175 |                 eoh_encountered = true;
176 |                 i++;
177 |                 break;
178 |               }
179 |               else {
180 |                 token.push_back(buf[i]);
181 |               }
182 |             }
183 |           }
184 |         }
185 |         current += nread;
186 |       }
187 |       if (!soh_encountered) { /* If this is just a file of whitespace, then the end of header is the last pos in the file. */
188 |         end_of_header_ = current;
189 |       }
190 |       std::unordered_set<std::string> cnset;
191 |       for (auto &cname : column_names_) {
192 |         cnset.insert(cname);
193 |       }
194 |       has_header_ = true;
195 |       if (cnset.size() != column_names_.size() || no_header) {
196 |         has_header_ = false;
197 | #ifdef PARALOAD_DEBUG
198 |         std::cout << "column names not unique: " << cnset.size() << " unique column names found." ;
199 | #endif
200 |         size_t num_columns = column_names_.size();
201 |         column_names_.clear();
202 |         for (size_t i = 0; i < num_columns; i++) {
203 |           std::ostringstream ostr;
204 |           ostr << "col" << i;
205 |           std::string sstr(ostr.str());
206 |           column_names_.push_back(sstr);
207 |         }
208 |         end_of_header_ = 0;
209 |       }
210 | #ifdef PARALOAD_DEBUG
211 |       std::cout << "Total columns in header: " << column_names_.size() << std::endl;
212 | #endif
213 |       return;
214 |     }
215 | 
216 |     /*
217 |       Returns the end of the header.
218 |      */
219 |     size_t get_end_of_header() const {
220 |       return end_of_header_;
221 |     }
222 | 
223 |     bool has_header() const {
224 |       return has_header_;
225 |     }
226 | 
227 |   private:
228 |     std::ifstream in_;
229 |     std::vector<std::string> column_names_;
230 |     size_t length_;
231 |     size_t end_of_header_;
232 |     bool has_header_;
233 |   };
234 | }
235 | }
236 | #endif
237 | 


--------------------------------------------------------------------------------
/tests/test_paratext.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import unittest
  3 | import paratext.testing
  4 | import paratext.serial
  5 | from paratext.testing import assert_dictframe_almost_equal, generate_tempfile, generate_tempfilename
  6 | import pandas.util.testing
  7 | import numpy as np
  8 | import logging
  9 | 
 10 | class TestBasicFiles:
 11 | 
 12 |     def do_basic_nums(self, dtype, num_rows, num_columns, num_threads, number_only, no_header):
 13 |         if no_header:
 14 |             filedata = ''
 15 |             keys = ["col%d" % k for k in range(num_columns)] 
 16 |         else:
 17 |             keys = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"]
 18 |             keys = keys[0:num_columns]
 19 |             filedata = ','.join(keys[0:num_columns]) + "\n"
 20 |         expected = {}
 21 |         for key in keys:
 22 |             expected[key] = []
 23 |         for row in range(num_rows):
 24 |             if np.issubdtype(dtype, np.integer):
 25 |                 row_data = [row*i for i in range(num_columns)]
 26 |             else:
 27 |                 row_data = np.random.random((num_columns,))
 28 |             filedata += ",".join([str(v) for v in row_data]) + "\n"
 29 |             for k in range(len(keys)):
 30 |                 expected[keys[k]].append(row_data[k])
 31 |         with generate_tempfile(filedata.encode("utf-8")) as fn:
 32 |             logging.debug("filename: %s" % fn)
 33 |             actual = paratext.load_csv_to_pandas(fn, num_threads=num_threads, number_only=number_only, no_header=no_header)
 34 |             assert_dictframe_almost_equal(actual, expected)
 35 | 
 36 |     def do_basic_empty(self, file_body, num_threads):
 37 |         with generate_tempfile(file_body) as fn:
 38 |             logging.debug("filename: %s" % fn)
 39 |             actual = paratext.load_csv_to_pandas(fn, num_threads=num_threads)
 40 |             expected = pandas.DataFrame()
 41 |             assert_dictframe_almost_equal(actual, expected)
 42 | 
 43 |     def test_basic_empty(self):
 44 |         file_bodies = [b"", b"\n", b"\n\n", b" ", b" \n", b" \n   \n \n", b"\n  \n", b"\v\t \n", b"\n\n\n", b"\n\n\n\n"]
 45 |         file_bodies += [b"\r\n", b"\r\n\r\n", b" ", b" \r\n", b" \r\n   \r\n \r\n", b"\r\n  \r\n", b"\r\v\t \r\n", b"\r\n\r\n\r\n", b"\r\n\r\n\r\n\r\n"]
 46 |         for file_body in file_bodies:
 47 |             for num_threads in [1]:
 48 |                 yield self.do_basic_empty, file_body, num_threads
 49 | 
 50 |     def test_basic_ints(self):
 51 |         for no_header in [False, True]:
 52 |             for number_only in [False, True]:
 53 |                 for dtype in [np.float_, np.int64]:
 54 |                     for num_rows in [0, 1, 2, 3, 4, 5, 6, 10, 100, 1000]:
 55 |                         for num_cols in [1, 2, 3, 4, 5, 6, 10]:
 56 |                             if num_rows * num_cols < 20:
 57 |                                 thread_set = range(0,30)
 58 |                             else:
 59 |                                 thread_set = [0, 1, 2, 3, 4, 5, 6, 7, 8, 15, 20]
 60 |                                 for num_threads in thread_set:
 61 |                                     yield self.do_basic_nums, dtype, num_rows, num_cols, num_threads, number_only, no_header
 62 | 
 63 |     def test_basic_strange1(self):
 64 |         filedata = b"""A,B,C
 65 | "\\\"","",7
 66 | "\\\\","X",8
 67 | "\n","\\\\\\"",9"""
 68 |         with generate_tempfile(filedata) as fn:
 69 |             expected = {"A": ["\"","\\","\n"], "B": ["","X","\\\""], "C": [7,8,9]}
 70 |             logging.debug("filename: %s" % fn)
 71 |             actual = paratext.load_csv_to_pandas(fn, allow_quoted_newlines=True, out_encoding="utf-8")
 72 |             assert_dictframe_almost_equal(actual, expected)
 73 | 
 74 |     def test_basic_3x2x(self):
 75 |         filedata = b"""A,B,C
 76 | 1,4,7
 77 | 2,5,8
 78 | """
 79 |         with generate_tempfile(filedata) as fn:
 80 |             expected = {"A": [1,2], "B": [4,5], "C": [7,8]}
 81 |             logging.debug("filename: %s" % fn)
 82 |             actual = paratext.load_csv_to_pandas(fn)
 83 |             assert_dictframe_almost_equal(actual, expected)
 84 | 
 85 |     def test_basic_3x1x(self):
 86 |         filedata = b"""A,B,C
 87 | 1,4,7
 88 | """
 89 |         with generate_tempfile(filedata) as fn:
 90 |             expected = {"A": [1], "B": [4], "C": [7]}
 91 |             logging.debug("filename: %s" % fn)
 92 |             actual = paratext.load_csv_to_pandas(fn)
 93 |             assert_dictframe_almost_equal(actual, expected)
 94 | 
 95 | 
 96 |     def test_basic_3x0x(self):
 97 |         filedata = b"""A,B,C
 98 | """
 99 |         with generate_tempfile(filedata) as fn:
100 |             expected = {"A": [], "B": [], "C": []}
101 |             logging.debug("filename: %s" % fn)
102 |             actual = paratext.load_csv_to_pandas(fn)
103 |             assert_dictframe_almost_equal(actual, expected)
104 | 
105 |     def test_basic_empty_cells_num(self):
106 |         filedata = b"""A,B,C,D,E,F
107 | #,1,#,#,2,#
108 | 3,#,#,4,5,#
109 | 6,#,#,#,#,#
110 | #,7,#,#,#,#
111 | #,#,8,#,#,#
112 | #,#,#,9,#,#
113 | #,#,#,#,10,#
114 | #,#,#,#,#,11
115 | #,#,12,#,#,13
116 | 14,#,#,15,16,17
117 | """
118 |         filedata = filedata.replace(b"#", b"")
119 |         with generate_tempfile(filedata) as fn:
120 |             expected = {"A": [0,3,6,0,0,0,0,0,0,14], "B": [1,0,0,7,0,0,0,0,0,0], "C": [0,0,0,0,8,0,0,0,12,0], "D": [0,4,0,0,0,9,0,0,0,15], "E": [2,5,0,0,0,0,10,0,0,16], "F": [0,0,0,0,0,0,0,11,13,17]}
121 |             logging.debug("filename: %s" % fn)
122 |             actual = paratext.load_csv_to_pandas(fn, number_only=True)
123 |             assert_dictframe_almost_equal(actual, expected)
124 | 
125 | class TestMixedFiles:
126 | 
127 |     def run_case(self, num_rows, num_cats, num_floats, num_ints, num_threads):
128 |         expected, types_df = paratext.testing.generate_mixed_frame(num_rows, num_floats, num_cats, num_ints)
129 |         with generate_tempfilename() as fn:
130 |             logging.debug("filename: %s" % fn)
131 |             paratext.serial.save_frame(fn, expected, allow_quoted_newlines=True, out_encoding='utf-8')
132 |             actual = paratext.load_csv_to_pandas(fn, allow_quoted_newlines=True, out_encoding='utf-8', num_threads=num_threads)
133 |             assert_dictframe_almost_equal(actual, expected)
134 | 
135 |     def test_mixed_frame(self):
136 |         for num_rows in [0, 1, 2, 3, 5, 10, 100, 1000]:
137 |             for num_cats in [1, 3, 5]:
138 |                 for num_floats in [1, 3, 5]:
139 |                     for num_ints in [0, 1, 5, 10, 50]:
140 |                         for num_threads in [1, 2, 3, 5, 10, 20]:
141 |                             yield self.run_case, num_rows, num_cats, num_floats, num_ints, num_threads
142 | 
143 | class TestHellFiles:
144 | 
145 |     def do_hell_frame(self, dos, frame_encoding, out_encoding, include_null, allow_quoted_newlines, rows, cols, num_threads):
146 |         expected = paratext.testing.generate_hell_frame(rows, cols, include_null=include_null, fmt=frame_encoding)
147 |         with generate_tempfilename() as fn:
148 |             logging.debug("filename: %s" % fn)
149 |             paratext.serial.save_frame(fn, expected, allow_quoted_newlines, out_encoding=out_encoding, dos=dos)
150 |             actual = paratext.load_csv_to_pandas(fn, allow_quoted_newlines=allow_quoted_newlines, out_encoding=out_encoding, num_threads=num_threads, convert_null_to_space=not include_null)
151 |             assert_dictframe_almost_equal(actual, expected)
152 | 
153 |     def test_hell_frame(self):
154 |         formatting = [("utf-8", "utf-8"),
155 |                       ("printable_ascii", "utf-8"),
156 |                       ("utf-8", "unknown"),
157 |                       ("arbitrary", "unknown"),
158 |                       ("arbitrary", "utf-8"),
159 |                       ("mixed", "unknown"),
160 |                       ("mixed", "utf-8")]
161 |         for dos in [False, True]:
162 |             for (frame_encoding, out_encoding) in formatting:
163 |                 for include_null in [False, True]:
164 |                     for allow_quoted_newlines in [False, True]:
165 |                         for num_rows in [0, 1,2,3,4,10,100,600]:
166 |                             for num_cols in [1,2,3,4,5,10]:
167 |                                 for num_threads in [1,2,4,8,16]:
168 |                                     yield self.do_hell_frame, dos, frame_encoding, out_encoding, include_null, allow_quoted_newlines, num_rows, num_cols, num_threads
169 | 


--------------------------------------------------------------------------------
/src/util/safe_string_output.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | 
  3 |     ParaText: parallel text reading
  4 |     Copyright (C) 2016. wise.io, Inc.
  5 | 
  6 |    Licensed to the Apache Software Foundation (ASF) under one
  7 |    or more contributor license agreements.  See the NOTICE file
  8 |    distributed with this work for additional information
  9 |    regarding copyright ownership.  The ASF licenses this file
 10 |    to you under the Apache License, Version 2.0 (the
 11 |    "License"); you may not use this file except in compliance
 12 |    with the License.  You may obtain a copy of the License at
 13 | 
 14 |      http://www.apache.org/licenses/LICENSE-2.0
 15 | 
 16 |    Unless required by applicable law or agreed to in writing,
 17 |    software distributed under the License is distributed on an
 18 |    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 19 |    KIND, either express or implied.  See the License for the
 20 |    specific language governing permissions and limitations
 21 |    under the License.
 22 |  */
 23 | 
 24 | /*
 25 |   Coder: Damian Eads.
 26 |  */
 27 | 
 28 | #ifndef SAFE_STRING_OUTPUT_HPP
 29 | #define SAFE_STRING_OUTPUT_HPP
 30 | 
 31 | #include <array>
 32 | #include <sstream>
 33 | #include <iomanip>
 34 | 
 35 | namespace WiseIO {
 36 | 
 37 |   typedef enum {NO_ESCAPE, ESCAPE, CONTINUATION, LEAD2, LEAD3, LEAD4, POTENTIAL_SURROGATE} SafeCharState;
 38 | 
 39 |   class SafeStringOutput {
 40 |   public:
 41 |     SafeStringOutput() : double_quote_output_(false) {
 42 |       should_escape_.fill(NO_ESCAPE);
 43 |       should_escape_['\\'] = SafeCharState::ESCAPE;
 44 |     }
 45 | 
 46 |     ParaText::as_utf8 to_utf8_string(const std::string &input) {
 47 |       ParaText::as_utf8 output;
 48 |       output.val = output_string(input.begin(), input.end());
 49 |       return output;
 50 |     }
 51 | 
 52 |     ParaText::as_raw_bytes to_raw_string(const std::string &input) {
 53 |       ParaText::as_raw_bytes output;
 54 |       output.val = output_string(input.begin(), input.end());
 55 |       return output;
 56 |     }
 57 | 
 58 |     template <class Iterator>
 59 |     std::string output_string(Iterator begin, Iterator end) {
 60 |       /* FIXME: Support filtering of illegal surrogates in UTF8 sequences. */
 61 |       std::ostringstream ostr;
 62 |       size_t bytes_in_sequence = 0;
 63 |       //bool surrogate = false;
 64 |       if (double_quote_output_) {
 65 |         ostr << '"';
 66 |       }
 67 |       std::vector<bool> escaped(std::distance(begin, end), false);
 68 |       size_t k = 0;
 69 |       for (Iterator it = begin; it != end; it++, k++) {
 70 |         unsigned char c = (unsigned char)*it;
 71 |         bool escape_it = should_escape_[c] == ESCAPE;
 72 |         if (bytes_in_sequence > 0) { /* If a UTF8 sequence was started, only escape the byte if its not a continuation. */
 73 |           escape_it = should_escape_[c] != CONTINUATION;
 74 |           bytes_in_sequence--;
 75 |         }
 76 |         else if (!escape_it && bytes_in_sequence == 0) { /* If a UTF8 sequence is not progress, check higher order bits. */
 77 |           switch (should_escape_[c]) {
 78 |           case LEAD4:
 79 |             bytes_in_sequence = 3;
 80 |             break;
 81 |           case POTENTIAL_SURROGATE:
 82 |             /*bytes_in_sequence = 2;
 83 |               surrogate = true;*/
 84 |             break;
 85 |           case LEAD3:
 86 |             bytes_in_sequence = 2;
 87 |             break;
 88 |           case LEAD2:
 89 |             bytes_in_sequence = 1;
 90 |             break;
 91 |           case NO_ESCAPE:
 92 |             break;
 93 |           case ESCAPE: /* Explicit escape. */
 94 |           case CONTINUATION: /* An invalid continuation byte, escape it. */
 95 |             escape_it = true;
 96 |             break;
 97 |           }
 98 |         }
 99 |         escaped[k] = escape_it;
100 |       }
101 |       k = 0;
102 |       for (Iterator it = begin; it != end; it++, k++) {
103 |         unsigned char c = (unsigned char)*it;
104 |         if (escaped[k]) {
105 |           ostr << '\\';
106 |           switch (c) {
107 |           case '\b':
108 |             ostr << 'b';
109 |             break;
110 |           case '\v':
111 |             ostr << 'v';
112 |             break;
113 |           case '\n':
114 |             ostr << 'n';
115 |             break;
116 |           case '\r':
117 |             ostr << 'r';
118 |             break;
119 |           case '\t':
120 |             ostr << 't';
121 |             break;
122 |           case '\\':
123 |             ostr << '\\';
124 |             break;
125 |           case '\"':
126 |             ostr << '\"';
127 |             break;
128 |           case '\'':
129 |             ostr << '\'';
130 |             break;
131 |           default:
132 |             ostr << 'x';
133 |             ostr << to_hex(c >> 4);
134 |             ostr << to_hex(c & 0x0F);
135 |             break;
136 |           }
137 |         }
138 |         else {
139 |           ostr.put(*it);
140 |         }
141 |       }
142 |       if (double_quote_output_) {
143 |         ostr << '"';
144 |       }
145 |       return ostr.str();
146 |     }
147 | 
148 |     void escape_newlines(bool b) {
149 |       SafeCharState st = b ? SafeCharState::ESCAPE : SafeCharState::NO_ESCAPE;
150 |       should_escape_['\n'] = st;
151 |     }
152 | 
153 |     void escape_whitespace(bool b) {
154 |       SafeCharState st = b ? SafeCharState::ESCAPE : SafeCharState::NO_ESCAPE;
155 |       should_escape_['\n'] = st;
156 |       should_escape_['\r'] = st;
157 |       should_escape_['\v'] = st;
158 |       should_escape_['\f'] = st;
159 |       should_escape_['\b'] = st;
160 |     }
161 | 
162 |     void escape_special(bool b) {
163 |       SafeCharState st = b ? SafeCharState::ESCAPE : SafeCharState::NO_ESCAPE;
164 |       should_escape_['\''] = st;
165 |       should_escape_['\"'] = st;
166 |       should_escape_['\\'] = st;
167 |     }
168 | 
169 |     void escape_delim(bool b) {
170 |       SafeCharState st = b ? SafeCharState::ESCAPE : SafeCharState::NO_ESCAPE;
171 |       should_escape_[','] = st;
172 |       escape_special(true);
173 |     }
174 | 
175 |     void escape_comments(bool b) {
176 |       SafeCharState st = b ? SafeCharState::ESCAPE : SafeCharState::NO_ESCAPE;
177 |       should_escape_['%'] = st;
178 |       escape_special(true);
179 |     }
180 | 
181 |     void escape_nonprintables(bool b) {
182 |       SafeCharState st = b ? SafeCharState::ESCAPE : SafeCharState::NO_ESCAPE;
183 |       for (unsigned char c = 0; c < ' '; c++) {
184 |         should_escape_[c] = st;
185 |       }
186 |     }
187 | 
188 |     void escape_nonascii(bool b) {
189 |       SafeCharState st = b ? SafeCharState::ESCAPE : SafeCharState::NO_ESCAPE;
190 |       for (size_t c = 0x7F; c <= 0xFF; c++) {
191 |         should_escape_[c] = st;
192 |       }
193 |     }
194 | 
195 |     void escape_nonutf8(bool b) {
196 |       const SafeCharState outside = b ? SafeCharState::ESCAPE : SafeCharState::NO_ESCAPE;
197 |       const SafeCharState cont = b ? SafeCharState::CONTINUATION : SafeCharState::NO_ESCAPE;
198 |       const SafeCharState lead2 = b ? SafeCharState::LEAD2 : SafeCharState::NO_ESCAPE;
199 |       const SafeCharState lead3 = b ? SafeCharState::LEAD3 : SafeCharState::NO_ESCAPE;
200 |       const SafeCharState lead4 = b ? SafeCharState::LEAD4 : SafeCharState::NO_ESCAPE;
201 |       //const SafeCharState surrogate = b ? SafeCharState::POTENTIAL_SURROGATE : SafeCharState::NO_ESCAPE;
202 |       for (size_t c = 0x80; c <= 0xBF; c++) {
203 |         should_escape_[c] = cont;
204 |       }
205 |       for (size_t c = 0xC0; c <= 0xDF; c++) {
206 |         should_escape_[c] = lead2;
207 |       }
208 |       for (size_t c = 0xE0; c <= 0xEF; c++) {
209 |         should_escape_[c] = lead3;
210 |       }
211 |       for (size_t c = 0xF0; c <= 0xF7; c++) {
212 |         should_escape_[c] = lead4;
213 |       }
214 |       for (size_t c = 0xF8; c <= 0xFF; c++) {
215 |         should_escape_[c] = outside;
216 |       }
217 |       //should_escape_[0xED] = surrogate;
218 |     }
219 | 
220 |     void double_quote_output(bool b) {
221 |       if (b) {
222 |         should_escape_['\"'] = SafeCharState::ESCAPE;
223 |       }
224 |       double_quote_output_ = b;
225 |     }
226 | 
227 |   private:
228 |     inline char to_hex(int v) {
229 |       if (v >= 0 && v < 10) {
230 |         return '0' + v;
231 |       }
232 |       else if (v >= 10 && v < 16) {
233 |         return 'a' + (v-10);
234 |       }
235 |       else {
236 |         throw std::logic_error("invalid range for hex character");
237 |       }
238 |     }
239 |     
240 |   private:
241 |     std::array<SafeCharState, 256> should_escape_;
242 |     bool double_quote_output_;
243 |   };
244 | }
245 | #endif
246 | 


--------------------------------------------------------------------------------
/src/diagnostic/parse_and_sum.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | 
  3 |     ParaText: parallel text reading
  4 |     Copyright (C) 2016. wise.io, Inc.
  5 | 
  6 |    Licensed to the Apache Software Foundation (ASF) under one
  7 |    or more contributor license agreements.  See the NOTICE file
  8 |    distributed with this work for additional information
  9 |    regarding copyright ownership.  The ASF licenses this file
 10 |    to you under the Apache License, Version 2.0 (the
 11 |    "License"); you may not use this file except in compliance
 12 |    with the License.  You may obtain a copy of the License at
 13 | 
 14 |      http://www.apache.org/licenses/LICENSE-2.0
 15 | 
 16 |    Unless required by applicable law or agreed to in writing,
 17 |    software distributed under the License is distributed on an
 18 |    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 19 |    KIND, either express or implied.  See the License for the
 20 |    specific language governing permissions and limitations
 21 |    under the License.
 22 |  */
 23 | 
 24 | /*
 25 |   Coder: Damian Eads.
 26 |  */
 27 | 
 28 | #ifndef PARATEXT_DIAGNOSTIC_PARSE_AND_SUM_HPP
 29 | #define PARATEXT_DIAGNOSTIC_PARSE_AND_SUM_HPP
 30 | 
 31 | #include <iostream>
 32 | #include <fstream>
 33 | 
 34 | #include <sys/types.h>
 35 | #include <sys/stat.h>
 36 | #include <unistd.h>
 37 | #include <thread>
 38 | #include <sstream>
 39 | 
 40 | #include "generic/chunker.hpp"
 41 | #include "csv/header_parser.hpp"
 42 | 
 43 | namespace ParaText {
 44 | 
 45 |   namespace Diagnostic {
 46 | 
 47 | template <bool TypeCheck>
 48 | class ParseAndSumWorker {
 49 | public:
 50 |   ParseAndSumWorker(size_t chunk_start, size_t chunk_end, size_t block_size, size_t num_columns)
 51 |     : chunk_start_(chunk_start),
 52 |       chunk_end_(chunk_end),
 53 |       block_size_(block_size),
 54 |       num_columns_(num_columns) {}
 55 | 
 56 |   virtual ~ParseAndSumWorker() {}
 57 | 
 58 |   void parse(const std::string &filename) {
 59 |     try {
 60 |       parse_impl(filename);
 61 |     }
 62 |     catch (...) {
 63 |       thread_exception_ = std::current_exception();
 64 |     }
 65 |   }
 66 | 
 67 |   std::exception_ptr get_exception() {
 68 |     return thread_exception_;
 69 |   }
 70 | 
 71 |   void parse_impl(const std::string &filename) {
 72 |     std::ifstream in;
 73 |     in.open(filename.c_str());
 74 |     const size_t block_size = block_size_;
 75 |     char buf[block_size];
 76 |     in.seekg(chunk_start_, std::ios_base::beg);
 77 |     size_t current = chunk_start_;
 78 |     sums_.resize(num_columns_);
 79 |     std::fill(sums_.begin(), sums_.end(), 0.0);
 80 |     column_index_ = 0;
 81 |     num_lines_ = 0;
 82 |     char token[64];
 83 |     size_t j = 0;
 84 |     while (current <= chunk_end_) {
 85 |       in.read(buf, std::min(chunk_end_ - current + 1, block_size));
 86 |       size_t nread = in.gcount();
 87 |       if (nread == 0) {
 88 |         break;
 89 |       }
 90 |       for (size_t i = 0; i < nread; i++) {
 91 |         if (buf[i] == '\n') {   
 92 |           sums_[column_index_] += parse_token<TypeCheck>(token, token + j);
 93 |           column_index_ = 0;
 94 |           num_lines_++;
 95 |           j = 0;
 96 |         }
 97 |         else if (buf[i] == ',') {
 98 |           sums_[column_index_] += parse_token<TypeCheck>(token, token + j);
 99 |           column_index_++;
100 |           j = 0;
101 |         }
102 |         else {
103 |           token[j++] = buf[i];
104 |         }
105 |       }
106 |       current += nread;
107 |     }
108 |     if (j > 0) {
109 |       sums_[column_index_] += parse_token<TypeCheck>(token, token + j);
110 |       j = 0;
111 |     }
112 |     if (column_index_ > 0) {
113 |       num_lines_++;
114 |     }
115 |   }
116 |   
117 |   const std::vector<double> &get_sums() const {
118 |     return sums_;
119 |   }
120 | 
121 |   size_t get_N() const {
122 |     return num_lines_;
123 |   }
124 | 
125 |   // No type checking
126 |   template <bool TypeCheck_, class Iterator>
127 |   inline typename std::enable_if<!TypeCheck_, double>::type parse_token(Iterator begin, Iterator end) const {
128 |     return bsd_strtod(begin, end);
129 |   }
130 | 
131 |   // Type checking only for numbers.
132 |   template <bool TypeCheck_, class Iterator>
133 |   inline typename std::enable_if<TypeCheck_, double>::type parse_token(Iterator begin, Iterator end) const {
134 |     Iterator it = begin;
135 |     for (; it != end && isspace(*it); it++) {}
136 |     if (it != end) {
137 |       if (*it == '?' && std::distance(it, end) == 1) {
138 |         return std::numeric_limits<double>::quiet_NaN();
139 |       }
140 |       else if (std::distance(it, end) == 3 &&
141 |                ((*it == 'n' || *it == 'N'))
142 |                && ((*(it+1) == 'a' || *(it+1) == 'A'))
143 |                && ((*(it+2) == 'n' || *(it+2) == 'N'))) {
144 |         return std::numeric_limits<double>::quiet_NaN();
145 |       }
146 |       else {
147 |         if (*it == '-') { it++; }
148 |         for (; it != end && isdigit(*it); it++) {}
149 |         if (it != end && (*it == '.' || *it == 'E' || *it == 'e')) {
150 |           return bsd_strtod(begin, end);
151 |         }
152 |         else {
153 |           return (double)fast_atoi<long>(begin, end);
154 |         }
155 |       }
156 |     }
157 |     return (double)std::distance(begin, end);
158 |   }
159 | 
160 | private:
161 |   size_t chunk_start_;
162 |   size_t chunk_end_;
163 |   size_t block_size_;
164 |   size_t num_columns_;
165 |   size_t num_lines_;
166 |   size_t column_index_;
167 |   std::vector<double> sums_;
168 |   std::exception_ptr thread_exception_;
169 | };
170 | 
171 |   class ParseAndSum {
172 |   public:
173 |     ParseAndSum()          {}
174 | 
175 |     virtual ~ParseAndSum() {}
176 | 
177 |     size_t load(const std::string &filename, const ParseParams &params, bool type_check) {
178 |       size_t retval = 0;
179 |       if (type_check) {
180 |         retval = load_impl<true>(filename, params);
181 |       }
182 |       else {
183 |         retval = load_impl<false>(filename, params);
184 |       }
185 |       return retval;
186 |     }
187 | 
188 |     template <bool TypeCheck>
189 |     size_t load_impl(const std::string &filename, const ParseParams &params) {
190 |       std::vector<std::thread> threads;
191 |       std::vector<std::shared_ptr<ParseAndSumWorker<TypeCheck> > > workers;
192 |       header_parser_.open(filename, params.no_header);
193 |       std::exception_ptr thread_exception;
194 |       if (header_parser_.has_header()) {
195 |         chunker_.process(filename, header_parser_.get_end_of_header()+1, params.num_threads, params.allow_quoted_newlines);
196 |       }
197 |       else {
198 |         chunker_.process(filename, 0, params.num_threads, params.allow_quoted_newlines);
199 |       }
200 |       for (size_t worker_id = 0; worker_id < chunker_.num_chunks(); worker_id++) {
201 |         long start_of_chunk = 0, end_of_chunk = 0;
202 |         std::tie(start_of_chunk, end_of_chunk) = chunker_.get_chunk(worker_id);
203 |         if (start_of_chunk < 0 || end_of_chunk < 0) {
204 |           continue;
205 |         }
206 |         workers.push_back(std::make_shared<ParseAndSumWorker<TypeCheck> >(start_of_chunk, end_of_chunk, params.block_size, header_parser_.get_num_columns()));
207 |         threads.emplace_back(&ParseAndSumWorker<TypeCheck>::parse,
208 |                              workers.back(),
209 |                              filename);
210 |       }
211 | 
212 |       for (size_t i = 0; i < threads.size(); i++) {
213 |         threads[i].join();
214 |         if (!thread_exception) {
215 |           thread_exception = workers[i]->get_exception();
216 |         }
217 |       }
218 |       // We're now outside the parallel region.
219 |       if (thread_exception) {
220 |         std::rethrow_exception(thread_exception);
221 |       }
222 |       N_ = 0.0;
223 |       avgs_.resize(header_parser_.get_num_columns());
224 |       std::fill(avgs_.begin(), avgs_.end(), 0.0);
225 |       for (size_t i = 0; i < workers.size(); i++) {
226 |         auto worker_sums = workers[i]->get_sums();
227 |         N_ += workers[i]->get_N();
228 |         for (size_t j = 0; j < worker_sums.size(); j++) {
229 |           avgs_[j] += worker_sums[j];
230 |         }
231 |       }
232 |       for (size_t j = 0; j < avgs_.size(); j++) {
233 |         avgs_[j] /= N_;
234 |       } 
235 |       return N_;
236 |     }
237 | 
238 |     size_t get_num_columns() const {
239 |       return header_parser_.get_num_columns();
240 |     }
241 | 
242 |     double get_avg(size_t column_index) const {
243 |       return avgs_[column_index];
244 |     }
245 | 
246 |     const std::string &get_column_name(size_t column_index) const {
247 |       return header_parser_.get_column_name(column_index);
248 |     }
249 | 
250 |     size_t get_N() const {
251 |       return N_;
252 |     }
253 | 
254 |   private:
255 |     CSV::HeaderParser header_parser_;
256 |     TextChunker chunker_;
257 |     std::vector<double> avgs_;
258 |     size_t N_;
259 |   };
260 |   }
261 | }
262 | #endif
263 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
 2 | 
 3 | 1. Definitions.
 4 | 
 5 | "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
 6 | 
 7 | "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
 8 | 
 9 | "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
10 | 
11 | "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
12 | 
13 | "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
14 | 
15 | "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
16 | 
17 | "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
18 | 
19 | "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
20 | 
21 | "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
22 | 
23 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
24 | 
25 | 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
26 | 
27 | 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
28 | 
29 | 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
30 | 
31 | You must give any other recipients of the Work or Derivative Works a copy of this License; and
32 | You must cause any modified files to carry prominent notices stating that You changed the files; and
33 | You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
34 | If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. 
35 | 
36 | You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
37 | 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
38 | 
39 | 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
40 | 
41 | 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
42 | 
43 | 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
44 | 
45 | 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
46 | 
47 | END OF TERMS AND CONDITIONS


--------------------------------------------------------------------------------
/bench/generate_experiments.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | #   Licensed to the Apache Software Foundation (ASF) under one
  4 | #   or more contributor license agreements.  See the NOTICE file
  5 | #   distributed with this work for additional information
  6 | #   regarding copyright ownership.  The ASF licenses this file
  7 | #   to you under the Apache License, Version 2.0 (the
  8 | #   "License"); you may not use this file except in compliance
  9 | #   with the License.  You may obtain a copy of the License at
 10 | #
 11 | #     http://www.apache.org/licenses/LICENSE-2.0 
 12 | #
 13 | #   Unless required by applicable law or agreed to in writing,
 14 | #   software distributed under the License is distributed on an
 15 | #   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 16 | #   KIND, either express or implied.  See the License for the
 17 | #   specific language governing permissions and limitations
 18 | #   under the License.
 19 | #
 20 | #   Copyright (C) Wise.io, Inc. 2016.
 21 | 
 22 | import sha
 23 | import json
 24 | import os
 25 | 
 26 | all_params = []
 27 | 
 28 | datasets = {"mnist":
 29 |             {"csv": "mnist.csv",
 30 |              "hdf5": "mnist.hdf5",
 31 |              "npy": "mnist.npy",
 32 |              "feather": "mnist.feather",
 33 |              "pickle": "mnist.pkl",
 34 |              "cPickle": "mnist.pkl",
 35 |              "no_header": True,
 36 |              "number_only": True,
 37 |              "to_df": True},
 38 |             "mnist8m":
 39 |             {"csv": "mnist8m.csv",
 40 |              "hdf5": "mnist8m.hdf5",
 41 |              "npy": "mnist8m.npy",
 42 |              "feather": "mnist8m.feather",
 43 |              "pickle": "mnist8m.pkl",
 44 |              "cPickle": "mnist8m.pkl",
 45 |              "no_header": True,
 46 |              "number_only": True,
 47 |              "to_df": False},
 48 |             "messy":
 49 |             {"csv": "messy.csv",
 50 |              "feather": "messy.feather",
 51 |              "pickle": "messy.pkl",
 52 |              "qnl": True,
 53 |              "no_header": False,
 54 |              "run_pyspark": False,
 55 |              "max_level_name_length": 0,
 56 |              "contains_text": True,
 57 |              "to_df": True},
 58 |             "messy2":
 59 |             {"csv": "messy2.csv",
 60 |              "feather": "messy2.feather",
 61 |              "pickle": "messy2.pkl",
 62 |              "qnl": True,
 63 |              "no_header": False,
 64 |              "run_pyspark": False,
 65 |              "max_level_name_length": 0,
 66 |              "contains_text": True,
 67 |              "to_df": True},
 68 |             "car":
 69 |             {"csv": "car.csv",
 70 |              "feather": "car.feather",
 71 |              "pickle": "car.pkl",
 72 |              "qnl": False,
 73 |              "no_header": False,
 74 |              "contains_text": True,
 75 |              "to_df": True},
 76 |             "floats":
 77 |             {"csv": "floats.csv",
 78 |              "feather": "floats.feather",
 79 |              "hdf5": "floats.hdf5",
 80 |              "npy": "floats.npy",
 81 |              "no_header": False,
 82 |              "pickle": "floats.pkl",
 83 |              "to_df": True},
 84 |             "floats2":
 85 |             {"csv": "floats2.csv",
 86 |              "feather": "floats2.feather",
 87 |              "hdf5": "floats2.hdf5",
 88 |              "npy": "floats2.npy",
 89 |              "no_header": False,
 90 |              "pickle": "floats2.pkl",
 91 |              "to_df": True},
 92 |             "floats3":
 93 |             {"csv": "floats3.csv",
 94 |              "feather": "floats3.feather",
 95 |              "hdf5": "floats3.hdf5",
 96 |              "npy": "floats3.npy",
 97 |              "no_header": False,
 98 |              "pickle": "floats3.pkl",
 99 |              "to_df": True},
100 |             "floats4":
101 |             {"csv": "floats4.csv",
102 |              "feather": "floats4.feather",
103 |              "hdf5": "floats4.hdf5",
104 |              "npy": "floats4.npy",
105 |              "no_header": False,
106 |              "pickle": "floats4.pkl",
107 |              "to_df": True}
108 |              }
109 | 
110 | scaling_experiments = bool(raw_input("enter 'yes' to do scaling experiments, 'no' to do main benchmarks: ").lower() == 'yes')
111 | 
112 | print "available datasets: ", datasets.keys()
113 | restrict_keys = raw_input("enter comma-delimited list of datasets to generate experiment json [enter for all]: ")
114 | 
115 | if restrict_keys != "":
116 |     restrict_keys = set(restrict_keys.split(","))
117 |     for key in datasets.keys():
118 |         if key not in restrict_keys:
119 |             datasets.pop(key)
120 | 
121 | for name, attr in datasets.iteritems():
122 |     if "csv" in attr:
123 |         csv_filename = attr["csv"]
124 |         for disk_state in ["cold", "warm"]:
125 |             if scaling_experiments:
126 |                 num_threads_list = [1,4,8,12,16,20,24,28,32]
127 |             else:
128 |                 num_threads_list = [0]
129 |             for num_threads in num_threads_list:
130 |                 for block_size in [32768]:
131 |                     if not attr.get("contains_text", False):
132 |                         for type_check in [True, False]:
133 |                             params = {"cmd": "avgcols",
134 |                                       "filename": attr["csv"],
135 |                                       "no_header": attr.get("no_header", True),
136 |                                       "allow_quoted_newlines": attr.get("qnl", False),
137 |                                       "num_threads": num_threads,
138 |                                       "disk_state": disk_state,
139 |                                       "block_size": block_size,
140 |                                       "to_df": True,
141 |                                       "sum_after": True,
142 |                                       "type_check": type_check,
143 |                                       "log": str(len(all_params)) + ".log"}
144 |                             all_params.append(params)
145 |                     for cmd in ["disk-to-mem", "countnl", "paratext"]:
146 |                         params = {"cmd": cmd,
147 |                                   "filename": attr["csv"],
148 |                                   "no_header": attr.get("no_header", True),
149 |                                   "allow_quoted_newlines": attr.get("qnl", False),
150 |                                   "num_threads": num_threads,
151 |                                   "disk_state": disk_state,
152 |                                   "block_size": block_size,
153 |                                   "to_df": True,
154 |                                   "sum_after": True,
155 |                                   "log": str(len(all_params)) + ".log"}
156 |                         if attr.get("number_only", False):
157 |                             params["number_only"] = True
158 |                         mlnl = attr.get("max_level_name_length", None)
159 |                         if mlnl:
160 |                             params["max_level_name_length"] = mlnl
161 |                         all_params.append(params)
162 |         for disk_state in ["cold", "warm"]:
163 |             if attr.get("run_pyspark", True):
164 |                 params = {"cmd": "pyspark",
165 |                           "filename": attr["csv"],
166 |                           "no_header": attr.get("no_header", True),
167 |                           "to_df": attr.get("to_df", False),
168 |                           "sum_after": True,
169 |                           "disk_state": disk_state}
170 |                 all_params.append(params)
171 | 
172 |             if params.get("number_only", True):
173 |                 params = {"cmd": "numpy",
174 |                           "filename": attr["csv"],
175 |                           "no_header": attr.get("no_header", True),
176 |                           "sum_after": True,
177 |                           "disk_state": disk_state}
178 |                 all_params.append(params)                
179 | 
180 |             for cmd in ["sframe", "pandas", "R-readcsv", "R-readr", "R-fread"]:
181 |                 params = {"cmd": cmd,
182 |                           "filename": attr["csv"],
183 |                           "no_header": attr.get("no_header", True),
184 |                           "to_df": attr.get("to_df", False),
185 |                           "sum_after": True,
186 |                           "disk_state": disk_state}
187 |                 all_params.append(params)
188 | 
189 |             for cmd in ["feather", "hdf5", "pickle", "cPickle", "npy"]:
190 |                 if cmd in attr:
191 |                     params = {"cmd": cmd,
192 |                               "filename": attr[cmd],
193 |                               "sum_after": True,
194 |                               "disk_state": disk_state}
195 |                 if cmd == "hdf5":
196 |                     params["dataset"] = "mydataset"
197 |                 all_params.append(params)
198 | 
199 | if "mnist8m" in datasets.keys():
200 |     for cmd in ["sframe", "paratext", "pyspark"]:
201 |         params = {"cmd": cmd,
202 |                   "filename": "mnist8m.csv",
203 |                   "no_header": True,
204 |                   "to_df": True,
205 |                   "sum_after": True,
206 |                   "disk_state": disk_state}
207 |         all_params.append(params)
208 | 
209 | params = {"cmd": "noop"}
210 | all_params.append(params)
211 | 
212 | for i, params in enumerate(all_params):
213 |     hparams = sha.sha(json.dumps(params)).hexdigest()
214 |     prefix = hparams[0:8]
215 |     params["log"] = os.path.join(params["cmd"], "run-" + prefix + ".log")
216 |     if not os.path.exists(params["cmd"]):
217 |         os.makedirs(params["cmd"])
218 |     json.dump(params, open(os.path.join(params["cmd"], "run-" + hparams[0:8] + ".json"), "w"), indent=1)
219 | 


--------------------------------------------------------------------------------
/src/generic/quote_adjustment_worker.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |     ParaText: parallel text reading
  3 |     Copyright (C) 2016. wise.io, Inc.
  4 | 
  5 |    Licensed to the Apache Software Foundation (ASF) under one
  6 |    or more contributor license agreements.  See the NOTICE file
  7 |    distributed with this work for additional information
  8 |    regarding copyright ownership.  The ASF licenses this file
  9 |    to you under the Apache License, Version 2.0 (the
 10 |    "License"); you may not use this file except in compliance
 11 |    with the License.  You may obtain a copy of the License at
 12 | 
 13 |      http://www.apache.org/licenses/LICENSE-2.0
 14 | 
 15 |    Unless required by applicable law or agreed to in writing,
 16 |    software distributed under the License is distributed on an
 17 |    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 18 |    KIND, either express or implied.  See the License for the
 19 |    specific language governing permissions and limitations
 20 |    under the License.
 21 |  */
 22 | 
 23 | /*
 24 |   Coder: Damian Eads.
 25 |  */
 26 | 
 27 | #ifndef PARATEXT_QUOTE_NEWLINE_WORKER_HPP
 28 | #define PARATEXT_QUOTE_NEWLINE_WORKER_HPP
 29 | 
 30 | #include <cassert>
 31 | 
 32 | namespace ParaText {
 33 | 
 34 | class QuoteNewlineAdjustmentWorker {
 35 | public:
 36 |   QuoteNewlineAdjustmentWorker(size_t chunk_start, size_t chunk_end)
 37 |     : chunk_start_(chunk_start),
 38 |       chunk_end_(chunk_end),
 39 |       num_quotes_(0),
 40 |       first_unquoted_newline_(-1),
 41 |       first_quoted_newline_(-1) {}
 42 | 
 43 |   virtual ~QuoteNewlineAdjustmentWorker() {}
 44 | 
 45 |   void parse(const std::string &filename) {
 46 |     try {
 47 |       parse_impl(filename);
 48 |     }
 49 |     catch (...) {
 50 |       thread_exception_ = std::current_exception();
 51 |     }
 52 |   }
 53 | 
 54 |   std::exception_ptr get_exception() {
 55 |     return thread_exception_;
 56 |   }
 57 | 
 58 |   void parse_impl(const std::string &filename) {
 59 |     std::ifstream in;
 60 |     in.open(filename.c_str());
 61 |     const size_t block_size = 32768;
 62 |     char buf[block_size];
 63 |     in.seekg(chunk_start_, std::ios_base::beg);
 64 |     size_t current = chunk_start_;
 65 |     size_t escape_count = 0;
 66 |     bool in_quote = false;
 67 |     while (current <= chunk_end_) {
 68 |       in.read(buf, std::min(chunk_end_ - current + 1, block_size));
 69 |       size_t nread = in.gcount();
 70 |       if (nread == 0) {
 71 |         break;
 72 |       }
 73 |       size_t i = 0;
 74 |       while (i < nread && first_unquoted_newline_ < 0 && first_quoted_newline_ < 0) {
 75 |         if (in_quote) {
 76 |           for (; i < nread; i++) {
 77 |             if (escape_count > 0) {
 78 |               escape_count--;
 79 |             }
 80 |             else if (buf[i] == '\\') {
 81 |               escape_count = 1;
 82 |             }
 83 |             else if (buf[i] == '\"') {
 84 |               num_quotes_++;
 85 | #ifdef PARATEXT_DEBUG_QUOTE
 86 |               std::cerr << "[Q1:" << (current + i) << ":" << num_quotes_ << ":" << escape_count;
 87 | #endif
 88 |               in_quote = false;
 89 |               i++;
 90 |               break;
 91 |             }
 92 |             else if (buf[i] == '\n') {
 93 |               first_quoted_newline_ = current + i;
 94 |               i++;
 95 |               break;
 96 |             }
 97 |           }
 98 |         }
 99 |         else {
100 |           for (; i < nread; i++) {
101 |             if (escape_count > 0) {
102 |               escape_count--;
103 |             }
104 |             else if (buf[i] == '\\') {
105 |               escape_count = 1;
106 |             }
107 |             else if (buf[i] == '\"') {
108 |               num_quotes_++;
109 | #ifdef PARATEXT_DEBUG_QUOTE
110 |               std::cerr << "[Q2:" << (current + i) << ":" << num_quotes_ << ":" << escape_count;
111 | #endif
112 |               in_quote = true;
113 |               i++;
114 |               break;
115 |             }
116 |             else if (buf[i] == '\n') {
117 |               first_unquoted_newline_ = current + i;
118 |               i++;
119 |               break;
120 |             }
121 |           }
122 |         }
123 |       }
124 |       while (i < nread && first_unquoted_newline_ < 0) {
125 |         if (in_quote) {
126 |           for (; i < nread; i++) {
127 |             if (escape_count > 0) {
128 |               escape_count--;
129 |             }
130 |             else if (buf[i] == '\\') {
131 |               escape_count = 1;
132 |             }
133 |             else if (buf[i] == '\"') {
134 |               num_quotes_++;
135 | #ifdef PARATEXT_DEBUG_QUOTE
136 |               std::cerr << "[Q3:" << (current + i) << ":" << num_quotes_ << ":" << escape_count;
137 | #endif
138 |               in_quote = false;
139 |               i++;
140 |               break;
141 |             }
142 |           }
143 |         }
144 |         else {
145 |           for (; i < nread; i++) {
146 |             if (escape_count > 0) {
147 |               escape_count--;
148 |             }
149 |             else if (buf[i] == '\\') {
150 |               escape_count = 1;
151 |             }
152 |             else if (buf[i] == '\"') {
153 |               num_quotes_++;
154 | #ifdef PARATEXT_DEBUG_QUOTE
155 |               std::cerr << "[Q4:" << (current + i) << ":" << num_quotes_ << ":" << escape_count;
156 | #endif
157 |               in_quote = true;
158 |               i++;
159 |               break;
160 |             }
161 |             else if (buf[i] == '\n') {
162 |               first_unquoted_newline_ = current + i;
163 |               i++;
164 |               break;
165 |             }
166 |           }
167 |         }
168 |       }
169 |       while (i < nread && first_quoted_newline_ < 0) {
170 |         if (in_quote) {
171 |           for (; i < nread; i++) {
172 |             if (escape_count > 0) {
173 |               escape_count--;
174 |             }
175 |             else if (buf[i] == '\\') {
176 |               escape_count = 1;
177 |             }
178 |             else if (buf[i] == '\"') {
179 |               num_quotes_++;
180 | #ifdef PARATEXT_DEBUG_QUOTE
181 |               std::cerr << "[Q5:" << (current + i) << ":" << num_quotes_ << ":" << escape_count;
182 | #endif
183 |               in_quote = false;
184 |               i++;
185 |               break;
186 |             }
187 |             else if (buf[i] == '\n') {
188 |               first_quoted_newline_ = current + i;
189 |               i++;
190 |               break;
191 |             }
192 |           }
193 |         }
194 |         else {
195 |           for (; i < nread; i++) {
196 |             if (escape_count > 0) {
197 |               escape_count--;
198 |             }
199 |             else if (buf[i] == '\\') {
200 |               escape_count = 1;
201 |             }
202 |             else if (buf[i] == '\"') {
203 |               num_quotes_++;
204 | #ifdef PARATEXT_DEBUG_QUOTE
205 |               std::cerr << "[Q6:" << (current + i) << ":" << num_quotes_ << ":" << escape_count;
206 | #endif
207 |               in_quote = true;
208 |               i++;
209 |               break;
210 |             }
211 |           }
212 |         }
213 |       }
214 |       /*
215 |         If we got here, then either we've found both the first quoted newline and 
216 |         unquoted newline, or we've processed all the data in the buffer.
217 |       */
218 |       while (i < nread) {
219 |         if (in_quote) {
220 |           for (; i < nread; i++) {
221 |             if (escape_count > 0) {
222 |               escape_count--;
223 |             }
224 |             else if (buf[i] == '\\') {
225 |               escape_count = 1;
226 |             }
227 |             else if (buf[i] == '\"') {
228 |               num_quotes_++;
229 | #ifdef PARATEXT_DEBUG_QUOTE
230 |               std::cerr << "[Q7:" << (current + i) << ":" << num_quotes_ << ":" << escape_count;
231 | #endif
232 |               in_quote = false;
233 |               i++;
234 |               break;
235 |             } 
236 |           }
237 |         }
238 |         else {
239 |           for (; i < nread; i++) {
240 |             if (escape_count > 0) {
241 |               escape_count--;
242 |             }
243 |             else if (buf[i] == '\\') {
244 |               escape_count = 1;
245 |             }
246 |             else if (buf[i] == '\"') {
247 |               num_quotes_++;
248 | #ifdef PARATEXT_DEBUG_QUOTE
249 |               std::cerr << "[Q8:" << (current + i) << ":" << num_quotes_ << ":" << escape_count;
250 | #endif
251 |               in_quote = true;
252 |               i++;
253 |               break;
254 |             }
255 |           }
256 |         }
257 |       }
258 |       current += nread;
259 |     }
260 |   }
261 | 
262 |   size_t get_start() const {
263 |     return chunk_start_;
264 |   }
265 | 
266 |   size_t get_end() const {
267 |     return chunk_end_;
268 |   }
269 | 
270 |   size_t get_num_quotes() const {
271 |     return num_quotes_;
272 |   }
273 |   
274 |   long get_first_quoted_newline() const {
275 |     return first_quoted_newline_;
276 |   }
277 | 
278 |   long get_first_unquoted_newline() const {
279 |     return first_unquoted_newline_;
280 |   }
281 | 
282 |   void clear() {
283 |     chunk_start_ = 0;
284 |     chunk_end_ = 0;
285 |     num_quotes_ = 0;
286 |     first_unquoted_newline_ = 0;
287 |     first_quoted_newline_ = 0;
288 |   }
289 | 
290 |   void combine_adjacent(const QuoteNewlineAdjustmentWorker &other) {
291 |     chunk_end_ = other.chunk_end_;
292 |     num_quotes_ += other.num_quotes_;
293 |     if (first_unquoted_newline_ < 0) {
294 |       first_unquoted_newline_ = other.first_unquoted_newline_;
295 |     }
296 |     if (first_quoted_newline_ < 0) {
297 |       first_quoted_newline_ = other.first_quoted_newline_;
298 |     }
299 |   }
300 | 
301 | private:
302 |   size_t chunk_start_;
303 |   size_t chunk_end_;
304 |   size_t num_quotes_;
305 |   long first_unquoted_newline_;
306 |   long first_quoted_newline_;
307 |   std::exception_ptr thread_exception_;
308 | };
309 | }
310 | #endif
311 | 


--------------------------------------------------------------------------------
/python/paratext/testing.py:
--------------------------------------------------------------------------------
  1 | #   Licensed to the Apache Software Foundation (ASF) under one
  2 | #   or more contributor license agreements.  See the NOTICE file
  3 | #   distributed with this work for additional information
  4 | #   regarding copyright ownership.  The ASF licenses this file
  5 | #   to you under the Apache License, Version 2.0 (the
  6 | #   "License"); you may not use this file except in compliance
  7 | #   with the License.  You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | #   Unless required by applicable law or agreed to in writing,
 12 | #   software distributed under the License is distributed on an
 13 | #   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 14 | #   KIND, either express or implied.  See the License for the
 15 | #   specific language governing permissions and limitations
 16 | #   under the License.
 17 | #
 18 | #   Copyright (C) Wise.io, Inc. 2016.
 19 | 
 20 | import numpy as np
 21 | import pandas.util.testing
 22 | import unittest
 23 | import collections
 24 | import pandas
 25 | import paratext_internal
 26 | import os
 27 | import random
 28 | import sys
 29 | 
 30 | from tempfile import NamedTemporaryFile
 31 | from contextlib import contextmanager
 32 | from six.moves import range
 33 | import six
 34 | 
 35 | def generate_hell_frame(num_rows, num_columns, include_null=False, fmt='arbitrary'):
 36 |     """
 37 |     Generate a DataFrame of columns containing randomly generated data.
 38 |     """
 39 |     if include_null:
 40 |         min_byte = 0
 41 |     else:
 42 |         min_byte = 1
 43 |     frame = collections.OrderedDict()
 44 |     seed = 0
 45 |     keys = []
 46 |     colfmts = {}
 47 |     for column in range(num_columns):
 48 |         key = "col%d" % (column,)
 49 |         keys.append(key)
 50 |         if fmt == 'mixed':
 51 |             colfmts[key] = random.choice(["ascii","arbitrary","printable_ascii","utf-8"])
 52 |         else:
 53 |             colfmts[key] = fmt
 54 |     for key in keys:
 55 |         data = []
 56 |         colfmt = colfmts[key]
 57 |         for row in range(num_rows):
 58 |             length = np.random.randint(50,1000)
 59 |             if colfmt == 'arbitrary':
 60 |                 cell = paratext_internal.get_random_string(length, seed, min_byte, 255)
 61 |             elif colfmt == 'ascii':
 62 |                 cell = paratext_internal.get_random_string(length, seed, min_byte, 127)
 63 |             elif colfmt == 'printable_ascii':
 64 |                 cell = paratext_internal.get_random_string(length, seed, 32, 126)
 65 |             elif colfmt == 'utf-8' or fmt == 'utf-8':
 66 |                 cell = paratext_internal.get_random_string_utf8(length, seed, include_null)
 67 |             else:
 68 |                 raise ValueError("unknown format: " + fmt)
 69 |             data.append(cell)
 70 |         frame[key] = data
 71 |     return pandas.DataFrame(frame)
 72 | 
 73 | @contextmanager
 74 | def generate_tempfile(filedata):
 75 |     """
 76 |     A context manager that generates a temporary file object that will be deleted
 77 |     when the context goes out of scope. The mode of the file is "wb".
 78 | 
 79 |     Parameters
 80 |     ----------
 81 |     filedata : The data of the file to write as a bytes object.
 82 |     """
 83 |     f = NamedTemporaryFile(delete=False, mode="wb", prefix="paratext-tests")
 84 |     f.write(filedata)
 85 |     name = f.name
 86 |     f.close()
 87 |     yield f.name
 88 |     os.remove(name)
 89 | 
 90 | @contextmanager
 91 | def generate_tempfilename():
 92 |     """
 93 |     A context manager that generates a temporary filename that will be deleted
 94 |     when the context goes out of scope.
 95 |     """
 96 |     f = NamedTemporaryFile(delete=False, prefix="paratext-tests")
 97 |     name = f.name
 98 |     f.close()
 99 |     yield f.name
100 |     os.remove(name)
101 | 
102 | def assert_seq_almost_equal(left, right):
103 |     left = np.asarray(left)
104 |     right = np.asarray(right)
105 |     left_is_string = np.issubdtype(left.dtype, np.str_) or np.issubdtype(left.dtype, np.unicode_) or left.dtype == np.object_
106 |     right_is_string = np.issubdtype(right.dtype, np.str_) or np.issubdtype(right.dtype, np.unicode_) or right.dtype == np.object_
107 |     if np.issubdtype(left.dtype, np.integer) and np.issubdtype(right.dtype, np.integer):
108 |         if not (left.shape == right.shape):
109 |             raise AssertionError("integer sequences have different sizes: %s vs %s" % (str(left.shape), str(right.shape)))
110 |         if not (left == right).all():
111 |             m = (left != right).mean() * 100.
112 |             raise AssertionError("integer sequences mismatch: %5.5f%% left=%s right=%s" % ((m, str(left[0:20]), str(right[0:20]))))
113 |     elif np.issubdtype(left.dtype, np.floating) and np.issubdtype(right.dtype, np.floating):
114 |         np.testing.assert_almost_equal(left, right)
115 |     elif left_is_string and not right_is_string:
116 |         if len(left) > 0 and len(right) > 0:
117 |             raise AssertionError("sequences differ by dtype: left is string and right is %s" % (str(right.dtype)))
118 |     elif not left_is_string and right_is_string:
119 |         if len(left) > 0 and len(right) > 0:
120 |             raise AssertionError("sequences differ by dtype: left is %s and right is string" % (str(left.dtype)))
121 |     elif left_is_string and right_is_string:
122 |         q = np.zeros((len(left)))
123 |         for i in range(len(q)):
124 |             q[i] = not paratext_internal.are_strings_equal(left[i], right[i])
125 |         m = q.mean() * 100.
126 |         if q.any():
127 |             raise AssertionError("object sequences mismatch: %5.5f%%, rows: %s" % (m, str(np.where(q)[0].tolist())))
128 |     else:
129 |         if np.issubdtype(left.dtype, np.floating):
130 |             left_float = left
131 |         else:
132 |             left_float = np.asarray(left, dtype=np.float_)
133 |         if np.issubdtype(right.dtype, np.floating):
134 |             right_float = right
135 |         else:
136 |             right_float = np.asarray(right, dtype=np.float_)
137 |         pandas.util.testing.assert_almost_equal(left_float, right_float)
138 | 
139 | def assert_dictframe_almost_equal(left, right, err_msg=""):
140 |     """
141 |     Compares two dictframes for equivalent. A dict-frame is simply
142 |     an object that obeys the Python mapping protocol. Each (key, value)
143 |     represents a column keyed/indexed by `key` where `value` is
144 |     a NumPy array, a Python sequence, or Python iterable.
145 |     """
146 |     left_keys = set(left.keys())
147 |     right_keys = set(right.keys())
148 |     left_missing = right_keys - left_keys
149 |     right_missing = left_keys - right_keys
150 |     together = left_keys.intersection(right_keys)
151 |     msg = err_msg
152 |     for key in left_missing:
153 |         msg += "%s: missing on left\n" % key
154 |     for key in right_missing:
155 |         msg += "%s: missing on right\n" % key
156 |     for key in together:
157 |         try:
158 |             assert_seq_almost_equal(left[key], right[key])
159 |         except AssertionError as e:
160 |             msg += "\n Column %s: %s" % (key, e.args[0])
161 |     if len(msg) > 0:
162 |         raise AssertionError(msg)
163 | 
164 | def generate_mixed_frame(num_rows, num_floats, num_cats, num_ints):
165 |     fid = open("/usr/share/dict/words")
166 |     words=[line.strip() for line in fid.readlines()]
167 |     num_cols = num_floats + num_cats + num_ints
168 |     perm = np.random.permutation(num_cols)
169 |     num_catints = num_cats + num_ints
170 |     float_ids = perm[num_catints:]
171 |     int_ids = perm[num_cats:num_catints]
172 |     cat_ids = perm[0:num_cats]
173 |     cat_ids = ["col" + str(id) for id in cat_ids]
174 |     int_ids = ["col" + str(id) for id in int_ids]
175 |     float_ids = ["col" + str(id) for id in float_ids]
176 |     d = collections.OrderedDict()
177 |     dtypes = {}
178 |     for col in cat_ids:
179 |         X = np.zeros((num_rows,), dtype=np.object);
180 |         for row in range(0, num_rows):
181 |             num_newlines = np.random.randint(3,7)
182 |             num_commas = np.random.randint(3,7)
183 |             X[row] = ""
184 |             tricky_delims = np.asarray(["\n"] * num_newlines + [","] * num_commas)
185 |             np.random.shuffle(tricky_delims)
186 |             for delim in tricky_delims:
187 |                 X[row] += ' '.join(random.sample(words, 5))
188 |                 X[row] += delim
189 |                 X[row] += ' '.join(random.sample(words, 5))
190 |         d[col] = X
191 |         dtypes[col] = np.object
192 |     for col in float_ids:
193 |         d[col] = np.asarray(np.random.randn(num_rows), dtype=np.float32)
194 |         dtypes[col] = np.float32
195 |     min_int = [0,   -2**7, 0   , -2**15,     0, -2**31,     0, -2**62]
196 |     max_int = [2**8, 2**7, 2**16,  2**15, 2**32,  2**31, 2**62, 2**62]
197 |     dtypes_int = [np.uint8, np.int8, np.uint16, np.int16, np.uint32, np.int32, np.uint64, np.int64]
198 |     for col in int_ids:
199 |         j = np.random.randint(0, len(min_int))
200 |         d[col] = np.asarray(np.random.randint(min_int[j], max_int[j], num_rows), dtype=dtypes_int[j])
201 |         dtypes[col] = dtypes_int[j]
202 |     return d, dtypes
203 | 
204 | 
205 | def internal_compare(filename, *args, **kwargs):
206 |     """
207 |     Loads a Pandas DataFrame with pandas and paratext, and compares their contents.
208 |     """
209 |     import pandas
210 |     dfY = load_csv_to_pandas(filename, *args, **kwargs)
211 |     if kwargs.get("no_header"):
212 |         dfX = pandas.read_csv(filename, header=None, na_values=['?'], names=dfY.keys())
213 |     else:
214 |         dfX = pandas.read_csv(filename, na_values=['?'])
215 |     results = {}
216 |     for key in dfX.columns:
217 |         if dfX[key].dtype in (str, unicode, np.object):
218 |             nonnan_mask = (dfY[key] != 'nan') & (dfY[key] != '?')
219 |             results[key] = (dfX[key][nonnan_mask]!=dfY[key][nonnan_mask]).mean()
220 |         else:
221 |             nonnan_mask = ~np.isnan(dfX[key])
222 |             results[key] = abs(dfX[key][nonnan_mask]-dfY[key][nonnan_mask]).max()
223 |     return results
224 | 


--------------------------------------------------------------------------------
/src/csv/colbased_chunk.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |     ParaText: parallel text reading
  3 |     Copyright (C) 2016. wise.io, Inc.
  4 | 
  5 |    Licensed to the Apache Software Foundation (ASF) under one
  6 |    or more contributor license agreements.  See the NOTICE file
  7 |    distributed with this work for additional information
  8 |    regarding copyright ownership.  The ASF licenses this file
  9 |    to you under the Apache License, Version 2.0 (the
 10 |    "License"); you may not use this file except in compliance
 11 |    with the License.  You may obtain a copy of the License at
 12 | 
 13 |      http://www.apache.org/licenses/LICENSE-2.0
 14 | 
 15 |    Unless required by applicable law or agreed to in writing,
 16 |    software distributed under the License is distributed on an
 17 |    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 18 |    KIND, either express or implied.  See the License for the
 19 |    specific language governing permissions and limitations
 20 |    under the License.
 21 | */
 22 | 
 23 | /*
 24 |   Coder: Damian Eads.
 25 |  */
 26 | 
 27 | #ifndef PARATEXT_COLBASED_CHUNK_HPP
 28 | #define PARATEXT_COLBASED_CHUNK_HPP
 29 | 
 30 | #include "generic/parse_params.hpp"
 31 | #include "util/widening_vector.hpp"
 32 | #include "util/strings.hpp"
 33 | 
 34 | #include <typeindex>
 35 | #include <sstream>
 36 | 
 37 | namespace ParaText {
 38 | 
 39 | namespace CSV {
 40 | 
 41 |   /*
 42 |     Represents a chunk of parsed column data for a col-based CSV parser.
 43 |   */
 44 |   class ColBasedChunk {
 45 |   public:
 46 |     /*
 47 |       Creates a new chunk with an empty name.
 48 |      */
 49 |     ColBasedChunk() : max_level_name_length_(std::numeric_limits<size_t>::max()), max_levels_(std::numeric_limits<size_t>::max()), forced_semantics_(Semantics::UNKNOWN) {}
 50 | 
 51 |     /*
 52 |       Creates a new chunk.
 53 | 
 54 |       \param column_name      The name of the column for the chunk.
 55 |      */
 56 |     ColBasedChunk(const std::string &column_name)
 57 |       : column_name_(column_name), max_level_name_length_(std::numeric_limits<size_t>::max()), max_levels_(std::numeric_limits<size_t>::max()), forced_semantics_(Semantics::UNKNOWN) {}
 58 | 
 59 |     /*
 60 |       Creates a new chunk.
 61 | 
 62 |       \param column_name             The name of the column for the chunk.
 63 |       \param max_level_name_length   If this field length is exceeded, all string fields in a
 64 |                                      column are considered text rather than categorical levels.
 65 |       \param max_levels              If this number of levels is exceeded, then all string fields
 66 |                                      in a column are considered categorical.
 67 |      */
 68 |     ColBasedChunk(const std::string &column_name, size_t max_level_name_length, size_t max_levels, Semantics forced_semantics_)
 69 |       : column_name_(column_name), max_level_name_length_(max_level_name_length), max_levels_(max_levels), forced_semantics_(forced_semantics_) {}
 70 | 
 71 | 
 72 |     /*
 73 |      * Destroys this chunk.
 74 |      */
 75 |     virtual ~ColBasedChunk()             {}
 76 | 
 77 |     /*
 78 |      * Passes a floating point datum to the column handler. If categorical
 79 |      * data was previously passed to this handler, this datum will be converted
 80 |      * to a string and treated as categorical.
 81 |      */
 82 |     void process_float(float val)               {
 83 |       if (cat_data_.size() > 0 || forced_semantics_ == Semantics::CATEGORICAL || forced_semantics_ == Semantics::TEXT) {
 84 |         std::string s(std::to_string(val));
 85 |         process_categorical(s.begin(), s.end());
 86 |       }
 87 |       else {
 88 |         number_data_.push_back(val);
 89 |       }
 90 |     }
 91 | 
 92 |     /*
 93 |      * Passes a floating point datum to the column handler. If categorical
 94 |      * data was previously passed to this handler, this datum will be converted
 95 |      * to a string and treated as categorical.
 96 |      */
 97 |     void process_integer(long val)               {
 98 |       if (cat_data_.size() > 0 || forced_semantics_ == Semantics::CATEGORICAL || forced_semantics_ == Semantics::TEXT) {
 99 |         std::string s(std::to_string(val));
100 |         process_categorical(s.begin(), s.end());
101 |       }
102 |       else {
103 |         number_data_.push_back(val);
104 |       }
105 |     }
106 | 
107 |     /*
108 |      * Passes a categorical datum to the column handler. If numerical data
109 |      * was previously passed to this handler, all previous data passed will
110 |      * be converted to a string.
111 |      */
112 |     template <class Iterator>
113 |     void process_categorical(Iterator begin, Iterator end) {
114 |       if (forced_semantics_ == Semantics::NUMERIC) {
115 |         number_data_.push_back((float)bsd_strtod(begin, end));
116 |       }
117 |       else if (number_data_.size() > 0) {
118 |         if (begin == end) {
119 |           //std::cout << "{" << std::string(begin, end);
120 |           number_data_.push_back((long)0);
121 |         }
122 |         else {
123 |           //std::cout << "[" << std::string(begin, end);
124 |           convert_to_cat_or_text();
125 |           std::string key(begin, end);
126 |           add_cat_data(key);
127 |         }
128 |       }
129 |       else {
130 |         std::string key(begin, end);
131 |         add_cat_data(key);
132 |       }
133 |     }
134 | 
135 |     /*
136 |       Returns the semantics of this column.
137 |      */
138 |     Semantics get_semantics() const {
139 |       if (cat_data_.size() > 0) {
140 |         return Semantics::CATEGORICAL;
141 |       }
142 |       else if (text_data_.size() > 0) {
143 |         return Semantics::TEXT;
144 |       }
145 |       else {
146 |         return Semantics::NUMERIC;
147 |       }
148 |     }
149 | 
150 |     /*
151 |       Returns the type index of the data in this column.
152 |      */
153 |     std::type_index get_type_index() const {
154 |       if (cat_data_.size() > 0) {
155 |         return cat_data_.get_type_index();
156 |       } else if (text_data_.size() > 0) {
157 |         return std::type_index(typeid(text_data_));
158 |       }
159 |       else {
160 |         return number_data_.get_type_index();
161 |       }
162 |     }
163 | 
164 |     std::type_index get_common_type_index(std::type_index &other) const {
165 |       if (cat_data_.size() > 0 || other == std::type_index(typeid(std::string))) {
166 |         return std::type_index(typeid(std::string));
167 |       }
168 |       else {
169 |         return number_data_.get_common_type_index(other);
170 |       }
171 |     }
172 | 
173 |     template <class T, bool Numeric>
174 |     inline typename std::enable_if<std::is_arithmetic<T>::value && Numeric, T>::type get(size_t i) const {
175 |       return number_data_.get<T>(i);
176 |     }
177 | 
178 |     template <class T, bool Numeric>
179 |     inline typename std::enable_if<std::is_arithmetic<T>::value && !Numeric, T>::type get(size_t i) const {
180 |       return cat_data_.get<size_t>(i);
181 |     }
182 | 
183 |     const std::vector<std::string> &get_cat_keys() const {
184 |       return cat_keys_;
185 |     }
186 | 
187 |     size_t size() const {
188 |       if (cat_data_.size() > 0) {
189 |         return cat_data_.size();
190 |       }
191 |       else if (number_data_.size() > 0) {
192 |         return number_data_.size();
193 |       }
194 |       else {
195 |         return text_data_.size();
196 |       }
197 |     }
198 | 
199 |     void clear() {
200 |       number_data_.clear();
201 |       number_data_.shrink_to_fit();
202 |       cat_data_.clear();
203 |       cat_data_.shrink_to_fit();
204 |       cat_ids_.clear();
205 |       cat_keys_.clear();
206 |       cat_keys_.shrink_to_fit();
207 |     }
208 | 
209 |     size_t get_string(size_t idx) {
210 |       return cat_data_.get<size_t>(idx);
211 |     }
212 | 
213 |     size_t get_string_id(const std::string &key) {
214 |       auto it = cat_ids_.find(key);
215 |       if (it == cat_ids_.end()) {
216 |         std::tie(it, std::ignore) = cat_ids_.insert(std::make_pair(key, cat_ids_.size()));
217 |         cat_keys_.push_back(key);
218 |       }
219 |       return it->second;
220 |     }
221 | 
222 |     /*
223 |      * Converts all floating point data collected by this handler into
224 |      * categorical data.
225 |      */
226 |     void convert_to_cat_or_text() {
227 |       if (number_data_.size() > 0) {
228 |         for (size_t i = 0; i < number_data_.size(); i++) {
229 |           add_cat_data(std::to_string(number_data_.get<float>(i)));
230 |         }
231 |         number_data_.clear();
232 |         number_data_.shrink_to_fit();
233 |       }
234 |     }
235 | 
236 |     void convert_to_text() {
237 |       if (number_data_.size() > 0 || forced_semantics_ == Semantics::TEXT) {
238 |         for (size_t i = 0; i < number_data_.size(); i++) {
239 |           text_data_.push_back(std::to_string(number_data_.get<float>(i)));
240 |         }
241 |         number_data_.clear();
242 |         number_data_.shrink_to_fit();
243 |       }
244 |       else if (cat_data_.size() > 0) {
245 |         for (size_t i = 0; i < cat_data_.size(); i++) {
246 |           text_data_.push_back(cat_keys_[cat_data_.get<long>(i)]);
247 |         }
248 |         cat_data_.clear();
249 |         cat_data_.shrink_to_fit();
250 |         cat_ids_.clear();
251 |         cat_keys_.clear();
252 |         cat_keys_.shrink_to_fit();
253 |       }
254 |     }
255 | 
256 |     void add_cat_data(const std::string &data) {
257 |       if (forced_semantics_ == Semantics::TEXT || text_data_.size() > 0) {
258 |         text_data_.push_back(data);
259 |       }
260 |       else if (forced_semantics_ == Semantics::CATEGORICAL) {
261 |         cat_data_.push_back((long)get_string_id(data));
262 |       }
263 |       else if (data.size() > max_level_name_length_ || cat_keys_.size() > max_levels_) {
264 |         convert_to_text();
265 |         text_data_.push_back(data);
266 |       }
267 |       else {
268 |         cat_data_.push_back((long)get_string_id(data));
269 |       }
270 |     }
271 | 
272 |     const std::string &get_text(size_t i) const {
273 |       return text_data_[i];
274 |     }
275 | 
276 |     template <class T>
277 |     void copy_numeric_into(T *out) {
278 |       number_data_.copy_into(out);
279 |     }
280 | 
281 |     template <class T>
282 |     void copy_cat_into(T *out) {
283 |       cat_data_.copy_into(out);
284 |     }
285 | 
286 |     size_t get_text_length_sum() const {
287 |       size_t sum = 0;
288 |       for (size_t i = 0; i < text_data_.size(); i++) {
289 |         sum += text_data_[i].size();
290 |       }
291 |       return sum;
292 |     }
293 | 
294 |     template <class T>
295 |     T get_number_sum() const {
296 |       return number_data_.get_sum<T>();
297 |     }
298 | 
299 |   private:
300 |     std::string column_name_;
301 |     widening_vector_dynamic<uint8_t, int8_t, int16_t, int32_t, int64_t, float> number_data_;
302 |     widening_vector_dynamic<uint8_t, uint8_t, uint16_t, uint32_t, uint64_t>    cat_data_;
303 |     std::unordered_map<std::string, size_t>                                    cat_ids_;
304 |     std::vector<std::string>                                                   cat_keys_;
305 |     std::vector<std::string>                                                   text_data_;
306 |     size_t                                                                     max_level_name_length_;
307 |     size_t                                                                     max_levels_;
308 |     Semantics                                                                  forced_semantics_;
309 |   };
310 | }
311 | }
312 | #endif
313 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ParaText
  2 | ========
  3 | 
  4 | <a href="https://travis-ci.org/wiseio/paratext.svg?branch=master"><img src="https://travis-ci.org/wiseio/paratext.svg?branch=master" qlt="Paratext Travis Status"></a>
  5 | <a href="https://github.com/wiseio/paratext/blob/master/README.md"><img src="https://img.shields.io/github/license/wiseio/paratext.svg" qlt="Paratext License"></a>
  6 | 
  7 | ParaText is a C++ library to read text files in parallel on multi-core
  8 | machines. The alpha release includes a CSV reader and Python bindings.
  9 | The library itself has no dependencies other than the standard library.
 10 | 
 11 | Depedencies
 12 | -----------
 13 | ParaText has the following dependencies:
 14 | 
 15 |    - a fully C++11-compliant C++ compiler (gcc 4.8 or above, clang 3.4 or above)
 16 |    - SWIG 2.0.7 or above (Python 2 bindings)
 17 |    - SWIG 3.0.8 or above (Python 3 bindings)
 18 |    - Python 2.7 or 3.5
 19 |    - setuptools
 20 |    - numpy
 21 | 
 22 | Pandas is required only if using ParaText to read CSV files into
 23 | Pandas. The SWIG available from Ubuntu 14.04 does not work with Python 3.
 24 | 
 25 | Anaconda packages the latest version of SWIG that works properly
 26 | with Python 3. You can install it as follows:
 27 | 
 28 | ```
 29 | conda install swig
 30 | ```
 31 | 
 32 | Building Python
 33 | ---------------
 34 | 
 35 | First, go into the `python` directory:
 36 | 
 37 | ```
 38 |    cd python/
 39 | ```
 40 | 
 41 | Then run `setup.py`:
 42 | 
 43 | ```
 44 |    python setup.py build install
 45 | ```
 46 | 
 47 | Use the `--prefix` option if you prefer to install ParaText to a
 48 | different location:
 49 | 
 50 | ```
 51 |    cd python/
 52 |    python setup.py build install --prefix=/my/prefix/dir
 53 | ```
 54 | 
 55 | 
 56 | Using ParaText in Python
 57 | ========================
 58 | 
 59 | First, import the `paratext` Python package.
 60 | 
 61 | ```
 62 |    import paratext
 63 | ```
 64 | 
 65 | Loading into Pandas
 66 | -------------------
 67 | 
 68 | A CSV file can be loaded into Pandas in just one line of code using
 69 | the `load_csv_to_pandas` function.
 70 | 
 71 | ```
 72 | df = paratext.load_csv_to_pandas("hepatitis.csv")
 73 | ```
 74 | 
 75 | The data frame looks something like this:
 76 | 
 77 | ```
 78 | In [1]: print df.head()
 79 |    AGE     SEX STEROID ANTIVIRALS FATIGUE MALAISE ANOREXIA LIVER_BIG  \
 80 | 0   30    male      no         no      no      no       no        no   
 81 | 1   50  female      no         no     yes      no       no        no   
 82 | 2   78  female     yes         no     yes      no       no       yes   
 83 | 3   31  female     nan        yes      no      no       no       yes   
 84 | 4   34  female     yes         no      no      no       no       yes   
 85 | 
 86 |   LIVER_FIRM SPLEEN_PALPABLE SPIDERS ASCITES VARICES  BILIRUBIN  \
 87 | 0         no              no      no      no      no        1.0   
 88 | 1         no              no      no      no      no        0.9   
 89 | 2         no              no      no      no      no        0.7   
 90 | 3         no              no      no      no      no        0.7   
 91 | 4         no              no      no      no      no        1.0   
 92 | 
 93 |    ALK_PHOSPHATE  SGOT  ALBUMIN  PROTIME HISTOLOGY Class  
 94 | 0             85    18      4.0      NaN        no  LIVE  
 95 | 1            135    42      3.5      NaN        no  LIVE  
 96 | 2             96    32      4.0      NaN        no  LIVE  
 97 | 3             46    52      4.0       80        no  LIVE  
 98 | 4            NaN   200      4.0      NaN        no  LIVE
 99 | ```
100 | 
101 | Loading into Dictionaries (more memory-efficient)
102 | -------------------------------------------------
103 | 
104 | A Python dictionary of arrays is preferable over a DataFrame
105 | if the memory budget is very tight. The `load_csv_to_dict`
106 | loads a CSV file, storing the columns as a dictionary of
107 | arrays.
108 | 
109 | ```
110 |   dict_frame, levels = paratext.load_csv_to_dict(filename)
111 | ```
112 | 
113 | It returns a two element tuple. The first `dict_frame` is a Python
114 | dictionary that maps column names to column data. The second `levels`
115 | is also a Python dictionary keyed by column name. It contains a list
116 | of level strings for each categorical column.
117 | 
118 | The following code visits the columns. For each column, it
119 | prints its name, the first 5 values of its data, and the categorical
120 | levels (`None` if not categorical).
121 | 
122 | ```
123 |   for key in dict_frame.keys():
124 |       print key, repr(dict_frame[key][0:5]), levels.get(key, None)
125 | ```
126 | 
127 | This gives the following output:
128 | 
129 | ```
130 | PROTIME array([ nan,  nan,  nan,  80.,  nan], dtype=float32) None
131 | LIVER_BIG array([0, 0, 1, 1, 1], dtype=uint8) ['no' 'yes' 'nan']
132 | ALBUMIN array([ 4. ,  3.5,  4. ,  4. ,  4. ], dtype=float32) None
133 | ALK_PHOSPHATE array([  85.,  135.,   96.,   46.,   nan], dtype=float32) None
134 | ANTIVIRALS array([0, 0, 0, 1, 0], dtype=uint8) ['no' 'yes']
135 | HISTOLOGY array([0, 0, 0, 0, 0], dtype=uint8) ['no' 'yes']
136 | BILIRUBIN array([ 1.,  0.89999998,  0.69999999,  0.69999999, 1. ], dtype=float32) None
137 | AGE array([30, 50, 78, 31, 34], dtype=uint8) None
138 | SEX array([0, 1, 1, 1, 1], dtype=uint8) ['male' 'female']
139 | STEROID array([0, 0, 1, 2, 1], dtype=uint8) ['no' 'yes' 'nan']
140 | SGOT array([  18.,   42.,   32.,   52.,  200.], dtype=float32) None
141 | MALAISE array([0, 0, 0, 0, 0], dtype=uint8) ['no' 'yes' 'nan']
142 | FATIGUE array([0, 1, 1, 0, 0], dtype=uint8) ['no' 'yes' 'nan']
143 | SPIDERS array([0, 0, 0, 0, 0], dtype=uint8) ['no' 'yes' 'nan']
144 | VARICES array([0, 0, 0, 0, 0], dtype=uint8) ['no' 'nan' 'yes']
145 | LIVER_FIRM array([0, 0, 0, 0, 0], dtype=uint8) ['no' 'yes' 'nan']
146 | SPLEEN_PALPABLE array([0, 0, 0, 0, 0], dtype=uint8) ['no' 'yes' 'nan']
147 | ASCITES array([0, 0, 0, 0, 0], dtype=uint8) ['no' 'yes' 'nan']
148 | Class array([0, 0, 0, 0, 0], dtype=uint8) ['LIVE' 'DIE']
149 | ANOREXIA array([0, 0, 0, 0, 0], dtype=uint8) ['no' 'yes' 'nan']
150 | ```
151 | 
152 | All categorical columns in this data set have 3 or fewer levels so
153 | they are all `uint8`. A string representation uses at least 8 times
154 | as much space, but it can also be less computationally efficient. An
155 | integer representation is ideal for learning on categorical columns.
156 | Integer comparisons over contiguous integer buffers are pretty cheap
157 | compared to exhaustive string comparisons on (potentially)
158 | discontiguous string values. This makes a big difference for
159 | combinatorial learning algorithms.
160 | 
161 | Handling Multi-Line Fields
162 | --------------------------
163 | 
164 | ParaText supports reading CSV files with multi-line fields in
165 | parallel. This feature must be explicitly activated as it requires
166 | extra overhead to adjust the boundaries of the chunks processed by
167 | the workers.
168 | 
169 | ```
170 | df = paratext.load_csv_to_pandas("messy.csv", allow_quoted_newlines=True)
171 | ```
172 | 
173 | Header Detection
174 | ----------------
175 | 
176 | ParaText detects the presence of a header. This can be turned off with
177 | `no_header=True`.
178 | 
179 | Column Typing
180 | -------------
181 | 
182 | This library distinguishes between a column's data type and its semantics.
183 | The semantics defines how to interpret a column (e.g. numeric vs. categorical).
184 | and the data type (`uint8`, `int64`, `float`, etc.) is the type for encoding
185 | column values.
186 | 
187 | Three semantic types are supported:
188 | 
189 |    * `num`: numeric data.
190 | 
191 |    * `cat`: categorical data.
192 | 
193 |    * `text`: large strings like e-mails and text documents.
194 | 
195 | ParaText supports `(u)int(8|16|32|64)|float|double|string` data types.
196 | 
197 | Parameters
198 | ----------
199 | 
200 | Most CSV loading functions in ParaText have the following parameters:
201 | 
202 |    * `cat_names`: A list of column names to force as categorical regardless
203 |    of the inferred type.
204 | 
205 |    * `text_names`: A list of column names that should be treated as rich text
206 |    regardless of its inferred type.
207 | 
208 |    * `num_names`: A list of column names that should be treated as
209 |    numeric regardless of its inferred type.
210 | 
211 |    * `num_threads`:  The number of parser threads to spawn. The default
212 |    is the number of cores.
213 | 
214 |    * `allow_quoted_newlines`:  Allows multi-line text fields. This
215 |    is turned off by default.
216 | 
217 |    * `no_header`: Do not auto-detect the presence of a header. Assume
218 |    the first line is data. This is turned off by default.
219 | 
220 |    * `max_level_name_length`: If a field's length exceeds this value,
221 |    the entire column is treated as text rather than
222 |    categorical. The default is unlimited.
223 | 
224 |    * `max_levels`: The maximum number of levels of a categorical column.
225 |    The default is unlimited.
226 | 
227 |    * `number_only`: Whether it can be safely assumed the columns only
228 |    contain numbers. The default is unlimited.
229 | 
230 |    * `block_size`: The number of bytes to read at a time in each worker
231 |    thread. The default is unlimited.
232 | 
233 | Escape Characters
234 | -----------------
235 | 
236 | ParaText supports backslash escape characters:
237 | 
238 |     * `\t': tab
239 | 
240 |     * `\n': newline
241 | 
242 |     * `\r': carriage return
243 | 
244 |     * `\v': vertical tab
245 | 
246 |     * `\0': null terminator (0x00)
247 | 
248 |     * `\b': backspace
249 | 
250 |     * '\xnn': an 8-bit character represented with a 2 digit hexidecimal number.
251 | 
252 |     * '\unnnn': a Unicode code point represented as 4-digit hexidecimal number.
253 | 
254 |     * '\Unnnnnnnn': a Unicode code point represented as 8-digit hexiecimal number.
255 | 
256 | Writing CSV
257 | -----------
258 | 
259 | ParaText does yet support parallel CSV writing. However, it bundles a CSV
260 | writer that can be used to write DataFrames with arbitrary string and byte
261 | buffer data in a lossless fashion.
262 | 
263 | If a character in a Python `string`, `unicode`, or `bytes`
264 | object could be treated as non-data when parsed (e.g. a doublequote or
265 | escape character), it is escaped. Moreover, any character that is outside
266 | the desired encoding is also escaped. This enables, for example,
267 | the lossless writing of non-UTF-8 to a UTF-8 file.
268 | 
269 | For example, to restrict the encoding to 7-bit printable ASCII, pass
270 | `out_encoding='printable_ascii'`
271 | 
272 | ```
273 |    import paratext.serial
274 |    df = pandas.DataFrame({"X": [b"\xff\\\n \" oh my!"]})
275 |    paratext.serial.save_frame("lossless.csv", df, allow_quoted_newlines=True, out_encoding='printable_ascii', dos=False)
276 | ```
277 | 
278 | This results in a file:
279 | 
280 | ```
281 | "X"
282 | "\xff\\
283 |  \" oh my!"
284 | ```
285 | 
286 | Instead, pass `out_encoding='utf-8'` to ``save_frame``.
287 | 
288 | ```
289 |    import paratext.serial
290 |    df = pandas.DataFrame({"X": [b"\xff\\\n \" oh my!"],"Y": ["\U0001F600"]})
291 |    paratext.serial.save_frame("lossless2.csv", df, allow_quoted_newlines=True, out_encoding='utf-8', dos=False)
292 | ```
293 | 
294 | Now, the file only escapes cells in the DataFrame with
295 | non-UTF8 data. All other UTF8 characters are preserved.
296 | ```
297 | "X","Y"
298 | "\xff\\
299 |  \" oh my!","<U+1F600>"
300 | ```
301 | 
302 | Other Notes
303 | -----------
304 | 
305 | ParaText is a work-in-progress. There are a few unimplemented features
306 | that may prevent it from working on all CSV files. We note them below.
307 | 
308 | 1. There is no way to supply type hints (e.g. `uint64` or `float`) of a
309 | column.  Only the interpretation of a column (numeric, categorical, or
310 | text) can be forced.
311 | 
312 | 2. DateTime will be supported in a future release.
313 | 


--------------------------------------------------------------------------------
/src/csv/colbased_worker.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | 
  3 |     ParaText: parallel text reading
  4 |     Copyright (C) 2016. wise.io, Inc.
  5 | 
  6 |    Licensed to the Apache Software Foundation (ASF) under one
  7 |    or more contributor license agreements.  See the NOTICE file
  8 |    distributed with this work for additional information
  9 |    regarding copyright ownership.  The ASF licenses this file
 10 |    to you under the Apache License, Version 2.0 (the
 11 |    "License"); you may not use this file except in compliance
 12 |    with the License.  You may obtain a copy of the License at
 13 | 
 14 |      http://www.apache.org/licenses/LICENSE-2.0
 15 | 
 16 |    Unless required by applicable law or agreed to in writing,
 17 |    software distributed under the License is distributed on an
 18 |    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 19 |    KIND, either express or implied.  See the License for the
 20 |    specific language governing permissions and limitations
 21 |    under the License.
 22 |  */
 23 | 
 24 | /*
 25 |   Coder: Damian Eads.
 26 |  */
 27 | 
 28 | #ifndef WISEIO_PARSE_WORKER_COL_BASED_HPP
 29 | #define WISEIO_PARSE_WORKER_COL_BASED_HPP
 30 | 
 31 | #include "util/strings.hpp"
 32 | #include "util/widening_vector.hpp"
 33 | 
 34 | #include <fstream>
 35 | #include <exception>
 36 | #include <stdexcept>
 37 | 
 38 | namespace ParaText {
 39 | 
 40 | namespace CSV {
 41 | 
 42 | template <class ColumnHandler>
 43 | class ColBasedParseWorker {
 44 | public:
 45 |   ColBasedParseWorker(std::vector<std::shared_ptr<ColumnHandler> > &handlers) : handlers_(handlers), lines_parsed_(0), quote_started_('\0'), column_index_(0), escape_jump_(0) {}
 46 | 
 47 |   virtual ~ColBasedParseWorker() {}
 48 | 
 49 |   void parse(const std::string &filename,
 50 |              size_t begin,
 51 |              size_t end,
 52 |              size_t data_begin,
 53 |              size_t file_end,
 54 |              const ParaText::ParseParams &params) {
 55 |     try {
 56 |       if (params.number_only) {
 57 |         parse_impl<true>(filename, begin, end, data_begin, file_end, params);
 58 |       }
 59 |       else {
 60 |         parse_impl<false>(filename, begin, end, data_begin, file_end, params);
 61 |       }
 62 |     }
 63 |     catch (...) {
 64 |       thread_exception_ = std::current_exception();
 65 |     }
 66 |   }
 67 | 
 68 |   std::exception_ptr get_exception() {
 69 |     return thread_exception_;
 70 |   }
 71 | 
 72 |   template <bool NumberOnly>
 73 |   void parse_impl(const std::string &filename,
 74 |                   size_t begin,
 75 |                   size_t end,
 76 |                   size_t data_begin,
 77 |                   size_t file_end,
 78 |                   const ParaText::ParseParams &params) {
 79 |     (void)data_begin;
 80 |     (void)file_end;
 81 |     std::ifstream in;
 82 |     in.open(filename.c_str());
 83 |     column_index_ = 0;
 84 |     quote_started_ = '\0';
 85 |     escape_jump_ = 0;
 86 |     size_t current = begin;
 87 |     size_t spos_line = begin, epos_line = begin;
 88 |     const size_t block_size = params.block_size;
 89 |     convert_null_to_space_ = params.convert_null_to_space;
 90 |     char buf[block_size];
 91 |     in.seekg(current, std::ios_base::beg);
 92 |     definitely_string_ = false;
 93 | #ifdef PARALOAD_DEBUG
 94 |     size_t round = 0;
 95 | #endif
 96 |     while (current <= end) {
 97 |       if (current % block_size == 0) { /* The block is aligned. */
 98 |         in.read(buf, std::min(end - current + 1, block_size));
 99 |       }
100 |       else { /* Our first read should ensure our further reads are block-aligned. */
101 |         in.read(buf, std::min(end - current + 1, std::min(block_size, current % block_size)));
102 |       }
103 |       size_t nread = in.gcount();
104 | #ifdef PARALOAD_DEBUG
105 |       if (round == 0) {
106 |         std::cout << "R{" << std::string((char *)buf, (char *)buf + nread) << std::endl;
107 |       }
108 |       round++;
109 | #endif
110 |       if (nread == 0) {
111 |         break;
112 |       }
113 |       if (NumberOnly) {
114 |         size_t i = 0;
115 |         for (; i < nread; i++) {
116 |           if (buf[i] == ',') {
117 |             process_token_number_only();
118 |           }
119 |           else if (buf[i] == '\r') { /* do nothing. */}
120 |           else if (buf[i] == '\n') {
121 |             epos_line = current + i;
122 |             if (epos_line - spos_line > 0) {
123 |               process_token_number_only();
124 |               process_newline();
125 |             }
126 |             spos_line = epos_line + 1;
127 |             epos_line = spos_line;
128 |           } else {
129 |             token_.push_back(buf[i]);
130 |           }
131 |         }
132 |       } else {
133 |         for (size_t i = 0; i < nread;) {
134 |           if (quote_started_ != '\0') {
135 |             for (; i < nread; i++) {
136 |               if (escape_jump_ > 0) {
137 |                 escape_jump_--;
138 |               }
139 |               else if (buf[i] == '\\') {
140 |                 escape_jump_ = 1;
141 |               }
142 |               else if (buf[i] == quote_started_) {
143 |                 i++;
144 |                 quote_started_ = '\0';
145 |                 break;
146 |               }
147 |               token_.push_back(buf[i]);
148 |             }
149 |           }
150 |           else {
151 |             for (; i < nread; i++) {
152 |               if (escape_jump_ > 0) {
153 |                 escape_jump_--;
154 |                 if (buf[i] == 'x') {
155 |                   escape_jump_ += 2;
156 |                 }
157 |                 else if (buf[i] == 'u') {
158 |                   escape_jump_ += 4;
159 |                 }
160 |                 token_.push_back(buf[i]);
161 |               }
162 |               else if (buf[i] == '\\') {
163 |                 escape_jump_ = 1;
164 |                 token_.push_back(buf[i]);
165 |               }
166 |               else if (buf[i] == '"') {
167 |                 i++;
168 |                 quote_started_ = '\"';
169 |                 definitely_string_ = true;
170 |                 break;
171 |               }
172 |               else if (buf[i] == ',') {
173 |                 process_token();
174 |               }
175 |               else if (buf[i] == '\r') { /* do nothing: dos wastes a byte each line. */ }
176 |               else if (buf[i] == '\n') {
177 |                 epos_line = current + i;
178 |                 if (epos_line - spos_line > 0) {
179 |                   process_token();
180 |                   process_newline();
181 |                 }
182 |                 spos_line = epos_line + 1;
183 |                 epos_line = spos_line;
184 |               }
185 |               else {
186 |                 token_.push_back(buf[i]);
187 |               }
188 |             }
189 |           }
190 |         }
191 |       }
192 |       current += nread;
193 |     }
194 |     epos_line = end + 1;
195 |     //std::cout << "start line: " << spos_line << " end line: " << epos_line << std::endl;
196 |     /*
197 |       If we're in the last column position, process the token as some files
198 |       do not end with a newline.
199 |      */
200 |     if (token_.size() > 0) {
201 |       if (NumberOnly) {
202 |         process_token_number_only();
203 |       } else {
204 |         process_token();
205 |       }
206 |     }
207 |     /*
208 |       If there was data on the last line, process it.
209 |     */
210 |     if (column_index_ > 0) {
211 |       process_newline();
212 |     }
213 | #ifdef PARALOAD_DEBUG
214 |     std::cout << "lines parsed: " << lines_parsed_ << std::endl;
215 | #endif
216 |     return;
217 |   }
218 | 
219 |   void process_newline() {
220 |     if (column_index_ != handlers_.size()) {
221 |       std::ostringstream ostr;
222 |       ostr << "improper number of columns on line number (unquoted in chunk): " << (lines_parsed_ + 1) << ". Expected: " << handlers_.size();
223 |       throw std::logic_error(ostr.str());
224 |     }
225 |     column_index_ = 0;
226 |     lines_parsed_++;
227 |   }
228 | 
229 |   void process_token_number_only() {
230 |     if (column_index_ >= handlers_.size()) {
231 |       std::ostringstream ostr;
232 |       ostr << "too many columns on line number (unquoted in chunk): " << (lines_parsed_ + 1) << ". Expected: " << handlers_.size();
233 |       throw std::logic_error(ostr.str());
234 |     }
235 |     size_t i = 0;
236 |     for (; i < token_.size() && isspace(token_[i]); i++) {}
237 |     if (i < token_.size()) {
238 |       if (token_[i] == '?' && token_.size() - i == 1) {
239 |         handlers_[column_index_]->process_float(std::numeric_limits<float>::quiet_NaN());
240 |       }
241 |       else if (token_.size() - i == 3 &&
242 |                ((token_[i] == 'n' || token_[i] == 'N'))
243 |                && ((token_[i+1] == 'a' || token_[i+1] == 'A'))
244 |                && (token_[i+2] == 'n' || token_[i+2] == 'N')) {
245 |         handlers_[column_index_]->process_float(std::numeric_limits<float>::quiet_NaN());
246 |       }
247 |       else {
248 |         if (token_[i] == '-') { i++; }
249 |         for (; i < token_.size() && isdigit(token_[i]); i++) {}
250 |         if (i < token_.size() && (token_[i] == '.' || token_[i] == 'E' || token_[i] == 'e')) {
251 |           handlers_[column_index_]->process_float(bsd_strtod(token_.begin(), token_.end()));
252 |         }
253 |         else {
254 |           handlers_[column_index_]->process_integer(fast_atoi<long>(token_.begin(), token_.end()));
255 |         }
256 |       }
257 |     } else {
258 |       handlers_[column_index_]->process_integer(0);
259 |     }
260 |     column_index_++;
261 |     token_.clear();
262 |   }
263 | 
264 |   void process_token() {
265 |     if (column_index_ >= handlers_.size()) {
266 |       std::ostringstream ostr;
267 |       ostr << "too many columns on line number (unquoted in chunk): " << (lines_parsed_ + 1) << ". Expected: " << handlers_.size();
268 |       throw std::logic_error(ostr.str());
269 |     }
270 |     if (definitely_string_) {
271 |       parse_unquoted_string(token_.begin(), token_.end(), std::back_inserter(token_aux_));
272 |       if (convert_null_to_space_) {
273 |         convert_null_to_space(token_aux_.begin(), token_aux_.end());
274 |       }
275 |       handlers_[column_index_]->process_categorical(token_aux_.begin(), token_aux_.end());
276 |       token_aux_.clear();
277 |       definitely_string_ = false;
278 |     }
279 |     else {
280 |       size_t i = 0;
281 |       bool integer_possible = false, float_possible = false, exp_possible = false, handled = false;
282 |       for (; i < token_.size() && isspace(token_[i]); i++) {}
283 |       if (i < token_.size()) {
284 |         if (token_[i] == '-') {
285 |           i++;
286 |         }
287 |         else if (token_[i] == '?' && token_.size() - i == 1) {
288 |           handlers_[column_index_]->process_float(std::numeric_limits<float>::quiet_NaN());
289 |           handled = true;
290 |         }
291 |         else if ((token_[i] == 'n' || token_[i] == 'N') && token_.size() - i == 3) {
292 |           if ((token_[i+1] == 'a' || token_[i+1] == 'A') && (token_[i+2] == 'n' || token_[i+2] == 'N')) {
293 |             handlers_[column_index_]->process_float(std::numeric_limits<float>::quiet_NaN());
294 |             handled = true;
295 |           }
296 |         }
297 |       }
298 |       if (!handled) {
299 |         if (i < token_.size()) {
300 |           integer_possible = std::isdigit(token_[i]);
301 |           i++;
302 |           float_possible = integer_possible, exp_possible = integer_possible;
303 |           while (i < token_.size() && integer_possible) {
304 |             integer_possible = isdigit(token_[i]);
305 |             i++;
306 |           }
307 |           if (i < token_.size()) {
308 |             integer_possible = false;
309 |             float_possible = token_[i] == '.';
310 |             i++;
311 |             while (i < token_.size() && float_possible) {
312 |               float_possible = isdigit(token_[i]);
313 |               i++;
314 |             }
315 |             if (float_possible && i < token_.size()) {
316 |               float_possible = false;
317 |               exp_possible = token_[i] == 'E' || token_[i] == 'e';
318 |               i++;
319 |               if (exp_possible && i < token_.size()) {
320 |                 //std::cout << "A";
321 |                 if (token_[i] == '+' || token_[i] == '-') {
322 |                   //std::cout << "B";
323 |                   i++;
324 |                   if (i < token_.size()) {
325 |                     //std::cout << "C";
326 |                     exp_possible = isdigit(token_[i]);
327 |                     i++;
328 |                     while (i < token_.size() && exp_possible) {
329 |                       exp_possible = isdigit(token_[i]);
330 |                       i++;
331 |                     }
332 |                   }
333 |                   else {
334 |                     exp_possible = false;
335 |                   }
336 |                 }
337 |                 else if (isdigit(token_[i])) {
338 |                   //std::cout << "D";
339 |                   while (i < token_.size() && exp_possible) {
340 |                     exp_possible = isdigit(token_[i]);
341 |                     i++;
342 |                   }
343 |                   //std::cout << "E" << exp_possible << (token_[i-1]);
344 |                 }
345 |                 else {
346 |                   exp_possible = false;
347 |                 }
348 |               }
349 |               else {
350 |                 exp_possible = false;
351 |               }
352 |             }
353 |           }
354 |         }
355 |       if (integer_possible) {
356 |         handlers_[column_index_]->process_integer(fast_atoi<long>(token_.begin(), token_.end()));
357 |       }
358 |       else if (float_possible || exp_possible) {
359 |         handlers_[column_index_]->process_float(bsd_strtod(token_.begin(), token_.end()));
360 |       }
361 |       else {
362 |         parse_unquoted_string(token_.begin(), token_.end(), std::back_inserter(token_aux_));
363 |         if (convert_null_to_space_) {
364 |           convert_null_to_space(token_aux_.begin(), token_aux_.end());
365 |         }
366 |         handlers_[column_index_]->process_categorical(token_aux_.begin(), token_aux_.end());
367 |         token_aux_.clear();
368 |       }
369 | 
370 |       }
371 |     }
372 |     column_index_++;
373 |     token_.clear();
374 |   }
375 | 
376 |   void convert_to_cat_or_text(size_t column_index) {
377 |     handlers_[column_index]->convert_to_cat_or_text();
378 |   }
379 | 
380 |   void convert_to_text(size_t column_index) {
381 |     handlers_[column_index]->convert_to_text();
382 |   }
383 | 
384 | private:
385 |   std::vector<std::shared_ptr<ColumnHandler> > handlers_;
386 |   std::vector<char>                            token_;
387 |   std::vector<char>                            token_aux_;
388 |   std::vector<std::pair<size_t, long> >        long_cache_;
389 |   std::vector<std::pair<size_t, double> >      double_cache_;
390 |   std::vector<char>                            str_cache_data_;
391 |   std::vector<size_t>                          str_cache_offsets_;
392 |   std::vector<size_t>                          str_cache_column_;
393 |   bool                                         definitely_string_;
394 |   size_t                                       lines_parsed_;
395 |   char                                         quote_started_;
396 |   size_t                                       column_index_;
397 |   size_t                                       escape_jump_;
398 |   bool                                         convert_null_to_space_;
399 |   std::exception_ptr                           thread_exception_;
400 | };
401 | }
402 | }
403 | 
404 | #endif
405 | 


--------------------------------------------------------------------------------
/src/generic/chunker.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | 
  3 |     ParaText: parallel text reading
  4 |     Copyright (C) 2016. wise.io, Inc.
  5 | 
  6 |    Licensed to the Apache Software Foundation (ASF) under one
  7 |    or more contributor license agreements.  See the NOTICE file
  8 |    distributed with this work for additional information
  9 |    regarding copyright ownership.  The ASF licenses this file
 10 |    to you under the Apache License, Version 2.0 (the
 11 |    "License"); you may not use this file except in compliance
 12 |    with the License.  You may obtain a copy of the License at
 13 | 
 14 |      http://www.apache.org/licenses/LICENSE-2.0
 15 | 
 16 |    Unless required by applicable law or agreed to in writing,
 17 |    software distributed under the License is distributed on an
 18 |    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 19 |    KIND, either express or implied.  See the License for the
 20 |    specific language governing permissions and limitations
 21 |    under the License.
 22 |  */
 23 | 
 24 | /*
 25 |   Coder: Damian Eads.
 26 |  */
 27 | 
 28 | #ifndef PARATEXT_LINE_CHUNKER2_HPP
 29 | #define PARATEXT_LINE_CHUNKER2_HPP
 30 | 
 31 | #include <iostream>
 32 | #include <fstream>
 33 | 
 34 | #include <sys/types.h>
 35 | #include <sys/stat.h>
 36 | #include <unistd.h>
 37 | #include <thread>
 38 | #include <sstream>
 39 | #include <vector>
 40 | 
 41 | #include "quote_adjustment_worker.hpp"
 42 | 
 43 | namespace ParaText {
 44 | 
 45 |   /*
 46 |     Finds chunks in a text file that break on an unquoted
 47 |     newline. Text files are separated by newline separators.  If
 48 |     quoted newlines are supported, they are ignored for the purposes
 49 |     of separating lines.
 50 |    */
 51 |   class TextChunker {
 52 |   public:
 53 |     /*
 54 |       Constructs a new chunker with no chunk boundaries initialized.
 55 |      */
 56 |     TextChunker()          {}
 57 | 
 58 |     /*
 59 |       Destroys this text chunker.
 60 |      */
 61 |     virtual ~TextChunker() {}
 62 | 
 63 |     /*
 64 |       Computes the boundaries of the text chunks.
 65 | 
 66 |       \param filename          The text filename to open to computer offsets.
 67 |       \param starting_offset   The starting offset of the first chunk.
 68 |       \param maximum_chunks    The maximum number of chunks. The number of chunks
 69 |                                will be as close to this number as possible.
 70 |      */
 71 |     void process(const std::string &filename, size_t starting_offset, size_t maximum_chunks, bool allow_quoted_newlines) {
 72 |       filename_ = filename;
 73 |       starting_offset_ = starting_offset;
 74 |       maximum_chunks_ = maximum_chunks;
 75 |       struct stat fs;
 76 |       if (stat(filename.c_str(), &fs) == -1) {
 77 |         std::ostringstream ostr;
 78 |         ostr << "cannot open file '" << filename << "'";
 79 |         throw std::logic_error(ostr.str());
 80 |       }
 81 |       length_ = fs.st_size;
 82 |       if (length_ > 0) {
 83 |         lastpos_ = length_ - 1;
 84 |       }
 85 |       else {
 86 |         lastpos_ = 0;
 87 |       }
 88 |       in_.open(filename.c_str());
 89 |       if (!in_) {
 90 |         std::ostringstream ostr;
 91 |         ostr << "cannot open file '" << filename << "'";
 92 |         throw std::logic_error(ostr.str());
 93 |       }
 94 |       compute_offsets(allow_quoted_newlines);
 95 |     }
 96 | 
 97 |     /*
 98 |       Returns the number of chunks determined by this chunker.
 99 |      */
100 |     size_t num_chunks() const {
101 |       return start_of_chunk_.size();
102 |     }
103 | 
104 |     /*
105 |       Returns the (start, end) boundaries of a specific chunk. The ending
106 |       index is always inclusive.
107 |      */
108 |     std::pair<long, long> get_chunk(size_t index) const {
109 |       return std::make_pair(start_of_chunk_[index], end_of_chunk_[index]);
110 |     }
111 | 
112 |   private:
113 |     std::pair<long, char> get_num_trailing_escapes(long start_of_chunk, long end_of_chunk) {
114 |       long num_trailing_escapes = 0;
115 |       long k = end_of_chunk;
116 |       char successor = 0;
117 |       if (end_of_chunk < lastpos_) {
118 |         in_.clear();
119 |         in_.seekg(end_of_chunk + 1, std::ios_base::beg);
120 |         in_.read(&successor, 1);
121 |       }
122 | 
123 |       for (; k >= start_of_chunk; k--) {
124 |         in_.clear();
125 |         in_.seekg(k, std::ios_base::beg);
126 |         char buf;
127 |         in_.read(&buf, 1);
128 |         size_t nread = in_.gcount();
129 |         if (nread == 0 || buf != '\\') {
130 |           break;
131 |         }
132 |         num_trailing_escapes++;
133 |       }
134 |       return std::make_pair(num_trailing_escapes, successor);
135 |     }
136 | 
137 |     void compute_offsets(bool allow_quoted_newlines = true) {
138 |       const size_t chunk_size = std::max(2L, (long)((length_ - starting_offset_) / maximum_chunks_));
139 |       long start_of_chunk = starting_offset_;
140 | #ifdef PARALOAD_DEBUG
141 |       std::cerr << "number of threads: " << maximum_chunks_ << std::endl;
142 |       std::cerr << "length: " << length_ << std::endl;
143 | #endif
144 |       for (size_t worker_id = 0; worker_id < maximum_chunks_; worker_id++) {
145 |         long end_of_chunk = std::min(lastpos_, start_of_chunk + (long)chunk_size);
146 |         if (end_of_chunk < start_of_chunk) {
147 |           start_of_chunk = lastpos_ + 1;
148 |           end_of_chunk = lastpos_ + 1;
149 |           start_of_chunk_.push_back(start_of_chunk);
150 |           end_of_chunk_.push_back(end_of_chunk);
151 |           break;
152 |         }
153 | #ifdef PARALOAD_DEBUG
154 |         std::cerr << "initial>>> start_of_chunk: " << start_of_chunk << " end_of_chunk: " << end_of_chunk << std::endl;
155 | #endif
156 |         if (worker_id == maximum_chunks_ - 1) {
157 |           end_of_chunk = lastpos_;
158 |         }
159 |         long trailing_escapes;
160 |         char trailing_successor;
161 |         std::tie(trailing_escapes, trailing_successor) = get_num_trailing_escapes(start_of_chunk, end_of_chunk);
162 |         if (trailing_escapes % 2 == 1) {
163 |           long extra = 0;
164 |           switch (trailing_successor) {
165 |           case 'x': /* \xYY */
166 |             extra = 3;
167 |             break;
168 |           case 'u': /* \uXXXX */
169 |             extra = 5;
170 |             break;
171 |           case 'U': /* \UXXXXXXXX */
172 |             extra = 9;
173 |             break;
174 |           case 'n':
175 |           case '0':
176 |           case 'r':
177 |           case 'v':
178 |           case 't':
179 |           case 'b':
180 |           case '\\':
181 |           case '\"':
182 |           case '\'':
183 |           case '{':
184 |           case '}':
185 |           case ' ':
186 |           case ',':
187 |           case ')':
188 |           case '(':
189 |             extra = 1;
190 |             break;
191 |           default:
192 |             {
193 |               std::ostringstream ostr;
194 |               ostr << "invalid escape character: \\" << trailing_successor;
195 |               throw std::logic_error(ostr.str());
196 |             }
197 |           }
198 |           if (end_of_chunk + extra > lastpos_) {
199 |             std::ostringstream ostr;
200 |             ostr << "file ends with a trailing escape sequence \\" << trailing_successor;
201 |             throw std::logic_error(ostr.str());
202 |           }
203 |           else {
204 |             end_of_chunk++;
205 | #ifdef PARALOAD_DEBUG
206 |             std::cerr << "cover escape: " << end_of_chunk << std::endl;
207 | #endif
208 |           }
209 |         }
210 |         start_of_chunk_.push_back(start_of_chunk);
211 |         end_of_chunk_.push_back(end_of_chunk);
212 |         if (end_of_chunk >= lastpos_) {
213 |           break;
214 |         }
215 |         start_of_chunk = end_of_chunk + 1;
216 |       }
217 |       if (allow_quoted_newlines) {
218 |         adjust_offsets_according_to_quoted_newlines();
219 |       }
220 |       else {
221 |         adjust_offsets_according_to_unquoted_newlines();
222 |       }
223 |       for (size_t chunk_id = 0; chunk_id < start_of_chunk_.size(); chunk_id++) {
224 | #ifdef PARALOAD_DEBUG
225 |         std::cerr << "final>>> start_of_chunk: " << start_of_chunk_[chunk_id] << " end_of_chunk: " << end_of_chunk_[chunk_id] << std::endl;
226 | #endif
227 |       }
228 |     }
229 | 
230 |     void adjust_offsets_according_to_unquoted_newlines() {
231 |       const size_t block_size = 512;
232 |       char buf[block_size];
233 |       for (size_t worker_id = 0; worker_id < start_of_chunk_.size(); worker_id++) {
234 |         if (start_of_chunk_[worker_id] < 0 || end_of_chunk_[worker_id] < 0) {
235 |           continue;
236 |         }
237 |         in_.clear();
238 |         in_.seekg(end_of_chunk_[worker_id], std::ios_base::beg);
239 |         long new_end = end_of_chunk_[worker_id];
240 |         bool new_end_found = false;
241 |         long current = new_end;
242 |         while (in_ && !new_end_found) {
243 |           in_.read(buf, block_size);
244 |           size_t nread = in_.gcount();
245 |           if (nread == 0) {
246 |             break;
247 |           }
248 |           for (size_t i = 0; i < nread; i++) {
249 |             if (buf[i] == '\n') {
250 |               new_end = current + i;
251 |               new_end_found = true;
252 |             break;
253 |             }
254 |           }
255 |           current += nread;
256 |         }
257 |         if (!new_end_found) {
258 |           new_end = lastpos_;
259 |         }
260 |         end_of_chunk_[worker_id] = new_end;
261 |         for (size_t other_worker_id = worker_id + 1; other_worker_id < start_of_chunk_.size(); other_worker_id++) {
262 |           if (end_of_chunk_[other_worker_id] <= new_end || new_end == lastpos_) {
263 |             start_of_chunk_[other_worker_id] = -1;
264 |             end_of_chunk_[other_worker_id] = -1;
265 |           } else if (start_of_chunk_[other_worker_id] <= new_end) {            
266 |             start_of_chunk_[other_worker_id] = new_end + 1;
267 |             end_of_chunk_[other_worker_id] = std::max(end_of_chunk_[other_worker_id], new_end + 1);
268 |           }
269 |         }
270 |       }
271 |     }
272 | 
273 |     void adjust_offsets_according_to_quoted_newlines() {
274 |       std::vector<std::thread> threads;
275 |       std::vector<std::shared_ptr<QuoteNewlineAdjustmentWorker> > workers;
276 |       std::exception_ptr thread_exception;
277 |       for (size_t worker_id = 0; worker_id < start_of_chunk_.size(); worker_id++) {
278 |         workers.push_back(std::make_shared<QuoteNewlineAdjustmentWorker>(start_of_chunk_[worker_id],
279 |                                                                          end_of_chunk_[worker_id]));
280 |         threads.emplace_back(&QuoteNewlineAdjustmentWorker::parse, workers.back(), filename_);
281 |       }
282 |       for (size_t thread_id = 0; thread_id < threads.size(); thread_id++) {
283 |         threads[thread_id].join();
284 |         if (!thread_exception) {
285 |           thread_exception = workers[thread_id]->get_exception();
286 |         }
287 |       }
288 |       for (size_t chunk_id = 0; chunk_id < workers.size(); chunk_id++) {
289 | #ifdef PARALOAD_DEBUG
290 |         std::cerr << "quotes>>> wid=" << chunk_id << " start_of_chunk: " << start_of_chunk_[chunk_id] << " end_of_chunk: " << end_of_chunk_[chunk_id] << " num_quotes: " << workers[chunk_id]->get_num_quotes() << std::endl;
291 | #endif
292 |       }
293 |       // We're now outside the parallel region.
294 |       if (thread_exception) {
295 |         std::rethrow_exception(thread_exception);
296 |       }
297 |       std::vector<size_t> cumulative_quote_sum(workers.size(), 0);
298 |       if (workers.size() > 0) {
299 |         cumulative_quote_sum[0] = workers[0]->get_num_quotes();
300 |         for (size_t i = 1; i < workers.size(); i++) {
301 |         cumulative_quote_sum[i] = cumulative_quote_sum[i - 1] + workers[i]->get_num_quotes();
302 |         }
303 |       }
304 | #ifdef PARALOAD_DEBUG
305 |       std::cerr << "total unescaped quotes: " << cumulative_quote_sum.back() << std::endl;
306 | #endif
307 |       size_t current = 0;
308 |       size_t next = 1;
309 |       while (current < workers.size()) {
310 |         if (end_of_chunk_[current] < 0 || start_of_chunk_[current] < 0) {
311 |           start_of_chunk_[current] = -1;
312 |           end_of_chunk_[current] = -1;
313 | #ifdef PARALOAD_DEBUG
314 |           std::cerr << "negative chunk current=" << current << std::endl;
315 | #endif
316 |           current++;
317 |           /*          if (next_wid < workers.size()) {
318 |             quotes_so_far += workers[next_wid]->get_num_quotes();
319 |             next_wid++;
320 |           }
321 |           continue;*/
322 |         }
323 |         else if (cumulative_quote_sum[next-1] % 2 == 0) { /* even number of quotes so far. */
324 |           if (next < workers.size()) {
325 | #ifdef PARALOAD_DEBUG
326 |             std::cerr << "[A] current=" << current << " next=" << next << " quotes_so_far=" << cumulative_quote_sum[current] << std::endl;
327 | #endif
328 |             long pos = workers[next]->get_first_unquoted_newline();
329 |             if (pos >= 0) { /* resolved */
330 |               end_of_chunk_[current] = pos;
331 |               if (end_of_chunk_[next] == pos) { /* take all of next chunk. */
332 |                 start_of_chunk_[next] = -1;
333 |                 end_of_chunk_[next] = -1;
334 |                 current = next + 1;
335 |                 next += 2;
336 |               }
337 |               else {  /* take part of next chunk. */
338 |                 start_of_chunk_[next] = pos + 1;
339 |                 current = next;
340 |                 next++;
341 |               }
342 |             }
343 |             else { /* no resolution. do not increment current */
344 |               end_of_chunk_[current] = end_of_chunk_[next];
345 |               start_of_chunk_[next] = -1;
346 |               end_of_chunk_[next] = -1;
347 |               next++;
348 |             }
349 |           }
350 |           else { /* EOF resolution. */
351 |             end_of_chunk_[current] = lastpos_;
352 |             break;
353 |           }
354 |         }
355 |         else { /* odd number of quotes so far. */
356 |           if (next < workers.size()) {
357 | #ifdef PARALOAD_DEBUG
358 |             std::cerr << "[B] current=" << current << " next=" << next << " quotes_so_far=" << cumulative_quote_sum[next] << std::endl;
359 | #endif
360 |             long pos = workers[next]->get_first_quoted_newline();
361 |             if (pos >= 0) { /* resolution*/
362 |               end_of_chunk_[current] = pos;
363 |               if (end_of_chunk_[next] == pos) { /*take all of next chunk. */
364 |                 start_of_chunk_[next] = -1;
365 |                 end_of_chunk_[next] = -1;
366 |                 current = next + 1;
367 |                 next += 2;
368 |               }
369 |               else { /* take part of next chunk. */
370 |                 start_of_chunk_[next] = pos + 1;
371 |                 current = next;
372 |                 next++;
373 |               }
374 |             }
375 |             else { /*no resolution. take all of chunk. */
376 |               end_of_chunk_[current] = end_of_chunk_[next];
377 |               start_of_chunk_[next] = -1;
378 |               end_of_chunk_[next] = -1;
379 |               next++;
380 |             }
381 |           }
382 |           else { /* no resolution and EOF. */
383 |             std::ostringstream ostr;
384 |             ostr << "The file ends with an open quote; a total of " << cumulative_quote_sum[current] << ")";
385 |             throw std::logic_error(ostr.str());
386 |           }
387 |         }
388 |       }
389 |     }
390 | 
391 |   private:
392 |     std::ifstream in_;
393 |     std::string filename_;
394 |     size_t maximum_chunks_;
395 |     size_t length_;
396 |     long lastpos_;
397 |     long starting_offset_;
398 |     std::vector<long> start_of_chunk_;
399 |     std::vector<long> end_of_chunk_;
400 |   };
401 | }
402 | #endif
403 | 


--------------------------------------------------------------------------------
/src/python/numpy_helper.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |     ParaText: parallel text reading
  3 |     Copyright (C) 2016. wise.io, Inc.
  4 | 
  5 |    Licensed to the Apache Software Foundation (ASF) under one
  6 |    or more contributor license agreements.  See the NOTICE file
  7 |    distributed with this work for additional information
  8 |    regarding copyright ownership.  The ASF licenses this file
  9 |    to you under the Apache License, Version 2.0 (the
 10 |    "License"); you may not use this file except in compliance
 11 |    with the License.  You may obtain a copy of the License at
 12 | 
 13 |      http://www.apache.org/licenses/LICENSE-2.0
 14 | 
 15 |    Unless required by applicable law or agreed to in writing,
 16 |    software distributed under the License is distributed on an
 17 |    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 18 |    KIND, either express or implied.  See the License for the
 19 |    specific language governing permissions and limitations
 20 |    under the License.
 21 | 
 22 |  */
 23 | 
 24 | /*
 25 |   Coder: Damian Eads.
 26 |  */
 27 | 
 28 | #ifndef WISEIO_NUMPY_HELPER_HPP
 29 | #define WISEIO_NUMPY_HELPER_HPP
 30 | 
 31 | #include <Python.h>
 32 | 
 33 | #include <unordered_map>
 34 | #include <typeinfo>
 35 | #include <typeindex>
 36 | #include <memory>
 37 | #include <iostream>
 38 | 
 39 | #include "../generic/encoding.hpp"
 40 | 
 41 | #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
 42 | #include <numpy/arrayobject.h>
 43 | #include <numpy/npy_math.h>
 44 | 
 45 | 
 46 | template <class T>
 47 | struct numpy_type {};
 48 | 
 49 | template <> struct numpy_type<uint8_t>  { static const long id = NPY_UINT8; };
 50 | template <> struct numpy_type<int8_t>   { static const long id = NPY_INT8; };
 51 | template <> struct numpy_type<uint16_t> { static const long id = NPY_UINT16; };
 52 | template <> struct numpy_type<int16_t>  { static const long id = NPY_INT16; };
 53 | template <> struct numpy_type<uint32_t> { static const long id = NPY_UINT32; };
 54 | template <> struct numpy_type<int32_t>  { static const long id = NPY_INT32; };
 55 | template <> struct numpy_type<uint64_t> { static const long id = NPY_UINT64; };
 56 | template <> struct numpy_type<int64_t>  { static const long id = NPY_INT64; };
 57 | template <> struct numpy_type<float> { static const long id = NPY_FLOAT; };
 58 | template <> struct numpy_type<double>  { static const long id = NPY_DOUBLE; };
 59 | template <> struct numpy_type<std::string>  { static const long id = NPY_OBJECT; };
 60 | 
 61 | #if defined(__APPLE__)
 62 | template <> struct numpy_type<unsigned long>  { static const long id = NPY_ULONG; };
 63 | #endif
 64 | 
 65 | template <int InEncoding, int OutEncoding>
 66 | struct AsPythonString {};
 67 | 
 68 | template <>
 69 | struct AsPythonString<ParaText::Encoding::UNKNOWN_BYTES,
 70 |                       ParaText::Encoding::UNICODE_UTF8> {
 71 |   PyObject *operator()(const std::string &in) const {
 72 |     PyObject *attempt = PyUnicode_FromStringAndSize(in.c_str(), in.size());
 73 |     if (attempt == NULL) {
 74 |        PyErr_Clear();
 75 |        attempt = PyBytes_FromStringAndSize(in.c_str(), in.size());
 76 |     }
 77 |     return attempt;
 78 |   }
 79 | };
 80 | 
 81 | template <>
 82 | struct AsPythonString<ParaText::Encoding::UNKNOWN_BYTES,
 83 |                       ParaText::Encoding::UNKNOWN_BYTES> {
 84 |   PyObject *operator()(const std::string &in) const {
 85 | #if PY_MAJOR_VERSION >= 3
 86 |     return PyBytes_FromStringAndSize(in.c_str(), in.size());
 87 | #else
 88 |     return PyString_FromStringAndSize(in.c_str(), in.size());
 89 | #endif
 90 |   }
 91 | };
 92 | 
 93 | template <>
 94 | struct AsPythonString<ParaText::Encoding::UNICODE_UTF8,
 95 |                       ParaText::Encoding::UNKNOWN_BYTES> {
 96 |   PyObject *operator()(const std::string &in) const {
 97 | #if PY_MAJOR_VERSION >= 3
 98 |     return PyBytes_FromStringAndSize(in.c_str(), in.size());
 99 | #else
100 |     return PyString_FromStringAndSize(in.c_str(), in.size());
101 | #endif
102 |   }
103 | };
104 | 
105 | template <>
106 | struct AsPythonString<ParaText::Encoding::UNICODE_UTF8,
107 |                       ParaText::Encoding::UNICODE_UTF8> {
108 |   PyObject *operator()(const std::string &in) const {
109 |     PyObject *attempt = PyUnicode_FromStringAndSize(in.c_str(), in.size());
110 |     if (attempt == NULL) {
111 |        PyErr_Clear();
112 |        attempt = PyBytes_FromStringAndSize(in.c_str(), in.size());
113 |     }
114 |     return attempt;
115 |   }
116 | };
117 | 
118 | 
119 | template <int InEncoding, int OutEncoding>
120 | inline PyObject *as_python_string(const std::string &in) {
121 |   AsPythonString<InEncoding, OutEncoding> encoder;
122 |   return encoder(in);
123 | }
124 | 
125 | template <class Container, int InEncoding, int OutEncoding, class Enable=void>
126 | struct build_array_impl {};
127 | 
128 | template <class Container, int InEncoding, int OutEncoding>
129 | struct build_array_impl<Container, InEncoding, OutEncoding, typename std::enable_if<std::is_arithmetic<typename Container::value_type>::value>::type> {
130 |   typedef typename Container::value_type value_type;
131 | 
132 |   static PyObject *build_array(const Container &container) {
133 |     npy_intp fdims[] = {(npy_intp)container.size()};
134 |     PyObject *array = (PyObject*)PyArray_SimpleNew(1, fdims, numpy_type<value_type>::id);
135 |     try {
136 |       value_type *data = (value_type*)PyArray_DATA((PyArrayObject*)array);
137 |       for (size_t i = 0; i < container.size(); i++) {
138 |         data[i] = container[i];
139 |       }
140 |     }
141 |     catch (...) {
142 |       Py_XDECREF(array);
143 |       array = NULL;
144 |       std::rethrow_exception(std::current_exception());
145 |     }
146 |     return array;
147 |   }
148 | 
149 | };
150 | 
151 | template <class Container, int InEncoding, int OutEncoding>
152 | struct build_array_impl<Container, InEncoding, OutEncoding, typename std::enable_if<std::is_same<typename Container::value_type, std::string>::value>::type> {
153 | 
154 |   typedef typename Container::value_type value_type;
155 | 
156 |   static PyObject *build_array(const Container &container) {
157 |     size_t sz = (size_t)container.size();
158 |     npy_intp fdims[] = {(npy_intp)sz};
159 |     PyObject *array = (PyObject*)PyArray_SimpleNew(1, fdims, NPY_OBJECT);
160 |     try {
161 |       for (size_t i = 0; i < container.size(); i++) {
162 |         PyObject **ref = (PyObject **)PyArray_GETPTR1((PyArrayObject*)array, i);
163 |         PyObject *newobj = as_python_string<InEncoding, OutEncoding>(container[i]);
164 |         Py_XDECREF(*ref);
165 |         *ref = newobj;
166 |       }
167 |     }
168 |     catch (...) {
169 |       for (size_t i = 0; i < sz; i++) {
170 |         PyObject **ref = (PyObject **)PyArray_GETPTR1((PyArrayObject*)array, i);
171 |         Py_XDECREF(*ref);
172 |         *ref = Py_None;
173 |         Py_XINCREF(*ref);
174 |       }
175 |       Py_XDECREF(array);
176 |       std::rethrow_exception(std::current_exception());
177 |     }
178 |     return array;
179 |   }
180 | 
181 | };
182 | 
183 | 
184 | template <class Iterator, int InEncoding, int OutEncoding, class Enable=void>
185 | struct build_array_from_range_impl {};
186 | 
187 | template <class Iterator, int InEncoding, int OutEncoding>
188 | struct build_array_from_range_impl<Iterator, InEncoding, OutEncoding, typename std::enable_if<std::is_arithmetic<typename std::iterator_traits<Iterator>::value_type>::value>::type> {
189 |   typedef typename Iterator::value_type value_type;
190 | 
191 |   static PyObject *build_array(const std::pair<Iterator, Iterator> &range) {
192 |     npy_intp fdims[] = {(npy_intp)std::distance(range.first, range.second)};
193 |     PyObject *array = NULL;
194 |     try {
195 |       array = (PyObject*)PyArray_SimpleNew(1, fdims, numpy_type<value_type>::id);
196 |       value_type *data = (value_type*)PyArray_DATA((PyArrayObject*)array);
197 |       size_t i = 0;
198 |       for (Iterator it = range.first; it != range.second; it++, i++) {
199 |         data[i] = *it;
200 |       }
201 |     }
202 |     catch (...) {
203 |       Py_XDECREF(array);
204 |       array = NULL;
205 |       std::rethrow_exception(std::current_exception());
206 |     }
207 |     return array;
208 |   }
209 | };
210 | 
211 | template <class Iterator, int InEncoding, int OutEncoding>
212 | struct build_array_from_range_impl<Iterator, InEncoding, OutEncoding, typename std::enable_if<std::is_same<typename std::iterator_traits<Iterator>::value_type, std::string>::value>::type> {
213 | 
214 |   typedef typename Iterator::value_type value_type;
215 | 
216 |   static PyObject *build_array(const std::pair<Iterator, Iterator> &range) {
217 |     size_t sz = (npy_intp)std::distance(range.first, range.second);
218 |     npy_intp fdims[] = {(npy_intp)sz};
219 |     PyObject *array = (PyObject*)PyArray_SimpleNew(1, fdims, NPY_OBJECT);
220 |     try {
221 |       size_t i = 0;
222 |       for (Iterator it = range.first; it != range.second; it++, i++) {
223 |         PyObject **ref = (PyObject **)PyArray_GETPTR1((PyArrayObject*)array, i);
224 |         PyObject *newobj = as_python_string<InEncoding, OutEncoding>(*it);
225 |         Py_XDECREF(*ref);
226 |         *ref = newobj;
227 |       }
228 |     }
229 |     catch (...) {
230 |       size_t i = 0;
231 |       for (i = 0; i < sz; i++) {
232 |         PyObject **ref = (PyObject **)PyArray_GETPTR1((PyArrayObject*)array, i);
233 |         Py_XDECREF(*ref);
234 |         *ref = Py_None;
235 |         Py_XINCREF(*ref);
236 |       }
237 |       Py_XDECREF(array);
238 |       array = NULL;
239 |       std::rethrow_exception(std::current_exception());
240 |     }
241 |     return array;
242 |   }
243 | };
244 | 
245 | template <class Populator>
246 | struct base_insert_populator_impl {
247 |   base_insert_populator_impl() {}
248 |   virtual ~base_insert_populator_impl() {}
249 | 
250 |   virtual PyObject *populate(const Populator &populator) = 0;
251 | };
252 | 
253 | template <class Populator, class T>
254 | struct derived_insert_populator_impl : public base_insert_populator_impl<Populator> {
255 |   typedef T value_type;
256 | 
257 |   derived_insert_populator_impl() {}
258 |   virtual ~derived_insert_populator_impl() {}
259 | 
260 |   virtual PyObject *populate(const Populator &populator) {
261 |     npy_intp fdims[] = {(npy_intp)populator.size()};
262 |     PyObject *array = NULL;
263 |     try {
264 |       array = (PyObject*)PyArray_SimpleNew(1, fdims, numpy_type<value_type>::id);
265 |       value_type *data = (value_type*)PyArray_DATA((PyArrayObject*)array);
266 |       populator.insert_into_buffer(data);
267 |     }
268 |     catch (...) {
269 |       Py_XDECREF(array);
270 |       array = NULL;
271 |       std::rethrow_exception(std::current_exception());
272 |     }
273 |     return array;
274 |   }
275 | };
276 | 
277 | 
278 | template <int InEncoding, int OutEncoding>
279 | struct string_array_output_iterator  : public std::iterator<std::forward_iterator_tag, std::string> {
280 |   string_array_output_iterator(PyArrayObject *array) : i(0), array(array) {}
281 | 
282 |   inline string_array_output_iterator &operator++() {
283 |     PyObject *s = as_python_string<InEncoding, OutEncoding>(output);
284 |     PyObject **ref = (PyObject **)PyArray_GETPTR1((PyArrayObject*)array, i);
285 |     Py_XDECREF(*ref);
286 |     *ref = s;
287 |     i++;
288 |     return *this;
289 |   }
290 | 
291 |   inline string_array_output_iterator &operator++(int) {
292 |     return operator++();
293 |   }
294 | 
295 |   inline std::string &operator*() {
296 |     return output;
297 |   }
298 | 
299 |   long i;
300 |   std::string output;
301 |   PyArrayObject *array;
302 | };
303 | 
304 | template <class Populator>
305 | struct derived_insert_populator_impl<Populator, std::string> : public base_insert_populator_impl<Populator> {
306 |   typedef std::string value_type;
307 | 
308 |   derived_insert_populator_impl() {}
309 |   virtual ~derived_insert_populator_impl() {}
310 | 
311 |   virtual PyObject *populate(const Populator &populator) {
312 |     using ParaText::Encoding;
313 |     npy_intp fdims[] = {(npy_intp)populator.size()};
314 |     PyObject *array = NULL;
315 |     array = (PyObject*)PyArray_SimpleNew(1, fdims, numpy_type<value_type>::id);
316 |     try {
317 |       ParaText::Encoding in = populator.get_in_encoding();
318 |       ParaText::Encoding out = populator.get_out_encoding();
319 |       if (in == Encoding::UNKNOWN_BYTES && out == Encoding::UNKNOWN_BYTES) {
320 |         string_array_output_iterator<Encoding::UNKNOWN_BYTES, Encoding::UNKNOWN_BYTES> oit((PyArrayObject*)array);
321 |         populator.insert_and_forget(oit);
322 |       }
323 |       else if (in == Encoding::UNICODE_UTF8 && out == Encoding::UNKNOWN_BYTES) {
324 |         string_array_output_iterator<Encoding::UNICODE_UTF8, Encoding::UNKNOWN_BYTES> oit((PyArrayObject*)array);
325 |         populator.insert_and_forget(oit);
326 |       }
327 |       else if (in == Encoding::UNICODE_UTF8 && out == Encoding::UNICODE_UTF8) {
328 |         string_array_output_iterator<Encoding::UNICODE_UTF8, Encoding::UNICODE_UTF8> oit((PyArrayObject*)array);
329 |         populator.insert_and_forget(oit);
330 |       }
331 |       else if (in == Encoding::UNKNOWN_BYTES && out == Encoding::UNICODE_UTF8) {
332 |         string_array_output_iterator<Encoding::UNKNOWN_BYTES, Encoding::UNICODE_UTF8> oit((PyArrayObject*)array);
333 |         populator.insert_and_forget(oit);
334 |       }
335 |       else {
336 |         throw std::logic_error("unknown encoding");
337 |       }
338 |     }
339 |     catch (...) {
340 |       Py_XDECREF(array);
341 |       array = NULL;
342 |       std::rethrow_exception(std::current_exception());
343 |     }
344 |     return array;
345 |   }
346 | };
347 | 
348 | template <class Populator>
349 | PyObject *build_populator(const Populator &populator) {
350 |   static std::unordered_map<std::type_index, std::shared_ptr<base_insert_populator_impl<Populator>>>
351 |     populators({std::make_pair(std::type_index(typeid(uint8_t)),
352 |                                std::make_shared<derived_insert_populator_impl<Populator, uint8_t>>()),
353 |           std::make_pair(std::type_index(typeid(int8_t)),
354 |                          std::make_shared<derived_insert_populator_impl<Populator, int8_t>>()),
355 |           std::make_pair(std::type_index(typeid(uint16_t)),
356 |                          std::make_shared<derived_insert_populator_impl<Populator, uint16_t>>()),
357 |           std::make_pair(std::type_index(typeid(int16_t)),
358 |                          std::make_shared<derived_insert_populator_impl<Populator, int16_t>>()),
359 |           std::make_pair(std::type_index(typeid(uint32_t)),
360 |                          std::make_shared<derived_insert_populator_impl<Populator, uint32_t>>()),
361 |           std::make_pair(std::type_index(typeid(int32_t)),
362 |                          std::make_shared<derived_insert_populator_impl<Populator, int32_t>>()),
363 |           std::make_pair(std::type_index(typeid(uint64_t)),
364 |                          std::make_shared<derived_insert_populator_impl<Populator, uint64_t>>()),
365 |           std::make_pair(std::type_index(typeid(int64_t)),
366 |                          std::make_shared<derived_insert_populator_impl<Populator, int64_t>>()),
367 |           std::make_pair(std::type_index(typeid(float)),
368 |                          std::make_shared<derived_insert_populator_impl<Populator, float>>()),
369 |           std::make_pair(std::type_index(typeid(double)),
370 |                          std::make_shared<derived_insert_populator_impl<Populator, double>>()),
371 |           std::make_pair(std::type_index(typeid(std::string)),
372 |                          std::make_shared<derived_insert_populator_impl<Populator, std::string>>())
373 |           });
374 |   auto it = populators.find(populator.get_type_index());
375 |   if (it == populators.end()) {
376 |     throw std::logic_error(std::string("cannot process type"));
377 |   }
378 |   return it->second->populate(populator);
379 | }
380 | 
381 | template <int InEncoding, int OutEncoding, class Container>
382 | PyObject *build_array(const Container &container) {
383 |   return (PyObject*)build_array_impl<Container, InEncoding, OutEncoding>::build_array(container);
384 | }
385 | 
386 | template <int InEncoding, int OutEncoding, class Iterator>
387 | PyObject *build_array_from_range(const std::pair<Iterator, Iterator> &range) {
388 |   return (PyObject*)build_array_from_range_impl<Iterator, InEncoding, OutEncoding>::build_array(range);
389 | }
390 | 
391 | #endif
392 | 


--------------------------------------------------------------------------------