├── bench ├── clear_cache.sh ├── compare_files.py ├── convert_files.sh ├── test_disk.sh ├── convert_benchmark_files.sh ├── test_readcsv.R ├── run_experiment.sh ├── run_pyspark_experiment.sh ├── convert.py ├── test_readr.R ├── test_fread.R ├── convert_to_binary.py ├── compile_log_files.py └── generate_experiments.py ├── ci ├── osx_install.sh └── linux_install.sh ├── src ├── generic │ ├── encoding.hpp │ ├── parse_params.hpp │ ├── quote_adjustment_worker.hpp │ └── chunker.hpp ├── paratext_internal.hpp ├── util │ ├── unicode.hpp │ └── safe_string_output.hpp ├── csv │ ├── parallel.hpp │ ├── rowbased_loader.hpp │ ├── rowbased_worker.hpp │ ├── header_parser.hpp │ ├── colbased_chunk.hpp │ └── colbased_worker.hpp ├── paratext_internal.i ├── paratext_internal.cpp ├── diagnostic │ ├── memcopy.hpp │ ├── newline_counter.hpp │ └── parse_and_sum.hpp └── python │ ├── processor.hpp │ ├── python.i │ └── numpy_helper.hpp ├── .travis.yml ├── python ├── paratext │ ├── helpers.py │ ├── serial.py │ └── testing.py └── setup.py ├── tests └── test_paratext.py ├── LICENSE └── README.md /bench/clear_cache.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Ensure all writes are synced to disk. 4 | sudo bash -c "sync || sync || sync || sync" 5 | # Clear the caches 6 | sudo bash -c "echo 3 > /proc/sys/vm/drop_caches" 7 | -------------------------------------------------------------------------------- /ci/osx_install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | brew update 4 | 5 | if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then 6 | wget https://repo.continuum.io/miniconda/Miniconda2-latest-MacOSX-x86_64.sh -O miniconda.sh; 7 | else 8 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh; 9 | fi 10 | 11 | echo "clang++" > .cxx.choice 12 | -------------------------------------------------------------------------------- /bench/compare_files.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import paratext 4 | import json 5 | 6 | for fn in ["mnist8m", "mnist", "messy", "messy2", "car", "float1", "float2", "float3", "float4"]: 7 | if os.path.exists(fn + ".csv"): 8 | result = paratext.internal_compare(fn) 9 | fid = open(fn + "-compare.json", "w") 10 | json.dumps(fid) 11 | fid.close() 12 | -------------------------------------------------------------------------------- /ci/linux_install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | #apt-get update -qq 4 | #apt-get install -qq g++-4.8 5 | #add-apt-repository -y ppa:ubuntu-toolchain-r/test 6 | 7 | if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then 8 | wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh -O miniconda.sh; 9 | else 10 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; 11 | fi 12 | 13 | echo "g++-4.8" > .cxx.choice 14 | 15 | -------------------------------------------------------------------------------- /bench/convert_files.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Convert original CSV files to different formats. 4 | 5 | echo mnist 6 | python convert.py mnist.csv mnist.feather mnist.hdf5 mnist.pkl mnist.npy 7 | echo messy 8 | python convert.py messy.csv messy.feather messy.pkl messy.npy 9 | echo messy2 10 | python convert.py messy2.csv messy2.feather messy2.pkl messy2.npy 11 | echo mnist8m 12 | python convert.py mnist8m.csv mnist8m.feather mnist8m.hdf5 mnist8m.pkl mnist8m.npy 13 | echo car 14 | python convert.py car.csv car.feather car.pkl car.npy 15 | echo floats 16 | python convert.py floats.csv floats.feather floats.pkl floats.npy floats.hdf5 17 | 18 | -------------------------------------------------------------------------------- /bench/test_disk.sh: -------------------------------------------------------------------------------- 1 | # Tests a filesystem's warm and cold performance over time. Run the directory where the device is mounted. 2 | # Throttled disks should result in variable results. Otherwise, the results should be closer to constant. 3 | 4 | device="$1" 5 | log="$2" 6 | 7 | for i in $(seq 1 1000); 8 | do 9 | disk_results="$(sudo bash -c "hdparm -Tt ${device} | grep Timing | sed -e 's/.*=//g' | sed -e 's/ MB\/sec//g'")" 10 | disk_A="$(echo $disk_results | cut -d' ' -f1)" 11 | disk_B="$(echo $disk_results | cut -d' ' -f2)" 12 | sudo bash -c "free && sync && echo 3 > /proc/sys/vm/drop_caches && free" 13 | run_experiment.py - cmd=countnl disk_state="cold" filename="floats.csv" diskA="$disk_A" diskB="$disk_B" log="$log" num_threads=32 14 | run_experiment.py - cmd=countnl disk_state="warm" filename="floats.csv" log="$log" num_threads=32 15 | done 16 | 17 | -------------------------------------------------------------------------------- /bench/convert_benchmark_files.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Convert original CSV files to different formats. 4 | 5 | echo mnist 6 | python convert_to_binary.py mnist.csv 1 mnist.feather mnist.hdf5 mnist.pkl mnist.npy 7 | echo messy 8 | python convert_to_binary.py messy.csv 1 messy.feather messy.pkl messy.npy 9 | echo messy2 10 | python convert_to_binary.py messy2.csv 1 messy2.feather messy2.pkl messy2.npy 11 | echo mnist8m 12 | python convert_to_binary.py mnist8m.csv 1 mnist8m.feather mnist8m.hdf5 mnist8m.pkl mnist8m.npy 13 | echo car 14 | python convert_to_binary.py car.csv 1 car.feather car.pkl car.npy 15 | echo floats 16 | python convert_to_binary.py floats.csv 0 floats.feather floats.pkl floats.npy floats.hdf5 17 | python convert_to_binary.py floats2.csv 0 floats2.feather floats2.pkl floats2.npy floats2.hdf5 18 | python convert_to_binary.py floats3.csv 0 floats3.feather floats3.pkl floats3.npy floats3.hdf5 19 | python convert_to_binary.py floats4.csv 0 floats4.feather floats4.pkl floats4.npy floats4.hdf5 20 | 21 | -------------------------------------------------------------------------------- /bench/test_readcsv.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # 3 | # test_readcsv.R in.csv out.json 4 | # 5 | # Loads the file in.csv into an R data frame. Sums its numeric 6 | # columns. Outputs the run times and memory usage to the JSON file 7 | # out.json. 8 | 9 | require("rjson", quietly=TRUE) 10 | 11 | memory_usage <- function(){ 12 | return(strtoi(system(paste("ps -o rss ", Sys.getpid(), "| tail -1"), intern=TRUE))*1024) 13 | } 14 | 15 | args <- commandArgs(TRUE) 16 | filename <- args[1] 17 | result_filename <- args[2] 18 | load_tic <- Sys.time() 19 | df = read.csv(filename) 20 | load_toc <- Sys.time() 21 | load_time <- as.double(difftime(load_toc, load_tic, units="secs")) 22 | 23 | mem <- memory_usage() 24 | 25 | sum_tic <- Sys.time() 26 | s <- colSums(Filter(is.numeric, df)) 27 | s2 <- apply(Filter(function(x){!is.numeric(x)}, df), 2, function(x){sum(nchar(x))}) 28 | sum_toc <- Sys.time() 29 | sum_time <- as.double(difftime(sum_toc, sum_tic, units="secs")) 30 | 31 | results = list(cmd = "R-readcsv", load_time = load_time, mem = mem, sum_time = sum_time) 32 | json = rjson::toJSON(results) 33 | write(json, result_filename) 34 | -------------------------------------------------------------------------------- /bench/run_experiment.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | json_file="${1:-}" 4 | num_trials="${2:-1}" 5 | did="${3:-normal}" 6 | log_path="${4:-normal}" 7 | 8 | if [ "$json_file" == "" ]; 9 | then 10 | echo "usage: run_experiment.sh json_filename [num_trials:1]" 11 | exit 1 12 | fi 13 | 14 | echo "Starting ${num_trials} trials on ${json_file}" 15 | 16 | if [ "$(grep warm $json_file | wc -l)" == "1" ] 17 | then 18 | echo warm: $json_file 19 | cat $json_file 20 | # First do a cold run and throw away the log. 21 | sudo bash -c "sync || sync || sync || sync" 22 | sudo bash -c "echo 3 > /proc/sys/vm/drop_caches" 23 | run_experiment.py "$json_file" log="/dev/null" did="$did" 24 | 25 | # Now do x trials 26 | for trials in $(seq 1 $num_trials); do 27 | run_experiment.py "$json_file" did="$did" log_path="$log_path" 28 | done 29 | else 30 | echo cold: $json_file 31 | cat $json_file 32 | for trials in $(seq 1 $num_trials); do 33 | free 34 | sudo bash -c "sync || sync || sync || sync" 35 | sudo bash -c "echo 3 > /proc/sys/vm/drop_caches" 36 | free 37 | sleep 1 38 | run_experiment.py "$json_file" did="$did" log_path="$log_path" 39 | done 40 | fi 41 | -------------------------------------------------------------------------------- /src/generic/encoding.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | ParaText: parallel text reading 4 | Copyright (C) 2016. wise.io, Inc. 5 | 6 | Licensed to the Apache Software Foundation (ASF) under one 7 | or more contributor license agreements. See the NOTICE file 8 | distributed with this work for additional information 9 | regarding copyright ownership. The ASF licenses this file 10 | to you under the Apache License, Version 2.0 (the 11 | "License"); you may not use this file except in compliance 12 | with the License. You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, 17 | software distributed under the License is distributed on an 18 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 19 | KIND, either express or implied. See the License for the 20 | specific language governing permissions and limitations 21 | under the License. 22 | */ 23 | 24 | /* 25 | Coder: Damian Eads. 26 | */ 27 | 28 | #ifndef PARATEXT_ENCODING_HPP 29 | #define PARATEXT_ENCODING_HPP 30 | 31 | namespace ParaText { 32 | typedef enum {UNKNOWN_BYTES, UNICODE_UTF8, ASCII} Encoding; 33 | 34 | struct as_raw_bytes { 35 | std::string val; 36 | }; 37 | 38 | struct as_utf8 { 39 | std::string val; 40 | }; 41 | 42 | } 43 | 44 | #endif 45 | -------------------------------------------------------------------------------- /bench/run_pyspark_experiment.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | json_file="${1:-}" 4 | num_trials="${2:-1}" 5 | did="${3:-normal}" 6 | log_path="${4:-normal}" 7 | 8 | if [ "$json_file" == "" ]; 9 | then 10 | echo "usage: run_experiment.sh json_filename [num_trials:1]" 11 | exit 1 12 | fi 13 | 14 | echo "Starting ${num_trials} trials on ${json_file}" 15 | 16 | SPARK_OPTIONS="--driver-memory 300G --executor-memory 300G --num-executors 32 --conf spark.driver.maxResultSize=10g --packages com.databricks:spark-csv_2.11:1.4.0" 17 | 18 | if [ "$(grep warm $json_file | wc -l)" == "1" ] 19 | then 20 | echo warm: $json_file 21 | cat $json_file 22 | # First do a cold run and throw away the log. 23 | sudo bash -c "sync || sync || sync || sync" 24 | sudo bash -c "echo 3 > /proc/sys/vm/drop_caches" 25 | spark-submit $SPARK_OPTIONS $(which run_experiment.py) "$json_file" log="/dev/null" did="$did" 26 | 27 | # Now do x trials 28 | for trials in $(seq 1 $num_trials); do 29 | spark-submit $SPARK_OPTIONS $(which run_experiment.py) "$json_file" did="$did" log_path="$log_path" 30 | done 31 | else 32 | echo cold: $json_file 33 | cat $json_file 34 | for trials in $(seq 1 $num_trials); do 35 | free 36 | sudo bash -c "sync || sync || sync || sync" 37 | sudo bash -c "echo 3 > /proc/sys/vm/drop_caches" 38 | free 39 | sleep 1 40 | spark-submit $SPARK_OPTIONS $(which run_experiment.py) "$json_file" did="$did" log_path="$log_path" 41 | done 42 | fi 43 | 44 | -------------------------------------------------------------------------------- /src/paratext_internal.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | Licensed to the Apache Software Foundation (ASF) under one 3 | or more contributor license agreements. See the NOTICE file 4 | distributed with this work for additional information 5 | regarding copyright ownership. The ASF licenses this file 6 | to you under the Apache License, Version 2.0 (the 7 | "License"); you may not use this file except in compliance 8 | with the License. You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, 13 | software distributed under the License is distributed on an 14 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | KIND, either express or implied. See the License for the 16 | specific language governing permissions and limitations 17 | under the License. 18 | 19 | Copyright (C) wise.io, Inc. 2016. 20 | */ 21 | #ifndef PARATEXT_PARACSV_HPP 22 | #define PARATEXT_PARACSV_HPP 23 | 24 | #include "generic/parse_params.hpp" 25 | 26 | size_t get_num_cores(); 27 | 28 | std::string as_quoted_string(const std::string &s, bool do_not_escape_newlines = false); 29 | 30 | ParaText::as_raw_bytes get_random_string(size_t length, long seed, long min = 0, long max = 255); 31 | ParaText::as_utf8 get_random_string_utf8(size_t num_sequences, long seed, bool include_null = true); 32 | 33 | size_t get_string_length(const std::string &s); 34 | 35 | bool are_strings_equal(const std::string &x, const std::string &y); 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: c++ 2 | 3 | env: 4 | - PY_VERSION=2.7 5 | - PY_VERSION=3.5 6 | 7 | os: 8 | - linux 9 | - osx 10 | 11 | addons: 12 | apt: 13 | sources: 14 | - sourceline: 'ppa:ubuntu-toolchain-r/test' 15 | packages: 16 | - g++-4.8 17 | 18 | before_install: 19 | - echo "before_install" 20 | - echo $VIRTUAL_ENV 21 | - export PATH="$HOME/miniconda/bin:$PATH" 22 | - df -h 23 | - date 24 | - pwd 25 | - uname -a 26 | - python -V 27 | - which g++ 28 | - g++ --version 29 | - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then sudo /bin/bash ci/osx_install.sh; fi 30 | - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo /bin/bash ci/linux_install.sh; fi 31 | 32 | # command to install dependencies 33 | install: 34 | - export CXX="$(cat .cxx.choice)" 35 | - ls $(dirname $(which g++))/g++* 36 | - echo "install" 37 | - bash miniconda.sh -b -p $HOME/miniconda 38 | - export PATH="$HOME/miniconda/bin:$PATH" 39 | - hash -r 40 | - conda config --set always_yes yes --set changeps1 no 41 | - conda update -q conda 42 | # Useful for debugging any issues with conda 43 | - conda info -a 44 | - conda install python=$PY_VERSION nose 45 | # Replace dep1 dep2 ... with your dependencies 46 | - conda create -q -n test-environment python=$PY_VERSION swig=3.0.8 pandas numpy 47 | - source activate test-environment 48 | - cd python 49 | - python setup.py build install 50 | - cd .. 51 | - pwd 52 | 53 | before_script: 54 | - export PY_PREFIX=$(python -c "import sys; print(sys.prefix)") 55 | - echo $PY_PREFIX 56 | - export PYTHONPATH=$PY_PREFIX/lib/python$PY_VERSION/site-packages:$PYTHONPATH 57 | - echo $PYTHONPATH 58 | 59 | # command to run tests 60 | script: nosetests -s --failure-detail --with-xunit tests/test_paratext.py 61 | -------------------------------------------------------------------------------- /bench/convert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import pandas 4 | import pickle 5 | import feather 6 | import h5py 7 | import numpy as np 8 | import scipy.io as sio 9 | import os 10 | import sys 11 | 12 | def convert_feather(df, output_filename): 13 | feather.write_dataframe(df, output_filename) 14 | 15 | def convert_hdf5(df, output_filename): 16 | X = df.values 17 | f = h5py.File(output_filename, "w") 18 | ds=f.create_dataset("mydataset", X.shape, dtype=X.dtype) 19 | ds[...] = X 20 | 21 | def convert_npy(df, output_filename): 22 | X = df.values 23 | np.save(output_filename, X) 24 | 25 | def convert_pkl(df, output_filename): 26 | fid = open(output_filename, "wb") 27 | pickle.dump(df, fid) 28 | fid.close() 29 | 30 | def convert_mat(df, output_filename): 31 | dd = {key: df[key].values.flatten() for key in df.keys()} 32 | sio.savemat(output_filename, dd) 33 | 34 | input_filename = sys.argv[1] 35 | output_filenames = sys.argv[2:] 36 | 37 | if not input_filename.endswith(".csv"): 38 | print "input must be a CSV file (by extension)" 39 | sys.exit(1) 40 | 41 | df = paratext.load_csv_to_pandas(input_filename, allow_quoted_newlines=True) 42 | 43 | for output_filename in output_filenames: 44 | _, extension = os.path.splitext(output_filename) 45 | if extension == ".hdf5": 46 | convert_hdf5(df, output_filename) 47 | elif extension == ".feather": 48 | convert_feather(df, output_filename) 49 | elif extension == ".pkl": 50 | convert_pkl(df, output_filename) 51 | elif extension == ".npy": 52 | convert_npy(df, output_filename) 53 | elif extension == ".mat": 54 | convert_mat(df, output_filename) 55 | else: 56 | print "skipping '%s'; invalid output format '%s'" % (output_filename, extension) 57 | -------------------------------------------------------------------------------- /bench/test_readr.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # 3 | # test_datatable.R in.csv out.json 4 | # 5 | # Loads the file in.csv into an R data frame with fread, sums its numeric 6 | # columns, and outputs the run times and memory usage to the JSON file 7 | # out.json. 8 | 9 | require("readr", quietly=TRUE) 10 | 11 | "OlsonNames" = function () 12 | { 13 | if (.Platform$OS.type == "windows") 14 | tzdir <- Sys.getenv("TZDIR", file.path(R.home("share"), 15 | "zoneinfo")) 16 | else { 17 | tzdirs <- c(Sys.getenv("TZDIR"), file.path(R.home("share"), 18 | "zoneinfo"), "/usr/share/zoneinfo", "/usr/share/lib/zoneinfo", 19 | "/usr/lib/zoneinfo", "/usr/local/etc/zoneinfo", "/etc/zoneinfo", 20 | "/usr/etc/zoneinfo") 21 | tzdirs <- tzdirs[file.exists(tzdirs)] 22 | if (!length(tzdirs)) { 23 | warning("no Olson database found") 24 | return(character()) 25 | } 26 | else tzdir <- tzdirs[1] 27 | } 28 | x <- list.files(tzdir, recursive = TRUE) 29 | grep("^[ABCDEFGHIJKLMNOPQRSTUVWXYZ]", x, value = TRUE) 30 | } 31 | 32 | memory_usage <- function(){ 33 | return(strtoi(system(paste("ps -o rss ", Sys.getpid(), "| tail -1"), intern=TRUE))*1024) 34 | } 35 | 36 | args <- commandArgs(TRUE) 37 | filename <- args[1] 38 | result_filename <- args[2] 39 | load_tic <- Sys.time() 40 | df = read_csv(filename) 41 | load_toc <- Sys.time() 42 | load_time <- as.double(difftime(load_toc, load_tic, units="secs")) 43 | 44 | mem <- memory_usage() 45 | 46 | sum_tic <- Sys.time() 47 | s <- colSums(Filter(is.numeric, df)) 48 | s2 <- apply(Filter(function(x){!is.numeric(x)}, df), 2, function(x){sum(nchar(x))}) 49 | sum_toc <- Sys.time() 50 | sum_time <- as.double(difftime(sum_toc, sum_tic, units="secs")) 51 | 52 | results = list(cmd = "R-readr", load_time = load_time, mem = mem, sum_time = sum_time) 53 | json = rjson::toJSON(results) 54 | write(json, result_filename) 55 | -------------------------------------------------------------------------------- /bench/test_fread.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # 3 | # test_fread.R in.csv out.json 4 | # 5 | # Loads the file in.csv into an R data frame with fread, sums its numeric 6 | # columns, and outputs the run times and memory usage to the JSON file 7 | # out.json. 8 | 9 | require("data.table", quietly=TRUE) 10 | require("rjson", quietly=TRUE) 11 | 12 | "OlsonNames" = function () 13 | { 14 | if (.Platform$OS.type == "windows") 15 | tzdir <- Sys.getenv("TZDIR", file.path(R.home("share"), 16 | "zoneinfo")) 17 | else { 18 | tzdirs <- c(Sys.getenv("TZDIR"), file.path(R.home("share"), 19 | "zoneinfo"), "/usr/share/zoneinfo", "/usr/share/lib/zoneinfo", 20 | "/usr/lib/zoneinfo", "/usr/local/etc/zoneinfo", "/etc/zoneinfo", 21 | "/usr/etc/zoneinfo") 22 | tzdirs <- tzdirs[file.exists(tzdirs)] 23 | if (!length(tzdirs)) { 24 | warning("no Olson database found") 25 | return(character()) 26 | } 27 | else tzdir <- tzdirs[1] 28 | } 29 | x <- list.files(tzdir, recursive = TRUE) 30 | grep("^[ABCDEFGHIJKLMNOPQRSTUVWXYZ]", x, value = TRUE) 31 | } 32 | 33 | memory_usage <- function(){ 34 | return(strtoi(system(paste("ps -o rss ", Sys.getpid(), "| tail -1"), intern=TRUE))*1024) 35 | } 36 | 37 | args <- commandArgs(TRUE) 38 | filename <- args[1] 39 | result_filename <- args[2] 40 | load_tic <- Sys.time() 41 | df = fread(filename) 42 | load_toc <- Sys.time() 43 | load_time <- as.double(difftime(load_toc, load_tic, units="secs")) 44 | 45 | mem <- memory_usage() 46 | 47 | sum_tic <- Sys.time() 48 | s <- colSums(Filter(is.numeric, df)) 49 | s <- s + apply(Filter(function(x){!is.numeric(x)}, df), 2, function(x){sum(nchar(x))}) 50 | sum_toc <- Sys.time() 51 | sum_time <- as.double(difftime(sum_toc, sum_tic, units="secs")) 52 | 53 | results = list(cmd = "R-fread", load_time = load_time, mem = mem, sum_time = sum_time) 54 | json = rjson::toJSON(results) 55 | write(json, result_filename) 56 | -------------------------------------------------------------------------------- /src/generic/parse_params.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | ParaText: parallel text reading 4 | Copyright (C) 2016. wise.io, Inc. 5 | 6 | Licensed to the Apache Software Foundation (ASF) under one 7 | or more contributor license agreements. See the NOTICE file 8 | distributed with this work for additional information 9 | regarding copyright ownership. The ASF licenses this file 10 | to you under the Apache License, Version 2.0 (the 11 | "License"); you may not use this file except in compliance 12 | with the License. You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, 17 | software distributed under the License is distributed on an 18 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 19 | KIND, either express or implied. See the License for the 20 | specific language governing permissions and limitations 21 | under the License. 22 | */ 23 | 24 | /* 25 | Coder: Damian Eads. 26 | */ 27 | 28 | #ifndef PARATEXT_PARSE_PARAMS_HPP 29 | #define PARATEXT_PARSE_PARAMS_HPP 30 | 31 | #include 32 | #include 33 | #include "generic/encoding.hpp" 34 | 35 | namespace ParaText { 36 | 37 | typedef enum {ROW_BASED, COL_BASED} ParserType; 38 | typedef enum {NONE, SNAPPY, MSGPACK} Compression; 39 | 40 | typedef enum {CATEGORICAL, NUMERIC, TEXT, UNKNOWN} Semantics; 41 | 42 | template 43 | struct TagEncoding {}; 44 | 45 | struct ColumnInfo { 46 | std::string name; 47 | Semantics semantics; 48 | }; 49 | 50 | struct ParseParams { 51 | ParseParams() : no_header(false), number_only(false), convert_null_to_space(true), block_size(32768), num_threads(16), allow_quoted_newlines(false), max_level_name_length(std::numeric_limits::max()), max_levels(std::numeric_limits::max()), compression(Compression::NONE), parser_type(ParserType::COL_BASED) {} 52 | bool no_header; 53 | bool number_only; 54 | bool compute_sum; 55 | bool convert_null_to_space; 56 | size_t block_size; 57 | size_t num_threads; 58 | bool allow_quoted_newlines; 59 | size_t max_level_name_length; 60 | size_t max_levels; 61 | Compression compression; 62 | ParserType parser_type; 63 | }; 64 | 65 | } 66 | #endif 67 | -------------------------------------------------------------------------------- /src/util/unicode.hpp: -------------------------------------------------------------------------------- 1 | #ifndef WISEIO_UNICODE_HPP 2 | #define WISEIO_UNICODE_HPP 3 | 4 | #define UNI_REPLACEMENT_CHAR (WUTF32)0x0000FFFD 5 | #define UNI_MAX_BMP (WUTF32)0x0000FFFF 6 | #define UNI_MAX_UTF16 (WUTF32)0x0010FFFF 7 | #define UNI_MAX_UTF32 (WUTF32)0x7FFFFFFF 8 | #define UNI_MAX_LEGAL_UTF32 (WUTF32)0x0010FFFF 9 | 10 | #define UNI_SUR_HIGH_START (WUTF32)0xD800 11 | #define UNI_SUR_HIGH_END (WUTF32)0xDBFF 12 | #define UNI_SUR_LOW_START (WUTF32)0xDC00 13 | #define UNI_SUR_LOW_END (WUTF32)0xDFFF 14 | 15 | namespace WiseIO { 16 | 17 | template 18 | int convert_utf32_to_utf8(InputIterator start, 19 | InputIterator end, 20 | OutputIterator out, 21 | bool strict = false) { 22 | 23 | typedef unsigned long WUTF32; 24 | typedef unsigned char WUTF8; 25 | static const WUTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; 26 | int result = 0; 27 | unsigned char buf[4] = {0,0,0,0}; 28 | for (InputIterator it = start; it != end; it++) { 29 | WUTF32 ch = *it; 30 | unsigned short bytesToWrite = 0; 31 | const WUTF32 byteMask = 0xBF; 32 | const WUTF32 byteMark = 0x80; 33 | if (strict) { 34 | /* UTF-16 surrogate values are illegal in UTF-32 */ 35 | if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 36 | result = 1; 37 | break; 38 | } 39 | } 40 | /* 41 | * Figure out how many bytes the result will require. Turn any 42 | * illegally large UTF32 things (> Plane 17) into replacement chars. 43 | */ 44 | if (ch < (WUTF32)0x80) { bytesToWrite = 1; 45 | } else if (ch < (WUTF32)0x800) { bytesToWrite = 2; 46 | } else if (ch < (WUTF32)0x10000) { bytesToWrite = 3; 47 | } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4; 48 | } else { bytesToWrite = 3; 49 | ch = UNI_REPLACEMENT_CHAR; 50 | result = 1; 51 | break; 52 | } 53 | switch (bytesToWrite) { /* note: everything falls through. */ 54 | case 4: buf[3] = (WUTF8)((ch | byteMark) & byteMask); ch >>= 6; 55 | case 3: buf[2] = (WUTF8)((ch | byteMark) & byteMask); ch >>= 6; 56 | case 2: buf[1] = (WUTF8)((ch | byteMark) & byteMask); ch >>= 6; 57 | case 1: buf[0] = (WUTF8) (ch | firstByteMark[bytesToWrite]); 58 | } 59 | for(int i = 0; i < bytesToWrite; i++) { 60 | *(out++) = buf[i]; 61 | } 62 | } 63 | return result; 64 | } 65 | } 66 | #endif 67 | -------------------------------------------------------------------------------- /python/paratext/helpers.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | # 18 | # Copyright (C) Wise.io, Inc. 2016. 19 | 20 | def make_messy_frame(num_rows, num_cols, num_cats, num_ints): 21 | fid = open("/etc/dictionaries-common/words") 22 | words=[line.strip() for line in fid.readlines()] 23 | perm = np.random.permutation(num_cols) 24 | num_catints = num_cats + num_ints 25 | float_ids = perm[num_catints:] 26 | int_ids = perm[num_cats:num_catints] 27 | cat_ids = perm[0:num_cats] 28 | d = {} 29 | dtypes = {} 30 | for col in cat_ids: 31 | X = np.zeros((num_rows,), dtype=np.object); 32 | for row in xrange(0, num_rows): 33 | num_newlines = np.random.randint(3,7) 34 | num_commas = np.random.randint(3,7) 35 | X[row] = "" 36 | tricky_delims = np.asarray(["\n"] * num_newlines + [","] * num_commas) 37 | np.random.shuffle(tricky_delims) 38 | for delim in tricky_delims: 39 | X[row] += string.join(random.sample(words, 5), ' ') 40 | X[row] += delim 41 | X[row] += string.join(random.sample(words, 5), ' ') 42 | d[col] = X 43 | dtypes[col] = 'string' 44 | for col in float_ids: 45 | d[col] = np.random.randn(num_rows) 46 | dtypes[col] = 'float' 47 | min_int = [0, -2**7, 0 , -2**15, 0, -2**31, 0, -2**62] 48 | max_int = [2**8, 2**7, 2**16, 2**15, 2**32, 2**31, 2**62, 2**62] 49 | dtypes_int = ["uint8", "int8", "uint16", "int16", "uint32", "int32", "uint64", "int64"] 50 | for col in int_ids: 51 | j = np.random.randint(0, len(min_int)) 52 | d[col] = np.random.randint(min_int[j], max_int[j], num_rows) 53 | dtypes[col] = dtypes_int[j] 54 | return d, dtypes 55 | -------------------------------------------------------------------------------- /src/csv/parallel.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | ParaText: parallel text reading 3 | Copyright (C) 2016. wise.io, Inc. 4 | 5 | Licensed to the Apache Software Foundation (ASF) under one 6 | or more contributor license agreements. See the NOTICE file 7 | distributed with this work for additional information 8 | regarding copyright ownership. The ASF licenses this file 9 | to you under the Apache License, Version 2.0 (the 10 | "License"); you may not use this file except in compliance 11 | with the License. You may obtain a copy of the License at 12 | 13 | http://www.apache.org/licenses/LICENSE-2.0 14 | 15 | Unless required by applicable law or agreed to in writing, 16 | software distributed under the License is distributed on an 17 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 18 | KIND, either express or implied. See the License for the 19 | specific language governing permissions and limitations 20 | under the License. 21 | */ 22 | 23 | /* 24 | These functions by Guillem Blanco are taken from WiseML. 25 | */ 26 | 27 | #ifndef PARATEXT_PARALLEL_HPP 28 | #define PARATEXT_PARALLEL_HPP 29 | 30 | #include 31 | 32 | /* 33 | * Same as std::for_each but fun must have signature: void f(Iterator). 34 | */ 35 | template 36 | inline F for_each_it(Iterator first, Iterator last, F &&fun) { 37 | for (; first != last; ++first) 38 | fun(first); 39 | return std::move(fun); 40 | } 41 | 42 | /* 43 | * Distributes the application of F in [first, last) among different threads. 44 | */ 45 | template 46 | F parallel_for_each(Iterator first, Iterator last, size_t suggested_num_threads, F &&f) { 47 | using namespace std::placeholders; 48 | 49 | const std::size_t num_elements = std::distance(first, last); 50 | if (num_elements == 0) { 51 | return std::move(f); 52 | } 53 | const size_t num_threads = 54 | std::min(std::max(1UL, suggested_num_threads), num_elements); 55 | const std::size_t elements_thread = num_elements / num_threads; 56 | const std::size_t excess = num_elements % num_threads; 57 | 58 | /* Thread pool */ 59 | std::vector thread_pool; 60 | thread_pool.reserve(num_threads); 61 | 62 | /* Spawn threads */ 63 | Iterator it = first; 64 | for (std::size_t thread_id = 0; thread_id < num_threads; ++thread_id) { 65 | const std::size_t step = elements_thread + (thread_id < excess ? 1 : 0); 66 | thread_pool 67 | .emplace_back([ it, step, thread_id, f = std::forward(f) ]() { 68 | for_each_it(it, it + step, std::bind(f, _1, thread_id)); 69 | }); 70 | it += step; 71 | } 72 | 73 | /* Join threads */ 74 | for (auto &&thread : thread_pool) { 75 | thread.join(); 76 | } 77 | return std::move(f); 78 | } 79 | 80 | #endif 81 | -------------------------------------------------------------------------------- /bench/convert_to_binary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | # 20 | # Copyright (C) Wise.io, Inc. 2016. 21 | 22 | import pandas 23 | import pickle 24 | import feather 25 | import h5py 26 | import numpy as np 27 | import scipy.io as sio 28 | import os 29 | import sys 30 | 31 | def convert_feather(df, output_filename): 32 | feather.write_dataframe(df, output_filename) 33 | 34 | def convert_hdf5(df, output_filename): 35 | X = df.values 36 | f = h5py.File(output_filename, "w") 37 | ds=f.create_dataset("mydataset", X.shape, dtype=X.dtype) 38 | ds[...] = X 39 | 40 | def convert_npy(df, output_filename): 41 | X = df.values 42 | np.save(output_filename, X) 43 | 44 | def convert_pkl(df, output_filename): 45 | fid = open(output_filename, "wb") 46 | pickle.dump(df, fid) 47 | fid.close() 48 | 49 | def convert_mat(df, output_filename): 50 | dd = {key: df[key].values.flatten() for key in df.keys()} 51 | sio.savemat(output_filename, dd) 52 | 53 | input_filename = sys.argv[1] 54 | has_header = int(sys.argv[2]) 55 | output_filenames = sys.argv[3:] 56 | 57 | if not input_filename.endswith(".csv"): 58 | print "input must be a CSV file (by extension)" 59 | sys.exit(1) 60 | 61 | if has_header: 62 | df = pandas.read_csv(input_filename) 63 | else: 64 | df = pandas.read_csv(input_filename, header=None) 65 | 66 | for output_filename in output_filenames: 67 | _, extension = os.path.splitext(output_filename) 68 | if extension == ".hdf5": 69 | convert_hdf5(df, output_filename) 70 | elif extension == ".feather": 71 | convert_feather(df, output_filename) 72 | elif extension == ".pkl": 73 | convert_pkl(df, output_filename) 74 | elif extension == ".npy": 75 | convert_npy(df, output_filename) 76 | elif extension == ".mat": 77 | convert_mat(df, output_filename) 78 | else: 79 | print "skipping '%s'; invalid output format '%s'" % (output_filename, extension) 80 | -------------------------------------------------------------------------------- /src/paratext_internal.i: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | ParaText: parallel text reading 4 | Copyright (C) 2016. wise.io, Inc. 5 | 6 | Licensed to the Apache Software Foundation (ASF) under one 7 | or more contributor license agreements. See the NOTICE file 8 | distributed with this work for additional information 9 | regarding copyright ownership. The ASF licenses this file 10 | to you under the Apache License, Version 2.0 (the 11 | "License"); you may not use this file except in compliance 12 | with the License. You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, 17 | software distributed under the License is distributed on an 18 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 19 | KIND, either express or implied. See the License for the 20 | specific language governing permissions and limitations 21 | under the License. 22 | */ 23 | 24 | /* 25 | Coder: Damian Eads. 26 | */ 27 | 28 | %module paratext_internal 29 | 30 | #if defined(SWIGPYTHON) 31 | %include "python/python.i" 32 | #else 33 | #warning "no SWIG typemaps defined for the target language" 34 | #endif 35 | 36 | //%include "std_string.i" 37 | %include "std_vector.i" 38 | %include "std_pair.i" 39 | 40 | %ignore ParaText::CSV::ColBasedPopulator::get_type_index() const; 41 | %ignore ParaText::CSV::StringVectorPopulator::get_type_index() const; 42 | %ignore ParaText::CSV::ColBasedLoader::get_type_index(size_t) const; 43 | %ignore ParaText::CSV::ColBasedIterator::operator++(); 44 | %ignore ParaText::CSV::ColBasedIterator::operator++(int); 45 | 46 | namespace std { 47 | %template(vectori) std::vector; 48 | } 49 | 50 | ///////// Generic Header 51 | %include "paratext_internal.hpp" 52 | %{ 53 | #include "paratext_internal.hpp" 54 | %} 55 | 56 | ///////// Parsing Parameters 57 | %include "generic/parse_params.hpp" 58 | %{ 59 | #include "generic/parse_params.hpp" 60 | %} 61 | 62 | %include "generic/encoding.hpp" 63 | %{ 64 | #include "generic/encoding.hpp" 65 | %} 66 | 67 | //////// CSV-loading Stuff 68 | 69 | %include "csv/colbased_loader.hpp" 70 | %{ 71 | #include "csv/colbased_loader.hpp" 72 | %} 73 | 74 | %include "diagnostic/memcopy.hpp" 75 | %{ 76 | #include "diagnostic/memcopy.hpp" 77 | %} 78 | 79 | %include "diagnostic/newline_counter.hpp" 80 | %{ 81 | #include "diagnostic/newline_counter.hpp" 82 | %} 83 | 84 | %include "diagnostic/parse_and_sum.hpp" 85 | %{ 86 | #include "diagnostic/parse_and_sum.hpp" 87 | %} 88 | 89 | %include "util/safe_string_output.hpp" 90 | %{ 91 | #include "util/safe_string_output.hpp" 92 | %} 93 | 94 | #if defined(PARATEXT_ROWBASED_CSV) 95 | %include "csv/rowbased_loader.hpp" 96 | %{ 97 | #include "csv/rowbased_loader.hpp" 98 | %} 99 | #endif 100 | -------------------------------------------------------------------------------- /src/paratext_internal.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Licensed to the Apache Software Foundation (ASF) under one 3 | or more contributor license agreements. See the NOTICE file 4 | distributed with this work for additional information 5 | regarding copyright ownership. The ASF licenses this file 6 | to you under the Apache License, Version 2.0 (the 7 | "License"); you may not use this file except in compliance 8 | with the License. You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, 13 | software distributed under the License is distributed on an 14 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | KIND, either express or implied. See the License for the 16 | specific language governing permissions and limitations 17 | under the License. 18 | */ 19 | 20 | #include "paratext_internal.hpp" 21 | #include "util/strings.hpp" 22 | 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | //#include 29 | //#include 30 | 31 | size_t get_num_cores() { 32 | return std::thread::hardware_concurrency(); 33 | } 34 | 35 | std::string as_quoted_string(const std::string &s, bool do_not_escape_newlines) { 36 | return get_quoted_string(s.begin(), s.end(), true, do_not_escape_newlines); 37 | } 38 | 39 | ParaText::as_raw_bytes get_random_string(size_t length, long seed, long min, long max) { 40 | std::string output; 41 | if (seed == 0) { 42 | seed = std::chrono::system_clock::now().time_since_epoch().count(); 43 | } 44 | std::default_random_engine e1(seed); 45 | std::uniform_int_distribution byte_range(min, max); 46 | for (size_t i = 0; i < length; i++) { 47 | output.push_back(byte_range(e1)); 48 | } 49 | ParaText::as_raw_bytes retval; 50 | retval.val = output; 51 | return retval; 52 | } 53 | 54 | ParaText::as_utf8 get_random_string_utf8(size_t num_sequences, long seed, bool include_null) { 55 | std::string output; 56 | if (seed == 0) { 57 | seed = std::chrono::system_clock::now().time_since_epoch().count(); 58 | } 59 | std::default_random_engine e1(seed); 60 | unsigned long surrogate_range = 2048; 61 | std::uniform_int_distribution codepoint_range(include_null ? 0 : 1, 0x10FFFF - surrogate_range); 62 | std::vector seq; 63 | for (size_t i = 0; i < num_sequences; i++) { 64 | unsigned long val = codepoint_range(e1); 65 | if (val >= 0xD800) { 66 | val += surrogate_range; 67 | } 68 | seq.push_back(val); 69 | } 70 | WiseIO::convert_utf32_to_utf8(seq.begin(), seq.end(), std::back_inserter(output)); 71 | ParaText::as_utf8 retval; 72 | retval.val = output; 73 | return retval; 74 | } 75 | 76 | bool are_strings_equal(const std::string &x, const std::string &y) { 77 | return x == y; 78 | } 79 | 80 | size_t get_string_length(const std::string &s) { 81 | return s.size(); 82 | } 83 | -------------------------------------------------------------------------------- /python/setup.py: -------------------------------------------------------------------------------- 1 | 2 | import sys, os, os.path, string, subprocess 3 | import json 4 | 5 | # First, check for the presence of swig, which we will need to build 6 | # the Python bindings. 7 | p = subprocess.Popen(["which", "swig"]) 8 | p.communicate("") 9 | if p.returncode != 0: 10 | print("Error: you must install SWIG first.") 11 | sys.exit(1) 12 | 13 | # The multi-threaded reader will core dump unless -pthread is given. 14 | extra_link_args = [] 15 | extra_compile_args = ["-std=c++11", "-Wall", "-Wextra", "-pthread"] 16 | extra_libraries = [] 17 | 18 | if sys.platform == 'darwin': 19 | extra_compile_args += ["-m64", "-D_REENTRANT"] 20 | extra_link_args += [] 21 | extra_libraries += [] 22 | elif sys.platform.startswith("linux"): 23 | extra_compile_args += [] 24 | extra_link_args += [] 25 | extra_libraries += [] 26 | 27 | if len(set(('develop', 'release', 'bdist_egg', 'bdist_rpm', 28 | 'bdist_wininst', 'install_egg_info', 'build_sphinx', 29 | 'egg_info', 'easy_install', 'upload', 30 | )).intersection(sys.argv)) > 0: 31 | import setuptools 32 | extra_setuptools_args = dict( 33 | zip_safe=False, # the package can run out of an .egg file 34 | ) 35 | else: 36 | extra_setuptools_args = dict() 37 | 38 | from numpy.distutils.core import setup, Extension 39 | 40 | version = "0.2.1rc1" 41 | 42 | init_py = open("paratext/__init__.py", "w") 43 | 44 | init_py.write(""" 45 | __all__ = ['paratext'] 46 | 47 | from paratext.core import * 48 | 49 | import paratext_internal 50 | import warnings 51 | 52 | __version__ = "%s" 53 | """ % version) 54 | 55 | init_py.close() 56 | 57 | 58 | print(version) 59 | 60 | swig_cmd = ["swig", "-c++", "-python"] 61 | 62 | if sys.version_info >= (3,): 63 | swig_cmd += ["-py3"] 64 | 65 | swig_cmd += ["-I../src/", "-outdir", "./", "../src/paratext_internal.i"] 66 | 67 | print("running swig: ", swig_cmd) 68 | p = subprocess.Popen(swig_cmd) 69 | p.communicate("") 70 | if p.returncode != 0: 71 | print("Error generating SWIG wrappers.") 72 | sys.exit(1) 73 | 74 | setup(name='paratext', 75 | version=version, 76 | description='Reads text files in parallel. The first release includes a parallel CSV reader.', 77 | long_description=""" 78 | See README 79 | """, 80 | keywords=['csv', 'reading'], 81 | ext_modules=[Extension('_paratext_internal', 82 | ['../src/paratext_internal_wrap.cxx', '../src/paratext_internal.cpp'], 83 | extra_link_args = extra_link_args, 84 | extra_compile_args = extra_compile_args, 85 | include_dirs=['../src/'], 86 | libraries=["stdc++"] + extra_libraries), 87 | ], 88 | py_modules=["paratext_internal"], 89 | author="Damian Eads", 90 | author_email="damian@wise.io", 91 | license="Apache License", 92 | packages = ['paratext'], 93 | url = 'http://wise.io', 94 | include_package_data = True, 95 | **extra_setuptools_args 96 | ) 97 | -------------------------------------------------------------------------------- /bench/compile_log_files.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | # Licensed to the Apache Software Foundation (ASF) under one 5 | # or more contributor license agreements. See the NOTICE file 6 | # distributed with this work for additional information 7 | # regarding copyright ownership. The ASF licenses this file 8 | # to you under the Apache License, Version 2.0 (the 9 | # "License"); you may not use this file except in compliance 10 | # with the License. You may obtain a copy of the License at 11 | # 12 | # http://www.apache.org/licenses/LICENSE-2.0 13 | # 14 | # Unless required by applicable law or agreed to in writing, 15 | # software distributed under the License is distributed on an 16 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 17 | # KIND, either express or implied. See the License for the 18 | # specific language governing permissions and limitations 19 | # under the License. 20 | # 21 | # Copyright (C) Wise.io, Inc. 2016. 22 | 23 | # Usage: compile_log_files.py dir1 dir2 dir3 dir4 dir5 24 | 25 | import numpy as np 26 | import seaborn 27 | import pandas 28 | import sys 29 | import os 30 | import json 31 | 32 | # Compiles the log files into a single log. 33 | 34 | def get_dataset_key(fn): 35 | base_fn = os.path.basename(fn) 36 | base_fn = base_fn[:base_fn.find(".")] 37 | if base_fn == "car-pyspark": 38 | return "car" 39 | else: 40 | return base_fn 41 | 42 | # The log directory X where the result files are stored. 43 | # 44 | # 1. It searches directories: X/cmdname/*.log for log files. 45 | # 46 | # 2. It outputs CSV files: log-X.csv 47 | # 48 | 49 | if len(sys.argv) > 2: 50 | for bench_name in ["avgcols", "countnl", "cPickle", "disk-to-mem", "feather", "hdf5", "noop", "npy", "numpy", "pandas", "paratext", "pickle", "pyspark", "R-readcsv", "R-fread", "R-readr", "sframe"]: 51 | df = pandas.DataFrame() 52 | for bench_dir in sys.argv[1:]: 53 | bench_subdir = os.path.join(bench_dir, bench_name) 54 | print bench_subdir 55 | if not os.path.exists(bench_subdir): 56 | continue 57 | bench_files = os.listdir(bench_subdir) 58 | for filename in bench_files: 59 | fn = os.path.join(bench_subdir, filename) 60 | print "opening ", fn 61 | bench_json = json.load(open(fn)) 62 | log = bench_json["log"] 63 | mini_df = pandas.DataFrame() 64 | for i in xrange(0, len(log)): 65 | for key in log[i].keys(): 66 | if log[i][key] == '?': 67 | log[i][key] = None 68 | mini_df = mini_df.append(log[i], ignore_index = True) 69 | mini_df["log_key"] = filename.replace(".log","") 70 | df = df.append(mini_df) 71 | if bench_name in ["R-readcsv", "R-fread", "R-readr"]: 72 | df["mem"] = df["mem"] / 1000000 73 | if "filename" in df.keys(): 74 | df["ds"] = df["filename"].apply(get_dataset_key) 75 | else: 76 | df["ds"] = '?' 77 | df.to_csv("log-" + bench_name + ".csv", index=False) 78 | else: 79 | print "usage: gen_plot_files.py [log_dir1] [log_dir2]" 80 | -------------------------------------------------------------------------------- /src/csv/rowbased_loader.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | ParaText: parallel text reading 4 | Copyright (C) 2016. wise.io, Inc. 5 | 6 | Licensed to the Apache Software Foundation (ASF) under one 7 | or more contributor license agreements. See the NOTICE file 8 | distributed with this work for additional information 9 | regarding copyright ownership. The ASF licenses this file 10 | to you under the Apache License, Version 2.0 (the 11 | "License"); you may not use this file except in compliance 12 | with the License. You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, 17 | software distributed under the License is distributed on an 18 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 19 | KIND, either express or implied. See the License for the 20 | specific language governing permissions and limitations 21 | under the License. 22 | */ 23 | 24 | /* 25 | Coder: Damian Eads. 26 | */ 27 | 28 | #ifndef PARATEXT_ROW_BASED_LOADER_HPP 29 | #define PARATEXT_ROW_BASED_LOADER_HPP 30 | 31 | #include "parse_params.hpp" 32 | #include "rowbased_worker.hpp" 33 | #include "chunker.hpp" 34 | #include "header_parser.hpp" 35 | 36 | namespace ParaText { 37 | 38 | namespace CSV { 39 | 40 | class RowBasedLoader { 41 | public: 42 | RowBasedLoader() : length_(0) {} 43 | 44 | void load(const std::string &filename, const ParseParams ¶ms) { 45 | header_parser_.open(filename, params.no_header); 46 | struct stat fs; 47 | if (stat(filename.c_str(), &fs) == -1) { 48 | throw std::logic_error("cannot stat file"); 49 | } 50 | length_ = fs.st_size; 51 | column_infos_.resize(header_parser_.get_num_columns()); 52 | for (size_t i = 0; i < column_infos_.size(); i++) { 53 | column_infos_[i].name = header_parser_.get_column_name(i); 54 | } 55 | if (header_parser_.has_header()) { 56 | chunker_.process(filename, header_parser_.get_end_of_header()+1, params.num_threads, params.allow_quoted_newlines); 57 | } 58 | else { 59 | chunker_.process(filename, 0, params.num_threads, params.allow_quoted_newlines); 60 | } 61 | std::vector threads; 62 | std::vector > workers; 63 | for (size_t worker_id = 0; worker_id < params.num_threads; worker_id++) { 64 | long start_of_chunk, end_of_chunk = 0; 65 | std::tie(start_of_chunk, end_of_chunk) = chunker_.get_chunk(worker_id); 66 | 67 | /* If the chunk was eliminated because its entirety represents quoted 68 | text, do not spawn a worker thread for it. */ 69 | if (start_of_chunk < 0 || end_of_chunk < 0) { 70 | continue; 71 | } 72 | workers.push_back(std::make_shared(start_of_chunk, end_of_chunk, length_, params.block_size, params.compression == Compression::SNAPPY)); 73 | threads.emplace_back(&RowBasedParseWorker::parse, 74 | workers.back(), 75 | filename); 76 | start_of_chunk = end_of_chunk; 77 | } 78 | for (size_t i = 0; i < threads.size(); i++) { 79 | threads[i].join(); 80 | } 81 | } 82 | 83 | /* 84 | Returns the number of columns parsed by this loader. 85 | */ 86 | size_t get_num_columns() const { 87 | return column_infos_.size(); 88 | } 89 | 90 | /* 91 | Returns the info about the column. 92 | */ 93 | ParaText::ColumnInfo get_column_info(size_t column_index) const { 94 | return column_infos_[column_index]; 95 | } 96 | 97 | /* 98 | Returns the categorical levels. 99 | */ 100 | const std::vector &get_levels(size_t column_index) const { 101 | std::cout << level_names_[column_index].size(); 102 | return level_names_[column_index]; 103 | } 104 | 105 | size_t size() const { 106 | return size_.back(); 107 | } 108 | 109 | private: 110 | size_t length_; 111 | mutable std::vector > level_ids_; 112 | mutable std::vector > level_names_; 113 | std::vector size_; 114 | std::vector column_infos_; 115 | TextChunker chunker_; 116 | HeaderParser header_parser_; 117 | }; 118 | } 119 | } 120 | #endif 121 | -------------------------------------------------------------------------------- /src/diagnostic/memcopy.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | ParaText: parallel text reading 4 | Copyright (C) 2016. wise.io, Inc. 5 | 6 | Licensed to the Apache Software Foundation (ASF) under one 7 | or more contributor license agreements. See the NOTICE file 8 | distributed with this work for additional information 9 | regarding copyright ownership. The ASF licenses this file 10 | to you under the Apache License, Version 2.0 (the 11 | "License"); you may not use this file except in compliance 12 | with the License. You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, 17 | software distributed under the License is distributed on an 18 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 19 | KIND, either express or implied. See the License for the 20 | specific language governing permissions and limitations 21 | under the License. 22 | */ 23 | 24 | /* 25 | Coder: Damian Eads. 26 | */ 27 | 28 | #ifndef PARATEXT_DIAGNOSTIC_MEM_COPY_HPP 29 | #define PARATEXT_DIAGNOSTIC_MEM_COPY_HPP 30 | 31 | #include 32 | #include 33 | 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | 40 | #include "generic/chunker.hpp" 41 | #include "csv/header_parser.hpp" 42 | 43 | namespace ParaText { 44 | 45 | namespace Diagnostic { 46 | 47 | class MemCopyWorker { 48 | public: 49 | MemCopyWorker(size_t chunk_start, size_t chunk_end, size_t block_size) 50 | : chunk_start_(chunk_start), 51 | chunk_end_(chunk_end), 52 | block_size_(block_size) {} 53 | 54 | virtual ~MemCopyWorker() {} 55 | 56 | void parse(const std::string &filename) { 57 | try { 58 | parse_impl(filename); 59 | } 60 | catch (...) { 61 | thread_exception_ = std::current_exception(); 62 | } 63 | } 64 | 65 | std::exception_ptr get_exception() { 66 | return thread_exception_; 67 | } 68 | 69 | void parse_impl(const std::string &filename) { 70 | std::ifstream in; 71 | in.open(filename.c_str()); 72 | const size_t block_size = block_size_; 73 | char buf[block_size]; 74 | in.seekg(chunk_start_, std::ios_base::beg); 75 | size_t current = chunk_start_; 76 | while (current <= chunk_end_) { 77 | in.read(buf, std::min(chunk_end_ - current + 1, block_size)); 78 | size_t nread = in.gcount(); 79 | if (nread == 0) { 80 | break; 81 | } 82 | data_.insert(data_.begin(), buf + 0, buf + nread); 83 | current += nread; 84 | } 85 | } 86 | 87 | private: 88 | size_t chunk_start_; 89 | size_t chunk_end_; 90 | size_t block_size_; 91 | std::vector data_; 92 | std::exception_ptr thread_exception_; 93 | }; 94 | 95 | class MemCopyBaseline { 96 | public: 97 | MemCopyBaseline() {} 98 | 99 | virtual ~MemCopyBaseline() {} 100 | 101 | void load(const std::string &filename, const ParseParams ¶ms) { 102 | std::vector threads; 103 | std::vector > workers; 104 | header_parser_.open(filename, params.no_header); 105 | std::exception_ptr thread_exception; 106 | if (header_parser_.has_header()) { 107 | chunker_.process(filename, header_parser_.get_end_of_header()+1, params.num_threads, params.allow_quoted_newlines); 108 | } 109 | else { 110 | chunker_.process(filename, 0, params.num_threads, params.allow_quoted_newlines); 111 | } 112 | for (size_t worker_id = 0; worker_id < chunker_.num_chunks(); worker_id++) { 113 | size_t start_of_chunk = 0, end_of_chunk = 0; 114 | std::tie(start_of_chunk, end_of_chunk) = chunker_.get_chunk(worker_id); 115 | 116 | if (start_of_chunk == end_of_chunk) { 117 | continue; 118 | } 119 | workers.push_back(std::make_shared(start_of_chunk, end_of_chunk, params.block_size)); 120 | threads.emplace_back(&MemCopyWorker::parse, 121 | workers.back(), 122 | filename); 123 | } 124 | 125 | for (size_t i = 0; i < threads.size(); i++) { 126 | threads[i].join(); 127 | if (!thread_exception) { 128 | thread_exception = workers[i]->get_exception(); 129 | } 130 | } 131 | // We're now outside the parallel region. 132 | if (thread_exception) { 133 | std::rethrow_exception(thread_exception); 134 | } 135 | } 136 | 137 | private: 138 | CSV::HeaderParser header_parser_; 139 | TextChunker chunker_; 140 | }; 141 | } 142 | } 143 | #endif 144 | -------------------------------------------------------------------------------- /src/diagnostic/newline_counter.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | ParaText: parallel text reading 4 | Copyright (C) 2016. wise.io, Inc. 5 | 6 | Licensed to the Apache Software Foundation (ASF) under one 7 | or more contributor license agreements. See the NOTICE file 8 | distributed with this work for additional information 9 | regarding copyright ownership. The ASF licenses this file 10 | to you under the Apache License, Version 2.0 (the 11 | "License"); you may not use this file except in compliance 12 | with the License. You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, 17 | software distributed under the License is distributed on an 18 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 19 | KIND, either express or implied. See the License for the 20 | specific language governing permissions and limitations 21 | under the License. 22 | */ 23 | 24 | /* 25 | Coder: Damian Eads. 26 | */ 27 | 28 | #ifndef PARATEXT_DIAGNOSTIC_NL_COUNTER_HPP 29 | #define PARATEXT_DIAGNOSTIC_NL_COUNTER_HPP 30 | 31 | #include 32 | #include 33 | 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | 40 | #include "generic/chunker.hpp" 41 | #include "csv/header_parser.hpp" 42 | 43 | namespace ParaText { 44 | 45 | namespace Diagnostic { 46 | 47 | class NewlineCountWorker { 48 | public: 49 | NewlineCountWorker(size_t chunk_start, size_t chunk_end, size_t block_size) 50 | : chunk_start_(chunk_start), 51 | chunk_end_(chunk_end), 52 | block_size_(block_size) {} 53 | 54 | virtual ~NewlineCountWorker() {} 55 | 56 | void parse(const std::string &filename) { 57 | try { 58 | parse_impl(filename); 59 | } 60 | catch (...) { 61 | thread_exception_ = std::current_exception(); 62 | } 63 | } 64 | 65 | std::exception_ptr get_exception() { 66 | return thread_exception_; 67 | } 68 | 69 | void parse_impl(const std::string &filename) { 70 | std::ifstream in; 71 | in.open(filename.c_str()); 72 | const size_t block_size = block_size_; 73 | char buf[block_size]; 74 | in.seekg(chunk_start_, std::ios_base::beg); 75 | size_t current = chunk_start_; 76 | num_newlines_ = 0; 77 | while (current <= chunk_end_) { 78 | in.read(buf, std::min(chunk_end_ - current + 1, block_size)); 79 | size_t nread = in.gcount(); 80 | if (nread == 0) { 81 | break; 82 | } 83 | for (size_t i = 0; i < nread; i++) { 84 | if (buf[i] == '\n') { 85 | num_newlines_++; 86 | } 87 | } 88 | current += nread; 89 | } 90 | } 91 | 92 | size_t get_num_newlines() const { 93 | return num_newlines_; 94 | } 95 | 96 | private: 97 | size_t chunk_start_; 98 | size_t chunk_end_; 99 | size_t block_size_; 100 | size_t num_newlines_; 101 | std::exception_ptr thread_exception_; 102 | }; 103 | 104 | class NewlineCounter { 105 | public: 106 | NewlineCounter() {} 107 | 108 | virtual ~NewlineCounter() {} 109 | 110 | size_t load(const std::string &filename, const ParseParams ¶ms) { 111 | std::vector threads; 112 | std::vector > workers; 113 | std::exception_ptr thread_exception; 114 | chunker_.process(filename, 0, params.num_threads, params.allow_quoted_newlines); 115 | for (size_t worker_id = 0; worker_id < chunker_.num_chunks(); worker_id++) { 116 | long start_of_chunk = 0, end_of_chunk = 0; 117 | std::tie(start_of_chunk, end_of_chunk) = chunker_.get_chunk(worker_id); 118 | if (start_of_chunk < 0 || end_of_chunk < 0) { 119 | continue; 120 | } 121 | workers.push_back(std::make_shared(start_of_chunk, end_of_chunk, params.block_size)); 122 | threads.emplace_back(&NewlineCountWorker::parse, 123 | workers.back(), 124 | filename); 125 | } 126 | 127 | for (size_t i = 0; i < threads.size(); i++) { 128 | threads[i].join(); 129 | if (!thread_exception) { 130 | thread_exception = workers[i]->get_exception(); 131 | } 132 | } 133 | // We're now outside the parallel region. 134 | if (thread_exception) { 135 | std::rethrow_exception(thread_exception); 136 | } 137 | size_t newline_count = 0; 138 | for (size_t i = 0; i < workers.size(); i++) { 139 | newline_count += workers[i]->get_num_newlines(); 140 | } 141 | return newline_count; 142 | } 143 | 144 | private: 145 | TextChunker chunker_; 146 | }; 147 | } 148 | } 149 | #endif 150 | -------------------------------------------------------------------------------- /src/python/processor.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | File: processor.bhpp 3 | 4 | Author: Damian Eads, PhD 5 | 6 | Copyright (C) wise.io, Inc. 2015. 7 | */ 8 | 9 | #ifndef WISEIO_PROCESSOR_HPP 10 | #define WISEIO_PROCESSOR_HPP 11 | 12 | #include 13 | #include 14 | 15 | #ifdef PARATEXT_DATE_TIME 16 | #include 17 | #endif 18 | 19 | namespace ParaText { 20 | 21 | /* 22 | A generic call-back interface for processing a sequence of 23 | variably-typed objects coming from a different language. 24 | 25 | For example, if a C++ functor requires a sequence, it 26 | can implement the interface of this class. 27 | 28 | WiseTransfer will iterate over the array, list, tuple, 29 | iterable object, or sequence object. When an element of a 30 | string is found, it calls process_string. If it is floating 31 | point, process_float is called. If it is a long integer, 32 | process_long is called. 33 | */ 34 | class CallbackProcessor { 35 | public: 36 | 37 | /* 38 | The base constructor. Does nothing in this part of sub-object 39 | construction. 40 | */ 41 | CallbackProcessor(); 42 | 43 | /* 44 | The destructor deletes this callback processor and deallocates 45 | any temporary resources needed. 46 | */ 47 | virtual ~CallbackProcessor(); 48 | 49 | /* 50 | Tells the functor to ingest the next element, which is a string. 51 | */ 52 | virtual void process_string(const char *begin, const char *end) = 0; 53 | 54 | /* 55 | Tells the functor to ingest the next element, which is a float. 56 | */ 57 | virtual void process_float(float fval) = 0; 58 | 59 | /* 60 | Tells the functor to ingest the next element, which is a long. 61 | */ 62 | virtual void process_long(long lval) = 0; 63 | 64 | /* 65 | Tells the functor to ingest the next element, which is a bool. 66 | */ 67 | virtual void process_bool(bool bval) = 0; 68 | 69 | /* 70 | Asks the functor to translate an exception thrown while calling 71 | one of the process_XXX methods into a string. 72 | */ 73 | virtual void process_exception(std::exception_ptr ptr, std::string &text) = 0; 74 | 75 | /* 76 | Process the next sparse value. 77 | */ 78 | virtual void process_sparse(size_t row_index, size_t col_index, float value) = 0; 79 | 80 | 81 | /* 82 | Process an empty sparse row. 83 | */ 84 | virtual void process_sparse(size_t row_index) = 0; 85 | }; 86 | 87 | /* 88 | An enumerated type for identifying the type of element in an 89 | IteratorProcessor. 90 | */ 91 | enum class IteratorElementType {STRING, LONG, BOOL, FLOAT, DATETIME}; 92 | 93 | /* 94 | An IteratorProcessor (iterproc for short) generic interface for 95 | manipulating an iterator over primitive types in another language. 96 | 97 | An iterproc X can be queried if there are any more elements remaining 98 | as follows:: 99 | 100 | while (X.has_next()) { 101 | switch (X.get_type()) { 102 | case IteratorElementType::STRING: 103 | ... 104 | break; 105 | } 106 | X.advance(); 107 | } 108 | 109 | The get_type() function returns the type of the current element. 110 | The advance() function advances the iterator to the next element. 111 | The element that the iterator is currently pointing to can be 112 | retrieved with: 113 | 114 | X.get_string() 115 | X.get_float() 116 | X.get_long() 117 | X.get_bool() 118 | 119 | 120 | */ 121 | class IteratorProcessor { 122 | public: 123 | /* 124 | The base constructor for an IteratorProcessor. This part of 125 | the sub-object construction does nothing. 126 | */ 127 | IteratorProcessor() {} 128 | 129 | /* 130 | A virtual destructor for the iterator processor. 131 | */ 132 | virtual ~IteratorProcessor() {} 133 | 134 | /* 135 | The type of the element to which the iterator currently points. 136 | */ 137 | virtual IteratorElementType get_type() const = 0; 138 | 139 | /* 140 | Retrieves a string representation of the current element. 141 | */ 142 | virtual std::string get_string() const = 0; 143 | 144 | /* 145 | Retrieves a float at the current element. 146 | */ 147 | virtual double get_float() const = 0; 148 | 149 | /* 150 | Retrieves a long at the current element. 151 | */ 152 | virtual long get_long() const = 0; 153 | 154 | #ifdef PARATEXT_DATE_TIME 155 | /* 156 | Retrieves a date time at the current element. 157 | */ 158 | virtual boost::posix_time::ptime get_datetime() const = 0; 159 | #endif 160 | 161 | /* 162 | Retrieves a bool at the current element. 163 | */ 164 | virtual bool get_bool() const = 0; 165 | 166 | /* 167 | Returns true if and only if this iterator has another element 168 | past the current element. 169 | */ 170 | virtual bool has_next() const = 0; 171 | 172 | /* 173 | Advances the iterator to the next element. 174 | */ 175 | virtual void advance() = 0; 176 | 177 | /* 178 | Returns the number of elements this iterator processes. If this 179 | number is not known, std::numeric_limits::max() is used 180 | instead. 181 | */ 182 | virtual size_t size() const = 0; 183 | }; 184 | } 185 | #endif 186 | -------------------------------------------------------------------------------- /src/python/python.i: -------------------------------------------------------------------------------- 1 | /* 2 | ParaText: parallel text reading 3 | Copyright (C) 2016. wise.io, Inc. 4 | 5 | Licensed to the Apache Software Foundation (ASF) under one 6 | or more contributor license agreements. See the NOTICE file 7 | distributed with this work for additional information 8 | regarding copyright ownership. The ASF licenses this file 9 | to you under the Apache License, Version 2.0 (the 10 | "License"); you may not use this file except in compliance 11 | with the License. You may obtain a copy of the License at 12 | 13 | http://www.apache.org/licenses/LICENSE-2.0 14 | 15 | Unless required by applicable law or agreed to in writing, 16 | software distributed under the License is distributed on an 17 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 18 | KIND, either express or implied. See the License for the 19 | specific language governing permissions and limitations 20 | under the License. 21 | */ 22 | 23 | /* 24 | Coder: Damian Eads. 25 | */ 26 | 27 | %init %{ 28 | import_array(); 29 | %} 30 | 31 | #define PARATEXT_TYPEMAP_EXCEPTION_START try { 32 | 33 | #define PARATEXT_TYPEMAP_EXCEPTION_END } catch (const std::string &e) {\ 34 | std::string s = e;\ 35 | SWIG_exception(SWIG_RuntimeError, s.c_str());\ 36 | SWIG_fail;\ 37 | } catch (const std::exception &e) {\ 38 | SWIG_exception(SWIG_RuntimeError, e.what());\ 39 | SWIG_fail;\ 40 | } catch (const char *emsg) {\ 41 | SWIG_exception(SWIG_RuntimeError, emsg);\ 42 | SWIG_fail;\ 43 | } catch (...) {\ 44 | SWIG_exception(SWIG_RuntimeError, "unknown exception");\ 45 | SWIG_fail;\ 46 | } 47 | 48 | %exception { 49 | try { 50 | $action 51 | } catch (const std::string &e) { 52 | std::string s = e; 53 | SWIG_exception(SWIG_RuntimeError, s.c_str()); 54 | SWIG_fail; 55 | } catch (const std::exception &e) { 56 | SWIG_exception(SWIG_RuntimeError, e.what()); 57 | SWIG_fail; 58 | } catch (const char *emsg) { 59 | SWIG_exception(SWIG_RuntimeError, emsg); 60 | SWIG_fail; 61 | } catch (...) { 62 | SWIG_exception(SWIG_RuntimeError, "unknown exception"); 63 | SWIG_fail; 64 | } 65 | } 66 | 67 | %typemap(out) std::vector { 68 | $result = (PyObject*)::build_array>($1); 69 | } 70 | 71 | %typemap(out) std::vector { 72 | $result = (PyObject*)::build_array>($1); 73 | } 74 | 75 | %typemap(out) std::vector { 76 | $result = (PyObject*)::build_array>($1); 77 | } 78 | 79 | %typemap(out) const std::vector & { 80 | { auto range = std::make_pair($1->begin(), $1->end()); 81 | $result = (PyObject*)::build_array_from_range(range); 82 | } 83 | } 84 | 85 | %typemap(out) std::vector { 86 | $result = (PyObject*)::build_array>($1); 87 | } 88 | 89 | %typemap(out) const std::vector & { 90 | { auto range = std::make_pair($1->begin(), $1->end()); 91 | $result = (PyObject*)::build_array_from_range(range); 92 | } 93 | } 94 | 95 | %typemap(out) const std::pair, ParaText::TagEncoding > & { 96 | { auto range = std::make_pair($1->begin(), $1->end()); 97 | $result = (PyObject*)::build_array_from_range>(range); 98 | } 99 | } 100 | 101 | %typemap(out) const std::pair, ParaText::TagEncoding > & { 102 | { auto range = std::make_pair($1->begin(), $1->end()); 103 | $result = (PyObject*)::build_array_from_range>(range); 104 | } 105 | } 106 | 107 | %typemap(out) std::vector { 108 | $result = (PyObject*)::build_array>($1); 109 | } 110 | 111 | %typemap(out) ParaText::CSV::ColBasedPopulator { 112 | $result = (PyObject*)::build_populator($1); 113 | } 114 | 115 | %typemap(out) ParaText::CSV::StringVectorPopulator { 116 | $result = (PyObject*)::build_populator($1); 117 | } 118 | 119 | /* 120 | %typemap(in) const std::string & { 121 | std::string result(ParaText::get_as_string($input, 0)); 122 | $1 = &result; 123 | } 124 | 125 | %typemap(in) std::string & { 126 | std::string result(ParaText::get_as_string($input, 0)); 127 | $1 = &result; 128 | } 129 | */ 130 | 131 | %typemap(in) const std::string & { 132 | PARATEXT_TYPEMAP_EXCEPTION_START 133 | std::unique_ptr result(new std::string(ParaText::get_as_string($input, 0))); 134 | $1 = result.release(); 135 | PARATEXT_TYPEMAP_EXCEPTION_END 136 | } 137 | 138 | %typemap(in) std::string & { 139 | PARATEXT_TYPEMAP_EXCEPTION_START 140 | std::unique_ptr result(new std::string(ParaText::get_as_string($input, 0))); 141 | $1 = result.release(); 142 | PARATEXT_TYPEMAP_EXCEPTION_END 143 | } 144 | 145 | %typemap(freearg) const std::string & { 146 | delete $1; 147 | } 148 | 149 | %typemap(freearg) std::string & { 150 | delete $1; 151 | } 152 | 153 | 154 | %typemap(out) const std::string & { 155 | AsPythonString helper; 156 | $result = helper(*$1); 157 | } 158 | 159 | %typemap(out) std::string & { 160 | AsPythonString helper; 161 | $result = helper($1); 162 | } 163 | 164 | %typemap(out) std::string { 165 | AsPythonString helper; 166 | $result = helper($1); 167 | } 168 | 169 | %typemap(out) ParaText::as_raw_bytes { 170 | AsPythonString helper; 171 | $result = helper($1.val); 172 | } 173 | 174 | %typemap(out) ParaText::as_utf8 { 175 | AsPythonString helper; 176 | $result = helper($1.val); 177 | } 178 | 179 | 180 | 181 | %{ 182 | #include "python/numpy_helper.hpp" 183 | #include "python/python_input.hpp" 184 | %} 185 | 186 | -------------------------------------------------------------------------------- /python/paratext/serial.py: -------------------------------------------------------------------------------- 1 | """ 2 | Single-threaded utilities 3 | """ 4 | 5 | # Licensed to the Apache Software Foundation (ASF) under one 6 | # or more contributor license agreements. See the NOTICE file 7 | # distributed with this work for additional information 8 | # regarding copyright ownership. The ASF licenses this file 9 | # to you under the Apache License, Version 2.0 (the 10 | # "License"); you may not use this file except in compliance 11 | # with the License. You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, 16 | # software distributed under the License is distributed on an 17 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 18 | # KIND, either express or implied. See the License for the 19 | # specific language governing permissions and limitations 20 | # under the License. 21 | # 22 | # Copyright (C) Wise.io, Inc. 2016. 23 | 24 | 25 | # 26 | # Coder: Damian Eads 27 | # 28 | 29 | import paratext_internal as pti 30 | 31 | import os 32 | import six 33 | from six.moves import range 34 | from six.moves.urllib_parse import urlparse 35 | 36 | import random 37 | import numpy as np 38 | import string 39 | 40 | import numpy as np 41 | import unittest 42 | import collections 43 | import pandas 44 | import paratext_internal 45 | import os 46 | import random 47 | import sys 48 | 49 | if sys.version_info>=(3,0): 50 | def _repr_bytes(o): 51 | return bytes(repr(o), 'utf-8') 52 | else: 53 | def _repr_bytes(o): 54 | return repr(o) 55 | 56 | def as_quoted_string(s, do_not_escape_newlines=False): 57 | return paratext_internal.as_quoted_string(s, do_not_escape_newlines) 58 | 59 | 60 | def _docstring_parameter(*sub): 61 | def dec(obj): 62 | obj.__doc__ = obj.__doc__.format(*sub) 63 | return obj 64 | return dec 65 | 66 | _save_frame_params = """ 67 | frame : DataFrame, mapping, dict 68 | This object must be DataFrame-like (ie implement .keys() and __getattr__). 69 | 70 | allow_quoted_newlines : bool 71 | Whether to allow newlines to be unescaped in a quoted string. If False, all newline 72 | are encountered are escaped. 73 | 74 | out_encoding : bool 75 | The encoding to use. Valid options include: 76 | - `utf-8`: UTF-8 data 77 | - `arbitrary`: arbitrary bytes (values 0x00-0xFF) 78 | - `printable_ascii`: values 0x20-0xFF. 0x0A is included if `allow_quoted_newlines`=True 79 | - `ascii`: values 0x00-0x7F 80 | If any values are outside of this range, they are backslash-escaped. 81 | 82 | dos : bool 83 | Whether to add a carriage return before a newline (Windows and DOS compatability). 84 | """ 85 | 86 | 87 | @_docstring_parameter(_save_frame_params) 88 | def save_frame(filename, frame, allow_quoted_newlines=True, out_encoding='arbitrary', dos=False): 89 | """ 90 | Saves a dictframe/DataFrame of sequences of the same size to a CSV file. 91 | 92 | Parameters 93 | ---------- 94 | filename : str, unicode 95 | The name of the filename to write. 96 | 97 | {0} 98 | """ 99 | f = open(filename, 'wb') 100 | write_frame(f, frame, allow_quoted_newlines=allow_quoted_newlines, out_encoding=out_encoding, dos=dos) 101 | f.close() 102 | 103 | @_docstring_parameter(_save_frame_params) 104 | def write_frame(stream, frame, allow_quoted_newlines=True, out_encoding='arbitrary', dos=False): 105 | """ 106 | Saves a dictframe/DataFrame of sequences of the same size to a byte stream (binary mode). 107 | 108 | Parameters 109 | ---------- 110 | filename : str, unicode 111 | The name of the filename to write. 112 | 113 | {0} 114 | """ 115 | 116 | # In case .keys() is non-deterministic 117 | keys = list(frame.keys()) 118 | cols = [] 119 | 120 | psafe=paratext_internal.SafeStringOutput() 121 | psafe.escape_nonascii(True) 122 | psafe.escape_nonprintables(True) 123 | safe=paratext_internal.SafeStringOutput() 124 | safe.escape_special(True) 125 | if out_encoding == 'utf-8': 126 | safe.escape_nonutf8(True) 127 | elif out_encoding == 'ascii': 128 | safe.escape_nonascii(True) 129 | elif out_encoding == 'printable_ascii': 130 | safe.escape_nonascii(True) 131 | safe.escape_nonprintables(True) 132 | if not allow_quoted_newlines: 133 | safe.escape_newlines(True) 134 | psafe.escape_newlines(True) 135 | safe.double_quote_output(True) 136 | psafe.double_quote_output(True) 137 | for col in range(len(keys)): 138 | if col > 0: 139 | stream.write(b",") 140 | stream.flush() 141 | key = keys[col] 142 | if out_encoding == 'utf-8': 143 | stream.flush() 144 | if isinstance(key, bytes): 145 | skey = psafe.to_raw_string(key) 146 | else: 147 | skey = safe.to_raw_string(key) 148 | stream.write(skey) 149 | else: 150 | stream.flush() 151 | if isinstance(key, bytes): 152 | skey = psafe.to_raw_string(key) 153 | else: 154 | skey = safe.to_raw_string(key) 155 | stream.write(skey) 156 | if isinstance(frame[key], pandas.Series): 157 | cols.append(frame[key].values) 158 | else: 159 | cols.append(np.asarray(frame[key])) 160 | if dos: 161 | stream.write(b"\r\n") 162 | else: 163 | stream.write(b"\n") 164 | if hasattr(frame, "shape"): 165 | num_rows = frame.shape[0] 166 | elif len(keys) == 0: 167 | num_rows = 0 168 | else: 169 | num_rows = len(frame[keys[0]]) 170 | for row in range(num_rows): 171 | for col in range(len(cols)): 172 | if col > 0: 173 | stream.write(b',') 174 | val = cols[col][row] 175 | if np.issubdtype(type(val), np.string_) or np.issubdtype(type(val), np.unicode_) or isinstance(val, six.string_types): 176 | if out_encoding == 'utf-8': 177 | #sval = safe.to_utf8_string(val) 178 | if isinstance(val, bytes): 179 | sval = psafe.to_raw_string(val) 180 | else: 181 | sval = safe.to_raw_string(val) 182 | stream.write(sval) 183 | else: 184 | sval = safe.to_raw_string(val) 185 | stream.write(sval) 186 | else: 187 | stream.write(bytes(_repr_bytes(val))) 188 | if dos: 189 | stream.write(b"\r\n") 190 | else: 191 | stream.write(b"\n") 192 | -------------------------------------------------------------------------------- /src/csv/rowbased_worker.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | ParaText: parallel text reading 4 | Copyright (C) 2016. wise.io, Inc. 5 | 6 | Licensed to the Apache Software Foundation (ASF) under one 7 | or more contributor license agreements. See the NOTICE file 8 | distributed with this work for additional information 9 | regarding copyright ownership. The ASF licenses this file 10 | to you under the Apache License, Version 2.0 (the 11 | "License"); you may not use this file except in compliance 12 | with the License. You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, 17 | software distributed under the License is distributed on an 18 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 19 | KIND, either express or implied. See the License for the 20 | specific language governing permissions and limitations 21 | under the License. 22 | */ 23 | 24 | /* 25 | Coder: Damian Eads. 26 | */ 27 | 28 | #ifndef PARATEXT_ROW_BASED_WORKER_HPP 29 | #define PARATEXT_ROW_BASED_WORKER_HPP 30 | 31 | #include 32 | 33 | namespace ParaText { 34 | 35 | namespace CSV { 36 | 37 | class RowBasedParseWorker { 38 | public: 39 | RowBasedParseWorker(size_t chunk_start, size_t chunk_end, size_t file_size, size_t block_size, bool compression) 40 | : chunk_start_(chunk_start), 41 | chunk_end_(chunk_end), 42 | file_size_(file_size), 43 | block_size_(block_size), 44 | compression_(compression) {} 45 | 46 | virtual ~RowBasedParseWorker() {} 47 | 48 | void parse(const std::string &filename) { 49 | std::ifstream in; 50 | in.open(filename.c_str()); 51 | const size_t block_size = block_size_; 52 | char buf[block_size]; 53 | in.seekg(chunk_start_, std::ios_base::beg); 54 | size_t current = chunk_start_; 55 | uint8_t state = 0; 56 | //std::array staters; 57 | // 0: assumed negative-integer 58 | // 1: assumed integer, only digits encountered 59 | // 2: assumed float, '.' encountered 60 | // 3: assumed float, digits encountered before and after '.' 61 | // 4: assumed float, 'e' encountered 62 | // 5: closed-string 63 | // 6: open-string, '"' encountered 64 | // 7: unquoted delimiter 65 | // 8: unquoted newline 66 | std::vector token; 67 | state = 0; 68 | std::vector input; 69 | //msgpack::sbuffer ss; 70 | std::string output; 71 | column_index_ = 0; 72 | while (in && current < chunk_end_) { 73 | in.read(buf, std::min(chunk_end_ - current, block_size)); 74 | size_t nread = in.gcount(); 75 | if (nread == 0) { 76 | break; 77 | } 78 | size_t i = 0; 79 | if (state == 6) { /* open quote. */ 80 | for (; i < nread; i++) { 81 | if (buf[i] == '\"') { 82 | i++; 83 | state = 5; 84 | break; 85 | } 86 | else { 87 | token.push_back(buf[i]); 88 | } 89 | } 90 | } 91 | if (state < 4) { 92 | if (buf[i] == 'E' || buf[i] == 'e') { 93 | token.push_back(buf[i]); 94 | i++; 95 | state = 4; 96 | } 97 | } 98 | for (size_t i = 0; i < nread; i++) { 99 | if (buf[i] >= 0x3A) { 100 | if (state >= 4) { 101 | state = 5; 102 | token.push_back(buf[i]); 103 | } 104 | else if (buf[i] == 'E' || buf[i] == 'e') { 105 | state = 4; 106 | token.push_back(buf[i]); 107 | } 108 | } 109 | else if (buf[i] >= 0x30) { 110 | token.push_back(buf[i]); 111 | } 112 | else { 113 | if (buf[i] == ',' || buf[i] == '\n') { 114 | //std::cout << "[" << (int)state << "," << std::string(token.begin(), token.end()) << "]" << std::endl; 115 | if (state < 2) { 116 | input.push_back(0); 117 | long val = fast_atoi(token.begin(), token.end()); 118 | unsigned char *bb = (unsigned char *)(void*)&val; 119 | //input.insert(0); 120 | input.insert(input.end(), bb, bb + sizeof(long)); 121 | #if 0 122 | msgpack::pack(ss, val); 123 | input.insert(input.end(), ss.data(), ss.data() + ss.size()); 124 | ss.clear(); 125 | #endif 126 | #if 0 127 | if (val >= 0 && val < 128) { 128 | unsigned char v = (unsigned char)val; 129 | unsigned char *bb = (unsigned char *)(void*)&v; 130 | input.insert(input.end(), bb, bb + 1); 131 | } 132 | else { 133 | input.push_back(128); 134 | unsigned char *bb = (unsigned char *)(void*)&val; 135 | input.insert(input.end(), bb, bb + sizeof(long)); 136 | } 137 | #endif 138 | } 139 | else if (state < 5) { 140 | input.push_back(1); 141 | double val = bsd_strtod(token.begin(), token.end()); 142 | unsigned char *bb = (unsigned char *)(void*)&val; 143 | input.insert(input.end(), bb, bb + sizeof(double)); 144 | } 145 | else { 146 | input.push_back(2); 147 | long len = token.size(); 148 | unsigned char *bl = (unsigned char *)(void*)&len; 149 | input.insert(input.end(), bl, bl + sizeof(long)); 150 | input.insert(input.end(), token.begin(), token.end()); 151 | } 152 | if (rows_.size() == 0) { 153 | starting_state_.push_back(state); 154 | } 155 | column_index_++; 156 | if (column_index_ < starting_state_.size()) { 157 | state = starting_state_[column_index_]; 158 | } 159 | else { 160 | state = 0; 161 | } 162 | token.clear(); 163 | } 164 | else if (buf[i] == '.') { 165 | if (state < 2) { 166 | state = 3; 167 | } 168 | else { 169 | state = 5; 170 | } 171 | token.push_back('.'); 172 | } 173 | else if (buf[i] == '"') { 174 | if (state == 6) { 175 | state = 5; 176 | } 177 | else { 178 | state = 6; 179 | } 180 | } 181 | else { 182 | token.push_back(buf[i]); 183 | } 184 | if (buf[i] == '\n') { 185 | //std::cout << input.size() << std::endl; 186 | if (compression_) { 187 | snappy::Compress((const char *)input.data(), input.size(), &output); 188 | input.clear(); 189 | rows_.emplace_back(output.begin(), output.end()); 190 | } 191 | else { 192 | rows_.emplace_back(input.begin(), input.end()); 193 | input.clear(); 194 | } 195 | } 196 | } 197 | } 198 | current += nread; 199 | } 200 | if (input.size() > 0) { 201 | if (compression_) { 202 | snappy::Compress((const char*)input.data(), input.size(), &output); 203 | input.clear(); 204 | rows_.emplace_back(output.begin(), output.end()); 205 | } 206 | else { 207 | rows_.emplace_back(input.begin(), input.end()); 208 | input.clear(); 209 | } 210 | } 211 | } 212 | 213 | private: 214 | size_t chunk_start_; 215 | size_t chunk_end_; 216 | size_t file_size_; 217 | size_t column_index_; 218 | const size_t block_size_; 219 | bool compression_; 220 | std::vector maximum_values_; 221 | std::vector > rows_; 222 | std::vector starting_state_; 223 | }; 224 | } 225 | } 226 | #endif 227 | -------------------------------------------------------------------------------- /src/csv/header_parser.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | ParaText: parallel text reading 4 | Copyright (C) 2016. wise.io, Inc. 5 | 6 | Licensed to the Apache Software Foundation (ASF) under one 7 | or more contributor license agreements. See the NOTICE file 8 | distributed with this work for additional information 9 | regarding copyright ownership. The ASF licenses this file 10 | to you under the Apache License, Version 2.0 (the 11 | "License"); you may not use this file except in compliance 12 | with the License. You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, 17 | software distributed under the License is distributed on an 18 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 19 | KIND, either express or implied. See the License for the 20 | specific language governing permissions and limitations 21 | under the License. 22 | 23 | */ 24 | 25 | /* 26 | Coder: Damian Eads. 27 | */ 28 | 29 | #ifndef PARATEXT_HEADER_PARSER_HPP 30 | #define PARATEXT_HEADER_PARSER_HPP 31 | 32 | #include 33 | #include 34 | #include 35 | 36 | #include "util/strings.hpp" 37 | 38 | namespace ParaText { 39 | 40 | namespace CSV { 41 | 42 | /* 43 | Parses the first line of a CSV file to determine the header. 44 | */ 45 | class HeaderParser { 46 | public: 47 | /* 48 | Constructs an uninitialized header parser. 49 | */ 50 | HeaderParser() : length_(0), end_of_header_(0), has_header_(false) {} 51 | 52 | /* 53 | Destroys this parser. 54 | */ 55 | virtual ~HeaderParser() {} 56 | 57 | /* 58 | Opens a file and parses its header. 59 | */ 60 | void open(const std::string &filename, bool no_header) { 61 | struct stat fs; 62 | if (stat(filename.c_str(), &fs) == -1) { 63 | std::ostringstream ostr; 64 | ostr << "cannot open file '" << filename << "'"; 65 | throw std::logic_error(ostr.str()); 66 | } 67 | length_ = fs.st_size; 68 | in_.open(filename); 69 | if (!in_) { 70 | std::ostringstream ostr; 71 | ostr << "cannot open file '" << filename << "'"; 72 | throw std::logic_error(ostr.str()); 73 | } 74 | parse_header(no_header); 75 | } 76 | 77 | /* 78 | Returns the number of columns detected in the header. 79 | */ 80 | size_t get_num_columns() const { 81 | return column_names_.size(); 82 | } 83 | 84 | /* 85 | Adds a column of a specified name. 86 | */ 87 | void add_column_name(const std::string &name) { 88 | //std::cerr << "col " << column_names_.size() << ": " << name << std::endl; 89 | 90 | std::string transformed_name; 91 | parse_unquoted_string(name.begin(), name.end(), std::back_inserter(transformed_name)); 92 | convert_null_to_space(transformed_name.begin(), transformed_name.end()); 93 | column_names_.push_back(transformed_name); 94 | } 95 | 96 | /* 97 | Returns a specific name of a column. 98 | */ 99 | const std::string &get_column_name(size_t index) const { 100 | return column_names_[index]; 101 | } 102 | 103 | /* 104 | Parses a header. 105 | */ 106 | void parse_header(bool no_header=false) { 107 | std::string token; 108 | size_t current = 0; 109 | size_t block_size = 4096; 110 | size_t escape_jump = 0; 111 | char buf[block_size]; 112 | char quote_started = 0; 113 | bool eoh_encountered = false; 114 | bool soh_encountered = false; 115 | in_.seekg(0, std::ios_base::beg); 116 | while (current < length_ && !eoh_encountered) { 117 | if (current % block_size == 0) { /* The block is aligned. */ 118 | in_.read(buf, std::min(length_ - current, block_size)); 119 | } 120 | else { /* Our first read should ensure our further reads are block-aligned. */ 121 | in_.read(buf, std::min(length_ - current, std::min(block_size, current % block_size))); 122 | } 123 | size_t nread = in_.gcount(); 124 | size_t i = 0; 125 | /* ignore leading whitespace in the file. */ 126 | while (i < nread && !soh_encountered) { 127 | if (isspace(buf[i])) { 128 | i++; /* eat the whitespace. */ 129 | } else { 130 | soh_encountered = true; 131 | /* do not do i++. we need to process it like non-whitespace */ 132 | } 133 | } 134 | while (i < nread && !eoh_encountered) { 135 | if (quote_started) { 136 | for (; i < nread; i++) { 137 | if (escape_jump > 0) { 138 | escape_jump--; 139 | } 140 | else if (buf[i] == '\\') { 141 | escape_jump = 1; 142 | } 143 | else if (buf[i] == quote_started) { 144 | i++; 145 | quote_started = 0; 146 | break; 147 | } 148 | token.push_back(buf[i]); 149 | } 150 | } 151 | else { 152 | for (; i < nread; i++) { 153 | if (escape_jump > 0) { 154 | token.push_back(buf[i]); 155 | escape_jump--; 156 | } 157 | else if (buf[i] == '\\') { 158 | token.push_back(buf[i]); 159 | escape_jump = 1; 160 | } 161 | else if (buf[i] == '\"' || buf[i] == '\'') { 162 | quote_started = buf[i]; 163 | i++; 164 | break; 165 | } 166 | else if (buf[i] == ',') { 167 | add_column_name(token); 168 | token.clear(); 169 | } 170 | else if (buf[i] == '\r') { /* do nothing: dos wastes a byte each line. */ } 171 | else if (buf[i] == '\n') { 172 | add_column_name(token); 173 | token.clear(); 174 | end_of_header_ = current + i; 175 | eoh_encountered = true; 176 | i++; 177 | break; 178 | } 179 | else { 180 | token.push_back(buf[i]); 181 | } 182 | } 183 | } 184 | } 185 | current += nread; 186 | } 187 | if (!soh_encountered) { /* If this is just a file of whitespace, then the end of header is the last pos in the file. */ 188 | end_of_header_ = current; 189 | } 190 | std::unordered_set cnset; 191 | for (auto &cname : column_names_) { 192 | cnset.insert(cname); 193 | } 194 | has_header_ = true; 195 | if (cnset.size() != column_names_.size() || no_header) { 196 | has_header_ = false; 197 | #ifdef PARALOAD_DEBUG 198 | std::cout << "column names not unique: " << cnset.size() << " unique column names found." ; 199 | #endif 200 | size_t num_columns = column_names_.size(); 201 | column_names_.clear(); 202 | for (size_t i = 0; i < num_columns; i++) { 203 | std::ostringstream ostr; 204 | ostr << "col" << i; 205 | std::string sstr(ostr.str()); 206 | column_names_.push_back(sstr); 207 | } 208 | end_of_header_ = 0; 209 | } 210 | #ifdef PARALOAD_DEBUG 211 | std::cout << "Total columns in header: " << column_names_.size() << std::endl; 212 | #endif 213 | return; 214 | } 215 | 216 | /* 217 | Returns the end of the header. 218 | */ 219 | size_t get_end_of_header() const { 220 | return end_of_header_; 221 | } 222 | 223 | bool has_header() const { 224 | return has_header_; 225 | } 226 | 227 | private: 228 | std::ifstream in_; 229 | std::vector column_names_; 230 | size_t length_; 231 | size_t end_of_header_; 232 | bool has_header_; 233 | }; 234 | } 235 | } 236 | #endif 237 | -------------------------------------------------------------------------------- /tests/test_paratext.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | import paratext.testing 4 | import paratext.serial 5 | from paratext.testing import assert_dictframe_almost_equal, generate_tempfile, generate_tempfilename 6 | import pandas.util.testing 7 | import numpy as np 8 | import logging 9 | 10 | class TestBasicFiles: 11 | 12 | def do_basic_nums(self, dtype, num_rows, num_columns, num_threads, number_only, no_header): 13 | if no_header: 14 | filedata = '' 15 | keys = ["col%d" % k for k in range(num_columns)] 16 | else: 17 | keys = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"] 18 | keys = keys[0:num_columns] 19 | filedata = ','.join(keys[0:num_columns]) + "\n" 20 | expected = {} 21 | for key in keys: 22 | expected[key] = [] 23 | for row in range(num_rows): 24 | if np.issubdtype(dtype, np.integer): 25 | row_data = [row*i for i in range(num_columns)] 26 | else: 27 | row_data = np.random.random((num_columns,)) 28 | filedata += ",".join([str(v) for v in row_data]) + "\n" 29 | for k in range(len(keys)): 30 | expected[keys[k]].append(row_data[k]) 31 | with generate_tempfile(filedata.encode("utf-8")) as fn: 32 | logging.debug("filename: %s" % fn) 33 | actual = paratext.load_csv_to_pandas(fn, num_threads=num_threads, number_only=number_only, no_header=no_header) 34 | assert_dictframe_almost_equal(actual, expected) 35 | 36 | def do_basic_empty(self, file_body, num_threads): 37 | with generate_tempfile(file_body) as fn: 38 | logging.debug("filename: %s" % fn) 39 | actual = paratext.load_csv_to_pandas(fn, num_threads=num_threads) 40 | expected = pandas.DataFrame() 41 | assert_dictframe_almost_equal(actual, expected) 42 | 43 | def test_basic_empty(self): 44 | file_bodies = [b"", b"\n", b"\n\n", b" ", b" \n", b" \n \n \n", b"\n \n", b"\v\t \n", b"\n\n\n", b"\n\n\n\n"] 45 | file_bodies += [b"\r\n", b"\r\n\r\n", b" ", b" \r\n", b" \r\n \r\n \r\n", b"\r\n \r\n", b"\r\v\t \r\n", b"\r\n\r\n\r\n", b"\r\n\r\n\r\n\r\n"] 46 | for file_body in file_bodies: 47 | for num_threads in [1]: 48 | yield self.do_basic_empty, file_body, num_threads 49 | 50 | def test_basic_ints(self): 51 | for no_header in [False, True]: 52 | for number_only in [False, True]: 53 | for dtype in [np.float_, np.int64]: 54 | for num_rows in [0, 1, 2, 3, 4, 5, 6, 10, 100, 1000]: 55 | for num_cols in [1, 2, 3, 4, 5, 6, 10]: 56 | if num_rows * num_cols < 20: 57 | thread_set = range(0,30) 58 | else: 59 | thread_set = [0, 1, 2, 3, 4, 5, 6, 7, 8, 15, 20] 60 | for num_threads in thread_set: 61 | yield self.do_basic_nums, dtype, num_rows, num_cols, num_threads, number_only, no_header 62 | 63 | def test_basic_strange1(self): 64 | filedata = b"""A,B,C 65 | "\\\"","",7 66 | "\\\\","X",8 67 | "\n","\\\\\\"",9""" 68 | with generate_tempfile(filedata) as fn: 69 | expected = {"A": ["\"","\\","\n"], "B": ["","X","\\\""], "C": [7,8,9]} 70 | logging.debug("filename: %s" % fn) 71 | actual = paratext.load_csv_to_pandas(fn, allow_quoted_newlines=True, out_encoding="utf-8") 72 | assert_dictframe_almost_equal(actual, expected) 73 | 74 | def test_basic_3x2x(self): 75 | filedata = b"""A,B,C 76 | 1,4,7 77 | 2,5,8 78 | """ 79 | with generate_tempfile(filedata) as fn: 80 | expected = {"A": [1,2], "B": [4,5], "C": [7,8]} 81 | logging.debug("filename: %s" % fn) 82 | actual = paratext.load_csv_to_pandas(fn) 83 | assert_dictframe_almost_equal(actual, expected) 84 | 85 | def test_basic_3x1x(self): 86 | filedata = b"""A,B,C 87 | 1,4,7 88 | """ 89 | with generate_tempfile(filedata) as fn: 90 | expected = {"A": [1], "B": [4], "C": [7]} 91 | logging.debug("filename: %s" % fn) 92 | actual = paratext.load_csv_to_pandas(fn) 93 | assert_dictframe_almost_equal(actual, expected) 94 | 95 | 96 | def test_basic_3x0x(self): 97 | filedata = b"""A,B,C 98 | """ 99 | with generate_tempfile(filedata) as fn: 100 | expected = {"A": [], "B": [], "C": []} 101 | logging.debug("filename: %s" % fn) 102 | actual = paratext.load_csv_to_pandas(fn) 103 | assert_dictframe_almost_equal(actual, expected) 104 | 105 | def test_basic_empty_cells_num(self): 106 | filedata = b"""A,B,C,D,E,F 107 | #,1,#,#,2,# 108 | 3,#,#,4,5,# 109 | 6,#,#,#,#,# 110 | #,7,#,#,#,# 111 | #,#,8,#,#,# 112 | #,#,#,9,#,# 113 | #,#,#,#,10,# 114 | #,#,#,#,#,11 115 | #,#,12,#,#,13 116 | 14,#,#,15,16,17 117 | """ 118 | filedata = filedata.replace(b"#", b"") 119 | with generate_tempfile(filedata) as fn: 120 | expected = {"A": [0,3,6,0,0,0,0,0,0,14], "B": [1,0,0,7,0,0,0,0,0,0], "C": [0,0,0,0,8,0,0,0,12,0], "D": [0,4,0,0,0,9,0,0,0,15], "E": [2,5,0,0,0,0,10,0,0,16], "F": [0,0,0,0,0,0,0,11,13,17]} 121 | logging.debug("filename: %s" % fn) 122 | actual = paratext.load_csv_to_pandas(fn, number_only=True) 123 | assert_dictframe_almost_equal(actual, expected) 124 | 125 | class TestMixedFiles: 126 | 127 | def run_case(self, num_rows, num_cats, num_floats, num_ints, num_threads): 128 | expected, types_df = paratext.testing.generate_mixed_frame(num_rows, num_floats, num_cats, num_ints) 129 | with generate_tempfilename() as fn: 130 | logging.debug("filename: %s" % fn) 131 | paratext.serial.save_frame(fn, expected, allow_quoted_newlines=True, out_encoding='utf-8') 132 | actual = paratext.load_csv_to_pandas(fn, allow_quoted_newlines=True, out_encoding='utf-8', num_threads=num_threads) 133 | assert_dictframe_almost_equal(actual, expected) 134 | 135 | def test_mixed_frame(self): 136 | for num_rows in [0, 1, 2, 3, 5, 10, 100, 1000]: 137 | for num_cats in [1, 3, 5]: 138 | for num_floats in [1, 3, 5]: 139 | for num_ints in [0, 1, 5, 10, 50]: 140 | for num_threads in [1, 2, 3, 5, 10, 20]: 141 | yield self.run_case, num_rows, num_cats, num_floats, num_ints, num_threads 142 | 143 | class TestHellFiles: 144 | 145 | def do_hell_frame(self, dos, frame_encoding, out_encoding, include_null, allow_quoted_newlines, rows, cols, num_threads): 146 | expected = paratext.testing.generate_hell_frame(rows, cols, include_null=include_null, fmt=frame_encoding) 147 | with generate_tempfilename() as fn: 148 | logging.debug("filename: %s" % fn) 149 | paratext.serial.save_frame(fn, expected, allow_quoted_newlines, out_encoding=out_encoding, dos=dos) 150 | actual = paratext.load_csv_to_pandas(fn, allow_quoted_newlines=allow_quoted_newlines, out_encoding=out_encoding, num_threads=num_threads, convert_null_to_space=not include_null) 151 | assert_dictframe_almost_equal(actual, expected) 152 | 153 | def test_hell_frame(self): 154 | formatting = [("utf-8", "utf-8"), 155 | ("printable_ascii", "utf-8"), 156 | ("utf-8", "unknown"), 157 | ("arbitrary", "unknown"), 158 | ("arbitrary", "utf-8"), 159 | ("mixed", "unknown"), 160 | ("mixed", "utf-8")] 161 | for dos in [False, True]: 162 | for (frame_encoding, out_encoding) in formatting: 163 | for include_null in [False, True]: 164 | for allow_quoted_newlines in [False, True]: 165 | for num_rows in [0, 1,2,3,4,10,100,600]: 166 | for num_cols in [1,2,3,4,5,10]: 167 | for num_threads in [1,2,4,8,16]: 168 | yield self.do_hell_frame, dos, frame_encoding, out_encoding, include_null, allow_quoted_newlines, num_rows, num_cols, num_threads 169 | -------------------------------------------------------------------------------- /src/util/safe_string_output.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | ParaText: parallel text reading 4 | Copyright (C) 2016. wise.io, Inc. 5 | 6 | Licensed to the Apache Software Foundation (ASF) under one 7 | or more contributor license agreements. See the NOTICE file 8 | distributed with this work for additional information 9 | regarding copyright ownership. The ASF licenses this file 10 | to you under the Apache License, Version 2.0 (the 11 | "License"); you may not use this file except in compliance 12 | with the License. You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, 17 | software distributed under the License is distributed on an 18 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 19 | KIND, either express or implied. See the License for the 20 | specific language governing permissions and limitations 21 | under the License. 22 | */ 23 | 24 | /* 25 | Coder: Damian Eads. 26 | */ 27 | 28 | #ifndef SAFE_STRING_OUTPUT_HPP 29 | #define SAFE_STRING_OUTPUT_HPP 30 | 31 | #include 32 | #include 33 | #include 34 | 35 | namespace WiseIO { 36 | 37 | typedef enum {NO_ESCAPE, ESCAPE, CONTINUATION, LEAD2, LEAD3, LEAD4, POTENTIAL_SURROGATE} SafeCharState; 38 | 39 | class SafeStringOutput { 40 | public: 41 | SafeStringOutput() : double_quote_output_(false) { 42 | should_escape_.fill(NO_ESCAPE); 43 | should_escape_['\\'] = SafeCharState::ESCAPE; 44 | } 45 | 46 | ParaText::as_utf8 to_utf8_string(const std::string &input) { 47 | ParaText::as_utf8 output; 48 | output.val = output_string(input.begin(), input.end()); 49 | return output; 50 | } 51 | 52 | ParaText::as_raw_bytes to_raw_string(const std::string &input) { 53 | ParaText::as_raw_bytes output; 54 | output.val = output_string(input.begin(), input.end()); 55 | return output; 56 | } 57 | 58 | template 59 | std::string output_string(Iterator begin, Iterator end) { 60 | /* FIXME: Support filtering of illegal surrogates in UTF8 sequences. */ 61 | std::ostringstream ostr; 62 | size_t bytes_in_sequence = 0; 63 | //bool surrogate = false; 64 | if (double_quote_output_) { 65 | ostr << '"'; 66 | } 67 | std::vector escaped(std::distance(begin, end), false); 68 | size_t k = 0; 69 | for (Iterator it = begin; it != end; it++, k++) { 70 | unsigned char c = (unsigned char)*it; 71 | bool escape_it = should_escape_[c] == ESCAPE; 72 | if (bytes_in_sequence > 0) { /* If a UTF8 sequence was started, only escape the byte if its not a continuation. */ 73 | escape_it = should_escape_[c] != CONTINUATION; 74 | bytes_in_sequence--; 75 | } 76 | else if (!escape_it && bytes_in_sequence == 0) { /* If a UTF8 sequence is not progress, check higher order bits. */ 77 | switch (should_escape_[c]) { 78 | case LEAD4: 79 | bytes_in_sequence = 3; 80 | break; 81 | case POTENTIAL_SURROGATE: 82 | /*bytes_in_sequence = 2; 83 | surrogate = true;*/ 84 | break; 85 | case LEAD3: 86 | bytes_in_sequence = 2; 87 | break; 88 | case LEAD2: 89 | bytes_in_sequence = 1; 90 | break; 91 | case NO_ESCAPE: 92 | break; 93 | case ESCAPE: /* Explicit escape. */ 94 | case CONTINUATION: /* An invalid continuation byte, escape it. */ 95 | escape_it = true; 96 | break; 97 | } 98 | } 99 | escaped[k] = escape_it; 100 | } 101 | k = 0; 102 | for (Iterator it = begin; it != end; it++, k++) { 103 | unsigned char c = (unsigned char)*it; 104 | if (escaped[k]) { 105 | ostr << '\\'; 106 | switch (c) { 107 | case '\b': 108 | ostr << 'b'; 109 | break; 110 | case '\v': 111 | ostr << 'v'; 112 | break; 113 | case '\n': 114 | ostr << 'n'; 115 | break; 116 | case '\r': 117 | ostr << 'r'; 118 | break; 119 | case '\t': 120 | ostr << 't'; 121 | break; 122 | case '\\': 123 | ostr << '\\'; 124 | break; 125 | case '\"': 126 | ostr << '\"'; 127 | break; 128 | case '\'': 129 | ostr << '\''; 130 | break; 131 | default: 132 | ostr << 'x'; 133 | ostr << to_hex(c >> 4); 134 | ostr << to_hex(c & 0x0F); 135 | break; 136 | } 137 | } 138 | else { 139 | ostr.put(*it); 140 | } 141 | } 142 | if (double_quote_output_) { 143 | ostr << '"'; 144 | } 145 | return ostr.str(); 146 | } 147 | 148 | void escape_newlines(bool b) { 149 | SafeCharState st = b ? SafeCharState::ESCAPE : SafeCharState::NO_ESCAPE; 150 | should_escape_['\n'] = st; 151 | } 152 | 153 | void escape_whitespace(bool b) { 154 | SafeCharState st = b ? SafeCharState::ESCAPE : SafeCharState::NO_ESCAPE; 155 | should_escape_['\n'] = st; 156 | should_escape_['\r'] = st; 157 | should_escape_['\v'] = st; 158 | should_escape_['\f'] = st; 159 | should_escape_['\b'] = st; 160 | } 161 | 162 | void escape_special(bool b) { 163 | SafeCharState st = b ? SafeCharState::ESCAPE : SafeCharState::NO_ESCAPE; 164 | should_escape_['\''] = st; 165 | should_escape_['\"'] = st; 166 | should_escape_['\\'] = st; 167 | } 168 | 169 | void escape_delim(bool b) { 170 | SafeCharState st = b ? SafeCharState::ESCAPE : SafeCharState::NO_ESCAPE; 171 | should_escape_[','] = st; 172 | escape_special(true); 173 | } 174 | 175 | void escape_comments(bool b) { 176 | SafeCharState st = b ? SafeCharState::ESCAPE : SafeCharState::NO_ESCAPE; 177 | should_escape_['%'] = st; 178 | escape_special(true); 179 | } 180 | 181 | void escape_nonprintables(bool b) { 182 | SafeCharState st = b ? SafeCharState::ESCAPE : SafeCharState::NO_ESCAPE; 183 | for (unsigned char c = 0; c < ' '; c++) { 184 | should_escape_[c] = st; 185 | } 186 | } 187 | 188 | void escape_nonascii(bool b) { 189 | SafeCharState st = b ? SafeCharState::ESCAPE : SafeCharState::NO_ESCAPE; 190 | for (size_t c = 0x7F; c <= 0xFF; c++) { 191 | should_escape_[c] = st; 192 | } 193 | } 194 | 195 | void escape_nonutf8(bool b) { 196 | const SafeCharState outside = b ? SafeCharState::ESCAPE : SafeCharState::NO_ESCAPE; 197 | const SafeCharState cont = b ? SafeCharState::CONTINUATION : SafeCharState::NO_ESCAPE; 198 | const SafeCharState lead2 = b ? SafeCharState::LEAD2 : SafeCharState::NO_ESCAPE; 199 | const SafeCharState lead3 = b ? SafeCharState::LEAD3 : SafeCharState::NO_ESCAPE; 200 | const SafeCharState lead4 = b ? SafeCharState::LEAD4 : SafeCharState::NO_ESCAPE; 201 | //const SafeCharState surrogate = b ? SafeCharState::POTENTIAL_SURROGATE : SafeCharState::NO_ESCAPE; 202 | for (size_t c = 0x80; c <= 0xBF; c++) { 203 | should_escape_[c] = cont; 204 | } 205 | for (size_t c = 0xC0; c <= 0xDF; c++) { 206 | should_escape_[c] = lead2; 207 | } 208 | for (size_t c = 0xE0; c <= 0xEF; c++) { 209 | should_escape_[c] = lead3; 210 | } 211 | for (size_t c = 0xF0; c <= 0xF7; c++) { 212 | should_escape_[c] = lead4; 213 | } 214 | for (size_t c = 0xF8; c <= 0xFF; c++) { 215 | should_escape_[c] = outside; 216 | } 217 | //should_escape_[0xED] = surrogate; 218 | } 219 | 220 | void double_quote_output(bool b) { 221 | if (b) { 222 | should_escape_['\"'] = SafeCharState::ESCAPE; 223 | } 224 | double_quote_output_ = b; 225 | } 226 | 227 | private: 228 | inline char to_hex(int v) { 229 | if (v >= 0 && v < 10) { 230 | return '0' + v; 231 | } 232 | else if (v >= 10 && v < 16) { 233 | return 'a' + (v-10); 234 | } 235 | else { 236 | throw std::logic_error("invalid range for hex character"); 237 | } 238 | } 239 | 240 | private: 241 | std::array should_escape_; 242 | bool double_quote_output_; 243 | }; 244 | } 245 | #endif 246 | -------------------------------------------------------------------------------- /src/diagnostic/parse_and_sum.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | ParaText: parallel text reading 4 | Copyright (C) 2016. wise.io, Inc. 5 | 6 | Licensed to the Apache Software Foundation (ASF) under one 7 | or more contributor license agreements. See the NOTICE file 8 | distributed with this work for additional information 9 | regarding copyright ownership. The ASF licenses this file 10 | to you under the Apache License, Version 2.0 (the 11 | "License"); you may not use this file except in compliance 12 | with the License. You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, 17 | software distributed under the License is distributed on an 18 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 19 | KIND, either express or implied. See the License for the 20 | specific language governing permissions and limitations 21 | under the License. 22 | */ 23 | 24 | /* 25 | Coder: Damian Eads. 26 | */ 27 | 28 | #ifndef PARATEXT_DIAGNOSTIC_PARSE_AND_SUM_HPP 29 | #define PARATEXT_DIAGNOSTIC_PARSE_AND_SUM_HPP 30 | 31 | #include 32 | #include 33 | 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | 40 | #include "generic/chunker.hpp" 41 | #include "csv/header_parser.hpp" 42 | 43 | namespace ParaText { 44 | 45 | namespace Diagnostic { 46 | 47 | template 48 | class ParseAndSumWorker { 49 | public: 50 | ParseAndSumWorker(size_t chunk_start, size_t chunk_end, size_t block_size, size_t num_columns) 51 | : chunk_start_(chunk_start), 52 | chunk_end_(chunk_end), 53 | block_size_(block_size), 54 | num_columns_(num_columns) {} 55 | 56 | virtual ~ParseAndSumWorker() {} 57 | 58 | void parse(const std::string &filename) { 59 | try { 60 | parse_impl(filename); 61 | } 62 | catch (...) { 63 | thread_exception_ = std::current_exception(); 64 | } 65 | } 66 | 67 | std::exception_ptr get_exception() { 68 | return thread_exception_; 69 | } 70 | 71 | void parse_impl(const std::string &filename) { 72 | std::ifstream in; 73 | in.open(filename.c_str()); 74 | const size_t block_size = block_size_; 75 | char buf[block_size]; 76 | in.seekg(chunk_start_, std::ios_base::beg); 77 | size_t current = chunk_start_; 78 | sums_.resize(num_columns_); 79 | std::fill(sums_.begin(), sums_.end(), 0.0); 80 | column_index_ = 0; 81 | num_lines_ = 0; 82 | char token[64]; 83 | size_t j = 0; 84 | while (current <= chunk_end_) { 85 | in.read(buf, std::min(chunk_end_ - current + 1, block_size)); 86 | size_t nread = in.gcount(); 87 | if (nread == 0) { 88 | break; 89 | } 90 | for (size_t i = 0; i < nread; i++) { 91 | if (buf[i] == '\n') { 92 | sums_[column_index_] += parse_token(token, token + j); 93 | column_index_ = 0; 94 | num_lines_++; 95 | j = 0; 96 | } 97 | else if (buf[i] == ',') { 98 | sums_[column_index_] += parse_token(token, token + j); 99 | column_index_++; 100 | j = 0; 101 | } 102 | else { 103 | token[j++] = buf[i]; 104 | } 105 | } 106 | current += nread; 107 | } 108 | if (j > 0) { 109 | sums_[column_index_] += parse_token(token, token + j); 110 | j = 0; 111 | } 112 | if (column_index_ > 0) { 113 | num_lines_++; 114 | } 115 | } 116 | 117 | const std::vector &get_sums() const { 118 | return sums_; 119 | } 120 | 121 | size_t get_N() const { 122 | return num_lines_; 123 | } 124 | 125 | // No type checking 126 | template 127 | inline typename std::enable_if::type parse_token(Iterator begin, Iterator end) const { 128 | return bsd_strtod(begin, end); 129 | } 130 | 131 | // Type checking only for numbers. 132 | template 133 | inline typename std::enable_if::type parse_token(Iterator begin, Iterator end) const { 134 | Iterator it = begin; 135 | for (; it != end && isspace(*it); it++) {} 136 | if (it != end) { 137 | if (*it == '?' && std::distance(it, end) == 1) { 138 | return std::numeric_limits::quiet_NaN(); 139 | } 140 | else if (std::distance(it, end) == 3 && 141 | ((*it == 'n' || *it == 'N')) 142 | && ((*(it+1) == 'a' || *(it+1) == 'A')) 143 | && ((*(it+2) == 'n' || *(it+2) == 'N'))) { 144 | return std::numeric_limits::quiet_NaN(); 145 | } 146 | else { 147 | if (*it == '-') { it++; } 148 | for (; it != end && isdigit(*it); it++) {} 149 | if (it != end && (*it == '.' || *it == 'E' || *it == 'e')) { 150 | return bsd_strtod(begin, end); 151 | } 152 | else { 153 | return (double)fast_atoi(begin, end); 154 | } 155 | } 156 | } 157 | return (double)std::distance(begin, end); 158 | } 159 | 160 | private: 161 | size_t chunk_start_; 162 | size_t chunk_end_; 163 | size_t block_size_; 164 | size_t num_columns_; 165 | size_t num_lines_; 166 | size_t column_index_; 167 | std::vector sums_; 168 | std::exception_ptr thread_exception_; 169 | }; 170 | 171 | class ParseAndSum { 172 | public: 173 | ParseAndSum() {} 174 | 175 | virtual ~ParseAndSum() {} 176 | 177 | size_t load(const std::string &filename, const ParseParams ¶ms, bool type_check) { 178 | size_t retval = 0; 179 | if (type_check) { 180 | retval = load_impl(filename, params); 181 | } 182 | else { 183 | retval = load_impl(filename, params); 184 | } 185 | return retval; 186 | } 187 | 188 | template 189 | size_t load_impl(const std::string &filename, const ParseParams ¶ms) { 190 | std::vector threads; 191 | std::vector > > workers; 192 | header_parser_.open(filename, params.no_header); 193 | std::exception_ptr thread_exception; 194 | if (header_parser_.has_header()) { 195 | chunker_.process(filename, header_parser_.get_end_of_header()+1, params.num_threads, params.allow_quoted_newlines); 196 | } 197 | else { 198 | chunker_.process(filename, 0, params.num_threads, params.allow_quoted_newlines); 199 | } 200 | for (size_t worker_id = 0; worker_id < chunker_.num_chunks(); worker_id++) { 201 | long start_of_chunk = 0, end_of_chunk = 0; 202 | std::tie(start_of_chunk, end_of_chunk) = chunker_.get_chunk(worker_id); 203 | if (start_of_chunk < 0 || end_of_chunk < 0) { 204 | continue; 205 | } 206 | workers.push_back(std::make_shared >(start_of_chunk, end_of_chunk, params.block_size, header_parser_.get_num_columns())); 207 | threads.emplace_back(&ParseAndSumWorker::parse, 208 | workers.back(), 209 | filename); 210 | } 211 | 212 | for (size_t i = 0; i < threads.size(); i++) { 213 | threads[i].join(); 214 | if (!thread_exception) { 215 | thread_exception = workers[i]->get_exception(); 216 | } 217 | } 218 | // We're now outside the parallel region. 219 | if (thread_exception) { 220 | std::rethrow_exception(thread_exception); 221 | } 222 | N_ = 0.0; 223 | avgs_.resize(header_parser_.get_num_columns()); 224 | std::fill(avgs_.begin(), avgs_.end(), 0.0); 225 | for (size_t i = 0; i < workers.size(); i++) { 226 | auto worker_sums = workers[i]->get_sums(); 227 | N_ += workers[i]->get_N(); 228 | for (size_t j = 0; j < worker_sums.size(); j++) { 229 | avgs_[j] += worker_sums[j]; 230 | } 231 | } 232 | for (size_t j = 0; j < avgs_.size(); j++) { 233 | avgs_[j] /= N_; 234 | } 235 | return N_; 236 | } 237 | 238 | size_t get_num_columns() const { 239 | return header_parser_.get_num_columns(); 240 | } 241 | 242 | double get_avg(size_t column_index) const { 243 | return avgs_[column_index]; 244 | } 245 | 246 | const std::string &get_column_name(size_t column_index) const { 247 | return header_parser_.get_column_name(column_index); 248 | } 249 | 250 | size_t get_N() const { 251 | return N_; 252 | } 253 | 254 | private: 255 | CSV::HeaderParser header_parser_; 256 | TextChunker chunker_; 257 | std::vector avgs_; 258 | size_t N_; 259 | }; 260 | } 261 | } 262 | #endif 263 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 2 | 3 | 1. Definitions. 4 | 5 | "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. 6 | 7 | "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. 8 | 9 | "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. 10 | 11 | "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. 12 | 13 | "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. 14 | 15 | "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. 16 | 17 | "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). 18 | 19 | "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. 20 | 21 | "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." 22 | 23 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 24 | 25 | 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 26 | 27 | 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 28 | 29 | 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: 30 | 31 | You must give any other recipients of the Work or Derivative Works a copy of this License; and 32 | You must cause any modified files to carry prominent notices stating that You changed the files; and 33 | You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and 34 | If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. 35 | 36 | You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 37 | 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 38 | 39 | 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 40 | 41 | 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 42 | 43 | 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 44 | 45 | 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. 46 | 47 | END OF TERMS AND CONDITIONS -------------------------------------------------------------------------------- /bench/generate_experiments.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | # 20 | # Copyright (C) Wise.io, Inc. 2016. 21 | 22 | import sha 23 | import json 24 | import os 25 | 26 | all_params = [] 27 | 28 | datasets = {"mnist": 29 | {"csv": "mnist.csv", 30 | "hdf5": "mnist.hdf5", 31 | "npy": "mnist.npy", 32 | "feather": "mnist.feather", 33 | "pickle": "mnist.pkl", 34 | "cPickle": "mnist.pkl", 35 | "no_header": True, 36 | "number_only": True, 37 | "to_df": True}, 38 | "mnist8m": 39 | {"csv": "mnist8m.csv", 40 | "hdf5": "mnist8m.hdf5", 41 | "npy": "mnist8m.npy", 42 | "feather": "mnist8m.feather", 43 | "pickle": "mnist8m.pkl", 44 | "cPickle": "mnist8m.pkl", 45 | "no_header": True, 46 | "number_only": True, 47 | "to_df": False}, 48 | "messy": 49 | {"csv": "messy.csv", 50 | "feather": "messy.feather", 51 | "pickle": "messy.pkl", 52 | "qnl": True, 53 | "no_header": False, 54 | "run_pyspark": False, 55 | "max_level_name_length": 0, 56 | "contains_text": True, 57 | "to_df": True}, 58 | "messy2": 59 | {"csv": "messy2.csv", 60 | "feather": "messy2.feather", 61 | "pickle": "messy2.pkl", 62 | "qnl": True, 63 | "no_header": False, 64 | "run_pyspark": False, 65 | "max_level_name_length": 0, 66 | "contains_text": True, 67 | "to_df": True}, 68 | "car": 69 | {"csv": "car.csv", 70 | "feather": "car.feather", 71 | "pickle": "car.pkl", 72 | "qnl": False, 73 | "no_header": False, 74 | "contains_text": True, 75 | "to_df": True}, 76 | "floats": 77 | {"csv": "floats.csv", 78 | "feather": "floats.feather", 79 | "hdf5": "floats.hdf5", 80 | "npy": "floats.npy", 81 | "no_header": False, 82 | "pickle": "floats.pkl", 83 | "to_df": True}, 84 | "floats2": 85 | {"csv": "floats2.csv", 86 | "feather": "floats2.feather", 87 | "hdf5": "floats2.hdf5", 88 | "npy": "floats2.npy", 89 | "no_header": False, 90 | "pickle": "floats2.pkl", 91 | "to_df": True}, 92 | "floats3": 93 | {"csv": "floats3.csv", 94 | "feather": "floats3.feather", 95 | "hdf5": "floats3.hdf5", 96 | "npy": "floats3.npy", 97 | "no_header": False, 98 | "pickle": "floats3.pkl", 99 | "to_df": True}, 100 | "floats4": 101 | {"csv": "floats4.csv", 102 | "feather": "floats4.feather", 103 | "hdf5": "floats4.hdf5", 104 | "npy": "floats4.npy", 105 | "no_header": False, 106 | "pickle": "floats4.pkl", 107 | "to_df": True} 108 | } 109 | 110 | scaling_experiments = bool(raw_input("enter 'yes' to do scaling experiments, 'no' to do main benchmarks: ").lower() == 'yes') 111 | 112 | print "available datasets: ", datasets.keys() 113 | restrict_keys = raw_input("enter comma-delimited list of datasets to generate experiment json [enter for all]: ") 114 | 115 | if restrict_keys != "": 116 | restrict_keys = set(restrict_keys.split(",")) 117 | for key in datasets.keys(): 118 | if key not in restrict_keys: 119 | datasets.pop(key) 120 | 121 | for name, attr in datasets.iteritems(): 122 | if "csv" in attr: 123 | csv_filename = attr["csv"] 124 | for disk_state in ["cold", "warm"]: 125 | if scaling_experiments: 126 | num_threads_list = [1,4,8,12,16,20,24,28,32] 127 | else: 128 | num_threads_list = [0] 129 | for num_threads in num_threads_list: 130 | for block_size in [32768]: 131 | if not attr.get("contains_text", False): 132 | for type_check in [True, False]: 133 | params = {"cmd": "avgcols", 134 | "filename": attr["csv"], 135 | "no_header": attr.get("no_header", True), 136 | "allow_quoted_newlines": attr.get("qnl", False), 137 | "num_threads": num_threads, 138 | "disk_state": disk_state, 139 | "block_size": block_size, 140 | "to_df": True, 141 | "sum_after": True, 142 | "type_check": type_check, 143 | "log": str(len(all_params)) + ".log"} 144 | all_params.append(params) 145 | for cmd in ["disk-to-mem", "countnl", "paratext"]: 146 | params = {"cmd": cmd, 147 | "filename": attr["csv"], 148 | "no_header": attr.get("no_header", True), 149 | "allow_quoted_newlines": attr.get("qnl", False), 150 | "num_threads": num_threads, 151 | "disk_state": disk_state, 152 | "block_size": block_size, 153 | "to_df": True, 154 | "sum_after": True, 155 | "log": str(len(all_params)) + ".log"} 156 | if attr.get("number_only", False): 157 | params["number_only"] = True 158 | mlnl = attr.get("max_level_name_length", None) 159 | if mlnl: 160 | params["max_level_name_length"] = mlnl 161 | all_params.append(params) 162 | for disk_state in ["cold", "warm"]: 163 | if attr.get("run_pyspark", True): 164 | params = {"cmd": "pyspark", 165 | "filename": attr["csv"], 166 | "no_header": attr.get("no_header", True), 167 | "to_df": attr.get("to_df", False), 168 | "sum_after": True, 169 | "disk_state": disk_state} 170 | all_params.append(params) 171 | 172 | if params.get("number_only", True): 173 | params = {"cmd": "numpy", 174 | "filename": attr["csv"], 175 | "no_header": attr.get("no_header", True), 176 | "sum_after": True, 177 | "disk_state": disk_state} 178 | all_params.append(params) 179 | 180 | for cmd in ["sframe", "pandas", "R-readcsv", "R-readr", "R-fread"]: 181 | params = {"cmd": cmd, 182 | "filename": attr["csv"], 183 | "no_header": attr.get("no_header", True), 184 | "to_df": attr.get("to_df", False), 185 | "sum_after": True, 186 | "disk_state": disk_state} 187 | all_params.append(params) 188 | 189 | for cmd in ["feather", "hdf5", "pickle", "cPickle", "npy"]: 190 | if cmd in attr: 191 | params = {"cmd": cmd, 192 | "filename": attr[cmd], 193 | "sum_after": True, 194 | "disk_state": disk_state} 195 | if cmd == "hdf5": 196 | params["dataset"] = "mydataset" 197 | all_params.append(params) 198 | 199 | if "mnist8m" in datasets.keys(): 200 | for cmd in ["sframe", "paratext", "pyspark"]: 201 | params = {"cmd": cmd, 202 | "filename": "mnist8m.csv", 203 | "no_header": True, 204 | "to_df": True, 205 | "sum_after": True, 206 | "disk_state": disk_state} 207 | all_params.append(params) 208 | 209 | params = {"cmd": "noop"} 210 | all_params.append(params) 211 | 212 | for i, params in enumerate(all_params): 213 | hparams = sha.sha(json.dumps(params)).hexdigest() 214 | prefix = hparams[0:8] 215 | params["log"] = os.path.join(params["cmd"], "run-" + prefix + ".log") 216 | if not os.path.exists(params["cmd"]): 217 | os.makedirs(params["cmd"]) 218 | json.dump(params, open(os.path.join(params["cmd"], "run-" + hparams[0:8] + ".json"), "w"), indent=1) 219 | -------------------------------------------------------------------------------- /src/generic/quote_adjustment_worker.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | ParaText: parallel text reading 3 | Copyright (C) 2016. wise.io, Inc. 4 | 5 | Licensed to the Apache Software Foundation (ASF) under one 6 | or more contributor license agreements. See the NOTICE file 7 | distributed with this work for additional information 8 | regarding copyright ownership. The ASF licenses this file 9 | to you under the Apache License, Version 2.0 (the 10 | "License"); you may not use this file except in compliance 11 | with the License. You may obtain a copy of the License at 12 | 13 | http://www.apache.org/licenses/LICENSE-2.0 14 | 15 | Unless required by applicable law or agreed to in writing, 16 | software distributed under the License is distributed on an 17 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 18 | KIND, either express or implied. See the License for the 19 | specific language governing permissions and limitations 20 | under the License. 21 | */ 22 | 23 | /* 24 | Coder: Damian Eads. 25 | */ 26 | 27 | #ifndef PARATEXT_QUOTE_NEWLINE_WORKER_HPP 28 | #define PARATEXT_QUOTE_NEWLINE_WORKER_HPP 29 | 30 | #include 31 | 32 | namespace ParaText { 33 | 34 | class QuoteNewlineAdjustmentWorker { 35 | public: 36 | QuoteNewlineAdjustmentWorker(size_t chunk_start, size_t chunk_end) 37 | : chunk_start_(chunk_start), 38 | chunk_end_(chunk_end), 39 | num_quotes_(0), 40 | first_unquoted_newline_(-1), 41 | first_quoted_newline_(-1) {} 42 | 43 | virtual ~QuoteNewlineAdjustmentWorker() {} 44 | 45 | void parse(const std::string &filename) { 46 | try { 47 | parse_impl(filename); 48 | } 49 | catch (...) { 50 | thread_exception_ = std::current_exception(); 51 | } 52 | } 53 | 54 | std::exception_ptr get_exception() { 55 | return thread_exception_; 56 | } 57 | 58 | void parse_impl(const std::string &filename) { 59 | std::ifstream in; 60 | in.open(filename.c_str()); 61 | const size_t block_size = 32768; 62 | char buf[block_size]; 63 | in.seekg(chunk_start_, std::ios_base::beg); 64 | size_t current = chunk_start_; 65 | size_t escape_count = 0; 66 | bool in_quote = false; 67 | while (current <= chunk_end_) { 68 | in.read(buf, std::min(chunk_end_ - current + 1, block_size)); 69 | size_t nread = in.gcount(); 70 | if (nread == 0) { 71 | break; 72 | } 73 | size_t i = 0; 74 | while (i < nread && first_unquoted_newline_ < 0 && first_quoted_newline_ < 0) { 75 | if (in_quote) { 76 | for (; i < nread; i++) { 77 | if (escape_count > 0) { 78 | escape_count--; 79 | } 80 | else if (buf[i] == '\\') { 81 | escape_count = 1; 82 | } 83 | else if (buf[i] == '\"') { 84 | num_quotes_++; 85 | #ifdef PARATEXT_DEBUG_QUOTE 86 | std::cerr << "[Q1:" << (current + i) << ":" << num_quotes_ << ":" << escape_count; 87 | #endif 88 | in_quote = false; 89 | i++; 90 | break; 91 | } 92 | else if (buf[i] == '\n') { 93 | first_quoted_newline_ = current + i; 94 | i++; 95 | break; 96 | } 97 | } 98 | } 99 | else { 100 | for (; i < nread; i++) { 101 | if (escape_count > 0) { 102 | escape_count--; 103 | } 104 | else if (buf[i] == '\\') { 105 | escape_count = 1; 106 | } 107 | else if (buf[i] == '\"') { 108 | num_quotes_++; 109 | #ifdef PARATEXT_DEBUG_QUOTE 110 | std::cerr << "[Q2:" << (current + i) << ":" << num_quotes_ << ":" << escape_count; 111 | #endif 112 | in_quote = true; 113 | i++; 114 | break; 115 | } 116 | else if (buf[i] == '\n') { 117 | first_unquoted_newline_ = current + i; 118 | i++; 119 | break; 120 | } 121 | } 122 | } 123 | } 124 | while (i < nread && first_unquoted_newline_ < 0) { 125 | if (in_quote) { 126 | for (; i < nread; i++) { 127 | if (escape_count > 0) { 128 | escape_count--; 129 | } 130 | else if (buf[i] == '\\') { 131 | escape_count = 1; 132 | } 133 | else if (buf[i] == '\"') { 134 | num_quotes_++; 135 | #ifdef PARATEXT_DEBUG_QUOTE 136 | std::cerr << "[Q3:" << (current + i) << ":" << num_quotes_ << ":" << escape_count; 137 | #endif 138 | in_quote = false; 139 | i++; 140 | break; 141 | } 142 | } 143 | } 144 | else { 145 | for (; i < nread; i++) { 146 | if (escape_count > 0) { 147 | escape_count--; 148 | } 149 | else if (buf[i] == '\\') { 150 | escape_count = 1; 151 | } 152 | else if (buf[i] == '\"') { 153 | num_quotes_++; 154 | #ifdef PARATEXT_DEBUG_QUOTE 155 | std::cerr << "[Q4:" << (current + i) << ":" << num_quotes_ << ":" << escape_count; 156 | #endif 157 | in_quote = true; 158 | i++; 159 | break; 160 | } 161 | else if (buf[i] == '\n') { 162 | first_unquoted_newline_ = current + i; 163 | i++; 164 | break; 165 | } 166 | } 167 | } 168 | } 169 | while (i < nread && first_quoted_newline_ < 0) { 170 | if (in_quote) { 171 | for (; i < nread; i++) { 172 | if (escape_count > 0) { 173 | escape_count--; 174 | } 175 | else if (buf[i] == '\\') { 176 | escape_count = 1; 177 | } 178 | else if (buf[i] == '\"') { 179 | num_quotes_++; 180 | #ifdef PARATEXT_DEBUG_QUOTE 181 | std::cerr << "[Q5:" << (current + i) << ":" << num_quotes_ << ":" << escape_count; 182 | #endif 183 | in_quote = false; 184 | i++; 185 | break; 186 | } 187 | else if (buf[i] == '\n') { 188 | first_quoted_newline_ = current + i; 189 | i++; 190 | break; 191 | } 192 | } 193 | } 194 | else { 195 | for (; i < nread; i++) { 196 | if (escape_count > 0) { 197 | escape_count--; 198 | } 199 | else if (buf[i] == '\\') { 200 | escape_count = 1; 201 | } 202 | else if (buf[i] == '\"') { 203 | num_quotes_++; 204 | #ifdef PARATEXT_DEBUG_QUOTE 205 | std::cerr << "[Q6:" << (current + i) << ":" << num_quotes_ << ":" << escape_count; 206 | #endif 207 | in_quote = true; 208 | i++; 209 | break; 210 | } 211 | } 212 | } 213 | } 214 | /* 215 | If we got here, then either we've found both the first quoted newline and 216 | unquoted newline, or we've processed all the data in the buffer. 217 | */ 218 | while (i < nread) { 219 | if (in_quote) { 220 | for (; i < nread; i++) { 221 | if (escape_count > 0) { 222 | escape_count--; 223 | } 224 | else if (buf[i] == '\\') { 225 | escape_count = 1; 226 | } 227 | else if (buf[i] == '\"') { 228 | num_quotes_++; 229 | #ifdef PARATEXT_DEBUG_QUOTE 230 | std::cerr << "[Q7:" << (current + i) << ":" << num_quotes_ << ":" << escape_count; 231 | #endif 232 | in_quote = false; 233 | i++; 234 | break; 235 | } 236 | } 237 | } 238 | else { 239 | for (; i < nread; i++) { 240 | if (escape_count > 0) { 241 | escape_count--; 242 | } 243 | else if (buf[i] == '\\') { 244 | escape_count = 1; 245 | } 246 | else if (buf[i] == '\"') { 247 | num_quotes_++; 248 | #ifdef PARATEXT_DEBUG_QUOTE 249 | std::cerr << "[Q8:" << (current + i) << ":" << num_quotes_ << ":" << escape_count; 250 | #endif 251 | in_quote = true; 252 | i++; 253 | break; 254 | } 255 | } 256 | } 257 | } 258 | current += nread; 259 | } 260 | } 261 | 262 | size_t get_start() const { 263 | return chunk_start_; 264 | } 265 | 266 | size_t get_end() const { 267 | return chunk_end_; 268 | } 269 | 270 | size_t get_num_quotes() const { 271 | return num_quotes_; 272 | } 273 | 274 | long get_first_quoted_newline() const { 275 | return first_quoted_newline_; 276 | } 277 | 278 | long get_first_unquoted_newline() const { 279 | return first_unquoted_newline_; 280 | } 281 | 282 | void clear() { 283 | chunk_start_ = 0; 284 | chunk_end_ = 0; 285 | num_quotes_ = 0; 286 | first_unquoted_newline_ = 0; 287 | first_quoted_newline_ = 0; 288 | } 289 | 290 | void combine_adjacent(const QuoteNewlineAdjustmentWorker &other) { 291 | chunk_end_ = other.chunk_end_; 292 | num_quotes_ += other.num_quotes_; 293 | if (first_unquoted_newline_ < 0) { 294 | first_unquoted_newline_ = other.first_unquoted_newline_; 295 | } 296 | if (first_quoted_newline_ < 0) { 297 | first_quoted_newline_ = other.first_quoted_newline_; 298 | } 299 | } 300 | 301 | private: 302 | size_t chunk_start_; 303 | size_t chunk_end_; 304 | size_t num_quotes_; 305 | long first_unquoted_newline_; 306 | long first_quoted_newline_; 307 | std::exception_ptr thread_exception_; 308 | }; 309 | } 310 | #endif 311 | -------------------------------------------------------------------------------- /python/paratext/testing.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | # 18 | # Copyright (C) Wise.io, Inc. 2016. 19 | 20 | import numpy as np 21 | import pandas.util.testing 22 | import unittest 23 | import collections 24 | import pandas 25 | import paratext_internal 26 | import os 27 | import random 28 | import sys 29 | 30 | from tempfile import NamedTemporaryFile 31 | from contextlib import contextmanager 32 | from six.moves import range 33 | import six 34 | 35 | def generate_hell_frame(num_rows, num_columns, include_null=False, fmt='arbitrary'): 36 | """ 37 | Generate a DataFrame of columns containing randomly generated data. 38 | """ 39 | if include_null: 40 | min_byte = 0 41 | else: 42 | min_byte = 1 43 | frame = collections.OrderedDict() 44 | seed = 0 45 | keys = [] 46 | colfmts = {} 47 | for column in range(num_columns): 48 | key = "col%d" % (column,) 49 | keys.append(key) 50 | if fmt == 'mixed': 51 | colfmts[key] = random.choice(["ascii","arbitrary","printable_ascii","utf-8"]) 52 | else: 53 | colfmts[key] = fmt 54 | for key in keys: 55 | data = [] 56 | colfmt = colfmts[key] 57 | for row in range(num_rows): 58 | length = np.random.randint(50,1000) 59 | if colfmt == 'arbitrary': 60 | cell = paratext_internal.get_random_string(length, seed, min_byte, 255) 61 | elif colfmt == 'ascii': 62 | cell = paratext_internal.get_random_string(length, seed, min_byte, 127) 63 | elif colfmt == 'printable_ascii': 64 | cell = paratext_internal.get_random_string(length, seed, 32, 126) 65 | elif colfmt == 'utf-8' or fmt == 'utf-8': 66 | cell = paratext_internal.get_random_string_utf8(length, seed, include_null) 67 | else: 68 | raise ValueError("unknown format: " + fmt) 69 | data.append(cell) 70 | frame[key] = data 71 | return pandas.DataFrame(frame) 72 | 73 | @contextmanager 74 | def generate_tempfile(filedata): 75 | """ 76 | A context manager that generates a temporary file object that will be deleted 77 | when the context goes out of scope. The mode of the file is "wb". 78 | 79 | Parameters 80 | ---------- 81 | filedata : The data of the file to write as a bytes object. 82 | """ 83 | f = NamedTemporaryFile(delete=False, mode="wb", prefix="paratext-tests") 84 | f.write(filedata) 85 | name = f.name 86 | f.close() 87 | yield f.name 88 | os.remove(name) 89 | 90 | @contextmanager 91 | def generate_tempfilename(): 92 | """ 93 | A context manager that generates a temporary filename that will be deleted 94 | when the context goes out of scope. 95 | """ 96 | f = NamedTemporaryFile(delete=False, prefix="paratext-tests") 97 | name = f.name 98 | f.close() 99 | yield f.name 100 | os.remove(name) 101 | 102 | def assert_seq_almost_equal(left, right): 103 | left = np.asarray(left) 104 | right = np.asarray(right) 105 | left_is_string = np.issubdtype(left.dtype, np.str_) or np.issubdtype(left.dtype, np.unicode_) or left.dtype == np.object_ 106 | right_is_string = np.issubdtype(right.dtype, np.str_) or np.issubdtype(right.dtype, np.unicode_) or right.dtype == np.object_ 107 | if np.issubdtype(left.dtype, np.integer) and np.issubdtype(right.dtype, np.integer): 108 | if not (left.shape == right.shape): 109 | raise AssertionError("integer sequences have different sizes: %s vs %s" % (str(left.shape), str(right.shape))) 110 | if not (left == right).all(): 111 | m = (left != right).mean() * 100. 112 | raise AssertionError("integer sequences mismatch: %5.5f%% left=%s right=%s" % ((m, str(left[0:20]), str(right[0:20])))) 113 | elif np.issubdtype(left.dtype, np.floating) and np.issubdtype(right.dtype, np.floating): 114 | np.testing.assert_almost_equal(left, right) 115 | elif left_is_string and not right_is_string: 116 | if len(left) > 0 and len(right) > 0: 117 | raise AssertionError("sequences differ by dtype: left is string and right is %s" % (str(right.dtype))) 118 | elif not left_is_string and right_is_string: 119 | if len(left) > 0 and len(right) > 0: 120 | raise AssertionError("sequences differ by dtype: left is %s and right is string" % (str(left.dtype))) 121 | elif left_is_string and right_is_string: 122 | q = np.zeros((len(left))) 123 | for i in range(len(q)): 124 | q[i] = not paratext_internal.are_strings_equal(left[i], right[i]) 125 | m = q.mean() * 100. 126 | if q.any(): 127 | raise AssertionError("object sequences mismatch: %5.5f%%, rows: %s" % (m, str(np.where(q)[0].tolist()))) 128 | else: 129 | if np.issubdtype(left.dtype, np.floating): 130 | left_float = left 131 | else: 132 | left_float = np.asarray(left, dtype=np.float_) 133 | if np.issubdtype(right.dtype, np.floating): 134 | right_float = right 135 | else: 136 | right_float = np.asarray(right, dtype=np.float_) 137 | pandas.util.testing.assert_almost_equal(left_float, right_float) 138 | 139 | def assert_dictframe_almost_equal(left, right, err_msg=""): 140 | """ 141 | Compares two dictframes for equivalent. A dict-frame is simply 142 | an object that obeys the Python mapping protocol. Each (key, value) 143 | represents a column keyed/indexed by `key` where `value` is 144 | a NumPy array, a Python sequence, or Python iterable. 145 | """ 146 | left_keys = set(left.keys()) 147 | right_keys = set(right.keys()) 148 | left_missing = right_keys - left_keys 149 | right_missing = left_keys - right_keys 150 | together = left_keys.intersection(right_keys) 151 | msg = err_msg 152 | for key in left_missing: 153 | msg += "%s: missing on left\n" % key 154 | for key in right_missing: 155 | msg += "%s: missing on right\n" % key 156 | for key in together: 157 | try: 158 | assert_seq_almost_equal(left[key], right[key]) 159 | except AssertionError as e: 160 | msg += "\n Column %s: %s" % (key, e.args[0]) 161 | if len(msg) > 0: 162 | raise AssertionError(msg) 163 | 164 | def generate_mixed_frame(num_rows, num_floats, num_cats, num_ints): 165 | fid = open("/usr/share/dict/words") 166 | words=[line.strip() for line in fid.readlines()] 167 | num_cols = num_floats + num_cats + num_ints 168 | perm = np.random.permutation(num_cols) 169 | num_catints = num_cats + num_ints 170 | float_ids = perm[num_catints:] 171 | int_ids = perm[num_cats:num_catints] 172 | cat_ids = perm[0:num_cats] 173 | cat_ids = ["col" + str(id) for id in cat_ids] 174 | int_ids = ["col" + str(id) for id in int_ids] 175 | float_ids = ["col" + str(id) for id in float_ids] 176 | d = collections.OrderedDict() 177 | dtypes = {} 178 | for col in cat_ids: 179 | X = np.zeros((num_rows,), dtype=np.object); 180 | for row in range(0, num_rows): 181 | num_newlines = np.random.randint(3,7) 182 | num_commas = np.random.randint(3,7) 183 | X[row] = "" 184 | tricky_delims = np.asarray(["\n"] * num_newlines + [","] * num_commas) 185 | np.random.shuffle(tricky_delims) 186 | for delim in tricky_delims: 187 | X[row] += ' '.join(random.sample(words, 5)) 188 | X[row] += delim 189 | X[row] += ' '.join(random.sample(words, 5)) 190 | d[col] = X 191 | dtypes[col] = np.object 192 | for col in float_ids: 193 | d[col] = np.asarray(np.random.randn(num_rows), dtype=np.float32) 194 | dtypes[col] = np.float32 195 | min_int = [0, -2**7, 0 , -2**15, 0, -2**31, 0, -2**62] 196 | max_int = [2**8, 2**7, 2**16, 2**15, 2**32, 2**31, 2**62, 2**62] 197 | dtypes_int = [np.uint8, np.int8, np.uint16, np.int16, np.uint32, np.int32, np.uint64, np.int64] 198 | for col in int_ids: 199 | j = np.random.randint(0, len(min_int)) 200 | d[col] = np.asarray(np.random.randint(min_int[j], max_int[j], num_rows), dtype=dtypes_int[j]) 201 | dtypes[col] = dtypes_int[j] 202 | return d, dtypes 203 | 204 | 205 | def internal_compare(filename, *args, **kwargs): 206 | """ 207 | Loads a Pandas DataFrame with pandas and paratext, and compares their contents. 208 | """ 209 | import pandas 210 | dfY = load_csv_to_pandas(filename, *args, **kwargs) 211 | if kwargs.get("no_header"): 212 | dfX = pandas.read_csv(filename, header=None, na_values=['?'], names=dfY.keys()) 213 | else: 214 | dfX = pandas.read_csv(filename, na_values=['?']) 215 | results = {} 216 | for key in dfX.columns: 217 | if dfX[key].dtype in (str, unicode, np.object): 218 | nonnan_mask = (dfY[key] != 'nan') & (dfY[key] != '?') 219 | results[key] = (dfX[key][nonnan_mask]!=dfY[key][nonnan_mask]).mean() 220 | else: 221 | nonnan_mask = ~np.isnan(dfX[key]) 222 | results[key] = abs(dfX[key][nonnan_mask]-dfY[key][nonnan_mask]).max() 223 | return results 224 | -------------------------------------------------------------------------------- /src/csv/colbased_chunk.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | ParaText: parallel text reading 3 | Copyright (C) 2016. wise.io, Inc. 4 | 5 | Licensed to the Apache Software Foundation (ASF) under one 6 | or more contributor license agreements. See the NOTICE file 7 | distributed with this work for additional information 8 | regarding copyright ownership. The ASF licenses this file 9 | to you under the Apache License, Version 2.0 (the 10 | "License"); you may not use this file except in compliance 11 | with the License. You may obtain a copy of the License at 12 | 13 | http://www.apache.org/licenses/LICENSE-2.0 14 | 15 | Unless required by applicable law or agreed to in writing, 16 | software distributed under the License is distributed on an 17 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 18 | KIND, either express or implied. See the License for the 19 | specific language governing permissions and limitations 20 | under the License. 21 | */ 22 | 23 | /* 24 | Coder: Damian Eads. 25 | */ 26 | 27 | #ifndef PARATEXT_COLBASED_CHUNK_HPP 28 | #define PARATEXT_COLBASED_CHUNK_HPP 29 | 30 | #include "generic/parse_params.hpp" 31 | #include "util/widening_vector.hpp" 32 | #include "util/strings.hpp" 33 | 34 | #include 35 | #include 36 | 37 | namespace ParaText { 38 | 39 | namespace CSV { 40 | 41 | /* 42 | Represents a chunk of parsed column data for a col-based CSV parser. 43 | */ 44 | class ColBasedChunk { 45 | public: 46 | /* 47 | Creates a new chunk with an empty name. 48 | */ 49 | ColBasedChunk() : max_level_name_length_(std::numeric_limits::max()), max_levels_(std::numeric_limits::max()), forced_semantics_(Semantics::UNKNOWN) {} 50 | 51 | /* 52 | Creates a new chunk. 53 | 54 | \param column_name The name of the column for the chunk. 55 | */ 56 | ColBasedChunk(const std::string &column_name) 57 | : column_name_(column_name), max_level_name_length_(std::numeric_limits::max()), max_levels_(std::numeric_limits::max()), forced_semantics_(Semantics::UNKNOWN) {} 58 | 59 | /* 60 | Creates a new chunk. 61 | 62 | \param column_name The name of the column for the chunk. 63 | \param max_level_name_length If this field length is exceeded, all string fields in a 64 | column are considered text rather than categorical levels. 65 | \param max_levels If this number of levels is exceeded, then all string fields 66 | in a column are considered categorical. 67 | */ 68 | ColBasedChunk(const std::string &column_name, size_t max_level_name_length, size_t max_levels, Semantics forced_semantics_) 69 | : column_name_(column_name), max_level_name_length_(max_level_name_length), max_levels_(max_levels), forced_semantics_(forced_semantics_) {} 70 | 71 | 72 | /* 73 | * Destroys this chunk. 74 | */ 75 | virtual ~ColBasedChunk() {} 76 | 77 | /* 78 | * Passes a floating point datum to the column handler. If categorical 79 | * data was previously passed to this handler, this datum will be converted 80 | * to a string and treated as categorical. 81 | */ 82 | void process_float(float val) { 83 | if (cat_data_.size() > 0 || forced_semantics_ == Semantics::CATEGORICAL || forced_semantics_ == Semantics::TEXT) { 84 | std::string s(std::to_string(val)); 85 | process_categorical(s.begin(), s.end()); 86 | } 87 | else { 88 | number_data_.push_back(val); 89 | } 90 | } 91 | 92 | /* 93 | * Passes a floating point datum to the column handler. If categorical 94 | * data was previously passed to this handler, this datum will be converted 95 | * to a string and treated as categorical. 96 | */ 97 | void process_integer(long val) { 98 | if (cat_data_.size() > 0 || forced_semantics_ == Semantics::CATEGORICAL || forced_semantics_ == Semantics::TEXT) { 99 | std::string s(std::to_string(val)); 100 | process_categorical(s.begin(), s.end()); 101 | } 102 | else { 103 | number_data_.push_back(val); 104 | } 105 | } 106 | 107 | /* 108 | * Passes a categorical datum to the column handler. If numerical data 109 | * was previously passed to this handler, all previous data passed will 110 | * be converted to a string. 111 | */ 112 | template 113 | void process_categorical(Iterator begin, Iterator end) { 114 | if (forced_semantics_ == Semantics::NUMERIC) { 115 | number_data_.push_back((float)bsd_strtod(begin, end)); 116 | } 117 | else if (number_data_.size() > 0) { 118 | if (begin == end) { 119 | //std::cout << "{" << std::string(begin, end); 120 | number_data_.push_back((long)0); 121 | } 122 | else { 123 | //std::cout << "[" << std::string(begin, end); 124 | convert_to_cat_or_text(); 125 | std::string key(begin, end); 126 | add_cat_data(key); 127 | } 128 | } 129 | else { 130 | std::string key(begin, end); 131 | add_cat_data(key); 132 | } 133 | } 134 | 135 | /* 136 | Returns the semantics of this column. 137 | */ 138 | Semantics get_semantics() const { 139 | if (cat_data_.size() > 0) { 140 | return Semantics::CATEGORICAL; 141 | } 142 | else if (text_data_.size() > 0) { 143 | return Semantics::TEXT; 144 | } 145 | else { 146 | return Semantics::NUMERIC; 147 | } 148 | } 149 | 150 | /* 151 | Returns the type index of the data in this column. 152 | */ 153 | std::type_index get_type_index() const { 154 | if (cat_data_.size() > 0) { 155 | return cat_data_.get_type_index(); 156 | } else if (text_data_.size() > 0) { 157 | return std::type_index(typeid(text_data_)); 158 | } 159 | else { 160 | return number_data_.get_type_index(); 161 | } 162 | } 163 | 164 | std::type_index get_common_type_index(std::type_index &other) const { 165 | if (cat_data_.size() > 0 || other == std::type_index(typeid(std::string))) { 166 | return std::type_index(typeid(std::string)); 167 | } 168 | else { 169 | return number_data_.get_common_type_index(other); 170 | } 171 | } 172 | 173 | template 174 | inline typename std::enable_if::value && Numeric, T>::type get(size_t i) const { 175 | return number_data_.get(i); 176 | } 177 | 178 | template 179 | inline typename std::enable_if::value && !Numeric, T>::type get(size_t i) const { 180 | return cat_data_.get(i); 181 | } 182 | 183 | const std::vector &get_cat_keys() const { 184 | return cat_keys_; 185 | } 186 | 187 | size_t size() const { 188 | if (cat_data_.size() > 0) { 189 | return cat_data_.size(); 190 | } 191 | else if (number_data_.size() > 0) { 192 | return number_data_.size(); 193 | } 194 | else { 195 | return text_data_.size(); 196 | } 197 | } 198 | 199 | void clear() { 200 | number_data_.clear(); 201 | number_data_.shrink_to_fit(); 202 | cat_data_.clear(); 203 | cat_data_.shrink_to_fit(); 204 | cat_ids_.clear(); 205 | cat_keys_.clear(); 206 | cat_keys_.shrink_to_fit(); 207 | } 208 | 209 | size_t get_string(size_t idx) { 210 | return cat_data_.get(idx); 211 | } 212 | 213 | size_t get_string_id(const std::string &key) { 214 | auto it = cat_ids_.find(key); 215 | if (it == cat_ids_.end()) { 216 | std::tie(it, std::ignore) = cat_ids_.insert(std::make_pair(key, cat_ids_.size())); 217 | cat_keys_.push_back(key); 218 | } 219 | return it->second; 220 | } 221 | 222 | /* 223 | * Converts all floating point data collected by this handler into 224 | * categorical data. 225 | */ 226 | void convert_to_cat_or_text() { 227 | if (number_data_.size() > 0) { 228 | for (size_t i = 0; i < number_data_.size(); i++) { 229 | add_cat_data(std::to_string(number_data_.get(i))); 230 | } 231 | number_data_.clear(); 232 | number_data_.shrink_to_fit(); 233 | } 234 | } 235 | 236 | void convert_to_text() { 237 | if (number_data_.size() > 0 || forced_semantics_ == Semantics::TEXT) { 238 | for (size_t i = 0; i < number_data_.size(); i++) { 239 | text_data_.push_back(std::to_string(number_data_.get(i))); 240 | } 241 | number_data_.clear(); 242 | number_data_.shrink_to_fit(); 243 | } 244 | else if (cat_data_.size() > 0) { 245 | for (size_t i = 0; i < cat_data_.size(); i++) { 246 | text_data_.push_back(cat_keys_[cat_data_.get(i)]); 247 | } 248 | cat_data_.clear(); 249 | cat_data_.shrink_to_fit(); 250 | cat_ids_.clear(); 251 | cat_keys_.clear(); 252 | cat_keys_.shrink_to_fit(); 253 | } 254 | } 255 | 256 | void add_cat_data(const std::string &data) { 257 | if (forced_semantics_ == Semantics::TEXT || text_data_.size() > 0) { 258 | text_data_.push_back(data); 259 | } 260 | else if (forced_semantics_ == Semantics::CATEGORICAL) { 261 | cat_data_.push_back((long)get_string_id(data)); 262 | } 263 | else if (data.size() > max_level_name_length_ || cat_keys_.size() > max_levels_) { 264 | convert_to_text(); 265 | text_data_.push_back(data); 266 | } 267 | else { 268 | cat_data_.push_back((long)get_string_id(data)); 269 | } 270 | } 271 | 272 | const std::string &get_text(size_t i) const { 273 | return text_data_[i]; 274 | } 275 | 276 | template 277 | void copy_numeric_into(T *out) { 278 | number_data_.copy_into(out); 279 | } 280 | 281 | template 282 | void copy_cat_into(T *out) { 283 | cat_data_.copy_into(out); 284 | } 285 | 286 | size_t get_text_length_sum() const { 287 | size_t sum = 0; 288 | for (size_t i = 0; i < text_data_.size(); i++) { 289 | sum += text_data_[i].size(); 290 | } 291 | return sum; 292 | } 293 | 294 | template 295 | T get_number_sum() const { 296 | return number_data_.get_sum(); 297 | } 298 | 299 | private: 300 | std::string column_name_; 301 | widening_vector_dynamic number_data_; 302 | widening_vector_dynamic cat_data_; 303 | std::unordered_map cat_ids_; 304 | std::vector cat_keys_; 305 | std::vector text_data_; 306 | size_t max_level_name_length_; 307 | size_t max_levels_; 308 | Semantics forced_semantics_; 309 | }; 310 | } 311 | } 312 | #endif 313 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ParaText 2 | ======== 3 | 4 | 5 | 6 | 7 | ParaText is a C++ library to read text files in parallel on multi-core 8 | machines. The alpha release includes a CSV reader and Python bindings. 9 | The library itself has no dependencies other than the standard library. 10 | 11 | Depedencies 12 | ----------- 13 | ParaText has the following dependencies: 14 | 15 | - a fully C++11-compliant C++ compiler (gcc 4.8 or above, clang 3.4 or above) 16 | - SWIG 2.0.7 or above (Python 2 bindings) 17 | - SWIG 3.0.8 or above (Python 3 bindings) 18 | - Python 2.7 or 3.5 19 | - setuptools 20 | - numpy 21 | 22 | Pandas is required only if using ParaText to read CSV files into 23 | Pandas. The SWIG available from Ubuntu 14.04 does not work with Python 3. 24 | 25 | Anaconda packages the latest version of SWIG that works properly 26 | with Python 3. You can install it as follows: 27 | 28 | ``` 29 | conda install swig 30 | ``` 31 | 32 | Building Python 33 | --------------- 34 | 35 | First, go into the `python` directory: 36 | 37 | ``` 38 | cd python/ 39 | ``` 40 | 41 | Then run `setup.py`: 42 | 43 | ``` 44 | python setup.py build install 45 | ``` 46 | 47 | Use the `--prefix` option if you prefer to install ParaText to a 48 | different location: 49 | 50 | ``` 51 | cd python/ 52 | python setup.py build install --prefix=/my/prefix/dir 53 | ``` 54 | 55 | 56 | Using ParaText in Python 57 | ======================== 58 | 59 | First, import the `paratext` Python package. 60 | 61 | ``` 62 | import paratext 63 | ``` 64 | 65 | Loading into Pandas 66 | ------------------- 67 | 68 | A CSV file can be loaded into Pandas in just one line of code using 69 | the `load_csv_to_pandas` function. 70 | 71 | ``` 72 | df = paratext.load_csv_to_pandas("hepatitis.csv") 73 | ``` 74 | 75 | The data frame looks something like this: 76 | 77 | ``` 78 | In [1]: print df.head() 79 | AGE SEX STEROID ANTIVIRALS FATIGUE MALAISE ANOREXIA LIVER_BIG \ 80 | 0 30 male no no no no no no 81 | 1 50 female no no yes no no no 82 | 2 78 female yes no yes no no yes 83 | 3 31 female nan yes no no no yes 84 | 4 34 female yes no no no no yes 85 | 86 | LIVER_FIRM SPLEEN_PALPABLE SPIDERS ASCITES VARICES BILIRUBIN \ 87 | 0 no no no no no 1.0 88 | 1 no no no no no 0.9 89 | 2 no no no no no 0.7 90 | 3 no no no no no 0.7 91 | 4 no no no no no 1.0 92 | 93 | ALK_PHOSPHATE SGOT ALBUMIN PROTIME HISTOLOGY Class 94 | 0 85 18 4.0 NaN no LIVE 95 | 1 135 42 3.5 NaN no LIVE 96 | 2 96 32 4.0 NaN no LIVE 97 | 3 46 52 4.0 80 no LIVE 98 | 4 NaN 200 4.0 NaN no LIVE 99 | ``` 100 | 101 | Loading into Dictionaries (more memory-efficient) 102 | ------------------------------------------------- 103 | 104 | A Python dictionary of arrays is preferable over a DataFrame 105 | if the memory budget is very tight. The `load_csv_to_dict` 106 | loads a CSV file, storing the columns as a dictionary of 107 | arrays. 108 | 109 | ``` 110 | dict_frame, levels = paratext.load_csv_to_dict(filename) 111 | ``` 112 | 113 | It returns a two element tuple. The first `dict_frame` is a Python 114 | dictionary that maps column names to column data. The second `levels` 115 | is also a Python dictionary keyed by column name. It contains a list 116 | of level strings for each categorical column. 117 | 118 | The following code visits the columns. For each column, it 119 | prints its name, the first 5 values of its data, and the categorical 120 | levels (`None` if not categorical). 121 | 122 | ``` 123 | for key in dict_frame.keys(): 124 | print key, repr(dict_frame[key][0:5]), levels.get(key, None) 125 | ``` 126 | 127 | This gives the following output: 128 | 129 | ``` 130 | PROTIME array([ nan, nan, nan, 80., nan], dtype=float32) None 131 | LIVER_BIG array([0, 0, 1, 1, 1], dtype=uint8) ['no' 'yes' 'nan'] 132 | ALBUMIN array([ 4. , 3.5, 4. , 4. , 4. ], dtype=float32) None 133 | ALK_PHOSPHATE array([ 85., 135., 96., 46., nan], dtype=float32) None 134 | ANTIVIRALS array([0, 0, 0, 1, 0], dtype=uint8) ['no' 'yes'] 135 | HISTOLOGY array([0, 0, 0, 0, 0], dtype=uint8) ['no' 'yes'] 136 | BILIRUBIN array([ 1., 0.89999998, 0.69999999, 0.69999999, 1. ], dtype=float32) None 137 | AGE array([30, 50, 78, 31, 34], dtype=uint8) None 138 | SEX array([0, 1, 1, 1, 1], dtype=uint8) ['male' 'female'] 139 | STEROID array([0, 0, 1, 2, 1], dtype=uint8) ['no' 'yes' 'nan'] 140 | SGOT array([ 18., 42., 32., 52., 200.], dtype=float32) None 141 | MALAISE array([0, 0, 0, 0, 0], dtype=uint8) ['no' 'yes' 'nan'] 142 | FATIGUE array([0, 1, 1, 0, 0], dtype=uint8) ['no' 'yes' 'nan'] 143 | SPIDERS array([0, 0, 0, 0, 0], dtype=uint8) ['no' 'yes' 'nan'] 144 | VARICES array([0, 0, 0, 0, 0], dtype=uint8) ['no' 'nan' 'yes'] 145 | LIVER_FIRM array([0, 0, 0, 0, 0], dtype=uint8) ['no' 'yes' 'nan'] 146 | SPLEEN_PALPABLE array([0, 0, 0, 0, 0], dtype=uint8) ['no' 'yes' 'nan'] 147 | ASCITES array([0, 0, 0, 0, 0], dtype=uint8) ['no' 'yes' 'nan'] 148 | Class array([0, 0, 0, 0, 0], dtype=uint8) ['LIVE' 'DIE'] 149 | ANOREXIA array([0, 0, 0, 0, 0], dtype=uint8) ['no' 'yes' 'nan'] 150 | ``` 151 | 152 | All categorical columns in this data set have 3 or fewer levels so 153 | they are all `uint8`. A string representation uses at least 8 times 154 | as much space, but it can also be less computationally efficient. An 155 | integer representation is ideal for learning on categorical columns. 156 | Integer comparisons over contiguous integer buffers are pretty cheap 157 | compared to exhaustive string comparisons on (potentially) 158 | discontiguous string values. This makes a big difference for 159 | combinatorial learning algorithms. 160 | 161 | Handling Multi-Line Fields 162 | -------------------------- 163 | 164 | ParaText supports reading CSV files with multi-line fields in 165 | parallel. This feature must be explicitly activated as it requires 166 | extra overhead to adjust the boundaries of the chunks processed by 167 | the workers. 168 | 169 | ``` 170 | df = paratext.load_csv_to_pandas("messy.csv", allow_quoted_newlines=True) 171 | ``` 172 | 173 | Header Detection 174 | ---------------- 175 | 176 | ParaText detects the presence of a header. This can be turned off with 177 | `no_header=True`. 178 | 179 | Column Typing 180 | ------------- 181 | 182 | This library distinguishes between a column's data type and its semantics. 183 | The semantics defines how to interpret a column (e.g. numeric vs. categorical). 184 | and the data type (`uint8`, `int64`, `float`, etc.) is the type for encoding 185 | column values. 186 | 187 | Three semantic types are supported: 188 | 189 | * `num`: numeric data. 190 | 191 | * `cat`: categorical data. 192 | 193 | * `text`: large strings like e-mails and text documents. 194 | 195 | ParaText supports `(u)int(8|16|32|64)|float|double|string` data types. 196 | 197 | Parameters 198 | ---------- 199 | 200 | Most CSV loading functions in ParaText have the following parameters: 201 | 202 | * `cat_names`: A list of column names to force as categorical regardless 203 | of the inferred type. 204 | 205 | * `text_names`: A list of column names that should be treated as rich text 206 | regardless of its inferred type. 207 | 208 | * `num_names`: A list of column names that should be treated as 209 | numeric regardless of its inferred type. 210 | 211 | * `num_threads`: The number of parser threads to spawn. The default 212 | is the number of cores. 213 | 214 | * `allow_quoted_newlines`: Allows multi-line text fields. This 215 | is turned off by default. 216 | 217 | * `no_header`: Do not auto-detect the presence of a header. Assume 218 | the first line is data. This is turned off by default. 219 | 220 | * `max_level_name_length`: If a field's length exceeds this value, 221 | the entire column is treated as text rather than 222 | categorical. The default is unlimited. 223 | 224 | * `max_levels`: The maximum number of levels of a categorical column. 225 | The default is unlimited. 226 | 227 | * `number_only`: Whether it can be safely assumed the columns only 228 | contain numbers. The default is unlimited. 229 | 230 | * `block_size`: The number of bytes to read at a time in each worker 231 | thread. The default is unlimited. 232 | 233 | Escape Characters 234 | ----------------- 235 | 236 | ParaText supports backslash escape characters: 237 | 238 | * `\t': tab 239 | 240 | * `\n': newline 241 | 242 | * `\r': carriage return 243 | 244 | * `\v': vertical tab 245 | 246 | * `\0': null terminator (0x00) 247 | 248 | * `\b': backspace 249 | 250 | * '\xnn': an 8-bit character represented with a 2 digit hexidecimal number. 251 | 252 | * '\unnnn': a Unicode code point represented as 4-digit hexidecimal number. 253 | 254 | * '\Unnnnnnnn': a Unicode code point represented as 8-digit hexiecimal number. 255 | 256 | Writing CSV 257 | ----------- 258 | 259 | ParaText does yet support parallel CSV writing. However, it bundles a CSV 260 | writer that can be used to write DataFrames with arbitrary string and byte 261 | buffer data in a lossless fashion. 262 | 263 | If a character in a Python `string`, `unicode`, or `bytes` 264 | object could be treated as non-data when parsed (e.g. a doublequote or 265 | escape character), it is escaped. Moreover, any character that is outside 266 | the desired encoding is also escaped. This enables, for example, 267 | the lossless writing of non-UTF-8 to a UTF-8 file. 268 | 269 | For example, to restrict the encoding to 7-bit printable ASCII, pass 270 | `out_encoding='printable_ascii'` 271 | 272 | ``` 273 | import paratext.serial 274 | df = pandas.DataFrame({"X": [b"\xff\\\n \" oh my!"]}) 275 | paratext.serial.save_frame("lossless.csv", df, allow_quoted_newlines=True, out_encoding='printable_ascii', dos=False) 276 | ``` 277 | 278 | This results in a file: 279 | 280 | ``` 281 | "X" 282 | "\xff\\ 283 | \" oh my!" 284 | ``` 285 | 286 | Instead, pass `out_encoding='utf-8'` to ``save_frame``. 287 | 288 | ``` 289 | import paratext.serial 290 | df = pandas.DataFrame({"X": [b"\xff\\\n \" oh my!"],"Y": ["\U0001F600"]}) 291 | paratext.serial.save_frame("lossless2.csv", df, allow_quoted_newlines=True, out_encoding='utf-8', dos=False) 292 | ``` 293 | 294 | Now, the file only escapes cells in the DataFrame with 295 | non-UTF8 data. All other UTF8 characters are preserved. 296 | ``` 297 | "X","Y" 298 | "\xff\\ 299 | \" oh my!","" 300 | ``` 301 | 302 | Other Notes 303 | ----------- 304 | 305 | ParaText is a work-in-progress. There are a few unimplemented features 306 | that may prevent it from working on all CSV files. We note them below. 307 | 308 | 1. There is no way to supply type hints (e.g. `uint64` or `float`) of a 309 | column. Only the interpretation of a column (numeric, categorical, or 310 | text) can be forced. 311 | 312 | 2. DateTime will be supported in a future release. 313 | -------------------------------------------------------------------------------- /src/csv/colbased_worker.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | ParaText: parallel text reading 4 | Copyright (C) 2016. wise.io, Inc. 5 | 6 | Licensed to the Apache Software Foundation (ASF) under one 7 | or more contributor license agreements. See the NOTICE file 8 | distributed with this work for additional information 9 | regarding copyright ownership. The ASF licenses this file 10 | to you under the Apache License, Version 2.0 (the 11 | "License"); you may not use this file except in compliance 12 | with the License. You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, 17 | software distributed under the License is distributed on an 18 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 19 | KIND, either express or implied. See the License for the 20 | specific language governing permissions and limitations 21 | under the License. 22 | */ 23 | 24 | /* 25 | Coder: Damian Eads. 26 | */ 27 | 28 | #ifndef WISEIO_PARSE_WORKER_COL_BASED_HPP 29 | #define WISEIO_PARSE_WORKER_COL_BASED_HPP 30 | 31 | #include "util/strings.hpp" 32 | #include "util/widening_vector.hpp" 33 | 34 | #include 35 | #include 36 | #include 37 | 38 | namespace ParaText { 39 | 40 | namespace CSV { 41 | 42 | template 43 | class ColBasedParseWorker { 44 | public: 45 | ColBasedParseWorker(std::vector > &handlers) : handlers_(handlers), lines_parsed_(0), quote_started_('\0'), column_index_(0), escape_jump_(0) {} 46 | 47 | virtual ~ColBasedParseWorker() {} 48 | 49 | void parse(const std::string &filename, 50 | size_t begin, 51 | size_t end, 52 | size_t data_begin, 53 | size_t file_end, 54 | const ParaText::ParseParams ¶ms) { 55 | try { 56 | if (params.number_only) { 57 | parse_impl(filename, begin, end, data_begin, file_end, params); 58 | } 59 | else { 60 | parse_impl(filename, begin, end, data_begin, file_end, params); 61 | } 62 | } 63 | catch (...) { 64 | thread_exception_ = std::current_exception(); 65 | } 66 | } 67 | 68 | std::exception_ptr get_exception() { 69 | return thread_exception_; 70 | } 71 | 72 | template 73 | void parse_impl(const std::string &filename, 74 | size_t begin, 75 | size_t end, 76 | size_t data_begin, 77 | size_t file_end, 78 | const ParaText::ParseParams ¶ms) { 79 | (void)data_begin; 80 | (void)file_end; 81 | std::ifstream in; 82 | in.open(filename.c_str()); 83 | column_index_ = 0; 84 | quote_started_ = '\0'; 85 | escape_jump_ = 0; 86 | size_t current = begin; 87 | size_t spos_line = begin, epos_line = begin; 88 | const size_t block_size = params.block_size; 89 | convert_null_to_space_ = params.convert_null_to_space; 90 | char buf[block_size]; 91 | in.seekg(current, std::ios_base::beg); 92 | definitely_string_ = false; 93 | #ifdef PARALOAD_DEBUG 94 | size_t round = 0; 95 | #endif 96 | while (current <= end) { 97 | if (current % block_size == 0) { /* The block is aligned. */ 98 | in.read(buf, std::min(end - current + 1, block_size)); 99 | } 100 | else { /* Our first read should ensure our further reads are block-aligned. */ 101 | in.read(buf, std::min(end - current + 1, std::min(block_size, current % block_size))); 102 | } 103 | size_t nread = in.gcount(); 104 | #ifdef PARALOAD_DEBUG 105 | if (round == 0) { 106 | std::cout << "R{" << std::string((char *)buf, (char *)buf + nread) << std::endl; 107 | } 108 | round++; 109 | #endif 110 | if (nread == 0) { 111 | break; 112 | } 113 | if (NumberOnly) { 114 | size_t i = 0; 115 | for (; i < nread; i++) { 116 | if (buf[i] == ',') { 117 | process_token_number_only(); 118 | } 119 | else if (buf[i] == '\r') { /* do nothing. */} 120 | else if (buf[i] == '\n') { 121 | epos_line = current + i; 122 | if (epos_line - spos_line > 0) { 123 | process_token_number_only(); 124 | process_newline(); 125 | } 126 | spos_line = epos_line + 1; 127 | epos_line = spos_line; 128 | } else { 129 | token_.push_back(buf[i]); 130 | } 131 | } 132 | } else { 133 | for (size_t i = 0; i < nread;) { 134 | if (quote_started_ != '\0') { 135 | for (; i < nread; i++) { 136 | if (escape_jump_ > 0) { 137 | escape_jump_--; 138 | } 139 | else if (buf[i] == '\\') { 140 | escape_jump_ = 1; 141 | } 142 | else if (buf[i] == quote_started_) { 143 | i++; 144 | quote_started_ = '\0'; 145 | break; 146 | } 147 | token_.push_back(buf[i]); 148 | } 149 | } 150 | else { 151 | for (; i < nread; i++) { 152 | if (escape_jump_ > 0) { 153 | escape_jump_--; 154 | if (buf[i] == 'x') { 155 | escape_jump_ += 2; 156 | } 157 | else if (buf[i] == 'u') { 158 | escape_jump_ += 4; 159 | } 160 | token_.push_back(buf[i]); 161 | } 162 | else if (buf[i] == '\\') { 163 | escape_jump_ = 1; 164 | token_.push_back(buf[i]); 165 | } 166 | else if (buf[i] == '"') { 167 | i++; 168 | quote_started_ = '\"'; 169 | definitely_string_ = true; 170 | break; 171 | } 172 | else if (buf[i] == ',') { 173 | process_token(); 174 | } 175 | else if (buf[i] == '\r') { /* do nothing: dos wastes a byte each line. */ } 176 | else if (buf[i] == '\n') { 177 | epos_line = current + i; 178 | if (epos_line - spos_line > 0) { 179 | process_token(); 180 | process_newline(); 181 | } 182 | spos_line = epos_line + 1; 183 | epos_line = spos_line; 184 | } 185 | else { 186 | token_.push_back(buf[i]); 187 | } 188 | } 189 | } 190 | } 191 | } 192 | current += nread; 193 | } 194 | epos_line = end + 1; 195 | //std::cout << "start line: " << spos_line << " end line: " << epos_line << std::endl; 196 | /* 197 | If we're in the last column position, process the token as some files 198 | do not end with a newline. 199 | */ 200 | if (token_.size() > 0) { 201 | if (NumberOnly) { 202 | process_token_number_only(); 203 | } else { 204 | process_token(); 205 | } 206 | } 207 | /* 208 | If there was data on the last line, process it. 209 | */ 210 | if (column_index_ > 0) { 211 | process_newline(); 212 | } 213 | #ifdef PARALOAD_DEBUG 214 | std::cout << "lines parsed: " << lines_parsed_ << std::endl; 215 | #endif 216 | return; 217 | } 218 | 219 | void process_newline() { 220 | if (column_index_ != handlers_.size()) { 221 | std::ostringstream ostr; 222 | ostr << "improper number of columns on line number (unquoted in chunk): " << (lines_parsed_ + 1) << ". Expected: " << handlers_.size(); 223 | throw std::logic_error(ostr.str()); 224 | } 225 | column_index_ = 0; 226 | lines_parsed_++; 227 | } 228 | 229 | void process_token_number_only() { 230 | if (column_index_ >= handlers_.size()) { 231 | std::ostringstream ostr; 232 | ostr << "too many columns on line number (unquoted in chunk): " << (lines_parsed_ + 1) << ". Expected: " << handlers_.size(); 233 | throw std::logic_error(ostr.str()); 234 | } 235 | size_t i = 0; 236 | for (; i < token_.size() && isspace(token_[i]); i++) {} 237 | if (i < token_.size()) { 238 | if (token_[i] == '?' && token_.size() - i == 1) { 239 | handlers_[column_index_]->process_float(std::numeric_limits::quiet_NaN()); 240 | } 241 | else if (token_.size() - i == 3 && 242 | ((token_[i] == 'n' || token_[i] == 'N')) 243 | && ((token_[i+1] == 'a' || token_[i+1] == 'A')) 244 | && (token_[i+2] == 'n' || token_[i+2] == 'N')) { 245 | handlers_[column_index_]->process_float(std::numeric_limits::quiet_NaN()); 246 | } 247 | else { 248 | if (token_[i] == '-') { i++; } 249 | for (; i < token_.size() && isdigit(token_[i]); i++) {} 250 | if (i < token_.size() && (token_[i] == '.' || token_[i] == 'E' || token_[i] == 'e')) { 251 | handlers_[column_index_]->process_float(bsd_strtod(token_.begin(), token_.end())); 252 | } 253 | else { 254 | handlers_[column_index_]->process_integer(fast_atoi(token_.begin(), token_.end())); 255 | } 256 | } 257 | } else { 258 | handlers_[column_index_]->process_integer(0); 259 | } 260 | column_index_++; 261 | token_.clear(); 262 | } 263 | 264 | void process_token() { 265 | if (column_index_ >= handlers_.size()) { 266 | std::ostringstream ostr; 267 | ostr << "too many columns on line number (unquoted in chunk): " << (lines_parsed_ + 1) << ". Expected: " << handlers_.size(); 268 | throw std::logic_error(ostr.str()); 269 | } 270 | if (definitely_string_) { 271 | parse_unquoted_string(token_.begin(), token_.end(), std::back_inserter(token_aux_)); 272 | if (convert_null_to_space_) { 273 | convert_null_to_space(token_aux_.begin(), token_aux_.end()); 274 | } 275 | handlers_[column_index_]->process_categorical(token_aux_.begin(), token_aux_.end()); 276 | token_aux_.clear(); 277 | definitely_string_ = false; 278 | } 279 | else { 280 | size_t i = 0; 281 | bool integer_possible = false, float_possible = false, exp_possible = false, handled = false; 282 | for (; i < token_.size() && isspace(token_[i]); i++) {} 283 | if (i < token_.size()) { 284 | if (token_[i] == '-') { 285 | i++; 286 | } 287 | else if (token_[i] == '?' && token_.size() - i == 1) { 288 | handlers_[column_index_]->process_float(std::numeric_limits::quiet_NaN()); 289 | handled = true; 290 | } 291 | else if ((token_[i] == 'n' || token_[i] == 'N') && token_.size() - i == 3) { 292 | if ((token_[i+1] == 'a' || token_[i+1] == 'A') && (token_[i+2] == 'n' || token_[i+2] == 'N')) { 293 | handlers_[column_index_]->process_float(std::numeric_limits::quiet_NaN()); 294 | handled = true; 295 | } 296 | } 297 | } 298 | if (!handled) { 299 | if (i < token_.size()) { 300 | integer_possible = std::isdigit(token_[i]); 301 | i++; 302 | float_possible = integer_possible, exp_possible = integer_possible; 303 | while (i < token_.size() && integer_possible) { 304 | integer_possible = isdigit(token_[i]); 305 | i++; 306 | } 307 | if (i < token_.size()) { 308 | integer_possible = false; 309 | float_possible = token_[i] == '.'; 310 | i++; 311 | while (i < token_.size() && float_possible) { 312 | float_possible = isdigit(token_[i]); 313 | i++; 314 | } 315 | if (float_possible && i < token_.size()) { 316 | float_possible = false; 317 | exp_possible = token_[i] == 'E' || token_[i] == 'e'; 318 | i++; 319 | if (exp_possible && i < token_.size()) { 320 | //std::cout << "A"; 321 | if (token_[i] == '+' || token_[i] == '-') { 322 | //std::cout << "B"; 323 | i++; 324 | if (i < token_.size()) { 325 | //std::cout << "C"; 326 | exp_possible = isdigit(token_[i]); 327 | i++; 328 | while (i < token_.size() && exp_possible) { 329 | exp_possible = isdigit(token_[i]); 330 | i++; 331 | } 332 | } 333 | else { 334 | exp_possible = false; 335 | } 336 | } 337 | else if (isdigit(token_[i])) { 338 | //std::cout << "D"; 339 | while (i < token_.size() && exp_possible) { 340 | exp_possible = isdigit(token_[i]); 341 | i++; 342 | } 343 | //std::cout << "E" << exp_possible << (token_[i-1]); 344 | } 345 | else { 346 | exp_possible = false; 347 | } 348 | } 349 | else { 350 | exp_possible = false; 351 | } 352 | } 353 | } 354 | } 355 | if (integer_possible) { 356 | handlers_[column_index_]->process_integer(fast_atoi(token_.begin(), token_.end())); 357 | } 358 | else if (float_possible || exp_possible) { 359 | handlers_[column_index_]->process_float(bsd_strtod(token_.begin(), token_.end())); 360 | } 361 | else { 362 | parse_unquoted_string(token_.begin(), token_.end(), std::back_inserter(token_aux_)); 363 | if (convert_null_to_space_) { 364 | convert_null_to_space(token_aux_.begin(), token_aux_.end()); 365 | } 366 | handlers_[column_index_]->process_categorical(token_aux_.begin(), token_aux_.end()); 367 | token_aux_.clear(); 368 | } 369 | 370 | } 371 | } 372 | column_index_++; 373 | token_.clear(); 374 | } 375 | 376 | void convert_to_cat_or_text(size_t column_index) { 377 | handlers_[column_index]->convert_to_cat_or_text(); 378 | } 379 | 380 | void convert_to_text(size_t column_index) { 381 | handlers_[column_index]->convert_to_text(); 382 | } 383 | 384 | private: 385 | std::vector > handlers_; 386 | std::vector token_; 387 | std::vector token_aux_; 388 | std::vector > long_cache_; 389 | std::vector > double_cache_; 390 | std::vector str_cache_data_; 391 | std::vector str_cache_offsets_; 392 | std::vector str_cache_column_; 393 | bool definitely_string_; 394 | size_t lines_parsed_; 395 | char quote_started_; 396 | size_t column_index_; 397 | size_t escape_jump_; 398 | bool convert_null_to_space_; 399 | std::exception_ptr thread_exception_; 400 | }; 401 | } 402 | } 403 | 404 | #endif 405 | -------------------------------------------------------------------------------- /src/generic/chunker.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | ParaText: parallel text reading 4 | Copyright (C) 2016. wise.io, Inc. 5 | 6 | Licensed to the Apache Software Foundation (ASF) under one 7 | or more contributor license agreements. See the NOTICE file 8 | distributed with this work for additional information 9 | regarding copyright ownership. The ASF licenses this file 10 | to you under the Apache License, Version 2.0 (the 11 | "License"); you may not use this file except in compliance 12 | with the License. You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, 17 | software distributed under the License is distributed on an 18 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 19 | KIND, either express or implied. See the License for the 20 | specific language governing permissions and limitations 21 | under the License. 22 | */ 23 | 24 | /* 25 | Coder: Damian Eads. 26 | */ 27 | 28 | #ifndef PARATEXT_LINE_CHUNKER2_HPP 29 | #define PARATEXT_LINE_CHUNKER2_HPP 30 | 31 | #include 32 | #include 33 | 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | 41 | #include "quote_adjustment_worker.hpp" 42 | 43 | namespace ParaText { 44 | 45 | /* 46 | Finds chunks in a text file that break on an unquoted 47 | newline. Text files are separated by newline separators. If 48 | quoted newlines are supported, they are ignored for the purposes 49 | of separating lines. 50 | */ 51 | class TextChunker { 52 | public: 53 | /* 54 | Constructs a new chunker with no chunk boundaries initialized. 55 | */ 56 | TextChunker() {} 57 | 58 | /* 59 | Destroys this text chunker. 60 | */ 61 | virtual ~TextChunker() {} 62 | 63 | /* 64 | Computes the boundaries of the text chunks. 65 | 66 | \param filename The text filename to open to computer offsets. 67 | \param starting_offset The starting offset of the first chunk. 68 | \param maximum_chunks The maximum number of chunks. The number of chunks 69 | will be as close to this number as possible. 70 | */ 71 | void process(const std::string &filename, size_t starting_offset, size_t maximum_chunks, bool allow_quoted_newlines) { 72 | filename_ = filename; 73 | starting_offset_ = starting_offset; 74 | maximum_chunks_ = maximum_chunks; 75 | struct stat fs; 76 | if (stat(filename.c_str(), &fs) == -1) { 77 | std::ostringstream ostr; 78 | ostr << "cannot open file '" << filename << "'"; 79 | throw std::logic_error(ostr.str()); 80 | } 81 | length_ = fs.st_size; 82 | if (length_ > 0) { 83 | lastpos_ = length_ - 1; 84 | } 85 | else { 86 | lastpos_ = 0; 87 | } 88 | in_.open(filename.c_str()); 89 | if (!in_) { 90 | std::ostringstream ostr; 91 | ostr << "cannot open file '" << filename << "'"; 92 | throw std::logic_error(ostr.str()); 93 | } 94 | compute_offsets(allow_quoted_newlines); 95 | } 96 | 97 | /* 98 | Returns the number of chunks determined by this chunker. 99 | */ 100 | size_t num_chunks() const { 101 | return start_of_chunk_.size(); 102 | } 103 | 104 | /* 105 | Returns the (start, end) boundaries of a specific chunk. The ending 106 | index is always inclusive. 107 | */ 108 | std::pair get_chunk(size_t index) const { 109 | return std::make_pair(start_of_chunk_[index], end_of_chunk_[index]); 110 | } 111 | 112 | private: 113 | std::pair get_num_trailing_escapes(long start_of_chunk, long end_of_chunk) { 114 | long num_trailing_escapes = 0; 115 | long k = end_of_chunk; 116 | char successor = 0; 117 | if (end_of_chunk < lastpos_) { 118 | in_.clear(); 119 | in_.seekg(end_of_chunk + 1, std::ios_base::beg); 120 | in_.read(&successor, 1); 121 | } 122 | 123 | for (; k >= start_of_chunk; k--) { 124 | in_.clear(); 125 | in_.seekg(k, std::ios_base::beg); 126 | char buf; 127 | in_.read(&buf, 1); 128 | size_t nread = in_.gcount(); 129 | if (nread == 0 || buf != '\\') { 130 | break; 131 | } 132 | num_trailing_escapes++; 133 | } 134 | return std::make_pair(num_trailing_escapes, successor); 135 | } 136 | 137 | void compute_offsets(bool allow_quoted_newlines = true) { 138 | const size_t chunk_size = std::max(2L, (long)((length_ - starting_offset_) / maximum_chunks_)); 139 | long start_of_chunk = starting_offset_; 140 | #ifdef PARALOAD_DEBUG 141 | std::cerr << "number of threads: " << maximum_chunks_ << std::endl; 142 | std::cerr << "length: " << length_ << std::endl; 143 | #endif 144 | for (size_t worker_id = 0; worker_id < maximum_chunks_; worker_id++) { 145 | long end_of_chunk = std::min(lastpos_, start_of_chunk + (long)chunk_size); 146 | if (end_of_chunk < start_of_chunk) { 147 | start_of_chunk = lastpos_ + 1; 148 | end_of_chunk = lastpos_ + 1; 149 | start_of_chunk_.push_back(start_of_chunk); 150 | end_of_chunk_.push_back(end_of_chunk); 151 | break; 152 | } 153 | #ifdef PARALOAD_DEBUG 154 | std::cerr << "initial>>> start_of_chunk: " << start_of_chunk << " end_of_chunk: " << end_of_chunk << std::endl; 155 | #endif 156 | if (worker_id == maximum_chunks_ - 1) { 157 | end_of_chunk = lastpos_; 158 | } 159 | long trailing_escapes; 160 | char trailing_successor; 161 | std::tie(trailing_escapes, trailing_successor) = get_num_trailing_escapes(start_of_chunk, end_of_chunk); 162 | if (trailing_escapes % 2 == 1) { 163 | long extra = 0; 164 | switch (trailing_successor) { 165 | case 'x': /* \xYY */ 166 | extra = 3; 167 | break; 168 | case 'u': /* \uXXXX */ 169 | extra = 5; 170 | break; 171 | case 'U': /* \UXXXXXXXX */ 172 | extra = 9; 173 | break; 174 | case 'n': 175 | case '0': 176 | case 'r': 177 | case 'v': 178 | case 't': 179 | case 'b': 180 | case '\\': 181 | case '\"': 182 | case '\'': 183 | case '{': 184 | case '}': 185 | case ' ': 186 | case ',': 187 | case ')': 188 | case '(': 189 | extra = 1; 190 | break; 191 | default: 192 | { 193 | std::ostringstream ostr; 194 | ostr << "invalid escape character: \\" << trailing_successor; 195 | throw std::logic_error(ostr.str()); 196 | } 197 | } 198 | if (end_of_chunk + extra > lastpos_) { 199 | std::ostringstream ostr; 200 | ostr << "file ends with a trailing escape sequence \\" << trailing_successor; 201 | throw std::logic_error(ostr.str()); 202 | } 203 | else { 204 | end_of_chunk++; 205 | #ifdef PARALOAD_DEBUG 206 | std::cerr << "cover escape: " << end_of_chunk << std::endl; 207 | #endif 208 | } 209 | } 210 | start_of_chunk_.push_back(start_of_chunk); 211 | end_of_chunk_.push_back(end_of_chunk); 212 | if (end_of_chunk >= lastpos_) { 213 | break; 214 | } 215 | start_of_chunk = end_of_chunk + 1; 216 | } 217 | if (allow_quoted_newlines) { 218 | adjust_offsets_according_to_quoted_newlines(); 219 | } 220 | else { 221 | adjust_offsets_according_to_unquoted_newlines(); 222 | } 223 | for (size_t chunk_id = 0; chunk_id < start_of_chunk_.size(); chunk_id++) { 224 | #ifdef PARALOAD_DEBUG 225 | std::cerr << "final>>> start_of_chunk: " << start_of_chunk_[chunk_id] << " end_of_chunk: " << end_of_chunk_[chunk_id] << std::endl; 226 | #endif 227 | } 228 | } 229 | 230 | void adjust_offsets_according_to_unquoted_newlines() { 231 | const size_t block_size = 512; 232 | char buf[block_size]; 233 | for (size_t worker_id = 0; worker_id < start_of_chunk_.size(); worker_id++) { 234 | if (start_of_chunk_[worker_id] < 0 || end_of_chunk_[worker_id] < 0) { 235 | continue; 236 | } 237 | in_.clear(); 238 | in_.seekg(end_of_chunk_[worker_id], std::ios_base::beg); 239 | long new_end = end_of_chunk_[worker_id]; 240 | bool new_end_found = false; 241 | long current = new_end; 242 | while (in_ && !new_end_found) { 243 | in_.read(buf, block_size); 244 | size_t nread = in_.gcount(); 245 | if (nread == 0) { 246 | break; 247 | } 248 | for (size_t i = 0; i < nread; i++) { 249 | if (buf[i] == '\n') { 250 | new_end = current + i; 251 | new_end_found = true; 252 | break; 253 | } 254 | } 255 | current += nread; 256 | } 257 | if (!new_end_found) { 258 | new_end = lastpos_; 259 | } 260 | end_of_chunk_[worker_id] = new_end; 261 | for (size_t other_worker_id = worker_id + 1; other_worker_id < start_of_chunk_.size(); other_worker_id++) { 262 | if (end_of_chunk_[other_worker_id] <= new_end || new_end == lastpos_) { 263 | start_of_chunk_[other_worker_id] = -1; 264 | end_of_chunk_[other_worker_id] = -1; 265 | } else if (start_of_chunk_[other_worker_id] <= new_end) { 266 | start_of_chunk_[other_worker_id] = new_end + 1; 267 | end_of_chunk_[other_worker_id] = std::max(end_of_chunk_[other_worker_id], new_end + 1); 268 | } 269 | } 270 | } 271 | } 272 | 273 | void adjust_offsets_according_to_quoted_newlines() { 274 | std::vector threads; 275 | std::vector > workers; 276 | std::exception_ptr thread_exception; 277 | for (size_t worker_id = 0; worker_id < start_of_chunk_.size(); worker_id++) { 278 | workers.push_back(std::make_shared(start_of_chunk_[worker_id], 279 | end_of_chunk_[worker_id])); 280 | threads.emplace_back(&QuoteNewlineAdjustmentWorker::parse, workers.back(), filename_); 281 | } 282 | for (size_t thread_id = 0; thread_id < threads.size(); thread_id++) { 283 | threads[thread_id].join(); 284 | if (!thread_exception) { 285 | thread_exception = workers[thread_id]->get_exception(); 286 | } 287 | } 288 | for (size_t chunk_id = 0; chunk_id < workers.size(); chunk_id++) { 289 | #ifdef PARALOAD_DEBUG 290 | std::cerr << "quotes>>> wid=" << chunk_id << " start_of_chunk: " << start_of_chunk_[chunk_id] << " end_of_chunk: " << end_of_chunk_[chunk_id] << " num_quotes: " << workers[chunk_id]->get_num_quotes() << std::endl; 291 | #endif 292 | } 293 | // We're now outside the parallel region. 294 | if (thread_exception) { 295 | std::rethrow_exception(thread_exception); 296 | } 297 | std::vector cumulative_quote_sum(workers.size(), 0); 298 | if (workers.size() > 0) { 299 | cumulative_quote_sum[0] = workers[0]->get_num_quotes(); 300 | for (size_t i = 1; i < workers.size(); i++) { 301 | cumulative_quote_sum[i] = cumulative_quote_sum[i - 1] + workers[i]->get_num_quotes(); 302 | } 303 | } 304 | #ifdef PARALOAD_DEBUG 305 | std::cerr << "total unescaped quotes: " << cumulative_quote_sum.back() << std::endl; 306 | #endif 307 | size_t current = 0; 308 | size_t next = 1; 309 | while (current < workers.size()) { 310 | if (end_of_chunk_[current] < 0 || start_of_chunk_[current] < 0) { 311 | start_of_chunk_[current] = -1; 312 | end_of_chunk_[current] = -1; 313 | #ifdef PARALOAD_DEBUG 314 | std::cerr << "negative chunk current=" << current << std::endl; 315 | #endif 316 | current++; 317 | /* if (next_wid < workers.size()) { 318 | quotes_so_far += workers[next_wid]->get_num_quotes(); 319 | next_wid++; 320 | } 321 | continue;*/ 322 | } 323 | else if (cumulative_quote_sum[next-1] % 2 == 0) { /* even number of quotes so far. */ 324 | if (next < workers.size()) { 325 | #ifdef PARALOAD_DEBUG 326 | std::cerr << "[A] current=" << current << " next=" << next << " quotes_so_far=" << cumulative_quote_sum[current] << std::endl; 327 | #endif 328 | long pos = workers[next]->get_first_unquoted_newline(); 329 | if (pos >= 0) { /* resolved */ 330 | end_of_chunk_[current] = pos; 331 | if (end_of_chunk_[next] == pos) { /* take all of next chunk. */ 332 | start_of_chunk_[next] = -1; 333 | end_of_chunk_[next] = -1; 334 | current = next + 1; 335 | next += 2; 336 | } 337 | else { /* take part of next chunk. */ 338 | start_of_chunk_[next] = pos + 1; 339 | current = next; 340 | next++; 341 | } 342 | } 343 | else { /* no resolution. do not increment current */ 344 | end_of_chunk_[current] = end_of_chunk_[next]; 345 | start_of_chunk_[next] = -1; 346 | end_of_chunk_[next] = -1; 347 | next++; 348 | } 349 | } 350 | else { /* EOF resolution. */ 351 | end_of_chunk_[current] = lastpos_; 352 | break; 353 | } 354 | } 355 | else { /* odd number of quotes so far. */ 356 | if (next < workers.size()) { 357 | #ifdef PARALOAD_DEBUG 358 | std::cerr << "[B] current=" << current << " next=" << next << " quotes_so_far=" << cumulative_quote_sum[next] << std::endl; 359 | #endif 360 | long pos = workers[next]->get_first_quoted_newline(); 361 | if (pos >= 0) { /* resolution*/ 362 | end_of_chunk_[current] = pos; 363 | if (end_of_chunk_[next] == pos) { /*take all of next chunk. */ 364 | start_of_chunk_[next] = -1; 365 | end_of_chunk_[next] = -1; 366 | current = next + 1; 367 | next += 2; 368 | } 369 | else { /* take part of next chunk. */ 370 | start_of_chunk_[next] = pos + 1; 371 | current = next; 372 | next++; 373 | } 374 | } 375 | else { /*no resolution. take all of chunk. */ 376 | end_of_chunk_[current] = end_of_chunk_[next]; 377 | start_of_chunk_[next] = -1; 378 | end_of_chunk_[next] = -1; 379 | next++; 380 | } 381 | } 382 | else { /* no resolution and EOF. */ 383 | std::ostringstream ostr; 384 | ostr << "The file ends with an open quote; a total of " << cumulative_quote_sum[current] << ")"; 385 | throw std::logic_error(ostr.str()); 386 | } 387 | } 388 | } 389 | } 390 | 391 | private: 392 | std::ifstream in_; 393 | std::string filename_; 394 | size_t maximum_chunks_; 395 | size_t length_; 396 | long lastpos_; 397 | long starting_offset_; 398 | std::vector start_of_chunk_; 399 | std::vector end_of_chunk_; 400 | }; 401 | } 402 | #endif 403 | -------------------------------------------------------------------------------- /src/python/numpy_helper.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | ParaText: parallel text reading 3 | Copyright (C) 2016. wise.io, Inc. 4 | 5 | Licensed to the Apache Software Foundation (ASF) under one 6 | or more contributor license agreements. See the NOTICE file 7 | distributed with this work for additional information 8 | regarding copyright ownership. The ASF licenses this file 9 | to you under the Apache License, Version 2.0 (the 10 | "License"); you may not use this file except in compliance 11 | with the License. You may obtain a copy of the License at 12 | 13 | http://www.apache.org/licenses/LICENSE-2.0 14 | 15 | Unless required by applicable law or agreed to in writing, 16 | software distributed under the License is distributed on an 17 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 18 | KIND, either express or implied. See the License for the 19 | specific language governing permissions and limitations 20 | under the License. 21 | 22 | */ 23 | 24 | /* 25 | Coder: Damian Eads. 26 | */ 27 | 28 | #ifndef WISEIO_NUMPY_HELPER_HPP 29 | #define WISEIO_NUMPY_HELPER_HPP 30 | 31 | #include 32 | 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | 39 | #include "../generic/encoding.hpp" 40 | 41 | #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION 42 | #include 43 | #include 44 | 45 | 46 | template 47 | struct numpy_type {}; 48 | 49 | template <> struct numpy_type { static const long id = NPY_UINT8; }; 50 | template <> struct numpy_type { static const long id = NPY_INT8; }; 51 | template <> struct numpy_type { static const long id = NPY_UINT16; }; 52 | template <> struct numpy_type { static const long id = NPY_INT16; }; 53 | template <> struct numpy_type { static const long id = NPY_UINT32; }; 54 | template <> struct numpy_type { static const long id = NPY_INT32; }; 55 | template <> struct numpy_type { static const long id = NPY_UINT64; }; 56 | template <> struct numpy_type { static const long id = NPY_INT64; }; 57 | template <> struct numpy_type { static const long id = NPY_FLOAT; }; 58 | template <> struct numpy_type { static const long id = NPY_DOUBLE; }; 59 | template <> struct numpy_type { static const long id = NPY_OBJECT; }; 60 | 61 | #if defined(__APPLE__) 62 | template <> struct numpy_type { static const long id = NPY_ULONG; }; 63 | #endif 64 | 65 | template 66 | struct AsPythonString {}; 67 | 68 | template <> 69 | struct AsPythonString { 71 | PyObject *operator()(const std::string &in) const { 72 | PyObject *attempt = PyUnicode_FromStringAndSize(in.c_str(), in.size()); 73 | if (attempt == NULL) { 74 | PyErr_Clear(); 75 | attempt = PyBytes_FromStringAndSize(in.c_str(), in.size()); 76 | } 77 | return attempt; 78 | } 79 | }; 80 | 81 | template <> 82 | struct AsPythonString { 84 | PyObject *operator()(const std::string &in) const { 85 | #if PY_MAJOR_VERSION >= 3 86 | return PyBytes_FromStringAndSize(in.c_str(), in.size()); 87 | #else 88 | return PyString_FromStringAndSize(in.c_str(), in.size()); 89 | #endif 90 | } 91 | }; 92 | 93 | template <> 94 | struct AsPythonString { 96 | PyObject *operator()(const std::string &in) const { 97 | #if PY_MAJOR_VERSION >= 3 98 | return PyBytes_FromStringAndSize(in.c_str(), in.size()); 99 | #else 100 | return PyString_FromStringAndSize(in.c_str(), in.size()); 101 | #endif 102 | } 103 | }; 104 | 105 | template <> 106 | struct AsPythonString { 108 | PyObject *operator()(const std::string &in) const { 109 | PyObject *attempt = PyUnicode_FromStringAndSize(in.c_str(), in.size()); 110 | if (attempt == NULL) { 111 | PyErr_Clear(); 112 | attempt = PyBytes_FromStringAndSize(in.c_str(), in.size()); 113 | } 114 | return attempt; 115 | } 116 | }; 117 | 118 | 119 | template 120 | inline PyObject *as_python_string(const std::string &in) { 121 | AsPythonString encoder; 122 | return encoder(in); 123 | } 124 | 125 | template 126 | struct build_array_impl {}; 127 | 128 | template 129 | struct build_array_impl::value>::type> { 130 | typedef typename Container::value_type value_type; 131 | 132 | static PyObject *build_array(const Container &container) { 133 | npy_intp fdims[] = {(npy_intp)container.size()}; 134 | PyObject *array = (PyObject*)PyArray_SimpleNew(1, fdims, numpy_type::id); 135 | try { 136 | value_type *data = (value_type*)PyArray_DATA((PyArrayObject*)array); 137 | for (size_t i = 0; i < container.size(); i++) { 138 | data[i] = container[i]; 139 | } 140 | } 141 | catch (...) { 142 | Py_XDECREF(array); 143 | array = NULL; 144 | std::rethrow_exception(std::current_exception()); 145 | } 146 | return array; 147 | } 148 | 149 | }; 150 | 151 | template 152 | struct build_array_impl::value>::type> { 153 | 154 | typedef typename Container::value_type value_type; 155 | 156 | static PyObject *build_array(const Container &container) { 157 | size_t sz = (size_t)container.size(); 158 | npy_intp fdims[] = {(npy_intp)sz}; 159 | PyObject *array = (PyObject*)PyArray_SimpleNew(1, fdims, NPY_OBJECT); 160 | try { 161 | for (size_t i = 0; i < container.size(); i++) { 162 | PyObject **ref = (PyObject **)PyArray_GETPTR1((PyArrayObject*)array, i); 163 | PyObject *newobj = as_python_string(container[i]); 164 | Py_XDECREF(*ref); 165 | *ref = newobj; 166 | } 167 | } 168 | catch (...) { 169 | for (size_t i = 0; i < sz; i++) { 170 | PyObject **ref = (PyObject **)PyArray_GETPTR1((PyArrayObject*)array, i); 171 | Py_XDECREF(*ref); 172 | *ref = Py_None; 173 | Py_XINCREF(*ref); 174 | } 175 | Py_XDECREF(array); 176 | std::rethrow_exception(std::current_exception()); 177 | } 178 | return array; 179 | } 180 | 181 | }; 182 | 183 | 184 | template 185 | struct build_array_from_range_impl {}; 186 | 187 | template 188 | struct build_array_from_range_impl::value_type>::value>::type> { 189 | typedef typename Iterator::value_type value_type; 190 | 191 | static PyObject *build_array(const std::pair &range) { 192 | npy_intp fdims[] = {(npy_intp)std::distance(range.first, range.second)}; 193 | PyObject *array = NULL; 194 | try { 195 | array = (PyObject*)PyArray_SimpleNew(1, fdims, numpy_type::id); 196 | value_type *data = (value_type*)PyArray_DATA((PyArrayObject*)array); 197 | size_t i = 0; 198 | for (Iterator it = range.first; it != range.second; it++, i++) { 199 | data[i] = *it; 200 | } 201 | } 202 | catch (...) { 203 | Py_XDECREF(array); 204 | array = NULL; 205 | std::rethrow_exception(std::current_exception()); 206 | } 207 | return array; 208 | } 209 | }; 210 | 211 | template 212 | struct build_array_from_range_impl::value_type, std::string>::value>::type> { 213 | 214 | typedef typename Iterator::value_type value_type; 215 | 216 | static PyObject *build_array(const std::pair &range) { 217 | size_t sz = (npy_intp)std::distance(range.first, range.second); 218 | npy_intp fdims[] = {(npy_intp)sz}; 219 | PyObject *array = (PyObject*)PyArray_SimpleNew(1, fdims, NPY_OBJECT); 220 | try { 221 | size_t i = 0; 222 | for (Iterator it = range.first; it != range.second; it++, i++) { 223 | PyObject **ref = (PyObject **)PyArray_GETPTR1((PyArrayObject*)array, i); 224 | PyObject *newobj = as_python_string(*it); 225 | Py_XDECREF(*ref); 226 | *ref = newobj; 227 | } 228 | } 229 | catch (...) { 230 | size_t i = 0; 231 | for (i = 0; i < sz; i++) { 232 | PyObject **ref = (PyObject **)PyArray_GETPTR1((PyArrayObject*)array, i); 233 | Py_XDECREF(*ref); 234 | *ref = Py_None; 235 | Py_XINCREF(*ref); 236 | } 237 | Py_XDECREF(array); 238 | array = NULL; 239 | std::rethrow_exception(std::current_exception()); 240 | } 241 | return array; 242 | } 243 | }; 244 | 245 | template 246 | struct base_insert_populator_impl { 247 | base_insert_populator_impl() {} 248 | virtual ~base_insert_populator_impl() {} 249 | 250 | virtual PyObject *populate(const Populator &populator) = 0; 251 | }; 252 | 253 | template 254 | struct derived_insert_populator_impl : public base_insert_populator_impl { 255 | typedef T value_type; 256 | 257 | derived_insert_populator_impl() {} 258 | virtual ~derived_insert_populator_impl() {} 259 | 260 | virtual PyObject *populate(const Populator &populator) { 261 | npy_intp fdims[] = {(npy_intp)populator.size()}; 262 | PyObject *array = NULL; 263 | try { 264 | array = (PyObject*)PyArray_SimpleNew(1, fdims, numpy_type::id); 265 | value_type *data = (value_type*)PyArray_DATA((PyArrayObject*)array); 266 | populator.insert_into_buffer(data); 267 | } 268 | catch (...) { 269 | Py_XDECREF(array); 270 | array = NULL; 271 | std::rethrow_exception(std::current_exception()); 272 | } 273 | return array; 274 | } 275 | }; 276 | 277 | 278 | template 279 | struct string_array_output_iterator : public std::iterator { 280 | string_array_output_iterator(PyArrayObject *array) : i(0), array(array) {} 281 | 282 | inline string_array_output_iterator &operator++() { 283 | PyObject *s = as_python_string(output); 284 | PyObject **ref = (PyObject **)PyArray_GETPTR1((PyArrayObject*)array, i); 285 | Py_XDECREF(*ref); 286 | *ref = s; 287 | i++; 288 | return *this; 289 | } 290 | 291 | inline string_array_output_iterator &operator++(int) { 292 | return operator++(); 293 | } 294 | 295 | inline std::string &operator*() { 296 | return output; 297 | } 298 | 299 | long i; 300 | std::string output; 301 | PyArrayObject *array; 302 | }; 303 | 304 | template 305 | struct derived_insert_populator_impl : public base_insert_populator_impl { 306 | typedef std::string value_type; 307 | 308 | derived_insert_populator_impl() {} 309 | virtual ~derived_insert_populator_impl() {} 310 | 311 | virtual PyObject *populate(const Populator &populator) { 312 | using ParaText::Encoding; 313 | npy_intp fdims[] = {(npy_intp)populator.size()}; 314 | PyObject *array = NULL; 315 | array = (PyObject*)PyArray_SimpleNew(1, fdims, numpy_type::id); 316 | try { 317 | ParaText::Encoding in = populator.get_in_encoding(); 318 | ParaText::Encoding out = populator.get_out_encoding(); 319 | if (in == Encoding::UNKNOWN_BYTES && out == Encoding::UNKNOWN_BYTES) { 320 | string_array_output_iterator oit((PyArrayObject*)array); 321 | populator.insert_and_forget(oit); 322 | } 323 | else if (in == Encoding::UNICODE_UTF8 && out == Encoding::UNKNOWN_BYTES) { 324 | string_array_output_iterator oit((PyArrayObject*)array); 325 | populator.insert_and_forget(oit); 326 | } 327 | else if (in == Encoding::UNICODE_UTF8 && out == Encoding::UNICODE_UTF8) { 328 | string_array_output_iterator oit((PyArrayObject*)array); 329 | populator.insert_and_forget(oit); 330 | } 331 | else if (in == Encoding::UNKNOWN_BYTES && out == Encoding::UNICODE_UTF8) { 332 | string_array_output_iterator oit((PyArrayObject*)array); 333 | populator.insert_and_forget(oit); 334 | } 335 | else { 336 | throw std::logic_error("unknown encoding"); 337 | } 338 | } 339 | catch (...) { 340 | Py_XDECREF(array); 341 | array = NULL; 342 | std::rethrow_exception(std::current_exception()); 343 | } 344 | return array; 345 | } 346 | }; 347 | 348 | template 349 | PyObject *build_populator(const Populator &populator) { 350 | static std::unordered_map>> 351 | populators({std::make_pair(std::type_index(typeid(uint8_t)), 352 | std::make_shared>()), 353 | std::make_pair(std::type_index(typeid(int8_t)), 354 | std::make_shared>()), 355 | std::make_pair(std::type_index(typeid(uint16_t)), 356 | std::make_shared>()), 357 | std::make_pair(std::type_index(typeid(int16_t)), 358 | std::make_shared>()), 359 | std::make_pair(std::type_index(typeid(uint32_t)), 360 | std::make_shared>()), 361 | std::make_pair(std::type_index(typeid(int32_t)), 362 | std::make_shared>()), 363 | std::make_pair(std::type_index(typeid(uint64_t)), 364 | std::make_shared>()), 365 | std::make_pair(std::type_index(typeid(int64_t)), 366 | std::make_shared>()), 367 | std::make_pair(std::type_index(typeid(float)), 368 | std::make_shared>()), 369 | std::make_pair(std::type_index(typeid(double)), 370 | std::make_shared>()), 371 | std::make_pair(std::type_index(typeid(std::string)), 372 | std::make_shared>()) 373 | }); 374 | auto it = populators.find(populator.get_type_index()); 375 | if (it == populators.end()) { 376 | throw std::logic_error(std::string("cannot process type")); 377 | } 378 | return it->second->populate(populator); 379 | } 380 | 381 | template 382 | PyObject *build_array(const Container &container) { 383 | return (PyObject*)build_array_impl::build_array(container); 384 | } 385 | 386 | template 387 | PyObject *build_array_from_range(const std::pair &range) { 388 | return (PyObject*)build_array_from_range_impl::build_array(range); 389 | } 390 | 391 | #endif 392 | --------------------------------------------------------------------------------