├── alphatools ├── __init__.py ├── algo │ ├── __init__.py │ ├── utils.py │ └── risk.py ├── misc │ ├── __init__.py │ └── pair_trade.py ├── data │ ├── __init__.py │ ├── dat_file.pkl │ ├── sectors.npy │ ├── industries.npy │ ├── sector_names.csv │ ├── factory │ │ └── data_sources.json │ ├── factory.py │ └── industry_names.csv ├── expression │ ├── __init__.py │ ├── test.py │ ├── repro.py │ ├── expression.lark │ └── expression.py ├── fundamentals │ ├── __init__.py │ ├── make_fundamentals.py~ │ ├── make_fundamentals.py │ └── fundamentals.py ├── ics │ ├── sectors.npy │ ├── industries.npy │ ├── sic_major.npy │ ├── sic_subclass.npy │ ├── __init__.py │ ├── sic_specialize.npy │ ├── sector_names.csv │ ├── ics_scheme.py │ └── industry_names.csv ├── research │ ├── __init__.py │ └── research.py └── __main__.py ├── setup.cfg ├── MANIFEST.in ├── catboost_info ├── time_left.tsv ├── learn_error.tsv ├── learn │ └── events.out.tfevents ├── meta.tsv └── catboost_training.json ├── .DS_Store ├── ci └── test.py ├── tests ├── catboost_info │ ├── time_left.tsv │ ├── learn_error.tsv │ ├── learn │ │ └── events.out.tfevents │ ├── meta.tsv │ └── catboost_training.json ├── test_sklearn.py ├── test_catboost.py ├── test_lightgbm.py └── expressions.py ├── notebooks ├── .DS_Store ├── alpha9.png ├── Tree84.gv.pdf ├── Tree84.gv.png ├── Tree84.gv ├── pipeline-blaze-factory.ipynb ├── pipeline-minimal.ipynb ├── one_o_one_alphas.json ├── pipeline-blaze-minimal.ipynb └── model.txt ├── requirements_blaze_latest.txt ├── requirements_blaze_stable.txt ├── setup.py ├── .travis.yml ├── install_stable.sh ├── install_latest.sh ├── .gitignore ├── requirements_stable.txt ├── requirements_latest.txt ├── LICENSE └── README.md /alphatools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /alphatools/algo/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /alphatools/misc/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include data/* 2 | recursive-include data * -------------------------------------------------------------------------------- /alphatools/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .factory import Factory 2 | -------------------------------------------------------------------------------- /catboost_info/time_left.tsv: -------------------------------------------------------------------------------- 1 | iter Passed Remaining 2 | 0 0 0 3 | 1 2 0 4 | -------------------------------------------------------------------------------- /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefan-jansen/alphatools/HEAD/.DS_Store -------------------------------------------------------------------------------- /alphatools/expression/__init__.py: -------------------------------------------------------------------------------- 1 | from .expression import ExpressionAlpha 2 | -------------------------------------------------------------------------------- /alphatools/fundamentals/__init__.py: -------------------------------------------------------------------------------- 1 | from .fundamentals import Fundamentals 2 | -------------------------------------------------------------------------------- /ci/test.py: -------------------------------------------------------------------------------- 1 | if __name__ == '__main__': 2 | print('Success!') 3 | 4 | -------------------------------------------------------------------------------- /catboost_info/learn_error.tsv: -------------------------------------------------------------------------------- 1 | iter RMSE 2 | 0 15.47309493 3 | 1 11.44676992 4 | -------------------------------------------------------------------------------- /tests/catboost_info/time_left.tsv: -------------------------------------------------------------------------------- 1 | iter Passed Remaining 2 | 0 57 57 3 | 1 59 0 4 | -------------------------------------------------------------------------------- /tests/catboost_info/learn_error.tsv: -------------------------------------------------------------------------------- 1 | iter RMSE 2 | 0 15.47309493 3 | 1 11.44676992 4 | -------------------------------------------------------------------------------- /notebooks/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefan-jansen/alphatools/HEAD/notebooks/.DS_Store -------------------------------------------------------------------------------- /notebooks/alpha9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefan-jansen/alphatools/HEAD/notebooks/alpha9.png -------------------------------------------------------------------------------- /alphatools/expression/test.py: -------------------------------------------------------------------------------- 1 | class MyClass(): 2 | y = 2 3 | def my_func(self): 4 | x = 2 5 | return(x) 6 | -------------------------------------------------------------------------------- /notebooks/Tree84.gv.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefan-jansen/alphatools/HEAD/notebooks/Tree84.gv.pdf -------------------------------------------------------------------------------- /notebooks/Tree84.gv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefan-jansen/alphatools/HEAD/notebooks/Tree84.gv.png -------------------------------------------------------------------------------- /alphatools/ics/sectors.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefan-jansen/alphatools/HEAD/alphatools/ics/sectors.npy -------------------------------------------------------------------------------- /alphatools/data/dat_file.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefan-jansen/alphatools/HEAD/alphatools/data/dat_file.pkl -------------------------------------------------------------------------------- /alphatools/data/sectors.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefan-jansen/alphatools/HEAD/alphatools/data/sectors.npy -------------------------------------------------------------------------------- /alphatools/ics/industries.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefan-jansen/alphatools/HEAD/alphatools/ics/industries.npy -------------------------------------------------------------------------------- /alphatools/ics/sic_major.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefan-jansen/alphatools/HEAD/alphatools/ics/sic_major.npy -------------------------------------------------------------------------------- /alphatools/data/industries.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefan-jansen/alphatools/HEAD/alphatools/data/industries.npy -------------------------------------------------------------------------------- /alphatools/ics/sic_subclass.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefan-jansen/alphatools/HEAD/alphatools/ics/sic_subclass.npy -------------------------------------------------------------------------------- /alphatools/ics/__init__.py: -------------------------------------------------------------------------------- 1 | from .ics_scheme import ( 2 | SICMajorIndustry, 3 | Sector, 4 | SubIndustry 5 | ) 6 | -------------------------------------------------------------------------------- /alphatools/ics/sic_specialize.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefan-jansen/alphatools/HEAD/alphatools/ics/sic_specialize.npy -------------------------------------------------------------------------------- /catboost_info/learn/events.out.tfevents: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefan-jansen/alphatools/HEAD/catboost_info/learn/events.out.tfevents -------------------------------------------------------------------------------- /tests/catboost_info/learn/events.out.tfevents: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefan-jansen/alphatools/HEAD/tests/catboost_info/learn/events.out.tfevents -------------------------------------------------------------------------------- /catboost_info/meta.tsv: -------------------------------------------------------------------------------- 1 | name experiment 2 | iterCount 2 3 | learnErrorLog learn_error.tsv 4 | testErrorLog test_error.tsv 5 | timeLeft time_left.tsv 6 | loss RMSE min 7 | -------------------------------------------------------------------------------- /tests/catboost_info/meta.tsv: -------------------------------------------------------------------------------- 1 | name experiment 2 | iterCount 2 3 | learnErrorLog learn_error.tsv 4 | testErrorLog test_error.tsv 5 | timeLeft time_left.tsv 6 | loss RMSE min 7 | -------------------------------------------------------------------------------- /alphatools/ics/sector_names.csv: -------------------------------------------------------------------------------- 1 | 0,Healthcare 2 | 1,Basic Materials 3 | 2,Services 4 | 3,Financial 5 | 4,Technology 6 | 5,Industrial Goods 7 | 6,Consumer Goods 8 | 7,Utilities 9 | 8,Conglomerates 10 | -------------------------------------------------------------------------------- /alphatools/data/sector_names.csv: -------------------------------------------------------------------------------- 1 | 0,Healthcare 2 | 1,Basic Materials 3 | 2,Services 4 | 3,Financial 5 | 4,Technology 6 | 5,Industrial Goods 7 | 6,Consumer Goods 8 | 7,Utilities 9 | 8,Conglomerates 10 | -------------------------------------------------------------------------------- /alphatools/data/factory/data_sources.json: -------------------------------------------------------------------------------- 1 | { 2 | "sample": { 3 | "url": "/Users/jonathan/devwork/alphatools/alphatools/data/factory/sample.csv", 4 | "schema": "var*{asof_date: datetime, sid: int64, value: float64}" 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /alphatools/research/__init__.py: -------------------------------------------------------------------------------- 1 | from .research import ( 2 | loaders, 3 | blaze_loader, 4 | run_pipeline, 5 | get_pricing, 6 | get_symbols, 7 | make_factor_plot, 8 | make_quantile_plot, 9 | set_bundle 10 | ) 11 | -------------------------------------------------------------------------------- /alphatools/algo/utils.py: -------------------------------------------------------------------------------- 1 | from logbook import Logger, StderrHandler, DEBUG, INFO 2 | 3 | log_handler = StderrHandler( 4 | format_string='[{record.time:%Y-%m-%d %H:%M:%S.%f}]: ' + 5 | '{record.level_name}: {record.func_name}: {record.message}', 6 | level=INFO 7 | ) 8 | log_handler.push_application() 9 | log = Logger('Algorithm') 10 | -------------------------------------------------------------------------------- /catboost_info/catboost_training.json: -------------------------------------------------------------------------------- 1 | { 2 | "meta":{"launch_mode":"Train","name":"experiment","iteration_count":2,"learn_metrics":[{"best_value":"Min","name":"RMSE"}],"test_sets":[],"test_metrics":[],"learn_sets":["learn"]}, 3 | "iterations":[ 4 | {"learn":[15.47309493],"iteration":0,"passed_time":0.0009688483709,"remaining_time":0.0009688483709}, 5 | {"learn":[11.44676992],"iteration":1,"passed_time":0.002063907508,"remaining_time":0} 6 | ]} -------------------------------------------------------------------------------- /tests/catboost_info/catboost_training.json: -------------------------------------------------------------------------------- 1 | { 2 | "meta":{"launch_mode":"Train","name":"experiment","iteration_count":2,"learn_metrics":[{"best_value":"Min","name":"RMSE"}],"test_sets":[],"test_metrics":[],"learn_sets":["learn"]}, 3 | "iterations":[ 4 | {"learn":[15.47309493],"iteration":0,"passed_time":0.05778349878,"remaining_time":0.05778349878}, 5 | {"learn":[11.44676992],"iteration":1,"passed_time":0.05900830356,"remaining_time":0} 6 | ]} -------------------------------------------------------------------------------- /tests/test_sklearn.py: -------------------------------------------------------------------------------- 1 | from sklearn import tree 2 | from sklearn.datasets import load_iris 3 | from sklearn.metrics import accuracy_score 4 | from sklearn.model_selection import train_test_split 5 | import numpy as np 6 | 7 | def test_tree(): 8 | iris = load_iris() 9 | X = iris.data 10 | y = iris.target 11 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 12 | 13 | clf = tree.DecisionTreeClassifier() 14 | clf.fit(X_train, y_train) 15 | preds = clf.predict(X_test) 16 | assert np.allclose(accuracy_score(y_test, preds), 0.973684210526) 17 | -------------------------------------------------------------------------------- /requirements_blaze_latest.txt: -------------------------------------------------------------------------------- 1 | -e git://github.com/quantopian/datashape.git@bf06a41dc0908baf7c324aeacadba8820468ee78#egg=datashape-dev 2 | -e git://github.com/quantopian/odo.git@da7f26d87702f5d293763e8ed54c7e25fd3af38#egg=odo-dev 3 | 4 | # Keep cytoolz version in sync with toolz version in requirements.txt 5 | cytoolz==0.8.2 6 | 7 | # Transitive dependencies of blaze: 8 | dask[dataframe]==0.17.1 9 | partd==0.3.7 10 | locket==0.2.0 11 | cloudpickle==0.2.1 12 | itsdangerous==0.24 13 | flask==0.12.4 14 | flask-cors==2.1.3 15 | Jinja2==2.9.6 16 | MarkupSafe==0.23 17 | Werkzeug==0.10.4 18 | psutil==4.3.0 19 | 20 | -e git://github.com/quantopian/blaze.git@310605323449e375e81a0cf04011c507cd126ef6#egg=blaze-dev 21 | -------------------------------------------------------------------------------- /requirements_blaze_stable.txt: -------------------------------------------------------------------------------- 1 | -e git://github.com/quantopian/datashape.git@bf06a41dc0908baf7c324aeacadba8820468ee78#egg=datashape-dev 2 | -e git://github.com/quantopian/odo.git@da7f26d87702f5d293763e8ed54c7e25fd3af38#egg=odo-dev 3 | 4 | # Keep cytoolz version in sync with toolz version in requirements.txt 5 | cytoolz==0.8.2 6 | 7 | # Transitive dependencies of blaze: 8 | dask[dataframe]==0.13.0 9 | partd==0.3.7 10 | locket==0.2.0 11 | cloudpickle==0.2.1 12 | itsdangerous==0.24 13 | flask==0.12.4 14 | flask-cors==2.1.3 15 | Jinja2==2.9.6 16 | MarkupSafe==0.23 17 | Werkzeug==0.10.4 18 | psutil==4.3.0 19 | 20 | -e git://github.com/quantopian/blaze.git@310605323449e375e81a0cf04011c507cd126ef6#egg=blaze-dev 21 | -------------------------------------------------------------------------------- /alphatools/__main__.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from alphatools.ics.ics_scheme import make_sector_classifier 3 | 4 | import click 5 | import subprocess 6 | import sys 7 | 8 | from os import path 9 | import zipline 10 | 11 | this_path = path.dirname(__file__) 12 | 13 | @click.group() 14 | def main(): 15 | pass 16 | 17 | @main.command() 18 | def get_blaze(): 19 | req = path.join(this_path, 'misc/requirements_blaze.txt') 20 | print(req) 21 | subprocess.call([sys.executable, "-m", "pip", "install", "-r" + req]) 22 | 23 | # Example 24 | if __name__ == '__main__': 25 | install('argh') 26 | 27 | @main.command() 28 | def ingest(): 29 | print('mapping sectors and industries...') 30 | make_sector_classifier() 31 | 32 | if __name__ == '__main__': 33 | main() 34 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='alphatools', 5 | version='0.15', 6 | description='Quant finance resarch tools', 7 | author='Jonathan Larkin', 8 | author_email='jonathan.r.larkin@gmail.com', 9 | url = "https://github.com/marketneutral/alphatools", 10 | download_url = "https://github.com/marketneutral/alphatools/archive/0.11.tar.gz", 11 | packages=find_packages(), 12 | python_requires='>=3.5.*', 13 | install_requires=[ 14 | 'zipline<=1.3', 15 | 'alphalens', 16 | 'ipykernel', 17 | 'lark-parser', 18 | 'autopep8', 19 | 'bottleneck', 20 | 'tqdm', 21 | 'pydot' 22 | ], 23 | entry_points={ 24 | 'console_scripts': [ 25 | 'alphatools = alphatools.__main__:main', 26 | ] 27 | } 28 | ) 29 | -------------------------------------------------------------------------------- /tests/test_catboost.py: -------------------------------------------------------------------------------- 1 | from catboost import CatBoostRegressor 2 | import numpy as np 3 | 4 | def test_catboost(): 5 | # Initialize data 6 | cat_features = [0, 1, 2] 7 | train_data = [ 8 | ["a", "b", 1, 4, 5, 6], 9 | ["a", "b", 4, 5, 6, 7], 10 | ["c", "d", 30, 40, 50, 60] 11 | ] 12 | test_data = [ 13 | ["a", "b", 2, 4, 6, 8], 14 | ["a", "d", 1, 4, 50, 60] 15 | ] 16 | train_labels = [10, 20, 30] 17 | # Initialize CatBoostRegressor 18 | model = CatBoostRegressor( 19 | iterations=2, 20 | learning_rate=1, 21 | depth=2, 22 | random_seed=100 23 | ) 24 | # Fit model 25 | model.fit(train_data, train_labels, cat_features) 26 | # Get predictions 27 | preds = model.predict(test_data) 28 | print(preds) 29 | assert np.array_equal(preds, np.array([9.6, 9.6])) 30 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | sudo: false 3 | 4 | python: 5 | - 3.5 6 | - 3.6 7 | 8 | before_install: 9 | - export PYTHON_VERSION=$TRAVIS_PYTHON_VERSION 10 | - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; 11 | - bash miniconda.sh -b -p $HOME/miniconda 12 | - export PATH="$HOME/miniconda/bin:$PATH" 13 | - export MKL_THREADING_LAYER=GNU 14 | - hash -r 15 | - conda config --set always_yes yes --set changeps1 no 16 | - conda update -q conda 17 | - conda info -a 18 | 19 | install: 20 | - if [[ "$TRAVIS_PYTHON_VERSION" == "3.5" ]]; then 21 | ./install_stable.sh; source activate env_alphatools_stable; 22 | else 23 | ./install_latest.sh; source activate env_alphatools_latest; 24 | fi 25 | - zipline ingest -b quantopian-quandl 26 | - pip install nose 27 | # - alphatools ingest 28 | 29 | script: 30 | - python ./ci/test.py 31 | - cd tests 32 | - nosetests -v -------------------------------------------------------------------------------- /notebooks/Tree84.gv: -------------------------------------------------------------------------------- 1 | digraph Tree84 { 2 | split0 [label="split_feature_name: f19\nthreshold: -2.0784999999999996"] 3 | leaf0 [label="leaf_index: 0\nleaf_value: 0.008256563487728793"] 4 | split0 -> leaf0 [label="<="] 5 | split1 [label="split_feature_name: f19\nthreshold: 0.025500000000000005"] 6 | split2 [label="split_feature_name: f2\nthreshold: -2.2624999999999997"] 7 | leaf1 [label="leaf_index: 1\nleaf_value: 0.028751267421813238"] 8 | split2 -> leaf1 [label="<="] 9 | split3 [label="split_feature_name: f25\nthreshold: 1.2685000000000002"] 10 | leaf3 [label="leaf_index: 3\nleaf_value: -0.0027176377740919895"] 11 | split3 -> leaf3 [label="<="] 12 | leaf4 [label="leaf_index: 4\nleaf_value: 0.0032094788377135734"] 13 | split3 -> leaf4 [label=">"] 14 | split2 -> split3 [label=">"] 15 | split1 -> split2 [label="<="] 16 | leaf2 [label="leaf_index: 2\nleaf_value: 0.0012024763815209393"] 17 | split1 -> leaf2 [label=">"] 18 | split0 -> split1 [label=">"] 19 | } 20 | -------------------------------------------------------------------------------- /alphatools/data/factory.py: -------------------------------------------------------------------------------- 1 | import blaze as bz 2 | import json 3 | from datashape import dshape 4 | 5 | from alphatools.research import loaders, blaze_loader 6 | from zipline.data import bundles 7 | from zipline.utils.calendars import get_calendar 8 | from zipline.pipeline.loaders.blaze import from_blaze 9 | 10 | from os import path 11 | 12 | this_file = path.dirname(__file__) 13 | 14 | with open(path.join(this_file, 'factory/data_sources.json')) as f: 15 | data_sources = json.load(f) 16 | 17 | Factory = {} 18 | 19 | for source in data_sources.keys(): 20 | loc = data_sources[source]['url'] 21 | shape = dshape(data_sources[source]['schema']) 22 | 23 | loc = path.expandvars(loc) 24 | 25 | expr = bz.data( 26 | loc, 27 | dshape=shape 28 | ) 29 | 30 | # create the DataSet 31 | ds = from_blaze( 32 | expr, 33 | no_deltas_rule='ignore', 34 | no_checkpoints_rule='ignore', 35 | loader=blaze_loader 36 | ) 37 | Factory = {source: ds} 38 | -------------------------------------------------------------------------------- /alphatools/fundamentals/make_fundamentals.py~: -------------------------------------------------------------------------------- 1 | from zipline.data import bundles 2 | from zipline.pipeline import Pipeline 3 | from zipline.pipeline.data import USEquityPricing 4 | from zipline.pipeline.data import Column 5 | from zipline.pipeline.data import DataSet 6 | from zipline.pipeline.engine import SimplePipelineEngine 7 | from zipline.pipeline.filters import StaticAssets 8 | from zipline.pipeline.loaders import USEquityPricingLoader 9 | from zipline.pipeline.loaders.frame import DataFrameLoader 10 | from zipline.utils.calendars import get_calendar 11 | 12 | import numpy as np 13 | import pandas as pd 14 | 15 | trading_calendar = get_calendar('NYSE') 16 | bundle_data = bundles.load('quandl') 17 | 18 | data_file = '/Users/jonathan/finnd/workspace_data/marketcap_pb_ps_pe_sector_s&p_comp-sharadar/data.csv' 19 | 20 | df = pd.read_csv(data_file)#, nrows=1000) 21 | df['Date'] = pd.to_datetime(df['Date']) 22 | 23 | df['sid'] = np.nan 24 | df = df.set_index('Date') 25 | 26 | df.index = df.index.tz_localize('UTC') 27 | 28 | dates = df.index.unique() 29 | 30 | for day in dates: 31 | file_tickers = df.loc[day]['Ticker'] 32 | sids = [] 33 | for ticker in file_tickers: 34 | try: 35 | this_ticker = bundle_data.asset_finder.lookup_symbol(ticker, as_of_date=day) 36 | this_sid = this_ticker.sid 37 | except: 38 | this_sid = np.nan 39 | sids.append(this_sid) 40 | df.loc[day]['sid'] = sids 41 | 42 | df.sid = df.sid.astype(float) 43 | df = df.dropna() 44 | df.sid = df.sid.astype(int) 45 | -------------------------------------------------------------------------------- /install_stable.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | conda create -n env_alphatools_stable -y -c conda-forge python=3.5.5 numpy=1.11.3 pandas=0.18.1 scipy=0.17.1 libgfortran=3.0 mkl-service pymc3=3.1 lightgbm=2.2.0 scikit-optimize=0.5.2 scikit-learn lapack catboost pip 4 | source activate env_alphatools_stable 5 | python -m pip install -r requirements_stable.txt --no-cache-dir 6 | python -m pip install -r requirements_blaze_stable.txt --no-cache-dir 7 | pip install cvxpy==0.4.10 --no-cache-dir --no-binary :all: 8 | pip install zipline==1.3.0 --no-cache-dir 9 | pip install ipykernel --no-cache-dir 10 | pip install alphalens==0.3.2 --no-cache-dir 11 | pip install pyfolio --no-cache-dir 12 | pip install graphviz==0.9 13 | cd .. 14 | pip install -e alphatools --no-cache-dir 15 | python -m ipykernel install --user --name env_alphatools_stable --display-name "Python 3.5 (env_alphatools_stable)" 16 | conda install -y pytorch=0.4.1 torchvision -c pytorch 17 | 18 | source ~/.bashrc 19 | # must append to .bashrc 20 | if [ "$MKL_THREADING_LAYER" = "" ] 21 | then 22 | export MKL_THREADING_LAYER=GNU 23 | echo 'export MKL_THREADING_LAYER=GNU' >> ~/.bashrc 24 | echo 'Please source the .bashrc file to activate MKL_THREADING env variable.' 25 | fi 26 | 27 | if [ "$THEANO_FLAGS" = "" ] 28 | then 29 | # Needed for Mac OS X 30 | export "THEANO_FLAGS='gcc.cxxflags=-Wno-c++11-narrowing'" 31 | echo "export \"THEANO_FLAGS='gcc.cxxflags=-Wno-c++11-narrowing'\"" >> ~/.bashrc 32 | echo 'Please source the .bashrc file to activate the THEANO_FLAGS env variable.' 33 | fi 34 | -------------------------------------------------------------------------------- /alphatools/expression/repro.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import itertools 3 | from lark import Lark, Transformer 4 | 5 | grammar = r""" 6 | 7 | ?value: "(" value ")" 8 | | mylocalvar 9 | | add 10 | | SIGNED_NUMBER -> number 11 | 12 | mylocalvar: "mylocalvar" 13 | add: value "+" value 14 | number: SIGNED_NUMBER 15 | 16 | %import common.SIGNED_NUMBER 17 | %import common.WS 18 | %ignore WS 19 | 20 | """ 21 | 22 | stack = [] 23 | 24 | class MyTransformer(Transformer): 25 | vcounter = itertools.count() 26 | 27 | def __init__(self): 28 | stack = [] 29 | self.cmdlist = [] 30 | 31 | def number(self, items): 32 | stack.append(items[0].value) 33 | 34 | def mylocalvar(self, items): 35 | stack.append('mylocalvar') 36 | 37 | def add(self, items): 38 | term2 = stack.pop() 39 | term1 = stack.pop() 40 | thisv = self.vcounter.next() 41 | stack.append('v' + str(thisv)) 42 | self.cmdlist.append( 43 | 'v' + str(thisv) + ' = ' + term1 + ' + ' + term2 44 | ) 45 | 46 | def transform(self, tree): 47 | self._transform_tree(tree) 48 | v1 = stack.pop() 49 | self.cmdlist.append( 50 | 'out[:] = ' + v1 + '[-1]' 51 | ) 52 | 53 | return self.cmdlist 54 | 55 | 56 | my_parser = Lark(grammar, start='value') 57 | 58 | 59 | text = "mylocalvar + mylocalvar" 60 | text = "mylocalvar + 2.5" 61 | text = "2 + 2" 62 | tree = my_parser.parse(text) 63 | npcmds = MyTransformer().transform(tree) 64 | 65 | 66 | -------------------------------------------------------------------------------- /install_latest.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | conda create -n env_alphatools_latest -y -c conda-forge python=$PYTHON_VERSION numpy=1.14.1 pandas=0.22.0 scipy=1.0.0 libgfortran=3.0 mkl-service pymc3=3.5 lightgbm=2.2.0 scikit-optimize=0.5.2 scikit-learn catboost pip 4 | source activate env_alphatools_latest 5 | python -m pip install -r requirements_latest.txt --no-cache-dir 6 | python -m pip install -r requirements_blaze_latest.txt --no-cache-dir 7 | pip install cvxpy==1.0.9 --no-cache-dir 8 | pip install zipline==1.3.0 --no-cache-dir 9 | pip install statsmodels==0.9.0 --upgrade --no-cache-dir 10 | pip install alphalens==0.3.2 --no-cache-dir 11 | pip install pyfolio --no-cache-dir 12 | pip install ipykernel --no-cache-dir 13 | pip install graphviz==0.9 14 | conda install -y -c pytorch pytorch-nightly-cpu 15 | conda install -y -c fastai torchvision-nightly-cpu 16 | conda install -y -c fastai fastai 17 | cd .. 18 | pip install -e alphatools --no-cache-dir 19 | python -m ipykernel install --user --name env_alphatools_latest --display-name "env_alphatools_latest" 20 | 21 | source ~/.bashrc 22 | # must append to .bashrc 23 | if [ "$MKL_THREADING_LAYER" = "" ] 24 | then 25 | export MKL_THREADING_LAYER=GNU 26 | echo 'export MKL_THREADING_LAYER=GNU' >> ~/.bashrc 27 | echo 'Please source the .bashrc file to activate MKL_THREADING env variable.' 28 | fi 29 | 30 | if [ "$THEANO_FLAGS" = "" ] 31 | then 32 | # Needed for Mac OS X 33 | export "THEANO_FLAGS='gcc.cxxflags=-Wno-c++11-narrowing'" 34 | echo "export \"THEANO_FLAGS='gcc.cxxflags=-Wno-c++11-narrowing'\"" >> ~/.bashrc 35 | echo 'Please source the .bashrc file to activate the THEANO_FLAGS env variable.' 36 | fi 37 | -------------------------------------------------------------------------------- /alphatools/fundamentals/make_fundamentals.py: -------------------------------------------------------------------------------- 1 | from zipline.data import bundles 2 | from zipline.pipeline import Pipeline 3 | from zipline.pipeline.data import USEquityPricing 4 | from zipline.pipeline.data import Column 5 | from zipline.pipeline.data import DataSet 6 | from zipline.pipeline.engine import SimplePipelineEngine 7 | from zipline.pipeline.filters import StaticAssets 8 | from zipline.pipeline.loaders import USEquityPricingLoader 9 | from zipline.pipeline.loaders.frame import DataFrameLoader 10 | from zipline.utils.calendars import get_calendar 11 | 12 | import numpy as np 13 | import pandas as pd 14 | 15 | trading_calendar = get_calendar('NYSE') 16 | bundle_data = bundles.load('quandl') 17 | 18 | data_file = 'path/to/sharadar/data' 19 | 20 | df = pd.read_csv(data_file)#, nrows=1000) 21 | df['Date'] = pd.to_datetime(df['Date']) 22 | 23 | df['sid'] = np.nan 24 | df = df.set_index('Date') 25 | 26 | df.index = df.index.tz_localize('UTC') 27 | 28 | dates = df.index.unique() 29 | 30 | for day in dates: 31 | file_tickers = df.loc[day]['Ticker'] 32 | sids = [] 33 | for ticker in file_tickers: 34 | try: 35 | this_ticker = bundle_data.asset_finder.lookup_symbol(ticker, as_of_date=day) 36 | this_sid = this_ticker.sid 37 | except: 38 | this_sid = np.nan 39 | sids.append(this_sid) 40 | df.loc[day]['sid'] = sids 41 | 42 | df.sid = df.sid.astype(float) 43 | df = df.dropna() 44 | df.sid = df.sid.astype(int) 45 | 46 | 47 | df['MarketCap'] = pd.to_numeric(df['MarketCap'], errors='coerce') 48 | df['P/B'] = pd.to_numeric(df['P/B'], errors='coerce') 49 | df['P/S'] = pd.to_numeric(df['P/S'], errors='coerce') 50 | df['P/E'] = pd.to_numeric(df['P/E'], errors='coerce') 51 | 52 | # save the df 53 | df.to_pickle('sharadar_with_sid.pkl') 54 | -------------------------------------------------------------------------------- /alphatools/fundamentals/fundamentals.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from alphatools.research import loaders 3 | from zipline.pipeline.data import Column 4 | from zipline.pipeline.data import DataSet 5 | from zipline.pipeline.loaders.frame import DataFrameLoader 6 | 7 | from os import path 8 | myfile_path = path.join(path.dirname(__file__), 'myfile.txt') 9 | 10 | df = pd.read_pickle(path.join(path.dirname(__file__), 'sharadar_with_sid.pkl')) 11 | 12 | MarketCap_frame = ( 13 | df[['MarketCap', 'sid']]. 14 | reset_index().set_index(['Date', 'sid']). 15 | unstack() 16 | ) 17 | 18 | MarketCap_frame.columns = MarketCap_frame.columns.droplevel() 19 | 20 | PriceToBook_frame = df[['P/B', 'sid']].reset_index().set_index(['Date', 'sid']).unstack() 21 | PriceToBook_frame.columns = PriceToBook_frame.columns.droplevel() 22 | 23 | PriceToSales_frame = df[['P/S', 'sid']].reset_index().set_index(['Date', 'sid']).unstack() 24 | PriceToSales_frame.columns = PriceToSales_frame.columns.droplevel() 25 | 26 | PriceToEarnings_frame = df[['P/E', 'sid']].reset_index().set_index(['Date', 'sid']).unstack() 27 | PriceToEarnings_frame.columns = PriceToEarnings_frame.columns.droplevel() 28 | 29 | class Fundamentals(DataSet): 30 | MarketCap = Column(dtype=float) 31 | PriceToBook = Column(dtype=float) 32 | PriceToSales = Column(dtype=float) 33 | PriceToEarnings = Column(dtype=float) 34 | 35 | # register the loaders 36 | loaders[Fundamentals.MarketCap] = DataFrameLoader(Fundamentals.MarketCap, MarketCap_frame) 37 | loaders[Fundamentals.PriceToBook] = DataFrameLoader(Fundamentals.PriceToBook, PriceToBook_frame) 38 | loaders[Fundamentals.PriceToSales] = DataFrameLoader(Fundamentals.PriceToSales, PriceToSales_frame) 39 | loaders[Fundamentals.PriceToEarnings] = DataFrameLoader(Fundamentals.PriceToEarnings, PriceToEarnings_frame) 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # other 107 | *.pkl 108 | *.pickle 109 | *.*~ 110 | *~ 111 | *.db 112 | src 113 | alpha*.png 114 | .DS_store -------------------------------------------------------------------------------- /requirements_stable.txt: -------------------------------------------------------------------------------- 1 | # Incompatible with earlier PIP versions 2 | pip>=7.1.0 3 | # bcolz fails to install if this is not in the build_requires. 4 | setuptools>18.0 5 | 6 | # Logging 7 | Logbook==0.12.5 8 | 9 | # Scientific Libraries 10 | 11 | pytz==2016.4 12 | numpy==1.11.3 13 | 14 | # for pandas-datareader 15 | requests-file==1.4.1 16 | 17 | # scipy and pandas are required for statsmodels, 18 | # statsmodels in turn is required for some pandas packages 19 | scipy==0.17.1 20 | pandas==0.18.1 21 | pandas-datareader==0.2.1 22 | # Needed for parts of pandas.stats 23 | patsy==0.4.0 24 | statsmodels==0.6.1 25 | 26 | python-dateutil==2.4.2 27 | six==1.10.0 28 | 29 | # For fetching remote data 30 | requests==2.9.1 31 | 32 | Cython==0.25.2 33 | 34 | # faster OrderedDict 35 | cyordereddict==0.2.2 36 | 37 | # faster array ops. 38 | bottleneck==1.0.0 39 | 40 | contextlib2==0.4.0 41 | 42 | # networkx requires decorator 43 | decorator==4.0.0 44 | 45 | # Graph algorithms used by zipline.pipeline 46 | networkx==1.9.1 47 | 48 | # NumericalExpression pipeline terms. 49 | numexpr==2.6.1 50 | 51 | # On disk storage format for pipeline data. 52 | bcolz==0.12.1 53 | 54 | # On disk storage format for pricing data. 55 | h5py==2.7.1 56 | 57 | # Command line interface helper 58 | click==4.0.0 59 | 60 | # FUNctional programming utilities 61 | toolz==0.8.2 62 | multipledispatch==0.6.0 63 | 64 | # for alembic 65 | MarkupSafe==0.23 66 | Mako==1.0.1 67 | # Asset writer and finder 68 | sqlalchemy==1.0.8 69 | # For asset db management 70 | alembic==0.7.7 71 | 72 | sortedcontainers==1.4.4 73 | # for intervaltree 74 | intervaltree==2.1.0 75 | 76 | # for caching 77 | lru-dict==1.1.4 78 | 79 | # For financial risk calculations 80 | empyrical==0.5.0 81 | 82 | tables==3.4.3 83 | 84 | # For trading calendars 85 | trading-calendars==1.2.0 86 | 87 | # Interface definitions. 88 | python-interface==1.4.0 89 | 90 | # Country Codes 91 | iso3166==0.9 92 | -------------------------------------------------------------------------------- /requirements_latest.txt: -------------------------------------------------------------------------------- 1 | # Incompatible with earlier PIP versions 2 | pip>=7.1.0 3 | # bcolz fails to install if this is not in the build_requires. 4 | setuptools>18.0 5 | 6 | # Logging 7 | Logbook==0.12.5 8 | 9 | # Scientific Libraries 10 | 11 | pytz==2016.4 12 | numpy==1.14.1 13 | 14 | # for pandas-datareader 15 | requests-file==1.4.1 16 | 17 | # scipy and pandas are required for statsmodels, 18 | # statsmodels in turn is required for some pandas packages 19 | scipy==1.0.0 20 | pandas==0.22.0 21 | pandas-datareader==0.4.0 22 | # Needed for parts of pandas.stats 23 | patsy==0.4.0 24 | 25 | # Q has statsmodels==0.6.1; I think this is incompatible with scipy==1.0.0 26 | statsmodels==0.9.0 27 | 28 | python-dateutil==2.4.2 29 | six==1.10.0 30 | 31 | # For fetching remote data 32 | requests==2.9.1 33 | 34 | Cython==0.25.2 35 | 36 | # faster OrderedDict 37 | cyordereddict==0.2.2 38 | 39 | # faster array ops. 40 | bottleneck==1.0.0 41 | 42 | contextlib2==0.4.0 43 | 44 | # networkx requires decorator 45 | decorator==4.0.0 46 | 47 | # Graph algorithms used by zipline.pipeline 48 | networkx==1.9.1 49 | 50 | # NumericalExpression pipeline terms. 51 | numexpr==2.6.1 52 | 53 | # On disk storage format for pipeline data. 54 | bcolz==0.12.1 55 | 56 | # On disk storage format for pricing data. 57 | h5py==2.7.1 58 | 59 | # Command line interface helper 60 | click==4.0.0 61 | 62 | # FUNctional programming utilities 63 | toolz==0.8.2 64 | multipledispatch==0.6.0 65 | 66 | # for alembic 67 | MarkupSafe==0.23 68 | Mako==1.0.1 69 | # Asset writer and finder 70 | sqlalchemy==1.0.8 71 | # For asset db management 72 | alembic==0.7.7 73 | 74 | sortedcontainers==1.4.4 75 | # for intervaltree 76 | intervaltree==2.1.0 77 | 78 | # for caching 79 | lru-dict==1.1.4 80 | 81 | # For financial risk calculations 82 | empyrical==0.5.0 83 | 84 | tables==3.4.3 85 | 86 | # For trading calendars 87 | trading-calendars==1.2.0 88 | 89 | # Interface definitions. 90 | python-interface==1.4.0 91 | 92 | # Country Codes 93 | iso3166==0.9 94 | -------------------------------------------------------------------------------- /alphatools/algo/risk.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | def L1_risk(wgts, returns): 5 | """ 6 | Returns the "L1" risk as per Konno & Yamazaki; aka the mean-absolute- 7 | deviation. 8 | """ 9 | rets = returns.fillna(0.0).as_matrix() 10 | return np.mean(np.abs(rets.dot(wgts))) 11 | 12 | def value_at_risk( 13 | weights, 14 | returns, 15 | alpha=0.95): 16 | """ 17 | Returns the historical simulation VaR at the confidence threshold. 18 | """ 19 | returns = returns.fillna(0.0).as_matrix() 20 | portfolio_returns = returns.dot(weights) 21 | return np.percentile(portfolio_returns, 100 * (1-alpha)) 22 | 23 | def expected_shortfall( 24 | weights, 25 | returns, 26 | alpha=0.95): 27 | """ 28 | Returns the historical simulation CVaR at the confidence threshold. 29 | """ 30 | 31 | var = value_at_risk(weights, returns, alpha) 32 | returns = returns.fillna(0.0) 33 | portfolio_returns = returns.dot(weights) 34 | return np.nanmean(portfolio_returns[portfolio_returns < var]) 35 | 36 | 37 | def calc_portfolio_risk( 38 | context, 39 | data, 40 | risk_func, 41 | hist_days=180, 42 | **kwargs): 43 | """ 44 | This is a helper function designed to be the primary call in an algo 45 | for calculating portfolio-level risk. It takes the current context and 46 | data objects and a `risk_func` (e.g., `value_at_risk`), and formats 47 | portfolio weights and returns such that the indicies line up in a numpy 48 | array. 49 | """ 50 | 51 | 52 | positions = context.portfolio.positions 53 | positions_index = pd.Index(positions) 54 | share_counts = pd.Series( 55 | index=positions_index, 56 | data=[positions[asset].amount for asset in positions] 57 | ) 58 | 59 | current_prices = data.current(positions_index, 'price') 60 | current_weights = ( 61 | share_counts * current_prices / context.portfolio.portfolio_value 62 | ) 63 | 64 | prices = data.history( 65 | current_weights.index.tolist(), 66 | 'price', 67 | hist_days, 68 | '1d' 69 | ) 70 | 71 | daily_rets = prices.pct_change() 72 | daily_rets = daily_rets - daily_rets.mean(skipna=True) 73 | daily_rets = daily_rets.fillna(0.0) 74 | 75 | risk = risk_func(current_weights.values, daily_rets, **kwargs) 76 | return risk 77 | -------------------------------------------------------------------------------- /tests/test_lightgbm.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | 4 | import lightgbm as lgb 5 | from sklearn.datasets import load_breast_cancer, dump_svmlight_file, load_svmlight_file 6 | import numpy as np 7 | from sklearn.model_selection import train_test_split 8 | 9 | 10 | def test_lgb(): 11 | X_train, X_test, y_train, y_test = train_test_split( 12 | *load_breast_cancer(True), test_size=0.1, random_state=2 13 | ) 14 | train_data = lgb.Dataset(X_train, label=y_train) 15 | valid_data = train_data.create_valid(X_test, label=y_test) 16 | 17 | params = { 18 | "objective": "binary", 19 | "metric": "auc", 20 | "min_data": 10, 21 | "num_leaves": 15, 22 | "verbose": -1, 23 | "num_threads": 1, 24 | "max_bin": 255 25 | } 26 | bst = lgb.Booster(params, train_data) 27 | bst.add_valid(valid_data, "valid_1") 28 | 29 | for i in range(30): 30 | bst.update() 31 | if i % 10 == 0: 32 | print(bst.eval_train(), bst.eval_valid()) 33 | 34 | assert bst.current_iteration() == 30 35 | assert bst.num_trees() == 30 36 | assert bst.num_model_per_iteration() == 1 37 | 38 | bst.save_model("model.txt") 39 | pred_from_matr = bst.predict(X_test) 40 | with tempfile.NamedTemporaryFile() as f: 41 | tname = f.name 42 | with open(tname, "w+b") as f: 43 | dump_svmlight_file(X_test, y_test, f) 44 | pred_from_file = bst.predict(tname) 45 | os.remove(tname) 46 | assert len(pred_from_matr) == len(pred_from_file) 47 | for preds in zip(pred_from_matr, pred_from_file): 48 | assert np.allclose(*preds) 49 | 50 | # check saved model persistence 51 | bst = lgb.Booster(params, model_file="model.txt") 52 | pred_from_model_file = bst.predict(X_test) 53 | assert len(pred_from_matr) == len(pred_from_model_file) 54 | for preds in zip(pred_from_matr, pred_from_model_file): 55 | # we need to check the consistency of model file here, so test for exact equal 56 | np.equal(*preds) 57 | 58 | # check early stopping is working. Make it stop very early, so the scores should be very close to zero 59 | pred_parameter = { 60 | "pred_early_stop": True, 61 | "pred_early_stop_freq": 5, 62 | "pred_early_stop_margin": 1.5 63 | } 64 | pred_early_stopping = bst.predict(X_test, **pred_parameter) 65 | assert len(pred_from_matr) == len(pred_early_stopping) 66 | for preds in zip(pred_early_stopping, pred_from_matr): 67 | # scores likely to be different, but prediction should still be the same 68 | assert (preds[0] > 0) == (preds[1] > 0) 69 | -------------------------------------------------------------------------------- /tests/expressions.py: -------------------------------------------------------------------------------- 1 | from alphatools.research import run_pipeline, make_factor_plot 2 | from alphatools.expression import ExpressionAlpha 3 | from alphatools.ics import Sector, SubIndustry 4 | from zipline.pipeline.factors import AverageDollarVolume, CustomFactor, Returns 5 | from zipline.pipeline.data import USEquityPricing as USEP 6 | from zipline.pipeline import Pipeline 7 | 8 | import pandas as pd 9 | import numpy as np 10 | 11 | universe = AverageDollarVolume(window_length=120).top(10) 12 | 13 | expressions = { 14 | '0': 'close', 15 | '1': 'delay(close,1)', 16 | '2': 'delta(close,5)', 17 | '3': 'returns', 18 | '4': 'delta(close,1)/delay(close,1)', 19 | '5': 'delta(close,5)/delay(close,5)', 20 | '6': 'rank(close)', 21 | '7': 'indneutralize(close, IndClass.sector)', 22 | '8': 'indneutralize(close, IndClass.industry)', 23 | } 24 | 25 | class Control_1(CustomFactor): 26 | window_length=2 27 | inputs=[USEP.close] 28 | 29 | def compute(self, today, assets, out, close): 30 | out[:]=close[-2] 31 | 32 | class Control_2(CustomFactor): 33 | window_length=6 34 | inputs=[USEP.close] 35 | 36 | def compute(self, today, assets, out, close): 37 | out[:]=close[-1] - close[-6] 38 | 39 | control = {} 40 | control_0 = USEP.close.latest 41 | control_1 = Control_1() 42 | control_2 = Control_2() 43 | control_3 = Returns(window_length=2) 44 | control_4 = Returns(window_length=2) 45 | control_5 = Returns(window_length=6) 46 | control_6 = USEP.close.latest.rank(mask=universe) 47 | control_7 = USEP.close.latest.demean(groupby=Sector(), mask=universe) 48 | control_8 = USEP.close.latest.demean(groupby=SubIndustry(), mask=universe) 49 | 50 | control = { 51 | '0': control_0, 52 | '1': control_1, 53 | '2': control_2, 54 | '3': control_3, 55 | '4': control_4, 56 | '5': control_5, 57 | '6': control_6, 58 | '7': control_7, 59 | '8': control_8, 60 | } 61 | 62 | 63 | start_date = '2017-01-04' 64 | end_date = '2017-01-04' 65 | 66 | 67 | def test_factor(expression, control, start_date='2017-01-04', end_date='2017-01-04', show_df=False): 68 | p = Pipeline(screen=universe) 69 | p.add(expression.make_pipeline_factor().pipeline_factor(mask=universe), 'expression_alpha') 70 | p.add(control, 'pipeline_factor') 71 | df = run_pipeline(p, start_date, end_date) 72 | print(np.allclose(df['expression_alpha'].values, df['pipeline_factor'].values)) 73 | if show_df: 74 | print df 75 | 76 | 77 | start_fac = 0 78 | end_fac = 8 79 | 80 | for i in range(start_fac, end_fac+1): 81 | test_factor(ExpressionAlpha(expressions[str(i)]), control[str(i)], show_df=False) 82 | -------------------------------------------------------------------------------- /alphatools/expression/expression.lark: -------------------------------------------------------------------------------- 1 | // Lark grammar for "expression"-based alphas 2 | 3 | ?value: abs 4 | | log 5 | | sign 6 | | greaterthan 7 | | lessthan 8 | | equals 9 | | logicalor 10 | | ternary 11 | | scale 12 | | signedpower 13 | | decay_linear 14 | | ts_max 15 | | ts_min 16 | | ts_argmax 17 | | ts_argmin 18 | | ts_rank 19 | | max 20 | | min 21 | | stddev 22 | | close 23 | | opens 24 | | high 25 | | low 26 | | volume 27 | | returns 28 | | adv 29 | | vwap 30 | | delay 31 | | correlation 32 | | covariance 33 | | indneutralize 34 | | delta 35 | | div 36 | | mult 37 | | minus 38 | | plus 39 | | powerof 40 | | rank 41 | | sum 42 | | product 43 | | ESCAPED_STRING 44 | | "(" value ")" 45 | | neg 46 | | number 47 | | cap 48 | | SIGNED_NUMBER -> number 49 | | factory 50 | 51 | 52 | delay: "delay" "(" value "," SIGNED_NUMBER ")" 53 | delta: "delta" "(" value "," SIGNED_NUMBER ")" 54 | 55 | correlation: "correlation" "(" value "," value "," SIGNED_NUMBER ")" 56 | covariance: "covariance" "(" value "," value "," SIGNED_NUMBER ")" 57 | 58 | factory: "factory" "(" ESCAPED_STRING ["," ESCAPED_STRING ] ")" 59 | 60 | close: "close" 61 | opens: "opens" 62 | high: "high" 63 | low: "low" 64 | volume: "volume" 65 | returns: "returns" 66 | vwap: "vwap" 67 | adv: "adv" SIGNED_NUMBER 68 | cap: "cap" 69 | 70 | number: SIGNED_NUMBER 71 | 72 | ts_max: "ts_max" "(" value "," SIGNED_NUMBER ")" 73 | ts_min: "ts_min" "(" value "," SIGNED_NUMBER ")" 74 | ts_argmax: "ts_argmax" "(" value "," SIGNED_NUMBER ")" 75 | ts_argmin: "ts_argmin" "(" value "," SIGNED_NUMBER ")" 76 | ts_rank: "ts_rank" "(" value "," SIGNED_NUMBER ")" 77 | stddev: "stddev" "(" value "," SIGNED_NUMBER ")" 78 | indneutralize: "indneutralize" "(" value ["," INDCLASS] ")" 79 | 80 | // not what paper says; this is element by element min 81 | max: "max" "(" value "," value ")" 82 | min: "min" "(" value "," value ")" 83 | 84 | div: value "/" value 85 | minus: value "-" value 86 | plus: value "+" value 87 | mult: value "*" value 88 | powerof: value "^" value 89 | 90 | abs: "abs" "(" value ")" 91 | log: "log" "(" value ")" 92 | sign: "sign" "(" value ")" 93 | greaterthan: value ">" value 94 | lessthan: value "<" value 95 | equals: value "==" value 96 | logicalor: value "||" value 97 | ternary: value "?" value ":" value 98 | scale: "scale" "(" value ")" 99 | signedpower: "signedpower" "(" value "," value ")" 100 | decay_linear: "decay_linear" "(" value "," SIGNED_NUMBER ")" 101 | 102 | neg: "-" value 103 | rank: "rank" "(" value ")" 104 | sum: "sum" "(" value "," SIGNED_NUMBER ")" 105 | product: "product" "(" value "," SIGNED_NUMBER ")" 106 | 107 | INDCLASS: ("IndClass.industry" | "IndClass.subindustry" | "IndClass.sector") 108 | 109 | %import common.SIGNED_NUMBER 110 | %import common.ESCAPED_STRING 111 | %import common.WS 112 | %ignore WS -------------------------------------------------------------------------------- /alphatools/misc/pair_trade.py: -------------------------------------------------------------------------------- 1 | # This is a simple pair trade example. Let's 2 | # look at the 10-year vs 5-year future. 3 | 4 | 5 | 6 | import numpy as np 7 | from sklearn.linear_model import LinearRegression 8 | from zipline.api import ( 9 | schedule_function, 10 | date_rules, 11 | time_rules, 12 | order_target_percent, 13 | record, 14 | symbol 15 | ) 16 | 17 | 18 | LOOKBACK_DAYS = 120 19 | ENTRY_THRESHOLD = 2.0 20 | EXIT_THRESHOLD = 0.50 21 | 22 | def initialize(context): 23 | # This function runs once when sim starts. 24 | # Put your `schedule_function()`, `set_slippage`, and `set_commission` calls here. 25 | 26 | # FV is the five-year root symbol 27 | # TU is the ten-year root symbol 28 | context.asset_A = symbol('AAPL') 29 | context.asset_B = symbol('MSFT') 30 | 31 | # Keep state if we have crossed the ENTRY_THRESHOLD 32 | context.position_initiated = False 33 | 34 | # Run rebal function every day 15min before 35 | # close. 36 | schedule_function( 37 | func=rebal, 38 | date_rule=date_rules.every_day(), 39 | time_rule=time_rules.market_close( 40 | minutes=15) 41 | ) 42 | 43 | def before_trading_start(context, data): 44 | # This function runs before each daily session 45 | pass 46 | 47 | def rebal(context, data): 48 | # Basic pair trade; price-based regression 49 | 50 | # get historical data for both futures 51 | hist = data.history( 52 | [context.asset_A, context.asset_B], 53 | fields='price', 54 | bar_count=LOOKBACK_DAYS, 55 | frequency='1d' 56 | ) 57 | 58 | # the reshape is a scikit-learn nuanace when you have 1-dim data 59 | asset_A_prices = hist[context.asset_A].values.reshape(-1,1) 60 | asset_B_prices = hist[context.asset_B].values.reshape(-1,1) 61 | 62 | # run a price regression 63 | lm = LinearRegression().fit( 64 | asset_A_prices, # X 65 | asset_B_prices # y 66 | ) 67 | 68 | # get residuals 69 | residuals = asset_B_prices - lm.predict(asset_A_prices) 70 | 71 | # the most recent residual is the current spread 72 | current_spread = residuals[-1] 73 | 74 | # Z-Score of current spread 75 | score = (current_spread/np.nanstd(residuals))[-1] 76 | 77 | target_weights = {} 78 | 79 | # trading logic 80 | if score > ENTRY_THRESHOLD and not context.position_initiated: 81 | target_weights[context.asset_A] = -5.0 # i.e, 500% 82 | target_weights[context.asset_B] = 5.0 83 | context.position_initiated = True 84 | elif score < -ENTRY_THRESHOLD and not context.position_initiated: 85 | target_weights[context.asset_A] = 5.0 86 | target_weights[context.asset_B] = -5.0 87 | context.position_initiated = True 88 | elif np.abs(score) < EXIT_THRESHOLD and context.position_initiated: 89 | #unwind 90 | for futures_contract, position in context.portfolio.positions.items(): 91 | target_weights[futures_contract] = 0 92 | context.position_initiated = False 93 | 94 | for futures_contract, target in target_weights.items(): 95 | order_target_percent(futures_contract, target) 96 | 97 | record(A=asset_A_prices[-1][0]) 98 | record(B=asset_B_prices[-1][0]) 99 | record(score=score) 100 | 101 | -------------------------------------------------------------------------------- /alphatools/ics/ics_scheme.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from numpy import where, zeros 4 | 5 | from zipline.data import bundles 6 | from zipline.pipeline.classifiers import Classifier 7 | from zipline.utils.numpy_utils import int64_dtype, object_dtype 8 | 9 | from os import path 10 | 11 | 12 | class SICMajorIndustry(Classifier): 13 | 14 | dtype = int64_dtype 15 | window_length = 0 16 | window_safe = True 17 | inputs = () 18 | missing_value = -1 19 | 20 | def __init__(self): 21 | self.data = np.load('sic_major.npy') 22 | self.names = None 23 | def _compute(self, arrays, dates, assets, mask): 24 | return where( 25 | mask, 26 | self.data[assets], 27 | self.missing_value, 28 | ) 29 | 30 | class Sector(Classifier): 31 | 32 | dtype = int64_dtype 33 | window_length = 0 34 | window_safe = True 35 | inputs = () 36 | missing_value = -1 37 | 38 | def __init__(self): 39 | self.data = np.load( 40 | path.join(path.dirname(__file__), 'sectors.npy') 41 | ) 42 | self.names = pd.read_csv( 43 | path.join(path.dirname(__file__), 'sector_names.csv'), 44 | header=None, 45 | index_col=0, 46 | names=['Sector_Name'] 47 | ) 48 | 49 | def _compute(self, arrays, dates, assets, mask): 50 | return where( 51 | mask, 52 | self.data[assets], 53 | self.missing_value, 54 | ) 55 | 56 | 57 | class SubIndustry(Classifier): 58 | 59 | dtype = int64_dtype 60 | window_length = 0 61 | window_safe = True 62 | inputs = () 63 | missing_value = -1 64 | 65 | def __init__(self): 66 | self.data = np.load( 67 | path.join(path.dirname(__file__), 'industries.npy') 68 | ) 69 | self.names = pd.read_csv( 70 | path.join(path.dirname(__file__), 'industry_names.csv'), 71 | header=None, 72 | index_col=0, 73 | names=['Industry_Name'] 74 | ) 75 | 76 | def _compute(self, arrays, dates, assets, mask): 77 | return where( 78 | mask, 79 | self.data[assets], 80 | self.missing_value, 81 | ) 82 | 83 | 84 | def make_sector_classifier( 85 | bundle='quandl', 86 | infile='../data/profiles_20170918.csv'): 87 | """ 88 | For a given bundle, create the .npy Sector and Industry classifier 89 | files. 90 | """ 91 | bundle_data = bundles.load(bundle) 92 | 93 | df_p = pd.read_csv(path.join(path.dirname(__file__), infile)) 94 | 95 | labels_sector, uniques_sector = pd.factorize(df_p['sector']) 96 | labels_industry, uniques_industry = pd.factorize(df_p['industry']) 97 | 98 | tickers = bundle_data.asset_finder.lookup_symbols( 99 | df_p['quandl_sym'], as_of_date=None) 100 | 101 | sids = [asset.sid for asset in tickers] 102 | max_sid = np.max(bundle_data.asset_finder.sids) 103 | 104 | sectors = np.full(np.max(max_sid)+1, -1, np.dtype('int64')) 105 | industries = np.full(np.max(max_sid)+1, -1, np.dtype('int64')) 106 | 107 | sectors[sids] = labels_sector 108 | industries[sids] = labels_industry 109 | 110 | np.save(path.join( 111 | path.dirname(__file__), '../data/sectors' 112 | ), sectors) 113 | np.save(path.join( 114 | path.dirname(__file__), '../data/industries' 115 | ), industries) 116 | 117 | pd.DataFrame(data=uniques_sector.tolist()).to_csv( 118 | path.join(path.dirname(__file__), '../data/sector_names.csv'), 119 | header=False 120 | ) 121 | pd.DataFrame(data=uniques_industry.tolist()).to_csv( 122 | path.join(path.dirname(__file__), '../data/industry_names.csv'), 123 | header=False 124 | ) 125 | 126 | return True 127 | 128 | def make_SIC_classifier( 129 | bundle='quandl', 130 | infile='../data/profiles_20170918.csv'): 131 | 132 | bundle_data = bundles.load(bundle) 133 | 134 | df_p = pd.read_csv(infile) 135 | df_cik = pd.read_csv('../data/cik_ticker_09152017.csv', sep='|') 136 | df_cik['SIC'] = df_cik['SIC'].fillna(-1).astype(np.int64).astype(str) 137 | df_cik['SIC_MajorIndustry'] = df_cik['SIC'].str[:2] 138 | df_cik['SIC_SubClassification'] = df_cik['SIC'].str[:3] 139 | df_cik['SIC_Specialization'] = df_cik['SIC'].str[:4] 140 | 141 | df_cik_select = df_cik.loc[df_cik.Ticker.isin(df_p['quandl_sym'])] 142 | tickers = bundle_data.asset_finder.lookup_symbols(df_cik_select['Ticker'], as_of_date=None) 143 | sids = [asset.sid for asset in tickers] 144 | max_sid = np.max(bundle_data.asset_finder.sids) 145 | 146 | major = np.full(max_sid+1, -1, np.dtype('int64')) 147 | subclass = np.full(max_sid+1, -1, np.dtype('int64')) 148 | specialize = np.full(max_sid+1, -1, np.dtype('int64')) 149 | 150 | major[sids] = df_cik_select['SIC_MajorIndustry'].astype(np.int64) 151 | subclass[sids] = df_cik_select['SIC_SubClassification'].astype(np.int64) 152 | specialize[sids] = df_cik_select['SIC_Specialization'].astype(np.int64) 153 | 154 | np.save('sic_major', major) 155 | np.save('sic_subclass', subclass) 156 | np.save('sic_specialize', specialize) 157 | 158 | 159 | 160 | if __name__ == '__main__': 161 | pass 162 | # make_sector_classifier() 163 | # make_SIC_classifier() 164 | -------------------------------------------------------------------------------- /notebooks/pipeline-blaze-factory.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | ":307: UserWarning: Overwriting bundle with name 'futures'\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "from alphatools.research import run_pipeline, get_symbols\n", 18 | "from alphatools.data import Factory\n", 19 | "import pandas as pd\n", 20 | "from zipline.pipeline import Pipeline\n", 21 | "from zipline.pipeline.data import USEquityPricing\n", 22 | "from zipline.pipeline.filters import StaticAssets\n", 23 | "from zipline.pipeline.factors import CustomFactor" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "# Example of use in a Custom Factor\n", 33 | "\n", 34 | "class MyFactor(CustomFactor):\n", 35 | " inputs = [Factory['sample'].value]\n", 36 | " window_length = 10\n", 37 | " \n", 38 | " def compute(self, today, assets, out, factory):\n", 39 | " out[:] = factory[-1]\n" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 3, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "assets = get_symbols(['A', 'AAL'], as_of_date=None)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 4, 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "data": { 58 | "text/plain": [ 59 | "[Equity(0 [A]), Equity(2 [AAL])]" 60 | ] 61 | }, 62 | "execution_count": 4, 63 | "metadata": {}, 64 | "output_type": "execute_result" 65 | } 66 | ], 67 | "source": [ 68 | "assets" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 5, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "p = Pipeline(\n", 78 | " columns={\n", 79 | " 'col_A': Factory['sample'].value.latest,\n", 80 | " 'col_B': MyFactor()\n", 81 | " },\n", 82 | " screen=StaticAssets(assets)\n", 83 | ")" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 6, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "df = run_pipeline(\n", 93 | " p,\n", 94 | " pd.Timestamp('2016-01-05', tz='utc'),\n", 95 | " pd.Timestamp('2018-01-04', tz='utc')\n", 96 | ")" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 7, 102 | "metadata": {}, 103 | "outputs": [ 104 | { 105 | "data": { 106 | "text/html": [ 107 | "
\n", 108 | "\n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | "
col_Acol_B
2016-01-05 00:00:00+00:00Equity(0 [A])0.5434050.543405
Equity(2 [AAL])0.4245180.424518
2016-01-06 00:00:00+00:00Equity(0 [A])0.5434050.543405
Equity(2 [AAL])0.4245180.424518
2016-01-07 00:00:00+00:00Equity(0 [A])0.5434050.543405
\n", 148 | "
" 149 | ], 150 | "text/plain": [ 151 | " col_A col_B\n", 152 | "2016-01-05 00:00:00+00:00 Equity(0 [A]) 0.543405 0.543405\n", 153 | " Equity(2 [AAL]) 0.424518 0.424518\n", 154 | "2016-01-06 00:00:00+00:00 Equity(0 [A]) 0.543405 0.543405\n", 155 | " Equity(2 [AAL]) 0.424518 0.424518\n", 156 | "2016-01-07 00:00:00+00:00 Equity(0 [A]) 0.543405 0.543405" 157 | ] 158 | }, 159 | "execution_count": 7, 160 | "metadata": {}, 161 | "output_type": "execute_result" 162 | } 163 | ], 164 | "source": [ 165 | "df.head()" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [] 174 | } 175 | ], 176 | "metadata": { 177 | "kernelspec": { 178 | "display_name": "Python 3.5 (env_alphatools_stable)", 179 | "language": "python", 180 | "name": "env_alphatools_stable" 181 | }, 182 | "language_info": { 183 | "codemirror_mode": { 184 | "name": "ipython", 185 | "version": 3 186 | }, 187 | "file_extension": ".py", 188 | "mimetype": "text/x-python", 189 | "name": "python", 190 | "nbconvert_exporter": "python", 191 | "pygments_lexer": "ipython3", 192 | "version": "3.5.5" 193 | } 194 | }, 195 | "nbformat": 4, 196 | "nbformat_minor": 2 197 | } 198 | -------------------------------------------------------------------------------- /alphatools/data/industry_names.csv: -------------------------------------------------------------------------------- 1 | 0,Medical Laboratories & Research 2 | 1,Aluminum 3 | 2,Major Airlines 4 | 3,Asset Management 5 | 4,Rental & Leasing Services 6 | 5,Semiconductor - Integrated Circuits 7 | 6,General Building Materials 8 | 7,Auto Parts Stores 9 | 8,Electronic Equipment 10 | 9,REIT - Retail 11 | 10,"Air Services, Other" 12 | 11,Drug Manufacturers - Major 13 | 12,Drugs Wholesale 14 | 13,Regional - Mid-Atlantic Banks 15 | 14,Business Services 16 | 15,Auto Dealerships 17 | 16,Medical Appliances & Equipment 18 | 17,Biotechnology 19 | 18,REIT - Residential 20 | 19,Office Supplies 21 | 20,Chemicals - Major Diversified 22 | 21,Property & Casualty Insurance 23 | 22,Specialized Health Services 24 | 23,Technical & System Software 25 | 24,Semiconductor Equipment & Materials 26 | 25,Technical Services 27 | 26,Information Technology Services 28 | 27,REIT - Diversified 29 | 28,Application Software 30 | 29,Property Management 31 | 30,Farm Products 32 | 31,Drugs - Generic 33 | 32,Business Software & Services 34 | 33,Communication Equipment 35 | 34,Home Health Care 36 | 35,Oil & Gas Refining & Marketing 37 | 36,Electric Utilities 38 | 37,Heavy Construction 39 | 38,Diversified Electronics 40 | 39,Life Insurance 41 | 40,Apparel Stores 42 | 41,Health Care Plans 43 | 42,Savings & Loans 44 | 43,Accident & Health Insurance 45 | 44,Farm & Construction Machinery 46 | 45,Credit Services 47 | 46,Surety & Title Insurance 48 | 47,Publishing - Newspapers 49 | 48,REIT - Hotel/Motel 50 | 49,Mortgage Investment 51 | 50,Industrial Electrical Equipment 52 | 51,Textile Industrial 53 | 52,Aerospace/Defense Products & Services 54 | 53,Industrial Equipment Wholesale 55 | 54,Insurance Brokers 56 | 55,Internet Information Providers 57 | 56,Steel & Iron 58 | 57,Specialty Chemicals 59 | 58,Diversified Utilities 60 | 59,Real Estate Development 61 | 60,Regional Airlines 62 | 61,Drug Delivery 63 | 62,Security & Protection Services 64 | 63,Auto Parts 65 | 64,Diagnostic Substances 66 | 65,"Movie Production, Theaters" 67 | 66,Entertainment - Diversified 68 | 67,Semiconductor - Broad Line 69 | 68,Diversified Machinery 70 | 69,Investment Brokerage - National 71 | 70,Home Furnishings & Fixtures 72 | 71,Catalog & Mail Order Houses 73 | 72,Internet Software & Services 74 | 73,Medical Instruments & Supplies 75 | 74,"Tobacco Products, Other" 76 | 75,Semiconductor - Specialized 77 | 76,Independent Oil & Gas 78 | 77,Education & Training Services 79 | 78,Trucking 80 | 79,Metal Fabrication 81 | 80,REIT - Office 82 | 81,Railroads 83 | 82,Regional - Northeast Banks 84 | 83,Water Utilities 85 | 84,Electronics Wholesale 86 | 85,Shipping 87 | 86,Staffing & Outsourcing Services 88 | 87,Healthcare Information Services 89 | 88,Telecom Services - Domestic 90 | 89,Gas Utilities 91 | 90,Packaging & Containers 92 | 91,Air Delivery & Freight Services 93 | 92,Multimedia & Graphics Software 94 | 93,Oil & Gas Drilling & Exploration 95 | 94,Agricultural Chemicals 96 | 95,Residential Construction 97 | 96,Business Equipment 98 | 97,Industrial Equipment & Components 99 | 98,Money Center Banks 100 | 99,Management Services 101 | 100,Regional - Pacific Banks 102 | 101,Regional - Southwest Banks 103 | 102,Oil & Gas Equipment & Services 104 | 103,Broadcasting - Radio 105 | 104,Networking & Communication Devices 106 | 105,Restaurants 107 | 106,Regional - Southeast Banks 108 | 107,Toy & Hobby Stores 109 | 108,Electronics Stores 110 | 109,"Recreational Goods, Other" 111 | 110,"Lumber, Wood Production" 112 | 111,Personal Services 113 | 112,Beverages - Wineries & Distillers 114 | 113,Sporting Goods Stores 115 | 114,Processed & Packaged Goods 116 | 115,Printed Circuit Boards 117 | 116,"Specialty Retail, Other" 118 | 117,"Discount, Variety Stores" 119 | 118,Long-Term Care Facilities 120 | 119,Foreign Money Center Banks 121 | 120,Scientific & Technical Instruments 122 | 121,Department Stores 123 | 122,Major Integrated Oil & Gas 124 | 123,Foreign Regional Banks 125 | 124,Information & Delivery Services 126 | 125,Security Software & Services 127 | 126,Data Storage Devices 128 | 127,Beverages - Brewers 129 | 128,Industrial Metals & Minerals 130 | 129,Regional - Midwest Banks 131 | 130,Building Materials Wholesale 132 | 131,Resorts & Casinos 133 | 132,Food - Major Diversified 134 | 133,Wireless Communications 135 | 134,Grocery Stores 136 | 135,General Contractors 137 | 136,Diversified Investments 138 | 137,Pollution & Treatment Controls 139 | 138,Beverages - Soft Drinks 140 | 139,Diversified Communication Services 141 | 140,Marketing Services 142 | 141,Silver 143 | 142,Broadcasting - TV 144 | 143,Cleaning Products 145 | 144,Gaming Activities 146 | 145,Food Wholesale 147 | 146,Lodging 148 | 147,CATV Systems 149 | 148,Personal Products 150 | 149,Processing Systems & Products 151 | 150,Waste Management 152 | 151,Sporting Activities 153 | 152,Paper & Paper Products 154 | 153,Housewares & Accessories 155 | 154,Textile - Apparel Footwear & Accessories 156 | 155,Textile - Apparel Clothing 157 | 156,Diversified Computer Systems 158 | 157,Rubber & Plastics 159 | 158,REIT - Industrial 160 | 159,Auto Parts Wholesale 161 | 160,Hospitals 162 | 161,Computer Based Systems 163 | 162,Computer Peripherals 164 | 163,Drug Manufacturers - Other 165 | 164,Dairy Products 166 | 165,REIT - Healthcare Facilities 167 | 166,Sporting Goods 168 | 167,Cement 169 | 168,Auto Manufacturers - Major 170 | 169,Copper 171 | 170,Research Services 172 | 171,Specialty Eateries 173 | 172,Investment Brokerage - Regional 174 | 173,Drug Stores 175 | 174,Long Distance Carriers 176 | 175,Gold 177 | 176,Toys & Games 178 | 177,Home Improvement Stores 179 | 178,Machine Tools & Accessories 180 | 179,Nonmetallic Mineral Mining 181 | 180,Recreational Vehicles 182 | 181,Conglomerates 183 | 182,Medical Equipment Wholesale 184 | 183,Confectioners 185 | 184,Home Furnishing Stores 186 | 185,Trucks & Other Vehicles 187 | 186,Consumer Services 188 | 187,Advertising Agencies 189 | 188,Appliances 190 | 189,Publishing - Books 191 | 190,Oil & Gas Pipelines 192 | 191,Small Tools & Accessories 193 | 192,Internet Service Providers 194 | 193,Publishing - Periodicals 195 | 194,Cigarettes 196 | 195,Semiconductor- Memory Chips 197 | 196,Aerospace/Defense - Major Diversified 198 | 197,Drug Related Products 199 | 198,Jewelry Stores 200 | 199,General Entertainment 201 | 200,Computers Wholesale 202 | 201,Closed-End Fund - Debt 203 | 202,Meat Products 204 | 203,Music & Video Stores 205 | -------------------------------------------------------------------------------- /alphatools/ics/industry_names.csv: -------------------------------------------------------------------------------- 1 | 0,Medical Laboratories & Research 2 | 1,Aluminum 3 | 2,Major Airlines 4 | 3,Asset Management 5 | 4,Rental & Leasing Services 6 | 5,Semiconductor - Integrated Circuits 7 | 6,General Building Materials 8 | 7,Auto Parts Stores 9 | 8,Electronic Equipment 10 | 9,REIT - Retail 11 | 10,"Air Services, Other" 12 | 11,Drug Manufacturers - Major 13 | 12,Drugs Wholesale 14 | 13,Regional - Mid-Atlantic Banks 15 | 14,Business Services 16 | 15,Auto Dealerships 17 | 16,Medical Appliances & Equipment 18 | 17,Biotechnology 19 | 18,REIT - Residential 20 | 19,Office Supplies 21 | 20,Chemicals - Major Diversified 22 | 21,Property & Casualty Insurance 23 | 22,Specialized Health Services 24 | 23,Technical & System Software 25 | 24,Semiconductor Equipment & Materials 26 | 25,Technical Services 27 | 26,Information Technology Services 28 | 27,REIT - Diversified 29 | 28,Application Software 30 | 29,Property Management 31 | 30,Farm Products 32 | 31,Drugs - Generic 33 | 32,Business Software & Services 34 | 33,Communication Equipment 35 | 34,Home Health Care 36 | 35,Oil & Gas Refining & Marketing 37 | 36,Electric Utilities 38 | 37,Heavy Construction 39 | 38,Diversified Electronics 40 | 39,Life Insurance 41 | 40,Apparel Stores 42 | 41,Health Care Plans 43 | 42,Savings & Loans 44 | 43,Accident & Health Insurance 45 | 44,Farm & Construction Machinery 46 | 45,Credit Services 47 | 46,Surety & Title Insurance 48 | 47,Publishing - Newspapers 49 | 48,REIT - Hotel/Motel 50 | 49,Mortgage Investment 51 | 50,Industrial Electrical Equipment 52 | 51,Textile Industrial 53 | 52,Aerospace/Defense Products & Services 54 | 53,Industrial Equipment Wholesale 55 | 54,Insurance Brokers 56 | 55,Internet Information Providers 57 | 56,Steel & Iron 58 | 57,Specialty Chemicals 59 | 58,Diversified Utilities 60 | 59,Real Estate Development 61 | 60,Regional Airlines 62 | 61,Drug Delivery 63 | 62,Security & Protection Services 64 | 63,Auto Parts 65 | 64,Diagnostic Substances 66 | 65,"Movie Production, Theaters" 67 | 66,Entertainment - Diversified 68 | 67,Semiconductor - Broad Line 69 | 68,Diversified Machinery 70 | 69,Investment Brokerage - National 71 | 70,Home Furnishings & Fixtures 72 | 71,Catalog & Mail Order Houses 73 | 72,Internet Software & Services 74 | 73,Medical Instruments & Supplies 75 | 74,"Tobacco Products, Other" 76 | 75,Semiconductor - Specialized 77 | 76,Independent Oil & Gas 78 | 77,Education & Training Services 79 | 78,Trucking 80 | 79,Metal Fabrication 81 | 80,REIT - Office 82 | 81,Railroads 83 | 82,Regional - Northeast Banks 84 | 83,Water Utilities 85 | 84,Electronics Wholesale 86 | 85,Shipping 87 | 86,Staffing & Outsourcing Services 88 | 87,Healthcare Information Services 89 | 88,Telecom Services - Domestic 90 | 89,Gas Utilities 91 | 90,Packaging & Containers 92 | 91,Air Delivery & Freight Services 93 | 92,Multimedia & Graphics Software 94 | 93,Oil & Gas Drilling & Exploration 95 | 94,Agricultural Chemicals 96 | 95,Residential Construction 97 | 96,Business Equipment 98 | 97,Industrial Equipment & Components 99 | 98,Money Center Banks 100 | 99,Management Services 101 | 100,Regional - Pacific Banks 102 | 101,Regional - Southwest Banks 103 | 102,Oil & Gas Equipment & Services 104 | 103,Broadcasting - Radio 105 | 104,Networking & Communication Devices 106 | 105,Restaurants 107 | 106,Regional - Southeast Banks 108 | 107,Toy & Hobby Stores 109 | 108,Electronics Stores 110 | 109,"Recreational Goods, Other" 111 | 110,"Lumber, Wood Production" 112 | 111,Personal Services 113 | 112,Beverages - Wineries & Distillers 114 | 113,Sporting Goods Stores 115 | 114,Processed & Packaged Goods 116 | 115,Printed Circuit Boards 117 | 116,"Specialty Retail, Other" 118 | 117,"Discount, Variety Stores" 119 | 118,Long-Term Care Facilities 120 | 119,Foreign Money Center Banks 121 | 120,Scientific & Technical Instruments 122 | 121,Department Stores 123 | 122,Major Integrated Oil & Gas 124 | 123,Foreign Regional Banks 125 | 124,Information & Delivery Services 126 | 125,Security Software & Services 127 | 126,Data Storage Devices 128 | 127,Beverages - Brewers 129 | 128,Industrial Metals & Minerals 130 | 129,Regional - Midwest Banks 131 | 130,Building Materials Wholesale 132 | 131,Resorts & Casinos 133 | 132,Food - Major Diversified 134 | 133,Wireless Communications 135 | 134,Grocery Stores 136 | 135,General Contractors 137 | 136,Diversified Investments 138 | 137,Pollution & Treatment Controls 139 | 138,Beverages - Soft Drinks 140 | 139,Diversified Communication Services 141 | 140,Marketing Services 142 | 141,Silver 143 | 142,Broadcasting - TV 144 | 143,Cleaning Products 145 | 144,Gaming Activities 146 | 145,Food Wholesale 147 | 146,Lodging 148 | 147,CATV Systems 149 | 148,Personal Products 150 | 149,Processing Systems & Products 151 | 150,Waste Management 152 | 151,Sporting Activities 153 | 152,Paper & Paper Products 154 | 153,Housewares & Accessories 155 | 154,Textile - Apparel Footwear & Accessories 156 | 155,Textile - Apparel Clothing 157 | 156,Diversified Computer Systems 158 | 157,Rubber & Plastics 159 | 158,REIT - Industrial 160 | 159,Auto Parts Wholesale 161 | 160,Hospitals 162 | 161,Computer Based Systems 163 | 162,Computer Peripherals 164 | 163,Drug Manufacturers - Other 165 | 164,Dairy Products 166 | 165,REIT - Healthcare Facilities 167 | 166,Sporting Goods 168 | 167,Cement 169 | 168,Auto Manufacturers - Major 170 | 169,Copper 171 | 170,Research Services 172 | 171,Specialty Eateries 173 | 172,Investment Brokerage - Regional 174 | 173,Drug Stores 175 | 174,Long Distance Carriers 176 | 175,Gold 177 | 176,Toys & Games 178 | 177,Home Improvement Stores 179 | 178,Machine Tools & Accessories 180 | 179,Nonmetallic Mineral Mining 181 | 180,Recreational Vehicles 182 | 181,Conglomerates 183 | 182,Medical Equipment Wholesale 184 | 183,Confectioners 185 | 184,Home Furnishing Stores 186 | 185,Trucks & Other Vehicles 187 | 186,Consumer Services 188 | 187,Advertising Agencies 189 | 188,Appliances 190 | 189,Publishing - Books 191 | 190,Oil & Gas Pipelines 192 | 191,Small Tools & Accessories 193 | 192,Internet Service Providers 194 | 193,Publishing - Periodicals 195 | 194,Cigarettes 196 | 195,Semiconductor- Memory Chips 197 | 196,Aerospace/Defense - Major Diversified 198 | 197,Drug Related Products 199 | 198,Jewelry Stores 200 | 199,General Entertainment 201 | 200,Computers Wholesale 202 | 201,Closed-End Fund - Debt 203 | 202,Meat Products 204 | 203,Music & Video Stores 205 | -------------------------------------------------------------------------------- /alphatools/research/research.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import time 4 | from zipline.data import bundles 5 | from zipline.data.data_portal import DataPortal 6 | from zipline.pipeline.data import USEquityPricing 7 | from zipline.pipeline.engine import SimplePipelineEngine 8 | from zipline.pipeline.loaders import USEquityPricingLoader 9 | from zipline.utils.calendars import get_calendar 10 | from zipline.assets._assets import Equity 11 | from zipline.pipeline.loaders.blaze import BlazeLoader, from_blaze 12 | from zipline.utils.run_algo import load_extensions 13 | 14 | 15 | # Load extensions.py; this allows you access to custom bundles 16 | load_extensions( 17 | default=True, 18 | extensions=[], 19 | strict=True, 20 | environ=os.environ, 21 | ) 22 | 23 | 24 | # Set-Up Pricing Data Access 25 | trading_calendar = get_calendar('NYSE') 26 | bundle = 'quandl' 27 | bundle_data = bundles.load(bundle) 28 | 29 | 30 | loaders = {} 31 | 32 | # create and empty BlazeLoader 33 | blaze_loader = BlazeLoader() 34 | 35 | def my_dispatcher(column): 36 | return loaders[column] 37 | 38 | pipeline_loader = USEquityPricingLoader( 39 | bundle_data.equity_daily_bar_reader, 40 | bundle_data.adjustment_reader, 41 | ) 42 | 43 | def choose_loader(column): 44 | if column in USEquityPricing.columns: 45 | return pipeline_loader 46 | try: 47 | return my_dispatcher(column) 48 | except: 49 | pass 50 | return blaze_loader 51 | 52 | # Set-Up Pipeline Engine 53 | engine = SimplePipelineEngine( 54 | get_loader=choose_loader, 55 | calendar=trading_calendar.all_sessions, 56 | asset_finder=bundle_data.asset_finder, 57 | ) 58 | 59 | def run_pipeline(pipeline, start_date, end_date): 60 | return engine.run_pipeline( 61 | pipeline, 62 | pd.Timestamp(start_date, tz='utc'), 63 | pd.Timestamp(end_date, tz='utc') 64 | ) 65 | 66 | 67 | 68 | data = DataPortal( 69 | bundle_data.asset_finder, 70 | trading_calendar=trading_calendar, 71 | first_trading_day=bundle_data.equity_daily_bar_reader.first_trading_day, 72 | equity_minute_reader=None, 73 | equity_daily_reader=bundle_data.equity_daily_bar_reader, 74 | adjustment_reader=bundle_data.adjustment_reader, 75 | ) 76 | 77 | def set_bundle(name, calendar='NYSE'): 78 | global trading_calendar 79 | global bundle 80 | global bundle_data 81 | global engine 82 | global choose_loader 83 | global data 84 | 85 | bundle = name 86 | trading_calendar = get_calendar(calendar) 87 | bundle_data = bundles.load(bundle) 88 | engine = SimplePipelineEngine( 89 | get_loader=choose_loader, 90 | calendar=trading_calendar.all_sessions, 91 | asset_finder=bundle_data.asset_finder, 92 | ) 93 | 94 | data = DataPortal( 95 | bundle_data.asset_finder, 96 | trading_calendar=trading_calendar, 97 | first_trading_day=bundle_data.equity_daily_bar_reader.first_trading_day, 98 | equity_minute_reader=None, 99 | equity_daily_reader=bundle_data.equity_daily_bar_reader, 100 | adjustment_reader=bundle_data.adjustment_reader, 101 | ) 102 | 103 | 104 | 105 | 106 | def get_symbols(tickers, as_of_date=None): 107 | if (type(tickers)==str): 108 | return bundle_data.asset_finder.lookup_symbols( 109 | [tickers], as_of_date=as_of_date) 110 | else: 111 | if(type(tickers[0])==Equity): 112 | return tickers 113 | else: 114 | return bundle_data.asset_finder.lookup_symbols( 115 | tickers, as_of_date=as_of_date) 116 | 117 | def get_pricing(tickers, start_date, end_date, field='close'): 118 | 119 | end_dt = pd.Timestamp(end_date, tz='UTC', offset='C') 120 | start_dt = pd.Timestamp(start_date, tz='UTC', offset='C') 121 | 122 | symbols = get_symbols(tickers, as_of_date=end_dt) 123 | 124 | end_loc = trading_calendar.closes.index.get_loc(end_dt) 125 | start_loc = trading_calendar.closes.index.get_loc(start_dt) 126 | 127 | dat = data.get_history_window( 128 | assets=symbols, 129 | end_dt=end_dt, 130 | bar_count=end_loc - start_loc, 131 | frequency='1d', 132 | field=field, 133 | data_frequency='daily' 134 | ) 135 | 136 | return dat 137 | 138 | import alphalens as al 139 | 140 | def make_quantile_plot(df, start_date, end_date): 141 | assets = df.index.levels[1].values.tolist() 142 | df = df.dropna() 143 | pricing = get_pricing( 144 | assets, 145 | start_date, 146 | end_date, 147 | 'close' 148 | ) 149 | 150 | factor_names = df.columns 151 | factor_data = {} 152 | 153 | start_time = time.clock() 154 | for factor in factor_names: 155 | print("Formatting factor data for: " + factor) 156 | factor_data[factor] = al.utils.get_clean_factor_and_forward_returns( 157 | factor=df[factor], 158 | prices=pricing, 159 | periods=[1] 160 | ) 161 | end_time = time.clock() 162 | print("Time to get arrange factor data: %.2f secs" % (end_time - start_time)) 163 | 164 | qr_factor_returns = [] 165 | 166 | for i, factor in enumerate(factor_names): 167 | mean_ret, _ = al.performance.mean_return_by_quantile(factor_data[factor]) 168 | mean_ret.columns = [factor] 169 | qr_factor_returns.append(mean_ret) 170 | 171 | df_qr_factor_returns = pd.concat(qr_factor_returns, axis=1) 172 | 173 | (10000*df_qr_factor_returns).plot.bar( 174 | subplots=True, 175 | sharey=True, 176 | layout=(4,2), 177 | figsize=(14, 14), 178 | legend=False, 179 | title='Alphas Comparison: Basis Points Per Day per Quantile' 180 | ) 181 | 182 | return df_qr_factor_returns 183 | 184 | 185 | def make_factor_plot(df, start_date, end_date): 186 | assets = df.index.levels[1].values.tolist() 187 | df = df.dropna() 188 | pricing = get_pricing( 189 | assets, 190 | start_date, 191 | end_date, 192 | 'close' 193 | ) 194 | 195 | factor_names = df.columns 196 | factor_data = {} 197 | 198 | start_time = time.clock() 199 | for factor in factor_names: 200 | print("Formatting factor data for: " + factor) 201 | factor_data[factor] = al.utils.get_clean_factor_and_forward_returns( 202 | factor=df[factor], 203 | prices=pricing, 204 | periods=[1], 205 | quantiles=1 206 | ) 207 | end_time = time.clock() 208 | print("Time to get arrange factor data: %.2f secs" % (end_time - start_time)) 209 | 210 | ls_factor_returns = [] 211 | 212 | start_time = time.clock() 213 | for i, factor in enumerate(factor_names): 214 | ls = al.performance.factor_returns(factor_data[factor]) 215 | ls.columns = [factor] 216 | ls_factor_returns.append(ls) 217 | end_time = time.clock() 218 | print("Time to generate long/short returns: %.2f secs" % (end_time - start_time)) 219 | 220 | df_ls_factor_returns = pd.concat(ls_factor_returns, axis=1) 221 | (1+df_ls_factor_returns).cumprod().plot(title='Cumulative Factor Returns'); 222 | return df_ls_factor_returns 223 | 224 | -------------------------------------------------------------------------------- /notebooks/pipeline-minimal.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | " col_A col_B price\n", 13 | "2016-01-05 00:00:00+00:00 Equity(0 [A]) 1468.0 True 40.69\n", 14 | " Equity(2 [AAL]) 1469.0 False 40.91\n", 15 | "2016-01-06 00:00:00+00:00 Equity(0 [A]) 1470.0 True 40.55\n", 16 | " Equity(2 [AAL]) 1471.0 False 40.52\n", 17 | "2016-01-07 00:00:00+00:00 Equity(0 [A]) 1472.0 True 40.73\n", 18 | " Equity(2 [AAL]) 1473.0 False 41.23\n" 19 | ] 20 | } 21 | ], 22 | "source": [ 23 | "from zipline.data import bundles\n", 24 | "from zipline.pipeline import Pipeline\n", 25 | "from zipline.pipeline.data import USEquityPricing\n", 26 | "from zipline.pipeline.data import Column \n", 27 | "from zipline.pipeline.data import DataSet\n", 28 | "from zipline.pipeline.engine import SimplePipelineEngine\n", 29 | "from zipline.pipeline.filters import StaticAssets\n", 30 | "from zipline.pipeline.loaders import USEquityPricingLoader\n", 31 | "from zipline.pipeline.loaders.frame import DataFrameLoader\n", 32 | "from zipline.utils.calendars import get_calendar\n", 33 | "\n", 34 | "import numpy as np\n", 35 | "import pandas as pd\n", 36 | "\n", 37 | "trading_calendar = get_calendar('NYSE')\n", 38 | "bundle_data = bundles.load('quandl')\n", 39 | "\n", 40 | "\n", 41 | "# Set up Custom Data Source for two sids for DataFrameLoader\n", 42 | "class MyDataSet(DataSet): \n", 43 | " column_A = Column(dtype=float)\n", 44 | " column_B = Column(dtype=bool) \n", 45 | "\n", 46 | "dates = pd.date_range('2014-01-01', '2017-01-01', tz='UTC')\n", 47 | "assets = bundle_data.asset_finder.lookup_symbols(['A', 'AAL'], as_of_date=None)\n", 48 | "sids = pd.Int64Index([asset.sid for asset in assets])\n", 49 | "\n", 50 | "# The values for Column A will just be a 2D array of numbers ranging from 1 -> N. \n", 51 | "column_A_frame = pd.DataFrame( \n", 52 | " data=np.arange(len(dates)*len(assets), dtype=float).reshape(len(dates), len(assets)), \n", 53 | " index=dates,\n", 54 | " columns=sids,\n", 55 | ")\n", 56 | "\n", 57 | "# Column B will always provide True for 0 and False for 1. \n", 58 | "column_B_frame = pd.DataFrame(data={sids[0]: True, sids[1]: False}, index=dates)\n", 59 | "\n", 60 | "loaders = { \n", 61 | " MyDataSet.column_A: DataFrameLoader(MyDataSet.column_A, column_A_frame), \n", 62 | " MyDataSet.column_B: DataFrameLoader(MyDataSet.column_B, column_B_frame), \n", 63 | "}\n", 64 | "\n", 65 | "def my_dispatcher(column):\n", 66 | " return loaders[column]\n", 67 | "\n", 68 | "\n", 69 | "# Set up pipeline engine\n", 70 | "\n", 71 | "# Loader for pricing\n", 72 | "pipeline_loader = USEquityPricingLoader(\n", 73 | " bundle_data.equity_daily_bar_reader,\n", 74 | " bundle_data.adjustment_reader,\n", 75 | ")\n", 76 | "\n", 77 | "def choose_loader(column):\n", 78 | " if column in USEquityPricing.columns:\n", 79 | " return pipeline_loader\n", 80 | " return my_dispatcher(column)\n", 81 | "\n", 82 | "engine = SimplePipelineEngine(\n", 83 | " get_loader=choose_loader,\n", 84 | " calendar=trading_calendar.all_sessions,\n", 85 | " asset_finder=bundle_data.asset_finder,\n", 86 | ")\n", 87 | "\n", 88 | "p = Pipeline(\n", 89 | " columns={\n", 90 | " 'price': USEquityPricing.close.latest,\n", 91 | " 'col_A': MyDataSet.column_A.latest,\n", 92 | " 'col_B': MyDataSet.column_B.latest\n", 93 | " },\n", 94 | " screen=StaticAssets(assets)\n", 95 | ")\n", 96 | "\n", 97 | "df = engine.run_pipeline(\n", 98 | " p,\n", 99 | " pd.Timestamp('2016-01-05', tz='utc'),\n", 100 | " pd.Timestamp('2016-01-07', tz='utc')\n", 101 | ")\n", 102 | "\n", 103 | "print(df)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 2, 109 | "metadata": {}, 110 | "outputs": [ 111 | { 112 | "name": "stdout", 113 | "output_type": "stream", 114 | "text": [ 115 | "1.11.3\n" 116 | ] 117 | } 118 | ], 119 | "source": [ 120 | "print(np.__version__)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 3, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "loader = my_dispatcher(MyDataSet.column_A)\n", 130 | "adj_array = loader.load_adjusted_array(\n", 131 | " [MyDataSet.column_A],\n", 132 | " dates,\n", 133 | " sids,\n", 134 | " np.ones((len(dates), len(sids)), dtype=bool)\n", 135 | ")\n" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 4, 141 | "metadata": {}, 142 | "outputs": [ 143 | { 144 | "name": "stdout", 145 | "output_type": "stream", 146 | "text": [ 147 | "Adjusted Array (float64):\n", 148 | "\n", 149 | "Data:\n", 150 | "array([[ 0.00000000e+00, 1.00000000e+00],\n", 151 | " [ 2.00000000e+00, 3.00000000e+00],\n", 152 | " [ 4.00000000e+00, 5.00000000e+00],\n", 153 | " ..., \n", 154 | " [ 2.18800000e+03, 2.18900000e+03],\n", 155 | " [ 2.19000000e+03, 2.19100000e+03],\n", 156 | " [ 2.19200000e+03, 2.19300000e+03]])\n", 157 | "\n", 158 | "Adjustments:\n", 159 | "{}\n", 160 | "\n" 161 | ] 162 | } 163 | ], 164 | "source": [ 165 | "print(list(adj_array.values())[0].inspect())" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 5, 171 | "metadata": {}, 172 | "outputs": [ 173 | { 174 | "data": { 175 | "text/plain": [ 176 | "zipline.pipeline.data.dataset.BoundColumn" 177 | ] 178 | }, 179 | "execution_count": 5, 180 | "metadata": {}, 181 | "output_type": "execute_result" 182 | } 183 | ], 184 | "source": [ 185 | "type(MyDataSet.column_A)" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 6, 191 | "metadata": {}, 192 | "outputs": [ 193 | { 194 | "data": { 195 | "text/plain": [ 196 | "Int64Index([0, 2], dtype='int64')" 197 | ] 198 | }, 199 | "execution_count": 6, 200 | "metadata": {}, 201 | "output_type": "execute_result" 202 | } 203 | ], 204 | "source": [ 205 | "column_A_frame.columns" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [] 221 | } 222 | ], 223 | "metadata": { 224 | "kernelspec": { 225 | "display_name": "Python 3.5 (env_alphatools)", 226 | "language": "python", 227 | "name": "env_alphatools" 228 | }, 229 | "language_info": { 230 | "codemirror_mode": { 231 | "name": "ipython", 232 | "version": 3 233 | }, 234 | "file_extension": ".py", 235 | "mimetype": "text/x-python", 236 | "name": "python", 237 | "nbconvert_exporter": "python", 238 | "pygments_lexer": "ipython3", 239 | "version": "3.5.5" 240 | }, 241 | "varInspector": { 242 | "cols": { 243 | "lenName": 16, 244 | "lenType": 16, 245 | "lenVar": 40 246 | }, 247 | "kernels_config": { 248 | "python": { 249 | "delete_cmd_postfix": "", 250 | "delete_cmd_prefix": "del ", 251 | "library": "var_list.py", 252 | "varRefreshCmd": "print(var_dic_list())" 253 | }, 254 | "r": { 255 | "delete_cmd_postfix": ") ", 256 | "delete_cmd_prefix": "rm(", 257 | "library": "var_list.r", 258 | "varRefreshCmd": "cat(var_dic_list()) " 259 | } 260 | }, 261 | "types_to_exclude": [ 262 | "module", 263 | "function", 264 | "builtin_function_or_method", 265 | "instance", 266 | "_Feature" 267 | ], 268 | "window_display": false 269 | } 270 | }, 271 | "nbformat": 4, 272 | "nbformat_minor": 2 273 | } 274 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2018 Jonathan Larkin 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /notebooks/one_o_one_alphas.json: -------------------------------------------------------------------------------- 1 | { 2 | "1": "(rank(ts_argmax(signedpower(((returns < 0) ? stddev(returns, 20) : close), 2.), 5)) - 0.5)", 3 | "2": "(-1 * correlation(rank(delta(log(volume), 2)), rank(((close - opens) / opens)), 6))", 4 | "3": "(-1 * correlation(rank(opens), rank(volume), 10))", 5 | "4": "(-1 * ts_rank(rank(low), 9))", 6 | "5": "(rank((opens - (sum(vwap, 10) / 10))) * (-1 * abs(rank((close - vwap)))))", 7 | "6": "(-1 * correlation(opens, volume, 10))", 8 | "7": "((adv20 < volume) ? ((-1 * ts_rank(abs(delta(close, 7)), 60)) * sign(delta(close, 7))) : (-1* 1))", 9 | "8": "-1*rank(((sum(opens, 5)*sum(returns, 5))-delay((sum(opens, 5)*sum(returns, 5)),10)))", 10 | "9": "((0 < ts_min(delta(close, 1), 5)) ? delta(close, 1) : ((ts_max(delta(close, 1), 5) < 0) ? delta(close, 1) : (-1 * delta(close, 1))))", 11 | "10": "rank(((0 < ts_min(delta(close, 1), 4)) ? delta(close, 1) : ((ts_max(delta(close, 1), 4) < 0) ? delta(close, 1) : (-1 * delta(close, 1)))))", 12 | "11": "((rank(ts_max((vwap - close), 3)) + rank(ts_min((vwap - close), 3))) * rank(delta(volume, 3)))", 13 | "12": "(sign(delta(volume, 1)) * (-1 * delta(close, 1)))", 14 | "13": "(-1 * rank(covariance(rank(close), rank(volume), 5)))", 15 | "14": "((-1 * rank(delta(returns, 3))) * correlation(opens, volume, 10))", 16 | "15": "(-1 * sum(rank(correlation(rank(high), rank(volume), 3)), 3))", 17 | "16": "(-1 * rank(covariance(rank(high), rank(volume), 5)))", 18 | "17": "(((-1 * rank(ts_rank(close, 10))) * rank(delta(delta(close, 1), 1))) *rank(ts_rank((volume / adv20), 5)))", 19 | "18": "(-1 * rank(((stddev(abs((close - opens)), 5) + (close - opens)) + correlation(close, opens,10))))", 20 | "19": "((-1 * sign(((close - delay(close, 7)) + delta(close, 7)))) * (1 + rank((1 + sum(returns, 250)))))", 21 | "20": "(((-1 * rank((opens - delay(high, 1)))) * rank((opens - delay(close, 1)))) * rank((opens - delay(low, 1))))", 22 | "21": "((((sum(close, 8) / 8) + stddev(close, 8)) < (sum(close, 2) / 2)) ? (-1 * 1) : (((sum(close, 2) / 2) < ((sum(close, 8) / 8) - stddev(close, 8))) ? 1 : (((1 < (volume / adv20)) || ((volume/adv20) == 1)) ? 1 : (-1 * 1))))", 23 | "22": "(-1 * (delta(correlation(high, volume, 5), 5) * rank(stddev(close, 20))))", 24 | "23": "(((sum(high, 20) / 20) < high) ? (-1 * delta(high, 2)) : 0)", 25 | "24": "((((delta((sum(close, 100) / 100), 100) / delay(close, 100)) < 0.05) || ((delta((sum(close, 100) / 100), 100) / delay(close, 100)) == 0.05)) ? (-1 * (close - ts_min(close, 100))) : (-1 * delta(close, 3)))", 26 | "25": "rank(((((-1 * returns) * adv20) * vwap) * (high - close)))", 27 | "26": "(-1 * ts_max(correlation(ts_rank(volume, 5), ts_rank(high, 5), 5), 3))", 28 | "27": "((0.5 < rank((sum(correlation(rank(volume), rank(vwap), 6), 2) / 2.0))) ? (-1 * 1) : 1)", 29 | "28": "scale(((correlation(adv20, low, 5) + ((high + low) / 2)) - close))", 30 | "29": "(min(product(rank(rank(scale(log(sum(ts_min(rank(rank((-1 * rank(delta((close - 1),5))))), 2), 1))))), 1), 5) + ts_rank(delay((-1 * returns), 6), 5))", 31 | "30": "(((1.0 - rank(((sign((close - delay(close, 1))) + sign((delay(close, 1) - delay(close, 2)))) + sign((delay(close, 2) - delay(close, 3)))))) * sum(volume, 5)) / sum(volume, 20))", 32 | "31": "((rank(rank(rank(decay_linear((-1 * rank(rank(delta(close, 10)))), 10)))) + rank((-1 * delta(close, 3)))) + sign(scale(correlation(adv20, low, 12))))", 33 | "32": "(scale(((sum(close, 7) / 7) - close)) + (20 * scale(correlation(vwap, delay(close, 5), 230))))", 34 | "33": "rank((-1 * ((1 - (opens / close))^1)))", 35 | "34": "rank(((1 - rank((stddev(returns, 2) / stddev(returns, 5)))) + (1 - rank(delta(close, 1)))))", 36 | "35": "((ts_rank(volume, 32) * (1 - ts_rank(((close + high) - low), 16))) * (1 - ts_rank(returns, 32))) ", 37 | "36": "(((((2.21 * rank(correlation((close - opens), delay(volume, 1), 15))) + (0.7 * rank((opens - close)))) + (0.73 * rank(ts_rank(delay((-1 * returns), 6), 5)))) + rank(abs(correlation(vwap, adv20, 6)))) + (0.6 * rank((((sum(close, 200) / 200) - opens) * (close - opens))))) ", 38 | "37": "(rank(correlation(delay((opens - close), 1), close, 200)) + rank((opens - close))) ", 39 | "38": "((-1 * rank(ts_rank(close, 10))) * rank((close / opens)))", 40 | "39": "((-1 * rank((delta(close, 7) * (1 - rank(decay_linear((volume / adv20), 9)))))) * (1 + rank(sum(returns, 250))))", 41 | "40": "((-1 * rank(stddev(high, 10))) * correlation(high, volume, 10))", 42 | "41": "(((high * low)^0.5) - vwap) ", 43 | "42": "(rank((vwap - close)) / rank((vwap + close)))", 44 | "43": "(ts_rank((volume / adv20), 20) * ts_rank((-1 * delta(close, 7)), 8)) ", 45 | "44": "(-1 * correlation(high, rank(volume), 5)) ", 46 | "45": "(-1 * ((rank((sum(delay(close, 5), 20) / 20)) * correlation(close, volume, 2)) *rank(correlation(sum(close, 5), sum(close, 20), 2)))) ", 47 | "46": "((0.25 < (((delay(close, 20) - delay(close, 10)) / 10) - ((delay(close, 10) - close) / 10))) ? (-1 * 1) : (((((delay(close, 20) - delay(close, 10)) / 10) - ((delay(close, 10) - close) / 10)) < 0) ? 1 : ((-1 * 1) * (close - delay(close, 1))))) ", 48 | "47": "((((rank((1 / close)) * volume) / adv20) * ((high * rank((high - close))) / (sum(high, 5) / 5))) - rank((vwap - delay(vwap, 5)))) ", 49 | "48": "(indneutralize(((correlation(delta(close, 1), delta(delay(close, 1), 1), 250) * delta(close, 1)) / close), IndClass.subindustry) / sum(((delta(close, 1) / delay(close, 1))^2), 250)) ", 50 | "49": "(((((delay(close, 20) - delay(close, 10)) / 10) - ((delay(close, 10) - close) / 10)) < (-1 *0.1)) ? 1 : ((-1 * 1) * (close - delay(close, 1))))", 51 | "50": "(-1 * ts_max(rank(correlation(rank(volume), rank(vwap), 5)), 5)) ", 52 | "51": "(((((delay(close, 20) - delay(close, 10)) / 10) - ((delay(close, 10) - close) / 10)) < (-1 *0.05)) ? 1 : ((-1 * 1) * (close - delay(close, 1)))) ", 53 | "52": "((((-1 * ts_min(low, 5)) + delay(ts_min(low, 5), 5)) * rank(((sum(returns, 240) - sum(returns, 20)) / 220))) * ts_rank(volume, 5)) ", 54 | "53": "(-1*delta((((close-low) - (high-close)) / (close-low)), 9))", 55 | "54": "((-1 * ((low - close) * (opens^5))) / ((low - high) * (close^5)))", 56 | "55": "(-1 * correlation(rank(((close - ts_min(low, 12)) / (ts_max(high, 12) - ts_min(low,12)))), rank(volume), 6))", 57 | "56": "(0 - (1 * (rank((sum(returns, 10) / sum(sum(returns, 2), 3))) * rank((returns * cap)))))", 58 | "57": "(0 - (1*((close - vwap)/decay_linear(rank(ts_argmax(close, 30)),2))))", 59 | "58": "(-1 * ts_rank(decay_linear(correlation(indneutralize(vwap, IndClass.sector), volume,3.92795), 7.89291), 5.50322)) ", 60 | "59": "(-1 * ts_rank(decay_linear(correlation(indneutralize(((vwap * 0.728317) + (vwap *(1 - 0.728317))), IndClass.industry), volume, 4.25197), 16.2289), 8.19648)) ", 61 | "60": "(0 - (1 * ((2 * scale(rank(((((close - low) - (high - close)) / (high - low)) * volume)))) -scale(rank(ts_argmax(close, 10)))))) ", 62 | "61": "(rank((vwap - ts_min(vwap, 16.1219))) < rank(correlation(vwap, adv180, 17.9282))) ", 63 | "62": "((rank(correlation(vwap, sum(adv20, 22.4101), 9.91009)) < rank(((rank(opens) + rank(opens)) < (rank(((high + low) / 2)) + rank(high))))) * -1) ", 64 | "63": "((rank(decay_linear(delta(indneutralize(close, IndClass.industry), 2.25164), 8.22237)) - rank(decay_linear(correlation(((vwap * 0.318108) + (opens * (1 - 0.318108))), sum(adv180,37.2467), 13.557), 12.2883))) * -1) ", 65 | "64": "((rank(correlation(sum(((opens * 0.178404) + (low * (1 - 0.178404))), 12.7054),sum(adv120, 12.7054), 16.6208)) < rank(delta(((((high + low) / 2) * 0.178404) + (vwap * (1 -0.178404))), 3.69741))) * -1) ", 66 | "65": "((rank(correlation(((opens * 0.00817205) + (vwap * (1 - 0.00817205))), sum(adv60,8.6911), 6.40374)) < rank((opens - ts_min(opens, 13.635)))) * -1) ", 67 | "66": "((rank(decay_linear(delta(vwap, 3.51013), 7.23052)) + ts_rank(decay_linear(((((low* 0.96633) + (low * (1 - 0.96633))) - vwap) / (opens - ((high + low) / 2))), 11.4157), 6.72611)) * -1) ", 68 | "67": "((rank((high - ts_min(high, 2.14593)))^rank(correlation(indneutralize(vwap,IndClass.sector), indneutralize(adv20, IndClass.subindustry), 6.02936))) * -1) ", 69 | "68": "((ts_rank(correlation(rank(high), rank(adv15), 8.91644), 13.9333) < rank(delta(((close * 0.518371) + (low * (1 - 0.518371))), 1.06157))) * -1) ", 70 | "69": "((rank(ts_max(delta(indneutralize(vwap, IndClass.industry), 2.72412),4.79344))^ts_rank(correlation(((close * 0.490655) + (vwap * (1 - 0.490655))), adv20, 4.92416),9.0615)) * -1) ", 71 | "70": "((rank(delta(vwap, 1.29456))^ts_rank(correlation(indneutralize(close,IndClass.industry), adv50, 17.8256), 17.9171)) * -1) ", 72 | "71": "max(ts_rank(decay_linear(correlation(ts_rank(close, 3.43976), ts_rank(adv180,12.0647), 18.0175), 4.20501), 15.6948), ts_rank(decay_linear((rank(((low + opens) - (vwap + vwap)))^2), 16.4662), 4.4388))", 73 | "72": "(rank(decay_linear(correlation(((high + low) / 2), adv40, 8.93345), 10.1519)) / rank(decay_linear(correlation(ts_rank(vwap, 3.72469), ts_rank(volume, 18.5188), 6.86671),2.95011))) ", 74 | "73": "(max(rank(decay_linear(delta(vwap, 4.72775), 2.91864)), ts_rank(decay_linear(((delta(((opens * 0.147155) + (low * (1 - 0.147155))), 2.03608) / ((opens * 0.147155) + (low * (1 - 0.147155)))) * -1), 3.33829), 16.7411)) * -1) ", 75 | "74": "((rank(correlation(close, sum(adv30, 37.4843), 15.1365)) 2 | 3 | [![License](https://img.shields.io/badge/License-Apache%202.0-green.svg)](https://opensource.org/licenses/Apache-2.0) 4 | [![Python](https://img.shields.io/badge/Python-3.5|3.6-blue.svg)](https://opensource.org/licenses/Apache-2.0) 5 | [![Build Status](https://travis-ci.org/marketneutral/alphatools.svg?branch=master)](https://travis-ci.org/marketneutral/alphatools) 6 | 7 | This package aims to provide environments within which best-in-class open source tools across **both** financial research (e.g., `zipline`, `alphelens`, and `pyfolio`) and machine learning (e.g., `scikit-learn`, `LightGBM`, `PyMC3`, `pytorch`, and `fastai`) operate together. The "stable" enviroment is on Python 3.5 and does not include `fastai`. The "latest" environment is on Python 3.6 and relies on the backwards compatibility PEP for packages which state only 3.5 support (e.g., `zipline`). The latest environment includes the pre-release of PyTorch 1.0 and fastai 1.0.x. The PyTorch version in both environments is currently "CPU" only (i.e., no GPU/CUDA for now). The "tests" are only testing that the environments are built without conflict for now. 8 | 9 | Additionally, this package provides functions to make the equity alpha factor research process more accessible and productive. Convenience functions sit on top of [zipline](https://github.com/quantopian/zipline) and, specifically, the [`Pipeline`](https://www.quantopian.com/help#pipeline-api) cross-sectional classes and functions in that package. `alphatools` allows you to 10 | 11 | - `run_pipeline` in a Jupyter notebook (or from any arbitrary Python code) **in your local environment**, 12 | - create `Pipeline` factors **at runtime** on **arbitrary data sources** (just expose the endpoint for data sitting somewhere, specify the schema, and...it's available for use in `Pipeline`!), 13 | - parse and compile **"expression" style alphas** as described the paper ["101 Formulaic Alphas"](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2701346) into `Pipeline` factors, and 14 | - work with and plot ingested **pricing data from an arbitrary bundle** with a `get_pricing(...)` function call. 15 | 16 | For example, with `alphatools`, you can, say, within a Jupyter notebook, 17 | 18 | ```python 19 | from alphatools.research import run_pipeline 20 | from alphatools.ics import Sector 21 | from alphatools.data import Factory 22 | from alphatools.expression import ExpressionAlpha 23 | from zipline.pipeline.data import USEquityPricing as USEP 24 | from zipline.pipeline.factors import Returns, AverageDollarVolume 25 | from zipline.pipeline import Pipeline 26 | 27 | universe = AverageDollarVolume(window_length=120).top(500) 28 | 29 | my_factor = ( 30 | -Returns(mask=universe, window_length=5). 31 | demean(groupby=Sector()). 32 | rank() 33 | ) 34 | 35 | expr_factor = ( 36 | ExpressionAlpha( 37 | 'rank(indneutralize(-log(close/delay(close, 4))),IndClass.sector)' 38 | ).make_pipeline_factor().pipeline_factor(mask=universe) 39 | ) 40 | 41 | p = Pipeline(screen=universe) 42 | 43 | p.add(my_factor, '5d_MR_Sector_Neutral_Rank') 44 | p.add(expr_factor, '5d_MR_Expression Alpha') 45 | 46 | p.add(Factory['my_special_data'].value.latest.zscore(), 'Special_Factor') 47 | 48 | start_date = '2017-01-04' 49 | end_date = '2017-12-28' 50 | 51 | df = run_pipeline(p, start_date, end_date) 52 | ``` 53 | 54 | ## Bring Your Own Data 55 | 56 | To "Bring Your Own Data", you simply point the Factory object to an endpoint and specify the schema. This is done by adding to the `json` file `data_sources.json`. For example, if you have a `csv` file on disk, `data.csv`, and a PostgreSQL table somewhere else, you would create `data_sources.json` as 57 | 58 | ```json 59 | { 60 | "my_special_data": { 61 | "url": "/full/path/to/data/data.csv", 62 | "schema": "var * {asof_date: datetime, sid: int64, value: float64}" 63 | }, 64 | 65 | "my_database_data": { 66 | "url": "postgresql://$USER:$PASS@hostname::my-table-name", 67 | "schema": "var * {asof_date: datetime, sid: int64, price_to_book: float64}" 68 | } 69 | ``` 70 | 71 | In the case of the example PostgreSQL `url`, note that the text `$USER` will be substituted with the text in the environment variable `USER` and the text `$PASS` will be substituted with the text in the environment variable `PASS`. Basically, any text token in the `url` which is preceeded by `$` will be substituted by the text in the environment variable of that name. Hence, you do not need to expose actual credentials in this file. 72 | 73 | The `schema` is specified as a `dshape` from the package `datashape` (docs [here]()). The magic happens via the `blaze/datashape/odo` stack. You can specify the `url` to a huge variety of source formats including `json`, `csv`, PostgreSQL tables, MongoDB collections, `bcolz`, Microsoft Excel(!?), `.gz` compressed files, collections of files (e.g., `myfiles_*.csv`), and remote locations like Amazon S3 and a Hadoop Distributed File System. To me, the [`odo`](https://en.wikipedia.org/wiki/Odo_(Star_Trek)) [documentation on URI strings](http://odo.pydata.org/en/latest/uri.html) is the clearest explanation on this. 74 | 75 | Note that this data must be mapped to the `sid` as mapped by `zipline ingest`. Also, the data rowwise dates must be in a column titled `asof_date`. You can then access this data like 76 | 77 | ```python 78 | from alphatools.data import Factory 79 | : 80 | : 81 | : 82 | 83 | my_factor = Factory['my_database_data'].price_to_book.latest.rank() 84 | p.add(my_factor) 85 | ``` 86 | 87 | This functionality should allow you to use new data in research very quickly with the absolute minimal amount of data engineering and/or munging. For example, commercial risk model providers often provide a single file per day for factor loadings (e.g., `data_yyyymmdd_fac.csv`). After `sid` mapping and converting the date column name to `asof_date`, this data can be immediately available in `Pipeline` by putting a `url` in `data_sources.json` like `"url": "/path/to/dir/data_*_fac.csv"`, and `schema` like `"var * {asof_date: datetime, sid: int64, MKT_BETA: float64, VALUE: float64, MOMENTUM: float64, ST_REVERSAL: float64 ..."`. 88 | 89 | ## Expression Alphas 90 | 91 | The ability to parse "expression" alphas is meant to help speed the research process and/or allow financial professionals with minimal Python experience to test alpha ideas. See ["101 Formulaic Alphas"](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2701346) for details on this DSL. The (EBNF) grammar is fully specified ["here"](https://github.com/marketneutral/alphatools/blob/master/alphatools/expression/expression.lark). We use the `Lark` Python [parsing library](https://github.com/lark-parser/lark) (great name, no relation). Currently, the data for `open`, `high`, `low`, `close`, `volume` are accessible; the following calculations and operators are implemented 92 | 93 | * `vwap`: the daily vwap (as a default, this is approximated with `(close + (opens + high + low)/3)/2`). 94 | * `returns`: daily close-to-close returns. 95 | * `+`,`-`, `*`, `/`, `^`: as expected, though only for two terms (i.e., only \ \ \); `^` is exponentiation, not bitwise or. 96 | * `-x`: unary minus on x (i.e., negation). 97 | * `abs(x)`, `log(x)`, `sign(x)`: elementwise standard math operations. 98 | * `>`, `<`, `==`, `||`: elementwise comparator operations returning 1 or 0. 99 | * `x ? y : z`: C-style ternary operator; `if x: y; else z`. 100 | * `rank(x)`: scaled ranks, per day, across all assets (i.e., the cross-sectional rank); ranks are descending such that the rank of the maximum raw value in the vector is 1.0; the smallest rank is 1/N. The re-scale of the ranks to the interval [1/N,1] is implied by Alpha 1: 0.50 is subtracted from the final ranked value. The ordinal method is used to match `Pipeline` method `.rank()`. 101 | * `delay(x, days)`: *x* lagged by *days*. Note that the *days* parameter in `delay` and `delta` differs from the `window_length` parameter you may be familiar with in `Pipeline`. The `window_length` refers to a the number of data points in the (row axis of the) data matrix, *not* the number of days lag. For example, in `Pipeline` if you want daily returns, you specify a `window_length` of `2` since you need 2 data points--today and the day prior--to get a daily return. In an expression alpha, the *days* is the lag *from today*. Concretely, a simple example to show is: the `Pipeline` factor `Returns(window_length=2)` is precisely equal to the expression alpha `delta(close,1)/delay(close,1)`. 102 | * `correlation(x, y, days)`: the Pearson correlation of the values for assets in *x* to the corresponding values for the same assets in *y* over *days*; note this is very slow in the current implementation. 103 | * `covariance(x, y, days)`: the covariance of the values for assets in *x* to the corresponding values for the same assets in *y* over *days*; note this is very slow as well currently. 104 | * `delta(x, days)`: diff on *x* per *days* timestep. 105 | * `signedpower(x, a)`: elementwise `sign(x)*(abs(x)^a)`. 106 | * `decay_linear(x, days)`: weighted sum of *x* over the past *days* with linearly decaying weights (weights sum to 1; max of the weights is on the most recent day). 107 | * `indneutralize(x, g)`: `x`, cross-sectionally "neutralized" (i.e., demeaned) against the group membership classifier `g`. `g` must be in the set {`IndClass.sector`, `IndClass.industry`, `IndClass.subindustry`}. The set `g` maps to the `Pipeline` classifiers `Sector()` and `SubIndustry()` in `alphatools.ics`. Concretely, the `Pipeline` factor `Returns().demean(groupby=Sector())` is equivalent (save a corner case on NaN treatment) to the expression `indneutralize(returns, IndClass.sector)`. If you do not specifically pass a token for `g`, the default of `IndClass.industry` is applied. 108 | * `ts_max(x, days)`: the per asset time series max on *x* over the trailing *days* (also `ts_min(...)`). 109 | * `max(a, b)`: The paper says that `max` is an alias for `ts_max(a, b)`; I think this is an error. Alphas 71, 73, 76, 87, and 96 do not parse with `max` as alias for `ts_max`. Rather I believe that `max` means elementwise maximum of two arrays (i.e., like `pmax(...)` in R and `np.maximum(...)` in Numpy) and have implemented it as such; same for `min(a, b)`. 110 | * `ts_argmax(x, days)`: on which day `ts_max(x, days)` occurred (also `ts_argmin(...)`) scaled to the interval [1/days,1]. For example, if window (*days*) is 10 days, and the max is in the most recent day, it will return 1.0; if the max is in the earliest day it will return 0.10. 111 | * `ts_rank(x, days)`: the time series rank per asset on *x* over the the trailing *days*. Currently this is in the range [0,1], but should be [1/days,1]. 112 | * `sum(x, days)`: the sum per asset on *x* over the trailing *days*. 113 | * `product(x, days)`: the product per asset on *x* over the trailing *days*. 114 | * `stddev(x, days)`: the standard deviation per asset on *x* over the trailing *days*. 115 | * `adv{days}`: the average daily **dollar** volume per asset over the trailing *days* (e.g., `adv20` gives the 20-day trailing average daily dollar volume). 116 | 117 | The expression alpha parser produces `zipline` compatible `Pipeline` factor code. This implementation makes use of the `bottleneck` package which provides many `numpy`-style rolling aggregations, implemented in highly optimized compiled C code. The `bottleneck` package is distributed in binary form in the Anaconda Python distribution (see Installation below). 118 | 119 | For example, the expression alpha "#9" from the paper 120 | 121 | ``` 122 | ((0 < ts_min(delta(close, 1), 5)) ? delta(close, 1) : ((ts_max(delta(close, 1), 5) < 0) ? delta(close, 1) : (-1 * delta(close, 1)))) 123 | ``` 124 | 125 | is compiled into a usable `Pipeline` factor, `e`, as 126 | 127 | ```python 128 | e = ( 129 | ExpressionAlpha('((0 < ts_min(delta(close, 1), 5)) ? delta(close, 1) : ((ts_max(delta(close, 1), 5) < 0) ? delta(close, 1) : (-1 * delta(close, 1))))). 130 | make_pipeline_factor(). 131 | pipeline_factor(mask=universe) 132 | ) 133 | ``` 134 | 135 | 136 | The abstract snytax tree ("AST") can be visualized with `from lark.tree import pydot__tree_to_png; pydot__tree_to_png(e.tree, "alpha9.png")`: 137 | 138 | 139 | 140 | This is quite helpful, in my opinion, to understand a third-party alpha like this. So what's happening? Looking top to bottom at each level, left to right: if zero is less than the minimum of the daily price change over the trailing five days (i.e., if the stock has gone **up** *every day* for the last five days), then the factor value is simpy the price change over the *most recent* day, which is a positive number by definition, and thus bets that positive momentum will continue. That branch should be pretty rare (meaning it would be rare for a stock to go up every day for five days in a row). Otherwise, we check if the max price change in the last 5 days is less than zero (i.e., the stock has gone **down** *every day* for the last 5 days), then the factor value again is just the price change over the *most recent day*, which is a negative number by definition. Thus if the stock has gone straight down for 5 days, the factor bets that it will continue. This should also be rare. Lastly, if neither of these two states exist, the factor value is just -1 times the last day's price change; i.e., a bet on mean reversion. Hence, by inspecting the parse tree like this, we can understand that this alpha is a momentum/mean-reversion switching factor; it assumes momentum will persist if the prior five days have moved in the same direction, otherwise it assumes mean-reversion will occur. 141 | 142 | You can see the resuling `Pipeline` code (though this is not necessary to use the alpha in `run_pipeline`) with `print(e.pipeline_code)`: 143 | 144 | ```python 145 | class ExprAlpha_1(CustomFactor): 146 | inputs = [USEP.close] 147 | window_length = 17 148 | 149 | def compute(self, today, assets, out, close): 150 | v0 = close - np.roll(close, 1, axis=0) 151 | v1 = bn.move_min(v0, window=5, min_count=1, axis=0) 152 | v2 = np.less(0, v1) 153 | v3 = close - np.roll(close, 1, axis=0) 154 | v4 = close - np.roll(close, 1, axis=0) 155 | v5 = bn.move_max(v4, window=5, min_count=1, axis=0) 156 | v6 = np.less(v5, 0) 157 | v7 = close - np.roll(close, 1, axis=0) 158 | v8 = close - np.roll(close, 1, axis=0) 159 | v9 = 1*v8 160 | v10 = -v9 161 | v11 = np.where(v6, v7, v10) 162 | v12 = np.where(v2, v3, v11) 163 | out[:] = v12[-1] 164 | ``` 165 | 166 | There is no compile-time optimization of the AST at all! What is happening is that the compiler walks down the AST and converts each node into a Python equivalent (`numpy`, `bottleneck`, and/or `pandas`) expression, keeping track of the call stack so that future references to prior calculations are correct. The resulting Python code is in the style of "three-address code". There is of course plenty of optimization which can be done. 167 | 168 | Note that there is no reference implementation of the expression-style alpha syntax to test against and that there are many specific details lacking the paper. As such, this implementation makes some assumptions where necessary (as a simple example, the paper does not specify if `rank` is ascending or descending, however, it obviously should be ascending as a larger raw value should produce a larger numberical rank to keep the alpha vector *directly* proportional). This is experimental and I have created only a handful of tests. 169 | 170 | ### Using Your Own Data in Expression Alphas 171 | 172 | It is also possible to use the "bring your own data" functionality provided by the `Factory` object in an expression alpha. This is done with one or more `factory` expressions. The syntax is 173 | 174 | * `factory("")`: where `""` is the name you would pass into the `Factory` object (for now assuming the data is in a column called "value"). Concretely, if you have a dataset, "sample", defined in the `data_sources.json` file, you can access it in an expression as: 175 | 176 | ``` 177 | (returns > 0) ? factory("sample") : -sum(returns, 5) 178 | ``` 179 | 180 | This compiles to the `Pipeline` factor as: 181 | 182 | ```python 183 | class ExprAlpha_1(CustomFactor): 184 | inputs = [Returns(window_length=2), Factory["sample"].value] 185 | window_length = 7 186 | 187 | def compute(self, today, assets, out, returns, factory0): 188 | v0 = np.greater(returns, 0) 189 | v1 = pd.DataFrame(data=returns).rolling( 190 | window=5, center=False, min_periods=1).sum().values 191 | v2 = -v1 192 | v3 = np.where(v0, factory0, v2) 193 | out[:] = v3[-1] 194 | ``` 195 | 196 | 197 | ## Installation 198 | 199 | Run the following in order: 200 | 201 | ``` 202 | git clone https://github.com/marketneutral/alphatools 203 | cd alphatools 204 | ./install_stable.sh 205 | zipline ingest 206 | ``` 207 | 208 | Note that when you run `zipline ingest` the security master is built from scratch and each `sid` is assigned at that time. You must map the `Sector`, `Industry` classifiers in this package **and all your own data** after every `zipline ingest`. You can map the `Sector` and `Industry` classifiers with 209 | 210 | ``` 211 | alphatools ingest 212 | ``` 213 | 214 | 215 | ## A Word on Sector and Industry Classfiers Included 216 | 217 | Sector and Industry data were scraped from Yahoo Finance on September 18, 2017 for the full Quandl WIKI universe at that time. The SIC and CIK codes were scraped from [Rank and Filed](http://rankandfiled.com/) on September 15, 2017. The classifiers built from this data assume that the codes have never and do never change (i.e., there is no concept of an asset being reclassified over time). **Be aware that there is lookahead bias in this** (e.g., a good example of why there is lookahead bias is with Corning, Inc. which is classified as a Technology/Electronic Components company in this dataset, but from 1851 to the 2000s(?) was actually classified as a boring Industrial glass company; the economic make up the company changed sometime in the early 1990s when optic fiber production became an important revenue driver and later with iPhone glass. At some point, the ICS providers changed the classification from "boring" to "high tech", but this was surely lagging the actual transformation of the company; hence...lookahead bias). 218 | 219 | ## A Word on Fundamental Data 220 | 221 | Altough there is a `Fundamentals` factor included, there is no Fundamental data included in the package. This factor was built on top of the `DataFrameLoader` to get a `pandas.DataFrame` into a factor. I think I will deprecate this in favor of using the `Factory` object as described above. In the meantime, the `Fundamentals` pipeline factors can be built from `make_fundamentals.py` with your own data. Note that these factors use the `DataFrameLoader` which means the data must fit in memory. 222 | 223 | ## Disclaimer 224 | 225 | Though this is in the `LICENSE` file, it bears noting that this software is provided on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE 226 | 227 | Additionally, nothing in this package constitutes investment advice. This package is a personal project and nothing in its functionality or examples is reflective of any past or current employer. 228 | 229 | Lastly, there are no automated tests (or any significnat tests for that matter), no automated nightly build, no docstrings, or any other features associated with what you might consider a well supported open source package. 230 | 231 | ## Contributing 232 | 233 | I hope you enjoy this package. Please leave feedback, or better, contribute. If you are planning to make a PR, please get in touch with me before you do any work as I have a project plan. I am figuring this out as I go and could use help, especially with (in order) 234 | 235 | - Incorporating `six` so that the package works with Python 3.x and Python 2.7 236 | - Creating tests and using Travis CI on this repo 237 | - Python packaging 238 | - Dockerizing this thing so we can avoid the painful install process 239 | -------------------------------------------------------------------------------- /alphatools/expression/expression.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import autopep8 3 | import itertools 4 | from lark import Lark, Transformer 5 | from os import path 6 | from scipy.stats import rankdata 7 | from six import iteritems, next 8 | 9 | 10 | class MyTransformer(Transformer): 11 | 12 | def __init__(self): 13 | self.cmdlist = [] 14 | self.window = 2 15 | self.vcounter = itertools.count() 16 | self.stack = [] 17 | 18 | self.imports = set() 19 | 20 | self.factory_counter = itertools.count() 21 | self.factories = dict() 22 | 23 | self.inputs = dict() 24 | 25 | 26 | def factory(self, items): 27 | self.imports.add('from alphatools.data import Factory') 28 | this_factory = self.factory_counter.next() 29 | self.stack.append('factory' + str(this_factory)) 30 | self.factories[this_factory] = items[0] 31 | self.inputs['factory'+str(this_factory)] = 'Factory['+items[0]+'].value' 32 | 33 | def neg(self, items): 34 | term1 = self.stack.pop() 35 | thisv = next(self.vcounter) 36 | self.stack.append('v' + str(thisv)) 37 | self.cmdlist.append( 38 | 'v' + str(thisv) + ' = -' + term1 39 | ) 40 | 41 | def rank(self, items): 42 | self.imports.add("from scipy.stats import rankdata") 43 | term1 = self.stack.pop() 44 | v1 = next(self.vcounter) 45 | self.cmdlist.append( 46 | 'v' + str(v1) + ' = np.apply_along_axis(rankdata, 1, ' + term1 +', method="ordinal")' 47 | ) 48 | v2 = next(self.vcounter) 49 | self.stack.append('v' + str(v2)) 50 | self.cmdlist.append( 51 | 'v' + str(v2) + ' = np.divide(v'+str(v1)+'.astype(float), np.sum(~np.isnan(v'+str(v1)+'), axis=1).reshape(v'+str(v1)+'.shape[0], 1))' 52 | ) 53 | 54 | 55 | # def close(self, items): 56 | # thisv = self.vcounter.next() 57 | # self.stack.append('v' + str(thisv)) 58 | # self.cmdlist.append( 59 | # 'v' + str(thisv) + ' = close' 60 | # ) 61 | 62 | def cap(self, items): 63 | thisv = next(self.vcounter) 64 | self.stack.append('v' + str(thisv)) 65 | self.cmdlist.append( 66 | 'v' + str(thisv) + ' = 1.0' 67 | ) 68 | 69 | def number(self, items): 70 | #import pdb; pdb.set_trace() 71 | self.stack.append(str(items[0].value)) 72 | pass 73 | 74 | def close(self, items): 75 | self.imports.add('from zipline.pipeline.data import USEquityPricing as USEP') 76 | self.inputs['close'] = 'USEP.close' 77 | self.stack.append('close') 78 | 79 | def high(self, items): 80 | self.imports.add('from zipline.pipeline.data import USEquityPricing as USEP') 81 | self.inputs['high'] = 'USEP.high' 82 | self.stack.append('high') 83 | 84 | def low(self, items): 85 | self.imports.add('from zipline.pipeline.data import USEquityPricing as USEP') 86 | self.inputs['low'] = 'USEP.low' 87 | self.stack.append('low') 88 | 89 | def volume(self, items): 90 | self.imports.add('from zipline.pipeline.data import USEquityPricing as USEP') 91 | self.inputs['volume'] = 'USEP.volume' 92 | self.stack.append('volume') 93 | 94 | def vwap(self, items): 95 | self.imports.add('from zipline.pipeline.data import USEquityPricing as USEP') 96 | self.inputs['close'] = 'USEP.close' 97 | self.inputs['opens'] = 'USEP.open' 98 | self.inputs['high'] = 'USEP.high' 99 | self.inputs['low'] = 'USEP.low' 100 | 101 | thisv = next(self.vcounter) 102 | self.stack.append('v' + str(thisv)) 103 | self.cmdlist.append( 104 | 'v' + str(thisv) + ' = (close + (opens + high + low)/3)/2' 105 | ) 106 | 107 | def adv(self, items): 108 | self.imports.add('from zipline.pipeline.data import USEquityPricing as USEP') 109 | self.inputs['close'] = 'USEP.close' 110 | self.inputs['volume'] = 'USEP.volume' 111 | thisv = next(self.vcounter) 112 | self.stack.append('v' + str(thisv)) 113 | self.window = max([self.window, int(items[0])+2]) 114 | self.cmdlist.append( 115 | 'v' + str(thisv) + ' = bn.move_mean(np.multiply(close, volume), window=' + items[0] + ', min_count=1, axis=0)' 116 | ) 117 | # def opens(self, items): 118 | # thisv = self.vcounter.next() 119 | # self.stack.append('v' + str(thisv)) 120 | # self.cmdlist.append( 121 | # 'v' + str(thisv) + ' = opens' 122 | # ) 123 | 124 | def opens(self, items): 125 | self.imports.add('from zipline.pipeline.data import USEquityPricing as USEP') 126 | self.inputs['opens'] = 'USEP.open' 127 | self.stack.append('opens') 128 | 129 | def div(self, items): 130 | term2 = self.stack.pop() 131 | term1 = self.stack.pop() 132 | thisv = next(self.vcounter) 133 | self.stack.append('v' + str(thisv)) 134 | self.cmdlist.append( 135 | 'v' + str(thisv) + ' = ' + term1 + ' / ' + term2 136 | ) 137 | 138 | def min(self, items): 139 | # TODO: check that this is parallel min 140 | term2 = self.stack.pop() 141 | term1 = self.stack.pop() 142 | thisv = next(self.vcounter) 143 | self.stack.append('v' + str(thisv)) 144 | self.cmdlist.append( 145 | 'v' + str(thisv) + ' = np.minimum('+term1 + ', ' + term2+')' 146 | ) 147 | 148 | def max(self, items): 149 | # TODO: check that this is parallel max 150 | # paper says this is == ts_min, but that doesn't parse for alpha 71 151 | term2 = self.stack.pop() 152 | term1 = self.stack.pop() 153 | thisv = next(self.vcounter) 154 | self.stack.append('v' + str(thisv)) 155 | self.cmdlist.append( 156 | 'v' + str(thisv) + ' = np.maximum('+term1 + ', ' + term2+')' 157 | ) 158 | 159 | def powerof(self, items): 160 | """ Element-wise power """ 161 | 162 | term2 = self.stack.pop() 163 | term1 = self.stack.pop() 164 | thisv = next(self.vcounter) 165 | self.stack.append('v' + str(thisv)) 166 | self.cmdlist.append( 167 | 'v' + str(thisv) + ' = np.power(' + term1 + ', ' + term2 + ')' 168 | ) 169 | 170 | def signedpower(self, items): 171 | """ np.sign(term1)*np.power(np.abs(term1), term2) """ 172 | 173 | term2 = self.stack.pop() 174 | term1 = self.stack.pop() 175 | thisv = next(self.vcounter) 176 | self.stack.append('v' + str(thisv)) 177 | self.cmdlist.append( 178 | 'v' + str(thisv) + ' = np.sign('+term1+')*np.power(np.abs(' + term1 + '), ' + term2 + ')' 179 | ) 180 | 181 | 182 | def minus(self, items): 183 | term2 = self.stack.pop() 184 | term1 = self.stack.pop() 185 | thisv = next(self.vcounter) 186 | self.stack.append('v' + str(thisv)) 187 | self.cmdlist.append( 188 | 'v' + str(thisv) + ' = ' + term1 + ' - ' + term2 189 | ) 190 | 191 | def plus(self, items): 192 | term2 = self.stack.pop() 193 | term1 = self.stack.pop() 194 | thisv = next(self.vcounter) 195 | self.stack.append('v' + str(thisv)) 196 | self.cmdlist.append( 197 | 'v' + str(thisv) + ' = ' + term1 + ' + ' + term2 198 | ) 199 | 200 | def mult(self, items): 201 | term2 = self.stack.pop() 202 | term1 = self.stack.pop() 203 | thisv = next(self.vcounter) 204 | self.stack.append('v' + str(thisv)) 205 | self.cmdlist.append( 206 | 'v' + str(thisv) + ' = ' + term1 + '*' + term2 207 | ) 208 | 209 | def log(self, items): 210 | term1 = self.stack.pop() 211 | thisv = next(self.vcounter) 212 | self.stack.append('v' + str(thisv)) 213 | self.cmdlist.append( 214 | 'v' + str(thisv) + ' = np.log(' + term1 + ')' 215 | ) 216 | 217 | def abs(self, items): 218 | term1 = self.stack.pop() 219 | thisv = next(self.vcounter) 220 | self.stack.append('v' + str(thisv)) 221 | self.cmdlist.append( 222 | 'v' + str(thisv) + ' = np.abs(' + term1 + ')' 223 | ) 224 | 225 | def sign(self, items): 226 | term1 = self.stack.pop() 227 | thisv = next(self.vcounter) 228 | self.stack.append('v' + str(thisv)) 229 | self.cmdlist.append( 230 | 'v' + str(thisv) + ' = np.sign(' + term1 + ')' 231 | ) 232 | 233 | def scale(self, items): 234 | # TODO: 101 paper says scaled sum(abs)==a; silent on mean 235 | term1 = self.stack.pop() 236 | thisv = next(self.vcounter) 237 | self.stack.append('v' + str(thisv)) 238 | self.cmdlist.append( 239 | 'v' + str(thisv) + ' = np.apply_along_axis(lambda x: (x - np.nanmean(x))/np.nansum(np.abs(x - np.nanmean(x))), 1, ' + term1 +')' 240 | ) 241 | 242 | def mult(self, items): 243 | term2 = self.stack.pop() 244 | term1 = self.stack.pop() 245 | thisv = next(self.vcounter) 246 | self.stack.append('v' + str(thisv)) 247 | self.cmdlist.append( 248 | 'v' + str(thisv) + ' = ' + term1 + '*' + term2 249 | ) 250 | 251 | def greaterthan(self, items): 252 | term2 = self.stack.pop() 253 | term1 = self.stack.pop() 254 | thisv = next(self.vcounter) 255 | self.stack.append('v' + str(thisv)) 256 | self.cmdlist.append( 257 | 'v' + str(thisv) + ' = np.greater(' + term1 + ', ' + term2 + ')' 258 | ) 259 | 260 | def lessthan(self, items): 261 | term2 = self.stack.pop() 262 | term1 = self.stack.pop() 263 | thisv = next(self.vcounter) 264 | self.stack.append('v' + str(thisv)) 265 | self.cmdlist.append( 266 | 'v' + str(thisv) + ' = np.less(' + term1 + ', ' + term2 + ')' 267 | ) 268 | 269 | def equals(self, items): 270 | # TODO: do we want np.isclose or np.allcose? 271 | term2 = self.stack.pop() 272 | term1 = self.stack.pop() 273 | thisv = next(self.vcounter) 274 | self.stack.append('v' + str(thisv)) 275 | self.cmdlist.append( 276 | 'v' + str(thisv) + ' = np.isclose(' + term1 + ', ' + term2 + ')' 277 | ) 278 | 279 | def logicalor(self, items): 280 | term2 = self.stack.pop() 281 | term1 = self.stack.pop() 282 | thisv = next(self.vcounter) 283 | self.stack.append('v' + str(thisv)) 284 | self.cmdlist.append( 285 | 'v' + str(thisv) + ' = np.logical_or(' + term1 + ', ' + term2 + ')' 286 | ) 287 | 288 | def ternary(self, items): 289 | term3 = self.stack.pop() 290 | term2 = self.stack.pop() 291 | term1 = self.stack.pop() 292 | thisv = next(self.vcounter) 293 | self.stack.append('v' + str(thisv)) 294 | self.cmdlist.append( 295 | 'v' + str(thisv) + ' = np.where(' + term1 + ', ' + term2 + ', ' + term3 + ')' 296 | ) 297 | 298 | def returns(self, items): 299 | self.imports.add("from zipline.pipeline.factors import Returns") 300 | self.inputs['returns'] = 'Returns(window_length=2)' 301 | self.stack.append('returns') 302 | #thisv = self.vcounter.next() 303 | #self.window = self.window+1 304 | #self.stack.append('v' + str(thisv)) 305 | #self.cmdlist.append( 306 | # 'v' + str(thisv) + ' = np.log(close/np.roll(close, 1, axis=0))' 307 | #) 308 | 309 | 310 | def delta(self, items): 311 | term1 = self.stack.pop() 312 | thisv = next(self.vcounter) 313 | self.window = self.window+int(items[1]) 314 | self.stack.append('v' + str(thisv)) 315 | self.cmdlist.append( 316 | 'v' + str(thisv) + ' = '+term1+' - np.roll(' + term1 + ', ' + items[1] + ', axis=0)' 317 | ) 318 | 319 | def delay(self, items): 320 | term1 = self.stack.pop() 321 | thisv = next(self.vcounter) 322 | self.window = self.window+int(items[1]) 323 | self.stack.append('v' + str(thisv)) 324 | self.cmdlist.append( 325 | 'v' + str(thisv) + ' = np.roll(' + term1 + ', ' + items[1] + ', axis=0)' 326 | ) 327 | 328 | def ts_max(self, items): 329 | v1 = self.stack.pop() 330 | thisv = next(self.vcounter) 331 | self.window = self.window + int(items[1]) 332 | self.stack.append('v' + str(thisv)) 333 | self.cmdlist.append( 334 | 'v' + str(thisv) + ' = bn.move_max(' + v1 + ', window=' + items[1] + ', min_count=1, axis=0)' 335 | ) 336 | 337 | def ts_min(self, items): 338 | v1 = self.stack.pop() 339 | thisv = next(self.vcounter) 340 | self.window = self.window + int(items[1]) 341 | self.stack.append('v' + str(thisv)) 342 | self.cmdlist.append( 343 | 'v' + str(thisv) + ' = bn.move_min(' + v1 + ', window=' + items[1] + ', min_count=1, axis=0)' 344 | ) 345 | 346 | def ts_argmax(self, items): 347 | """ 348 | The behavior of `move_argmax` and associated functions in Numpy 349 | and Bottleneck is that they index based on the shape of the array. 350 | In this case the time increases along the 0 axis so, if window is 351 | 10 days, and the max is in the most recent day, it will return 9; 352 | If the max is in the earliest day it will return zero. I add "1" to 353 | this imagining a mutiplier, and do not want zero to kill values. 354 | It is then rescaled to the interval (0,1] to match the `rank` style. 355 | """ 356 | v1 = self.stack.pop() 357 | thisv = next(self.vcounter) 358 | self.window = self.window + int(items[1]) 359 | self.stack.append('v' + str(thisv)) 360 | self.cmdlist.append( 361 | 'v' + str(thisv) + ' = (1. + bn.move_argmax(' + v1 + ', window=' + items[1] + ', min_count=1, axis=0))/' + items[1] 362 | ) 363 | 364 | def ts_argmin(self, items): 365 | v1 = self.stack.pop() 366 | thisv = next(self.vcounter) 367 | self.window = self.window + int(items[1]) 368 | self.stack.append('v' + str(thisv)) 369 | self.cmdlist.append( 370 | 'v' + str(thisv) + ' = (1. + bn.move_argmin(' + v1 + ', window=' + items[1] + ', min_count=1, axis=0))/' + items[1] 371 | ) 372 | 373 | def ts_rank(self, items): 374 | # Returns ranks 1-N; largest value is rank N 375 | # `bn.move_rank` returns values in the range -1 to 1.0, so we add 1 376 | # to get 0-2 and then divide by 2.0 to get [0,1] 377 | # note that we want [1/N, 1] 378 | v1 = self.stack.pop() 379 | thisv = next(self.vcounter) 380 | self.window = self.window + int(items[1]) 381 | self.stack.append('v' + str(thisv)) 382 | self.cmdlist.append( 383 | 'v' + str(thisv) + ' = (1. + bn.move_rank(' + v1 + ', window=' + items[1] + ', min_count=1, axis=0))/2.0' 384 | ) 385 | 386 | def stddev(self, items): 387 | # check that the day is what we want 388 | v1 = self.stack.pop() 389 | thisv = next(self.vcounter) 390 | self.window = self.window + int(items[1]) 391 | self.stack.append('v' + str(thisv)) 392 | self.cmdlist.append( 393 | 'v' + str(thisv) + ' = bn.move_std(' + v1 + ', window=' + items[1] + ', min_count=1, axis=0)' 394 | ) 395 | 396 | def sum(self, items): 397 | v1 = self.stack.pop() 398 | thisv = next(self.vcounter) 399 | self.window = self.window + int(items[1]) 400 | self.stack.append('v' + str(thisv)) 401 | self.cmdlist.append( 402 | 'v' + str(thisv) + ' = pd.DataFrame(data='+v1+').rolling(window='+items[1]+', center=False, min_periods=1).sum().values' 403 | ) 404 | 405 | def product(self, items): 406 | v1 = self.stack.pop() 407 | thisv = next(self.vcounter) 408 | self.window = self.window + int(items[1]) 409 | self.stack.append('v' + str(thisv)) 410 | self.cmdlist.append( 411 | 'v' + str(thisv) + ' = pd.DataFrame(data='+v1+').rolling(window='+items[1]+', center=False, min_periods=1).apply(lambda x: np.prod(x)).values' 412 | ) 413 | 414 | def correlation(self, items): 415 | v2 = self.stack.pop() 416 | v1 = self.stack.pop() 417 | thisv = next(self.vcounter) 418 | self.window = self.window + int(items[2]) 419 | self.stack.append('v' + str(thisv)) 420 | self.cmdlist.append( 421 | 'v' + str(thisv) + ' = pd.DataFrame('+v1+').rolling(window='+items[2]+', min_periods='+items[2]+').corr(other=pd.DataFrame('+v2+')).values' 422 | ) 423 | 424 | def covariance(self, items): 425 | v2 = self.stack.pop() 426 | v1 = self.stack.pop() 427 | thisv = next(self.vcounter) 428 | self.window = self.window + int(items[2]) 429 | self.stack.append('v' + str(thisv)) 430 | self.cmdlist.append( 431 | 'v' + str(thisv) + ' = pd.DataFrame('+v1+').rolling(window='+items[2]+', min_periods='+items[2]+').cov(other=pd.DataFrame('+v2+')).values' 432 | ) 433 | 434 | def decay_linear(self, items): 435 | v1 = self.stack.pop() 436 | thisv = next(self.vcounter) 437 | days = int(items[1]) 438 | self.window = self.window + days 439 | v2 = 'v'+str(thisv) 440 | self.cmdlist.append( 441 | v2 + ' = (np.arange(' + items[1] + ')+1.)/np.sum(np.arange(' + items[1]+ ')+1.)' 442 | ) 443 | thisv = next(self.vcounter) 444 | self.stack.append('v' + str(thisv)) 445 | 446 | self.cmdlist.append( 447 | 'v' + str(thisv) + ' = pd.DataFrame(data='+v1+').rolling(window='+items[1]+', center=False, min_periods='+items[1]+').apply(lambda x: (x*'+v2+').sum()).values' 448 | ) 449 | 450 | def indneutralize(self, items): 451 | """ 452 | De-means a data matrix, data, DxN, D days in rows x N stocks in 453 | columns by group means. 454 | 455 | The group means come from Pipeline Classifiers: Sector() and 456 | SubIndustry(). These are integer values per stock; -1 for missing. 457 | 458 | The Classifier produces a matrix window_lengthxN. We need the last 459 | slice of this, assuming that the data is constant per day. 460 | 461 | We set up a factor indicator matrix, OHE, like a one-hot-encoded 462 | matrix. 463 | 464 | # set up OHE matrix; add 1 so that missing now == 0 465 | OHE = np.zeros(N, classifier.max()+2) 466 | OHE[np.arange(N), classifier[-1] + 1) = 1 467 | 468 | # The per day (rows) by per industry (columns) mean is 469 | per_day_per_ind_mean = data.dot(OHE)/OHE.sum(axis=0) 470 | 471 | # The per day (rows) per *asset* (column) mean then is 472 | per_day_per_asset_ind_mean = per_day_per_ind_mean.dot(OHE.T) 473 | 474 | Finally, the de-meaned data matrix is simply calculated as 475 | 476 | data = data - per_day_per_asset_ind_mean 477 | """ 478 | self.imports.add("from alphatools.ics import Sector, SubIndustry") 479 | self.inputs['sector'] = 'Sector()' 480 | self.inputs['subindustry'] = 'SubIndustry()' 481 | 482 | groupmap = { 483 | 'IndClass.subindustry': 'subindustry', 484 | 'IndClass.sector': 'sector', 485 | 'IndClass.industry': 'subindustry', 486 | } 487 | 488 | v1 = self.stack.pop() 489 | if len(items)<2: 490 | groupby = 'IndClass.subindustry' 491 | else: 492 | groupby = str(items[1]) 493 | 494 | group_label = groupmap[groupby] 495 | 496 | # set up ICS matrix (like one-hot-encoded matrix); we add 1 to the 497 | # ics scheme bc -1 is a missing, so increment all by 1 498 | ohe = 'v' + str(next(self.vcounter)) 499 | self.cmdlist.append( 500 | ohe + ' = np.zeros(('+group_label+'.shape[1], '+group_label+'.max()+2))' 501 | ) 502 | self.cmdlist.append( 503 | ohe + '[np.arange('+group_label+'.shape[1]), '+group_label+'[-1] + 1] = 1' 504 | ) 505 | 506 | # get industry mean, per industry on columns, per day on rows 507 | # and the dot(ohe.T) gives per stock industry mean 508 | ind_mean = 'v' + str(next(self.vcounter)) 509 | self.cmdlist.append( 510 | ind_mean + ' = (np.nan_to_num('+v1+'.dot('+ohe+')/'+ohe+'.sum(axis=0))).dot('+ohe+'.T)' 511 | ) 512 | 513 | thisv = next(self.vcounter) 514 | self.stack.append('v' + str(thisv)) 515 | # subtract the per stock industry mean 516 | self.cmdlist.append( 517 | 'v' + str(thisv) + ' = '+v1+' - '+ind_mean 518 | ) 519 | 520 | 521 | def transform(self, tree): 522 | self._transform_tree(tree) 523 | v1 = self.stack.pop() 524 | self.cmdlist.append( 525 | 'out[:] = ' + v1 + '[-1]' 526 | ) 527 | return self 528 | #return ["window_length = "+str(self.window)] + self.cmdlist 529 | 530 | 531 | class ExpressionAlpha(): 532 | 533 | def __init__(self, expr_string): 534 | self.expr_string = expr_string 535 | self.code = "" 536 | fname = path.join(path.dirname(__file__), 'expression.lark') 537 | with open(fname, 'r') as grammar_file: 538 | self.grammar = grammar_file.read() 539 | 540 | def make_pipeline_factor(self): 541 | self.parse() 542 | self.transform() 543 | self.generate_pipeline_code() 544 | exec(self.imports, globals(), globals()) 545 | exec(self.pipeline_code, globals(), globals()) 546 | self.pipeline_factor = ExprAlpha_1 547 | return self 548 | 549 | def parse(self): 550 | my_parser = Lark(self.grammar, start='value') 551 | self.tree = my_parser.parse(self.expr_string) 552 | return self 553 | 554 | def transform(self): 555 | self.transformed = MyTransformer().transform(self.tree) 556 | return self 557 | 558 | def generate_pipeline_code(self): 559 | raw_np_list = \ 560 | ["window_length = "+str(self.transformed.window)] + \ 561 | self.transformed.cmdlist 562 | raw_imports = \ 563 | self.transformed.imports 564 | 565 | (data_names, factor_names) = zip(*iteritems(self.transformed.inputs)) 566 | 567 | self.imports = ['{0}\n'.format(imp) for imp in raw_imports] 568 | self.imports.append("from zipline.pipeline.factors import CustomFactor\n") 569 | self.imports.append("import numpy as np\n") 570 | self.imports.append("import bottleneck as bn\n") 571 | self.imports.append("import pandas as pd\n") 572 | self.imports = ["from __future__ import division\n"] + \ 573 | self.imports 574 | 575 | self.code = ["class ExprAlpha_1(CustomFactor):"] 576 | 577 | self.code.append(" inputs = [" + ', '.join(factor_names) + "]") 578 | self.code.append(' {0}'.format(raw_np_list[0])) 579 | self.code.append(" def compute(self, today, assets, out, " + ', '.join(data_names) + "):") 580 | lst = [' {0}'.format(elem) for elem in raw_np_list] 581 | 582 | self.code = self.code + lst[1:] 583 | 584 | self.imports = ''.join(self.imports) 585 | 586 | self.code_string = '\n'.join(self.code) 587 | self.pipeline_code = autopep8.fix_code(self.code_string) 588 | return self 589 | 590 | if __name__ == '__main__': 591 | e = ExpressionAlpha('close/delay(opens,1)') 592 | e.to_pipeline() 593 | print(e.pipeline_code) 594 | -------------------------------------------------------------------------------- /notebooks/pipeline-blaze-minimal.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import blaze as bz\n", 10 | "import numpy as np\n", 11 | "import pandas as pd\n", 12 | "import sqlite3\n", 13 | "import itertools\n", 14 | "\n", 15 | "from zipline.data import bundles\n", 16 | "from zipline.utils.calendars import get_calendar\n", 17 | "\n", 18 | "from zipline.pipeline import Pipeline\n", 19 | "from zipline.pipeline.data import USEquityPricing\n", 20 | "from zipline.pipeline.data import DataSet\n", 21 | "from zipline.pipeline.engine import SimplePipelineEngine\n", 22 | "from zipline.pipeline.filters import StaticAssets\n", 23 | "from zipline.pipeline.loaders import USEquityPricingLoader\n", 24 | "from zipline.pipeline.loaders.blaze import BlazeLoader, from_blaze\n", 25 | "\n", 26 | "\n", 27 | "trading_calendar = get_calendar('NYSE')\n", 28 | "bundle_data = bundles.load('quandl')\n", 29 | "\n", 30 | "# spoof some data\n", 31 | "\n", 32 | "np.random.seed(100)\n", 33 | "\n", 34 | "start = trading_calendar.closes.index.get_loc('2016-01-04 00:00:00+00:00')\n", 35 | "end = trading_calendar.closes.index.get_loc('2018-08-06 00:00:00+00:00')\n", 36 | "\n", 37 | "#dates = list(trading_calendar.closes.index)[start:end]\n", 38 | "dates = trading_calendar.closes.index[start:end]\n", 39 | "sids = bundle_data.asset_finder.sids\n", 40 | "\n", 41 | "df = pd.DataFrame(\n", 42 | " data={'value': np.random.random(size=len(dates)*len(sids))},\n", 43 | " index = pd.MultiIndex.from_tuples(list(itertools.product(dates,sids)), names=('asof_date', 'sid'))\n", 44 | ")" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 2, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "df = df.reset_index()" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 3, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "# this is necessary because sqlite3 doesn't like it if we have the time\n", 63 | "df.asof_date = df.asof_date.dt.date" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 4, 69 | "metadata": {}, 70 | "outputs": [ 71 | { 72 | "data": { 73 | "text/html": [ 74 | "
\n", 75 | "\n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | "
asof_datesidvalue
02016-01-0400.543405
12016-01-0410.278369
22016-01-0420.424518
32016-01-0430.844776
42016-01-0440.004719
\n", 117 | "
" 118 | ], 119 | "text/plain": [ 120 | " asof_date sid value\n", 121 | "0 2016-01-04 0 0.543405\n", 122 | "1 2016-01-04 1 0.278369\n", 123 | "2 2016-01-04 2 0.424518\n", 124 | "3 2016-01-04 3 0.844776\n", 125 | "4 2016-01-04 4 0.004719" 126 | ] 127 | }, 128 | "execution_count": 4, 129 | "metadata": {}, 130 | "output_type": "execute_result" 131 | } 132 | ], 133 | "source": [ 134 | "df.head()" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 5, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "# put into sqlite db\n", 144 | "df.to_sql('ds_table', con=sqlite3.connect('temp.db'), if_exists='replace', index=False)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 6, 150 | "metadata": {}, 151 | "outputs": [ 152 | { 153 | "name": "stdout", 154 | "output_type": "stream", 155 | "text": [ 156 | "[('2016-01-04', 0, 0.5434049417909654), ('2016-01-04', 1, 0.27836938509379616), ('2016-01-04', 2, 0.4245175907491331), ('2016-01-04', 3, 0.8447761323199037), ('2016-01-04', 4, 0.004718856190972565), ('2016-01-04', 5, 0.12156912078311422), ('2016-01-04', 6, 0.6707490847267786), ('2016-01-04', 7, 0.8258527551050476), ('2016-01-04', 8, 0.13670658968495297), ('2016-01-04', 9, 0.57509332942725)]\n" 157 | ] 158 | } 159 | ], 160 | "source": [ 161 | "con = sqlite3.connect('temp.db')\n", 162 | "cursor = con.cursor()\n", 163 | "cursor.execute(\"SELECT * FROM ds_table LIMIT 10\")\n", 164 | "print(cursor.fetchall())" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 7, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "from datashape import dshape" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 8, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "ds_dshape = dshape(\"var*{asof_date: datetime, sid: int64, value: float64}\")" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 9, 188 | "metadata": {}, 189 | "outputs": [ 190 | { 191 | "data": { 192 | "text/plain": [ 193 | "dshape(\"var * {asof_date: datetime, sid: int64, value: float64}\")" 194 | ] 195 | }, 196 | "execution_count": 9, 197 | "metadata": {}, 198 | "output_type": "execute_result" 199 | } 200 | ], 201 | "source": [ 202 | "ds_dshape" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 10, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "# create the blaze expr\n", 212 | "\n", 213 | "expr = bz.Data(\n", 214 | " 'sqlite:///temp.db::ds_table',\n", 215 | " dshape=ds_dshape\n", 216 | ")" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 11, 222 | "metadata": {}, 223 | "outputs": [ 224 | { 225 | "data": { 226 | "text/plain": [ 227 | "dshape(\"var * {asof_date: datetime, sid: int64, value: float64}\")" 228 | ] 229 | }, 230 | "execution_count": 11, 231 | "metadata": {}, 232 | "output_type": "execute_result" 233 | } 234 | ], 235 | "source": [ 236 | "expr.dshape" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 12, 242 | "metadata": {}, 243 | "outputs": [ 244 | { 245 | "data": { 246 | "text/html": [ 247 | "
\n", 248 | "\n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | "
asof_datesidvalue
02016-01-0400.543405
12016-01-0410.278369
22016-01-0420.424518
32016-01-0430.844776
42016-01-0440.004719
52016-01-0450.121569
62016-01-0460.670749
72016-01-0470.825853
82016-01-0480.136707
92016-01-0490.575093
102016-01-04100.891322
\n", 326 | "
" 327 | ], 328 | "text/plain": [ 329 | " asof_date sid value\n", 330 | "0 2016-01-04 0 0.543405\n", 331 | "1 2016-01-04 1 0.278369\n", 332 | "2 2016-01-04 2 0.424518\n", 333 | "3 2016-01-04 3 0.844776\n", 334 | "4 2016-01-04 4 0.004719\n", 335 | "5 2016-01-04 5 0.121569\n", 336 | "6 2016-01-04 6 0.670749\n", 337 | "7 2016-01-04 7 0.825853\n", 338 | "8 2016-01-04 8 0.136707\n", 339 | "9 2016-01-04 9 0.575093\n", 340 | "10 2016-01-04 10 0.891322" 341 | ] 342 | }, 343 | "execution_count": 12, 344 | "metadata": {}, 345 | "output_type": "execute_result" 346 | } 347 | ], 348 | "source": [ 349 | "expr.peek()" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": 13, 355 | "metadata": {}, 356 | "outputs": [ 357 | { 358 | "data": { 359 | "text/plain": [ 360 | "dshape(\"{asof_date: datetime, sid: int64, value: float64}\")" 361 | ] 362 | }, 363 | "execution_count": 13, 364 | "metadata": {}, 365 | "output_type": "execute_result" 366 | } 367 | ], 368 | "source": [ 369 | "expr.schema" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": 14, 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [ 378 | "# create and empty BlazeLoader\n", 379 | "my_blaze_loader = BlazeLoader()\n", 380 | "\n", 381 | "# create the DataSet\n", 382 | "ds = from_blaze(\n", 383 | " expr,\n", 384 | " no_deltas_rule='ignore',\n", 385 | " no_checkpoints_rule='ignore',\n", 386 | " loader=my_blaze_loader,\n", 387 | " missing_values={'index':-1}\n", 388 | ")" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": 15, 394 | "metadata": {}, 395 | "outputs": [ 396 | { 397 | "data": { 398 | "text/plain": [ 399 | "True" 400 | ] 401 | }, 402 | "execution_count": 15, 403 | "metadata": {}, 404 | "output_type": "execute_result" 405 | } 406 | ], 407 | "source": [ 408 | "issubclass(ds, DataSet)" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": 16, 414 | "metadata": {}, 415 | "outputs": [ 416 | { 417 | "data": { 418 | "text/plain": [ 419 | "" 420 | ] 421 | }, 422 | "execution_count": 16, 423 | "metadata": {}, 424 | "output_type": "execute_result" 425 | } 426 | ], 427 | "source": [ 428 | "my_blaze_loader" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": 17, 434 | "metadata": {}, 435 | "outputs": [], 436 | "source": [ 437 | "pipeline_loader = USEquityPricingLoader(\n", 438 | " bundle_data.equity_daily_bar_reader,\n", 439 | " bundle_data.adjustment_reader,\n", 440 | ")" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": 18, 446 | "metadata": {}, 447 | "outputs": [], 448 | "source": [ 449 | "def choose_loader(column):\n", 450 | " if column in USEquityPricing.columns:\n", 451 | " return pipeline_loader\n", 452 | " else:\n", 453 | " return my_blaze_loader" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": 19, 459 | "metadata": {}, 460 | "outputs": [], 461 | "source": [ 462 | "engine = SimplePipelineEngine(\n", 463 | " get_loader=choose_loader,\n", 464 | " calendar=trading_calendar.all_sessions,\n", 465 | " asset_finder=bundle_data.asset_finder,\n", 466 | ")" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": 20, 472 | "metadata": {}, 473 | "outputs": [], 474 | "source": [ 475 | "assets = bundle_data.asset_finder.lookup_symbols(['A', 'AAL'], as_of_date=None)" 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": 21, 481 | "metadata": {}, 482 | "outputs": [], 483 | "source": [ 484 | "p = Pipeline(\n", 485 | " columns={\n", 486 | " 'price': USEquityPricing.close.latest,\n", 487 | " 'col_A': ds.value.latest,\n", 488 | " },\n", 489 | " screen=StaticAssets(assets)\n", 490 | ")" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": 22, 496 | "metadata": {}, 497 | "outputs": [ 498 | { 499 | "data": { 500 | "image/svg+xml": [ 501 | "\n", 502 | "\n", 503 | "G\n", 504 | "\n", 505 | "\n", 506 | "cluster_Output\n", 507 | "\n", 508 | "Output\n", 509 | "\n", 510 | "\n", 511 | "cluster_Input\n", 512 | "\n", 513 | "Input\n", 514 | "\n", 515 | "\n", 516 | "\n", 517 | "5004224496\n", 518 | "\n", 519 | "StaticAssets\n", 520 | "\n", 521 | "\n", 522 | "\n", 523 | "5004224048\n", 524 | "\n", 525 | "Latest\n", 526 | "\n", 527 | "\n", 528 | "\n", 529 | "5004223264\n", 530 | "\n", 531 | "Latest\n", 532 | "\n", 533 | "\n", 534 | "\n", 535 | "4623743016\n", 536 | "\n", 537 | "BoundColumn:\n", 538 | "  Dataset: USEquityPricing\n", 539 | "  Column: close\n", 540 | "\n", 541 | "\n", 542 | "\n", 543 | "4623743016->5004224048\n", 544 | "\n", 545 | "\n", 546 | "\n", 547 | "\n", 548 | "\n", 549 | "4626226368\n", 550 | "\n", 551 | "BoundColumn:\n", 552 | "  Dataset: BlazeDataSet_0\n", 553 | "  Column: value\n", 554 | "\n", 555 | "\n", 556 | "\n", 557 | "4626226368->5004223264\n", 558 | "\n", 559 | "\n", 560 | "\n", 561 | "\n", 562 | "" 563 | ], 564 | "text/plain": [ 565 | "" 566 | ] 567 | }, 568 | "execution_count": 22, 569 | "metadata": {}, 570 | "output_type": "execute_result" 571 | } 572 | ], 573 | "source": [ 574 | "p.show_graph()" 575 | ] 576 | }, 577 | { 578 | "cell_type": "code", 579 | "execution_count": 23, 580 | "metadata": {}, 581 | "outputs": [ 582 | { 583 | "ename": "TypeError", 584 | "evalue": "Cannot change data-type for object array.", 585 | "output_type": "error", 586 | "traceback": [ 587 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 588 | "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", 589 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTimestamp\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'2016-01-05'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtz\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'utc'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTimestamp\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'2018-01-04'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtz\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'utc'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m )\n", 590 | "\u001b[0;32m/anaconda3/envs/testenv/lib/python3.5/site-packages/zipline/pipeline/engine.py\u001b[0m in \u001b[0;36mrun_pipeline\u001b[0;34m(self, pipeline, start_date, end_date)\u001b[0m\n\u001b[1;32m 309\u001b[0m \u001b[0mdates\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 310\u001b[0m \u001b[0massets\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 311\u001b[0;31m \u001b[0minitial_workspace\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 312\u001b[0m )\n\u001b[1;32m 313\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 591 | "\u001b[0;32m/anaconda3/envs/testenv/lib/python3.5/site-packages/zipline/pipeline/engine.py\u001b[0m in \u001b[0;36mcompute_chunk\u001b[0;34m(self, graph, dates, assets, initial_workspace)\u001b[0m\n\u001b[1;32m 522\u001b[0m \u001b[0mloader\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_loader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mterm\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 523\u001b[0m loaded = loader.load_adjusted_array(\n\u001b[0;32m--> 524\u001b[0;31m \u001b[0mto_load\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmask_dates\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0massets\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 525\u001b[0m )\n\u001b[1;32m 526\u001b[0m assert set(loaded) == set(to_load), (\n", 592 | "\u001b[0;32m/anaconda3/envs/testenv/lib/python3.5/site-packages/zipline/pipeline/loaders/blaze/core.py\u001b[0m in \u001b[0;36mload_adjusted_array\u001b[0;34m(self, columns, dates, assets, mask)\u001b[0m\n\u001b[1;32m 891\u001b[0m self.pool.imap_unordered(\n\u001b[1;32m 892\u001b[0m \u001b[0mpartial\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_load_dataset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdates\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0massets\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmask\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 893\u001b[0;31m \u001b[0mitervalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgetitem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_table_expressions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 894\u001b[0m ),\n\u001b[1;32m 895\u001b[0m )\n", 593 | "\u001b[0;32m/anaconda3/envs/testenv/lib/python3.5/site-packages/toolz/dicttoolz.py\u001b[0m in \u001b[0;36mmerge\u001b[0;34m(*dicts, **kwargs)\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 37\u001b[0m \u001b[0mrv\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfactory\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 38\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0md\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdicts\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 39\u001b[0m \u001b[0mrv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0md\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 40\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mrv\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 594 | "\u001b[0;32m/anaconda3/envs/testenv/lib/python3.5/site-packages/zipline/pipeline/loaders/blaze/core.py\u001b[0m in \u001b[0;36m_load_dataset\u001b[0;34m(self, dates, assets, mask, columns)\u001b[0m\n\u001b[1;32m 985\u001b[0m \u001b[0massets\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 986\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 987\u001b[0;31m \u001b[0mall_rows\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 988\u001b[0m )\n\u001b[1;32m 989\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 595 | "\u001b[0;32m/anaconda3/envs/testenv/lib/python3.5/site-packages/zipline/pipeline/loaders/blaze/_core.pyx\u001b[0m in \u001b[0;36mzipline.pipeline.loaders.blaze._core.adjusted_arrays_from_rows_with_assets\u001b[0;34m()\u001b[0m\n", 596 | "\u001b[0;32m/anaconda3/envs/testenv/lib/python3.5/site-packages/zipline/pipeline/loaders/blaze/_core.pyx\u001b[0m in \u001b[0;36mzipline.pipeline.loaders.blaze._core.adjusted_arrays_from_rows_with_assets\u001b[0;34m()\u001b[0m\n", 597 | "\u001b[0;32m/anaconda3/envs/testenv/lib/python3.5/site-packages/zipline/pipeline/loaders/blaze/_core.pyx\u001b[0m in \u001b[0;36mzipline.pipeline.loaders.blaze._core.arrays_from_rows_with_assets\u001b[0;34m()\u001b[0m\n", 598 | "\u001b[0;32m/anaconda3/envs/testenv/lib/python3.5/site-packages/zipline/pipeline/loaders/blaze/_core.pyx\u001b[0m in \u001b[0;36mzipline.pipeline.loaders.blaze._core.arrays_from_rows\u001b[0;34m()\u001b[0m\n", 599 | "\u001b[0;32m/anaconda3/envs/testenv/lib/python3.5/site-packages/numpy/core/_internal.py\u001b[0m in \u001b[0;36m_view_is_safe\u001b[0;34m(oldtype, newtype)\u001b[0m\n\u001b[1;32m 365\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 366\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mnewtype\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhasobject\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0moldtype\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhasobject\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 367\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Cannot change data-type for object array.\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 368\u001b[0m \u001b[0;32mreturn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 369\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 600 | "\u001b[0;31mTypeError\u001b[0m: Cannot change data-type for object array." 601 | ] 602 | } 603 | ], 604 | "source": [ 605 | "df = engine.run_pipeline(\n", 606 | " p,\n", 607 | " pd.Timestamp('2016-01-05', tz='utc'),\n", 608 | " pd.Timestamp('2018-01-04', tz='utc')\n", 609 | ")" 610 | ] 611 | }, 612 | { 613 | "cell_type": "code", 614 | "execution_count": null, 615 | "metadata": {}, 616 | "outputs": [], 617 | "source": [ 618 | "df.head()" 619 | ] 620 | }, 621 | { 622 | "cell_type": "code", 623 | "execution_count": null, 624 | "metadata": {}, 625 | "outputs": [], 626 | "source": [] 627 | }, 628 | { 629 | "cell_type": "code", 630 | "execution_count": null, 631 | "metadata": {}, 632 | "outputs": [], 633 | "source": [] 634 | }, 635 | { 636 | "cell_type": "code", 637 | "execution_count": null, 638 | "metadata": {}, 639 | "outputs": [], 640 | "source": [] 641 | } 642 | ], 643 | "metadata": { 644 | "kernelspec": { 645 | "display_name": "Python 3.5 (env_alphatools)", 646 | "language": "python", 647 | "name": "env_alphatools" 648 | }, 649 | "language_info": { 650 | "codemirror_mode": { 651 | "name": "ipython", 652 | "version": 3 653 | }, 654 | "file_extension": ".py", 655 | "mimetype": "text/x-python", 656 | "name": "python", 657 | "nbconvert_exporter": "python", 658 | "pygments_lexer": "ipython3", 659 | "version": "3.5.5" 660 | } 661 | }, 662 | "nbformat": 4, 663 | "nbformat_minor": 2 664 | } 665 | -------------------------------------------------------------------------------- /notebooks/model.txt: -------------------------------------------------------------------------------- 1 | tree 2 | version=v2 3 | num_class=1 4 | num_tree_per_iteration=1 5 | label_index=0 6 | max_feature_idx=27 7 | objective=binary sigmoid:1 8 | feature_names=feature_0 feature_1 feature_2 feature_3 feature_4 feature_5 feature_6 feature_7 feature_8 feature_9 feature_10 feature_11 feature_12 feature_13 feature_14 feature_15 feature_16 feature_17 feature_18 feature_19 feature_20 feature_21 feature_22 feature_23 feature_24 feature_25 feature_26 feature_27 9 | feature_infos=[0.27500000000000002:6.6950000000000003] [-2.4169999999999998:2.4300000000000002] [-1.7430000000000001:1.7430000000000001] [0.019:5.7000000000000002] [-1.7430000000000001:1.7430000000000001] [0.159:4.1900000000000004] [-2.9410000000000003:2.9700000000000002] [-1.7409999999999999:1.7409999999999999] [0:2.173] [0.19:5.1929999999999996] [-2.9039999999999999:2.9089999999999998] [-1.742:1.7430000000000001] [0:2.2149999999999999] [0.26400000000000001:6.5229999999999997] [-2.7280000000000002:2.7269999999999999] [-1.742:1.742] [0:2.548] [0.36499999999999999:6.0679999999999996] [-2.4950000000000001:2.496] [-1.74:1.7430000000000001] [0:3.1019999999999999] 1:0:2:3:4:5:6:7 [0.41899999999999998:7.3920000000000003] [0.46100000000000002:3.6819999999999999] [0.38400000000000001:6.5829999999999993] [0.093000000000000013:7.8600000000000003] [0.38900000000000001:4.5430000000000001] [0.48899999999999999:4.3159999999999998] 10 | tree_sizes=2473 2514 2494 2493 2501 2521 2510 2503 2510 2516 11 | 12 | Tree=0 13 | num_leaves=31 14 | num_cat=0 15 | split_feature=25 26 27 25 25 24 22 5 5 27 26 27 25 26 27 24 3 25 13 3 22 19 5 24 4 4 25 24 13 24 16 | split_gain=258.038 165.11 154.742 63.219 70.8123 52.7563 84.2546 36.0199 33.6019 27.9057 27.8139 26.5869 26.0761 22.1221 21.9403 33.7759 17.9554 17.3962 16.275 15.194 14.467 15.3902 14.2355 13.4469 12.7354 13.7208 12.6664 12.7077 12.328 14.4412 17 | threshold=1.2815000000000001 0.78050000000000008 0.90450000000000019 0.98450000000000015 0.6785000000000001 1.0265000000000002 1.0445000000000002 0.82550000000000001 1.0195000000000003 0.9225000000000001 0.86950000000000016 0.78850000000000009 0.62350000000000005 0.88850000000000018 0.88950000000000007 1.1225000000000003 1.9205000000000003 0.98950000000000016 1.3835000000000004 1.8335000000000004 0.94750000000000012 -0.96449999999999991 1.3365000000000002 0.71850000000000014 1.2535000000000001 -0.27149999999999996 0.58650000000000013 0.74550000000000016 0.96150000000000013 0.75150000000000006 18 | decision_type=2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 19 | left_child=1 7 3 4 28 6 -4 23 13 22 11 -6 -7 -5 20 -16 17 -14 -19 -17 -2 -22 26 -1 25 -8 27 -9 29 -3 20 | right_child=14 2 5 8 10 12 24 9 -10 -11 -12 -13 16 -15 15 19 -18 18 -20 -21 21 -23 -24 -25 -26 -27 -28 -29 -30 -31 21 | leaf_value=0.10284758317169626 0.081717548115204597 0.16920083703122518 0.061477323984852811 0.079260885438702716 0.18080904101407977 0.099309317207388309 0.12110846920445478 0.12408276028202063 0.17475193328198171 0.017008281489650057 0.1969212081555923 0.1235008337668447 0.16066263555873586 0.13517987794167297 0.05173771107426077 0.10127986202068245 0.10275151646010305 0.11491813432626971 0.17550668759677757 0.045185775908694936 0.063338584813271967 0.15085137998011328 0.17902887439915816 0.074439733286902621 0.085450775035909657 0.16644963581922742 0.14411108005613313 0.055249166772639538 0.16659762686134424 0.11481669961729626 22 | leaf_count=283 121 62 307 129 277 161 135 75 112 22 751 115 457 158 422 324 96 203 57 57 26 90 57 409 44 133 139 42 102 234 23 | internal_value=0 0.217752 0.42333 0.803342 0.987405 -0.0394609 -0.496888 -0.48946 0.0924624 0.0498958 1.24088 0.808221 0.251245 -0.270801 -0.88835 -1.04647 0.397149 0.506114 0.0923076 -0.613956 -0.35262 0.153016 0.203224 -0.750565 0.236446 0.400485 0.00157742 -0.484247 0.259469 0.0524468 24 | internal_count=5600 4560 3533 1940 1541 1593 619 1027 399 335 1143 392 974 287 1040 803 813 717 260 381 237 116 313 692 312 268 256 117 398 296 25 | shrinkage=1 26 | 27 | 28 | Tree=1 29 | num_leaves=31 30 | num_cat=0 31 | split_feature=25 26 25 25 3 5 22 5 26 22 22 26 22 3 26 24 9 25 3 5 24 26 19 13 9 19 24 5 23 3 32 | split_gain=232.567 149.461 79.7137 118.312 38.3157 33.9073 19.959 19.4367 21.805 18.5965 19.0683 17.7145 20.2141 15.8739 14.2707 13.2396 12.7568 12.7083 12.4665 12.2694 11.595 11.5527 11.8451 11.5454 11.2504 10.1043 9.87802 9.37985 9.34468 10.6218 33 | threshold=1.2815000000000001 0.77650000000000008 0.62350000000000005 0.98950000000000016 1.6175000000000004 0.82550000000000001 1.0445000000000002 0.88150000000000006 1.0375000000000003 1.0025000000000002 0.84950000000000014 0.97650000000000003 0.78450000000000009 0.97650000000000003 0.94950000000000012 0.71850000000000014 0.89250000000000018 2.0390000000000006 1.1435000000000002 1.5755000000000001 1.7450000000000003 1.1205000000000003 0.62900000000000011 0.73250000000000004 0.81850000000000012 1.0425000000000002 1.1005000000000003 0.98050000000000004 0.98050000000000004 0.33650000000000008 34 | decision_type=2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 35 | left_child=1 5 19 4 7 15 11 18 -9 10 -5 24 -13 -7 27 25 -15 -8 -4 -3 23 -16 -23 -6 -2 28 -14 -11 -1 -30 36 | right_child=6 2 3 9 20 13 17 8 -10 14 -12 12 26 16 21 -17 -18 -19 -20 -21 -22 22 -24 -25 -26 -27 -28 -29 29 -31 37 | leaf_value=-0.070541453406693988 -0.063694696339596971 -0.018268219940631856 0.040536711276372622 0.0050468732160738579 0.041073763562594706 0.020651352864842279 -0.011240886118261467 0.065302003278970011 0.032851254148682729 -0.048367080735581769 -0.034450629006536941 -0.0053636137701500485 -0.077843794406284653 -0.049414992210901412 0.046233340678247803 -0.048147811369761845 0.016529617086431816 -0.054394845292876338 0.012504728853908589 0.032712218632545885 -0.056054581121086335 -0.014206068541919782 0.048014347247125215 -0.0035552662953019934 0.00058016734174086166 -0.05701248666597733 -0.049041183946400627 0.032225489254395945 0.056549891419987078 -0.012066252716263076 38 | leaf_count=23 39 534 688 216 89 198 258 636 310 53 284 61 300 79 123 399 47 93 208 52 28 102 44 169 91 58 198 20 26 174 39 | internal_value=0 0.207189 0.400353 0.5339 0.786129 -0.474962 -0.842165 0.892352 1.09303 -0.101684 -0.347898 -1.04008 -1.19477 0.0589292 0.258551 -0.728684 -0.497018 -0.453583 0.680513 -0.274894 0.103404 0.471952 0.0907344 0.236521 -0.374138 -0.395616 -1.32798 -0.527041 -0.201945 -0.0629231 40 | internal_count=5600 4560 3556 2970 2128 1004 1040 1842 946 842 500 689 559 324 342 680 126 351 896 586 286 269 146 258 130 281 498 73 223 200 41 | shrinkage=0.05 42 | 43 | 44 | Tree=2 45 | num_leaves=31 46 | num_cat=0 47 | split_feature=27 26 24 22 5 3 5 22 27 26 26 22 22 22 24 26 26 27 5 18 18 27 24 22 1 24 4 26 13 19 48 | split_gain=148.117 184.435 89.3492 94.5529 40.4413 27.3787 27.2311 19.0733 22.2821 18.7714 16.1379 13.5399 13.5162 18.567 13.3548 13.0963 13.0848 12.6295 12.4813 11.7266 11.1861 10.885 10.6951 12.1689 10.6911 10.6851 10.5366 15.4816 10.235 10.0934 49 | threshold=0.90250000000000019 0.81150000000000011 1.0265000000000002 1.0445000000000002 0.82050000000000012 1.9205000000000003 0.89650000000000007 1.0355000000000001 0.83850000000000013 1.3665 1.2195000000000003 1.5530000000000002 1.0465000000000002 0.74450000000000016 0.78250000000000008 0.89850000000000019 0.77250000000000008 1.0885000000000002 1.3665 -0.80349999999999988 0.037500000000000012 0.73350000000000015 0.73550000000000015 0.8015000000000001 -0.50249999999999984 0.71850000000000014 -1.2844999999999998 0.86050000000000015 0.78050000000000008 0.8035000000000001 50 | decision_type=2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 51 | left_child=1 4 3 17 16 12 7 8 26 14 24 -11 13 -4 28 -10 25 22 -6 -13 21 -18 -2 -24 -7 29 27 -3 -5 -1 52 | right_child=2 6 5 9 18 10 -8 -9 15 11 -12 19 -14 -15 -16 -17 20 -19 -20 -21 -22 -23 23 -25 -26 -27 -28 -29 -30 -31 53 | leaf_value=-0.0073029196712544242 -0.016483738266246096 -0.056226662966493993 0.037243245973663699 -0.010522150734305409 0.0026461442993008974 0.042245029951626607 0.046440568748069885 0.04858761207873491 -0.04773094810436717 -0.084102271262559564 -0.065655930649870728 -0.063081051195993595 0.015762579862828231 -0.011458033620908884 -0.0058759188038779945 0.0046124893907736877 0.0083286333310169571 -0.086782249315688301 0.050838468825565433 0.0097202305841271319 0.025447545611885669 -0.058886351658876616 -0.020895762195453083 -0.072301035629168051 -0.033948366204301507 -0.046187366594338752 0.031795753626302992 0.0324895757065886 0.060645280643654702 -0.049344488016781651 54 | leaf_count=189 58 36 90 29 327 30 934 224 68 41 98 34 526 630 203 164 47 207 65 64 74 50 56 261 48 370 483 44 68 82 55 | internal_value=0 0.298664 -0.358922 -0.810713 -0.317348 -0.0338865 0.680672 0.453822 0.308105 -0.107649 -0.772754 -0.71612 0.070829 -0.107589 0.174807 -0.215088 -0.571915 -1.33898 0.212204 -0.311304 -0.0786672 -0.526581 -1.11994 -1.26446 -0.0928599 -0.702852 0.524037 -0.150543 0.787251 -0.400718 56 | internal_count=5600 3157 2443 1021 1204 1422 1953 1019 795 439 176 139 1246 720 300 232 812 582 392 98 171 97 375 317 78 641 563 80 97 271 57 | shrinkage=0.05 58 | 59 | 60 | Tree=3 61 | num_leaves=31 62 | num_cat=0 63 | split_feature=25 26 27 25 25 25 26 24 22 27 5 27 22 27 3 5 25 3 5 22 17 6 5 4 25 13 24 27 24 3 64 | split_gain=205.442 146.989 120.56 48.7787 42.9311 39.2895 31.1871 29.0385 55.8713 27.4702 25.904 24.2132 24.1528 24.5602 22.82 21.5809 17.496 15.6602 14.9168 14.0717 13.142 12.1895 11.6379 11.5599 13.7375 10.9054 12.5602 10.4004 10.2465 10.172 65 | threshold=1.0565000000000002 0.78050000000000008 0.90450000000000019 0.6785000000000001 1.4915 0.62350000000000005 0.86950000000000016 1.0065000000000002 0.99950000000000017 0.78250000000000008 0.82550000000000001 0.75650000000000006 1.0025000000000002 1.0445000000000002 1.6740000000000002 0.91550000000000009 0.94850000000000012 1.8335000000000004 1.2445000000000002 0.73650000000000004 2.1615000000000006 -1.2374999999999998 1.5755000000000001 -0.039499999999999993 0.81350000000000011 0.96150000000000013 0.75150000000000006 0.9225000000000001 1.2035000000000002 0.45550000000000007 66 | decision_type=2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 67 | left_child=1 10 3 25 12 22 9 8 -7 16 29 21 13 15 18 -2 -5 20 -9 -17 -6 -12 -4 24 -11 26 -3 -13 -21 -1 68 | right_child=4 2 5 6 17 7 -8 14 -10 23 11 27 -14 -15 -16 19 -18 -19 -20 28 -22 -23 -24 -25 -26 -27 -28 -29 -30 -31 69 | leaf_value=-0.0082033028811009587 -0.03599909263529838 0.041746635640685148 -0.039580301392174347 0.058125446833881649 -0.047005132938931764 -0.057212826088312366 0.062685920491323349 0.023242950454383147 0.021379314442893989 0.03458350647962026 -0.0472128223059173 -0.020425259197421095 0.0050919136779366547 -0.064092400066437114 -0.013299083017745733 0.071779011675729365 -0.011395179058987818 -0.096962996070842092 0.062990739000158127 -0.017861215058779942 0.015256644821202901 0.029991875111881828 0.019987201999956301 0.02216413692018528 -0.061614855385568858 0.039569321922273241 -0.0091237530592508161 -0.097929980712076425 0.037995357051269753 -0.038162993370534026 70 | leaf_count=145 247 62 248 246 599 156 854 358 217 21 23 82 361 142 131 30 43 60 130 123 36 196 38 75 52 102 234 22 45 522 71 | internal_value=0 0.259168 0.483726 0.837842 -0.58416 0.0197737 1.02885 0.209041 -0.232428 0.591867 -0.408289 0.0584268 -0.30659 -0.557134 0.476091 -0.325475 0.954461 -0.961879 0.676464 0.167993 -0.869502 0.43756 -0.633747 -0.1107 -0.679159 0.224196 0.0295668 -0.737286 -0.0574093 -0.633103 72 | internal_count=5600 3957 2967 1689 1643 1278 1291 992 373 437 990 323 948 587 619 445 289 695 488 198 635 219 286 148 73 398 296 104 168 667 73 | shrinkage=0.05 74 | 75 | 76 | Tree=4 77 | num_leaves=31 78 | num_cat=0 79 | split_feature=25 26 27 25 25 25 26 24 22 27 5 27 22 27 3 5 25 3 5 22 24 17 0 25 6 5 11 5 14 26 80 | split_gain=185.695 133.172 109.85 44.7552 38.7872 35.4665 28.7556 26.2881 48.8607 25.0437 23.3658 21.8754 21.7989 22.1947 19.956 19.4566 16.0033 14.3464 12.8726 12.8084 12.7319 16.2341 14.2999 12.7886 11.0086 10.8517 10.7611 10.4892 10.4273 10.1204 81 | threshold=1.0565000000000002 0.78050000000000008 0.90450000000000019 0.6785000000000001 1.4915 0.62350000000000005 0.86950000000000016 1.0185000000000002 0.99950000000000017 0.78250000000000008 0.82550000000000001 0.75650000000000006 1.0025000000000002 1.0445000000000002 1.6740000000000002 0.91550000000000009 0.94850000000000012 1.8335000000000004 1.2445000000000002 0.73650000000000004 1.3545000000000003 1.5825000000000002 1.1075000000000002 2.0390000000000006 -1.2374999999999998 1.2105000000000001 -0.39449999999999991 1.5755000000000001 0.88850000000000018 0.84050000000000014 82 | decision_type=2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 83 | left_child=1 10 3 -3 12 27 9 8 -7 16 -1 24 13 15 18 -2 -5 20 -9 -17 22 -22 23 25 -12 -6 -16 -4 -11 -14 84 | right_child=4 2 5 6 17 7 -8 14 -10 28 11 -13 29 -15 26 19 -18 -19 -20 -21 21 -23 -24 -25 -26 -27 -28 -29 -30 -31 85 | leaf_value=-0.03007466406260563 -0.034171014563527391 0.010667027800453588 -0.037567210194892987 0.055687579969788049 -0.041920232142950868 -0.053424147836170721 0.060126088198485833 0.02248874373378508 0.019554118609331331 -0.020358898894108236 -0.044708771572039417 -0.035026924163452405 -0.064419733632072249 -0.06092732837760189 0.023364535810334042 0.068723767475656228 -0.010814481443820212 -0.092627105268016957 0.059791420812017451 -0.0027257930963349077 -0.03310238735004007 0.047694654553158029 -0.077769747350913215 -0.072840744508540084 0.028568707780727565 0.00059459190294235478 -0.035450464047234551 0.019014778632459699 0.041777501926684335 0.0089110851022509169 86 | leaf_count=667 247 398 248 246 190 157 854 354 223 112 23 104 20 142 52 30 43 60 128 168 112 32 128 85 196 88 78 38 36 341 87 | internal_value=0 0.247111 0.461601 0.801135 -0.555152 0.0188508 0.98524 0.199058 -0.214866 0.565015 -0.388064 0.0556022 -0.291313 -0.529343 0.457455 -0.309174 0.913171 -0.914301 0.64701 0.15981 -0.826095 -0.303144 -0.979741 -0.776913 0.41656 -0.569222 -0.23828 -0.601691 -0.105129 0.0967787 88 | internal_count=5600 3957 2967 1689 1643 1278 1291 992 380 437 990 323 948 587 612 445 289 695 482 198 635 144 491 363 219 278 130 286 148 361 89 | shrinkage=0.05 90 | 91 | 92 | Tree=5 93 | num_leaves=31 94 | num_cat=0 95 | split_feature=25 25 26 26 25 22 9 5 5 25 9 0 24 0 22 22 4 10 23 24 14 3 24 25 10 24 26 15 15 1 96 | split_gain=190.967 123.643 75.9346 34.8826 32.1702 30.5379 28.9557 28.4996 20.4006 23.2065 16.3872 14.5294 12.6395 12.3923 12.0457 15.7034 11.4611 12.1532 11.4754 10.8305 10.4034 10.2247 9.77361 9.75748 12.5245 9.34506 9.1128 8.67953 8.61195 8.55797 97 | threshold=1.0675000000000001 0.66550000000000009 0.77650000000000008 1.0825000000000002 1.5695000000000003 1.0265000000000002 1.1375000000000002 0.85250000000000015 0.88150000000000006 0.96550000000000014 1.6925000000000001 2.2790000000000004 0.97750000000000015 2.4065000000000007 0.81050000000000011 1.0885000000000002 -0.26349999999999996 -0.60149999999999981 1.1125000000000003 0.6705000000000001 1.2275000000000003 1.8335000000000004 2.1485000000000007 0.96950000000000014 -0.39949999999999991 0.74850000000000017 0.98850000000000016 0.24750000000000003 1.0415000000000003 1.1045000000000003 98 | decision_type=2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 99 | left_child=1 6 7 8 5 12 14 20 9 29 13 -10 19 23 -1 16 17 -16 -18 -2 -3 22 26 -5 -25 -17 -6 -9 -7 -4 100 | right_child=4 2 3 10 21 28 -8 27 11 -11 -12 -13 -14 -15 15 25 18 -19 -20 -21 -22 -23 -24 24 -26 -27 -28 -29 -30 -31 101 | leaf_value=-0.0030615752742305952 0.0034703107205822795 -0.043631014576037838 0.039189339840581178 0.014774265448395028 -0.0018557887687825536 0.012316974642225694 0.017687564096601518 0.050194629936530677 0.052134220065784366 -0.010866723863100608 0.053746456132888858 -0.0095913829579568795 -0.017332207422173129 -0.066294385968032582 -0.053649680045280951 0.032692040483127113 -0.053259950735346007 -0.0076413436957386881 -9.6079161295919247e-05 -0.050577461676237591 0.01644601971111637 -0.088988727124621914 0.013100356705756284 -0.060969485605520762 0.0057211937142106865 -0.01879091794552833 -0.05166032479513516 -0.0048558551988264016 -0.027640113500363734 0.01009254748359883 102 | leaf_count=272 42 231 602 354 40 284 234 60 798 138 73 41 329 25 78 60 309 221 47 325 33 58 28 44 80 87 459 56 67 125 103 | internal_value=0 0.216732 0.468216 0.608941 -0.599264 -0.388867 -0.286679 -0.360292 0.755607 0.537959 0.181737 0.981772 -0.63183 0.0527939 -0.425642 -0.549283 -0.68199 -0.392767 -0.924946 -0.887641 -0.722441 -0.975834 -0.88845 0.125086 -0.359609 0.0435006 -0.953164 0.471924 0.0936351 0.683439 104 | internal_count=5600 3968 2660 2280 1632 1047 1308 380 1704 865 576 839 696 503 1074 802 655 299 356 367 264 585 527 478 124 147 499 116 351 727 105 | shrinkage=0.05 106 | 107 | 108 | Tree=6 109 | num_leaves=31 110 | num_cat=0 111 | split_feature=27 26 22 24 5 5 27 22 22 27 27 26 0 10 24 27 22 1 6 22 22 24 20 24 27 9 0 10 0 19 112 | split_gain=112.686 117.357 56.5122 59.0278 30.7779 26.4615 19.5527 18.104 17.3051 12.3645 12.2385 17.4938 11.7335 11.6296 11.5104 11.0966 13.7449 11.0212 11.4986 10.8982 11.2905 10.6987 10.5748 14.768 10.3464 10.1697 10.0578 9.70072 9.67154 9.01716 113 | threshold=0.90650000000000008 0.81150000000000011 1.0325000000000002 1.0935000000000004 0.88150000000000006 0.82050000000000012 1.0105000000000002 0.75750000000000017 1.0605000000000002 1.0885000000000002 0.82250000000000012 0.88850000000000018 1.5645000000000002 -1.0664999999999998 1.0475000000000001 0.99050000000000005 0.87450000000000017 1.7720000000000002 1.7735000000000003 0.70650000000000013 0.95150000000000012 0.73550000000000015 1.0000000180025095e-35 1.2565000000000002 0.72650000000000003 1.2125000000000001 1.6975000000000002 0.65350000000000008 1.1775000000000002 0.31150000000000005 114 | decision_type=2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 115 | left_child=1 5 3 9 8 12 25 -5 10 21 13 -12 -1 29 -13 16 -9 18 -8 -6 22 -2 23 -21 -7 -4 -23 28 -26 -3 116 | right_child=2 4 6 7 19 24 17 15 -10 -11 11 14 -14 -15 -16 -17 -18 -19 -20 20 -22 26 -24 -25 27 -27 -28 -29 -30 -31 117 | leaf_value=-0.026892568273982145 -0.0018198074835837985 -0.049139701467658732 0.011803174586376282 0.0285833166038028 0.070372584328135215 0.021527319118560748 -0.0084518415853693414 0.040500661923382759 0.044452525144778793 -0.071079050871489899 -0.051427159681273815 -0.014110093022953716 0.01629471827039097 0.023981904242587733 0.031692113600085565 -0.026750228641287285 -0.017655985938328837 -0.063000598166036328 0.04230629182849218 0.006788641613653338 0.048012287280551444 -0.042508680384827385 0.047412398470806555 0.07803322289540672 -0.0051024750963037562 0.057311061946845036 -0.095086012578126558 0.026179293974610798 -0.066506156715099615 0.020603987183684228 118 | leaf_count=749 56 43 184 100 107 240 759 64 163 247 71 129 69 442 99 412 114 35 48 279 416 289 148 34 70 68 42 48 41 34 119 | internal_value=0 0.228708 -0.348017 -0.627004 0.53712 -0.259092 -0.0097917 -0.220541 0.284205 -1.0695 0.164877 -0.159769 -0.464984 0.353124 0.114191 -0.354136 0.0650035 -0.156679 -0.108757 0.791752 0.717427 -0.846001 0.498612 0.288197 0.166245 0.48193 -0.983491 -0.230945 -0.557426 -0.372065 120 | internal_count=5600 3182 2418 1324 1965 1217 1094 690 981 634 818 299 818 519 228 590 178 842 807 984 877 387 461 313 399 252 331 159 111 77 121 | shrinkage=0.05 122 | 123 | 124 | Tree=7 125 | num_leaves=31 126 | num_cat=0 127 | split_feature=25 25 26 27 25 9 5 26 25 24 0 5 24 27 25 3 1 13 10 5 3 13 27 3 4 4 5 3 24 14 128 | split_gain=169.33 108.548 65.9297 60.5817 27.6196 25.1516 24.2582 23.9892 26.9228 18.3185 17.1149 18.0553 15.8765 14.0891 13.8709 13.3075 12.1236 11.7248 11.4836 11.2864 10.9738 10.7819 10.7279 10.3987 11.2564 10.3869 9.80083 9.64208 9.577 9.40305 129 | threshold=1.0675000000000001 0.66550000000000009 0.77650000000000008 0.90650000000000008 1.5585000000000002 1.1375000000000002 0.85250000000000015 0.86350000000000016 0.89550000000000007 0.97750000000000015 1.6975000000000002 1.2635000000000003 0.94450000000000012 0.93850000000000011 0.96950000000000014 1.9805000000000004 0.5495000000000001 1.3135000000000003 -0.47449999999999992 0.88450000000000017 1.7525000000000002 0.9405 0.79950000000000021 0.34850000000000003 -0.56549999999999989 -0.13099999999999998 1.3365000000000002 1.8335000000000004 0.6755000000000001 1.2275000000000003 130 | decision_type=2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 131 | left_child=1 5 6 7 9 20 29 8 22 13 11 14 -12 19 -5 -13 -14 18 -11 28 21 23 -4 24 -1 -24 -25 -6 -2 -3 132 | right_child=4 2 3 10 27 -7 -8 -9 -10 17 12 15 16 -15 -16 -17 -18 -19 -20 -21 -22 -23 25 26 -26 -27 -28 -29 -30 -31 133 | leaf_value=0.058279249497736801 0.023186751297265787 -0.040371287309234005 0.050883792970153677 0.015113831009609113 -0.041218658389127381 0.016607192668503433 0.021994432249266838 0.052529550218962932 -0.013156167059396942 0.0097867198262740086 -0.073979166268815427 0.053138057825276211 0.022809967003351726 -0.05424363551507503 -0.019995989396257314 -0.016616039035163285 -0.057112668477426055 0.019149606947993163 -0.027483268760758541 0.0024848841740828008 -0.05007105193334449 -0.00021978833304006642 -0.040202580922408963 -0.030471645743298467 -0.021400565852111707 0.043639884042823346 0.030380864132665965 -0.084790534118077465 -0.043273434223432126 0.016763322247873042 134 | leaf_count=31 25 231 234 459 545 234 116 901 139 128 43 166 67 181 151 33 27 147 237 139 108 291 29 574 42 31 28 58 172 33 135 | internal_value=0 0.205756 0.442814 0.574823 -0.56431 -0.265809 -0.330094 0.858235 0.466921 -0.364634 0.187518 0.299026 -0.470626 -0.63153 0.127762 0.827668 -0.00490865 -0.0958748 -0.288294 -0.388983 -0.395298 -0.327493 0.818035 -0.466451 0.248553 0.06036 -0.553058 -0.906268 -0.697205 -0.664315 136 | internal_count=5600 3968 2660 2280 1632 1308 380 1334 433 1029 946 809 137 517 610 199 94 512 365 336 1074 966 294 675 73 60 602 603 197 264 137 | shrinkage=0.05 138 | 139 | 140 | Tree=8 141 | num_leaves=31 142 | num_cat=0 143 | split_feature=25 25 25 26 27 5 9 5 0 27 22 22 22 27 25 26 25 27 9 5 14 22 13 3 15 22 27 25 26 8 144 | split_gain=153.747 74.6785 62.9554 53.3031 48.4209 26.7661 22.7111 20.2448 20.1534 14.0381 14.3734 13.9208 16.103 16.0004 13.4998 12.3294 15.3413 13.8927 12.0543 11.43 10.8403 10.8267 11.9293 10.6424 10.4214 10.3729 10.0297 9.99064 9.8811 10.7027 145 | threshold=1.2450000000000003 0.66550000000000009 0.96550000000000014 0.77650000000000008 0.90250000000000019 0.87650000000000006 1.1375000000000002 0.80850000000000011 1.6975000000000002 0.91150000000000009 1.0215000000000003 0.71850000000000014 1.0445000000000002 1.0705000000000002 1.9405000000000003 0.87150000000000005 0.7955000000000001 0.8015000000000001 1.0415000000000003 1.8805000000000003 0.7360000000000001 0.69950000000000012 0.9405 0.34850000000000003 -1.4779999999999998 1.1035000000000001 0.89550000000000007 0.78850000000000009 0.85650000000000015 1.0000000180025095e-35 146 | decision_type=2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 147 | left_child=1 6 3 7 15 18 21 20 19 28 -11 -2 13 24 -14 17 -17 -5 -4 -6 -3 -1 23 -23 -13 -25 -24 -19 29 -7 148 | right_child=11 2 5 4 8 9 -8 -9 -10 10 -12 12 14 -15 -16 16 -18 27 -20 -21 -22 22 26 25 -26 -27 -28 -29 -30 -31 149 | leaf_value=0.012288530788624094 0.024068018420805623 -0.042495991684968995 -0.031699583604026603 0.04227609406127486 0.016060699872508057 0.050742157044992658 0.015797939411350791 0.023336084066936356 -0.026825587643730393 -0.024191766445828797 0.018702016278240637 0.027778958345555649 -0.0072158239821684851 -0.065730680401898708 -0.047522718741986707 0.030669379657376147 0.064641592298539191 0.034839393857520072 0.0065463020192192637 0.069661647436787738 0.0080894106407500364 0.0089267893574030473 0.0079606632241629085 -0.038247232417906674 -0.037833702821507414 0.002275745388539007 -0.031349276129300843 -0.034790931573335911 0.042827659107599371 -0.040262816106403178 150 | leaf_count=102 40 160 351 322 576 21 234 114 111 160 156 26 265 255 123 199 499 38 109 44 58 66 196 540 418 72 98 47 165 35 151 | internal_value=0 0.149398 0.319459 0.515955 0.654136 -0.0979627 -0.25266 -0.223205 0.254663 0.209094 -0.0612112 -0.683145 -0.726092 -0.909178 -0.398452 0.929816 1.09835 0.648613 -0.452792 0.396546 -0.580203 -0.375685 -0.440957 -0.587179 -0.680015 -0.669632 -0.103189 -0.0750968 0.603433 -0.125406 152 | internal_count=5600 4473 3165 2168 1836 997 1308 332 731 537 316 1127 1087 699 388 1105 698 407 460 620 218 1074 972 678 444 612 294 85 221 56 153 | shrinkage=0.05 154 | 155 | 156 | Tree=9 157 | num_leaves=31 158 | num_cat=0 159 | split_feature=25 25 26 27 25 22 9 5 25 9 27 5 5 8 14 5 25 27 5 22 22 23 4 25 13 26 27 5 25 3 160 | split_gain=139.484 89.6879 56.1802 58.8554 23.4395 24.7608 20.5153 19.0123 16.3977 15.3931 12.4962 14.7831 12.3695 12.163 11.2701 11.1618 10.7229 17.1675 11.4517 10.2112 12.5602 10.0368 11.3407 9.88219 9.4946 9.44594 13.0539 16.2791 9.17487 9.09626 161 | threshold=1.0675000000000001 0.66550000000000009 0.8055000000000001 0.90450000000000019 1.5585000000000002 1.0265000000000002 1.1375000000000002 0.85250000000000015 0.94550000000000012 1.2975000000000001 1.0025000000000002 0.98050000000000004 0.88150000000000006 1.6300000000000001 0.7360000000000001 0.63850000000000018 0.90950000000000009 1.0585000000000002 0.59450000000000014 0.81050000000000011 1.0885000000000002 1.1355000000000002 0.013500000000000002 0.98950000000000016 0.90850000000000009 0.89850000000000019 0.8015000000000001 0.44950000000000007 0.86450000000000016 1.8335000000000004 162 | decision_type=2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 163 | left_child=1 6 7 12 5 10 19 8 14 13 11 -2 23 16 -3 -15 18 -18 -5 -1 21 22 -21 25 -23 26 27 -4 -20 -6 164 | right_child=4 2 3 9 29 -7 -8 -9 -10 -11 -12 -13 -14 15 -16 -17 17 -19 28 20 -22 24 -24 -25 -26 -27 -28 -29 -30 -31 165 | leaf_value=-0.0010723790862061031 -0.032065579686550126 -0.025817848130788558 -0.041706760072809143 0.05101366335179694 -0.037546866648071817 0.0057917137227283559 0.015029066991679822 0.019216551694225318 -0.073573436435715484 0.032167953525359685 -0.048250477157948071 0.0042246291995435663 0.053620629139660175 -0.036128219855601378 0.019153854461955454 0.023439751205089562 0.007818585977361691 -0.063999798430316848 -0.017197927927741483 -0.020886475178484278 0.0029597269639691642 -0.021174769179526431 -0.049082723913883491 -0.0037188796176172616 0.052422302980316859 0.052421343073495733 -0.0095669862153275036 0.047955591301407002 0.032057230856236921 -0.08044440319907023 166 | leaf_count=272 305 210 24 41 545 338 234 156 55 223 206 180 595 36 77 280 60 77 182 307 147 51 270 67 27 248 81 199 49 58 167 | internal_value=0 0.18794 0.405503 0.550009 -0.513113 -0.329303 -0.240248 -0.203493 -0.467674 0.177547 -0.547702 -0.372446 0.856269 0.035607 -0.274277 0.332745 -0.19379 -0.652625 0.0382857 -0.357194 -0.471231 -0.590177 -0.681546 0.65415 0.0853431 0.746856 0.505542 0.765334 -0.136097 -0.82998 168 | internal_count=5600 3968 2660 2162 1632 1029 1308 498 342 948 691 485 1214 725 287 316 409 137 272 1074 802 655 577 619 78 552 304 223 231 603 169 | shrinkage=0.05 170 | 171 | 172 | end of trees 173 | 174 | feature importances: 175 | feature_25=45 176 | feature_5=36 177 | feature_27=35 178 | feature_22=35 179 | feature_26=31 180 | feature_24=28 181 | feature_3=19 182 | feature_9=10 183 | feature_13=9 184 | feature_0=8 185 | feature_4=8 186 | feature_14=5 187 | feature_10=5 188 | feature_19=5 189 | feature_1=4 190 | feature_15=3 191 | feature_6=3 192 | feature_23=3 193 | feature_8=2 194 | feature_18=2 195 | feature_17=2 196 | feature_11=1 197 | feature_20=1 198 | 199 | parameters: 200 | [boosting: gbdt] 201 | [objective: binary] 202 | [metric: binary_logloss] 203 | [tree_learner: serial] 204 | [device_type: cpu] 205 | [data: ] 206 | [valid: ] 207 | [num_iterations: 100] 208 | [learning_rate: 0.05] 209 | [num_leaves: 31] 210 | [num_threads: 0] 211 | [max_depth: -1] 212 | [min_data_in_leaf: 20] 213 | [min_sum_hessian_in_leaf: 0.001] 214 | [bagging_fraction: 0.8] 215 | [bagging_freq: 5] 216 | [bagging_seed: 3] 217 | [feature_fraction: 0.9] 218 | [feature_fraction_seed: 2] 219 | [early_stopping_round: 0] 220 | [max_delta_step: 0] 221 | [lambda_l1: 0] 222 | [lambda_l2: 0] 223 | [min_gain_to_split: 0] 224 | [drop_rate: 0.1] 225 | [max_drop: 50] 226 | [skip_drop: 0.5] 227 | [xgboost_dart_mode: 0] 228 | [uniform_drop: 0] 229 | [drop_seed: 4] 230 | [top_rate: 0.2] 231 | [other_rate: 0.1] 232 | [min_data_per_group: 100] 233 | [max_cat_threshold: 32] 234 | [cat_l2: 10] 235 | [cat_smooth: 10] 236 | [max_cat_to_onehot: 4] 237 | [top_k: 20] 238 | [monotone_constraints: ] 239 | [feature_contri: ] 240 | [forcedsplits_filename: ] 241 | [refit_decay_rate: 0.9] 242 | [verbosity: 0] 243 | [max_bin: 255] 244 | [min_data_in_bin: 3] 245 | [bin_construct_sample_cnt: 200000] 246 | [histogram_pool_size: -1] 247 | [data_random_seed: 1] 248 | [output_model: LightGBM_model.txt] 249 | [snapshot_freq: -1] 250 | [input_model: ] 251 | [output_result: LightGBM_predict_result.txt] 252 | [initscore_filename: ] 253 | [valid_data_initscores: ] 254 | [pre_partition: 0] 255 | [enable_bundle: 1] 256 | [max_conflict_rate: 0] 257 | [is_enable_sparse: 1] 258 | [sparse_threshold: 0.8] 259 | [use_missing: 1] 260 | [zero_as_missing: 0] 261 | [two_round: 0] 262 | [save_binary: 0] 263 | [enable_load_from_binary_file: 1] 264 | [header: 0] 265 | [label_column: ] 266 | [weight_column: ] 267 | [group_column: ] 268 | [ignore_column: ] 269 | [categorical_feature: ] 270 | [predict_raw_score: 0] 271 | [predict_leaf_index: 0] 272 | [predict_contrib: 0] 273 | [num_iteration_predict: -1] 274 | [pred_early_stop: 0] 275 | [pred_early_stop_freq: 10] 276 | [pred_early_stop_margin: 10] 277 | [convert_model_language: ] 278 | [convert_model: gbdt_prediction.cpp] 279 | [num_class: 1] 280 | [is_unbalance: 0] 281 | [scale_pos_weight: 1] 282 | [sigmoid: 1] 283 | [boost_from_average: 1] 284 | [reg_sqrt: 0] 285 | [alpha: 0.9] 286 | [fair_c: 1] 287 | [poisson_max_delta_step: 0.7] 288 | [tweedie_variance_power: 1.5] 289 | [max_position: 20] 290 | [label_gain: ] 291 | [metric_freq: 1] 292 | [is_provide_training_metric: 0] 293 | [eval_at: ] 294 | [num_machines: 1] 295 | [local_listen_port: 12400] 296 | [time_out: 120] 297 | [machine_list_filename: ] 298 | [machines: ] 299 | [gpu_platform_id: -1] 300 | [gpu_device_id: -1] 301 | [gpu_use_dp: 0] 302 | 303 | end of parameters 304 | 305 | pandas_categorical:null 306 | --------------------------------------------------------------------------------