├── .gitignore ├── LICENSE ├── README.md ├── classifytrades.py ├── setup.py └── tradeclassification_c.pyx /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | *.c 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | pip-wheel-metadata/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # pipenv 89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 92 | # install all needed dependencies. 93 | #Pipfile.lock 94 | 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 96 | __pypackages__/ 97 | 98 | # Celery stuff 99 | celerybeat-schedule 100 | celerybeat.pid 101 | 102 | # SageMath parsed files 103 | *.sage.py 104 | 105 | # Environments 106 | .env 107 | .venv 108 | env/ 109 | venv/ 110 | ENV/ 111 | env.bak/ 112 | venv.bak/ 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | .dmypy.json 127 | dmypy.json 128 | 129 | # Pyre type checker 130 | .pyre/ 131 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 jktis 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Trade-Classification-Algorithms 2 | 3 | Module to classify financial markets transactions data into 4 | buyer- and seller-initiated trades. 5 | 6 | Available methods are the Lee-Ready algorithm (Lee and Ready, 1991), 7 | the Bulk-Volume classification algorithm (Easley et al., 2012), the 8 | algorithm of Ellis et al. (2000), the algorithm of Chakrabarty et al. 9 | (2007) and the Full-Information algorithm of Jurkatis (2020). 10 | 11 | Also allows the estimation of order imbalances and transaction costs. 12 | 13 | # Dependencies 14 | - numpy 15 | - pandas 16 | - cython 17 | - scipy 18 | - warnings 19 | - statsmodels 20 | 21 | # Installation 22 | To compile the `tradeclassification_c.pyx` file into the required C file 23 | ``` 24 | $ python setup.py build_ext -i 25 | ``` 26 | 27 | # Usage 28 | ## Trade Classification 29 | ```python 30 | from classifytrades import TradeClassification 31 | 32 | tc = TradeClassification(df,Ask=Ask,Bid=Bid) 33 | tc.classify(method='lee_ready', freq=0, reduce_precision=True) 34 | 35 | print(tc.df_tr.head()) 36 | ``` 37 | Other method arguments are `'clnv'`, `'emo'`, `'bvc'`, `'ds_1'`, `'ds_2'`, `'ds_3'`. 38 | 39 | - `df` : pandas.DataFrame with transaction data. 40 | Assumed to be deduplicated, ie. only one record per trade between two counterparties (only relevant for FI and BVC). 41 | The dataframe must contain at least a `time` column containing the transaction times measured in seconds (i.e. timestamps of precision higher than seconds are expressed as floats) and a `price` column containing the transaction prices. For the FI algorithm the dataframe must also contain a `vol` column with the number of shares exchanged in the transaction. 42 | - `Ask` : pandas.DataFrame (optional; default None). 43 | For the FI, LR, EMO and CLNV algorithms order book data are required, as well as for computing transaction costs. The dataframe must contain a `time` column indicating the time of the quote change expressed in seconds and a `price` column with the best ask. For the FI algorithm the dataframe must also contain the volume available at the best ask. 44 | - `Bid` : analogous to `Ask`. 45 | 46 | 47 | ### The FI algorithm 48 | The FI algorithm comes in three different versions, depending on the data structure (see Jurkatis, 2020). 49 | 50 | #### Data Structure 1 51 | For data where each transaction at the ask or bid must have a corresponding reduction in the volume available at the respective quote and where trades and quotes can be assumed to be recorded in the same order in which they were executed use `method = 'ds_1'`. 52 | 53 | #### Data Structure 2 54 | For data where, contrary to DS1, quote changes that are due to the same trade are aggregated, use `method='ds_2'`. Aggregated quote changes mean that, for example, a buy order for 100 shares that is executed against two 55 | standing sell limit-orders for 50 shares each will be reflected in a single change at the ask of a total change in volume of 100 shares, instead of two separate changes of 50 shares. 56 | 57 | #### Data Structure 3 58 | If in addition to DS2 one cannot assume that trades and quotes are in the correct order, use `method='ds_3'`. 59 | 60 | ## Order Imbalances 61 | The module also allows to compute the order imbalance, defined as the buyer-initiated volume minus seller-initiated volume over total volume over a given measurement interval. 62 | 63 | ```python 64 | oi = tc.get_orderimbalance(10,bin_type='vol') 65 | ``` 66 | splits the data into 10 equal volume intervals (individual trades are not broken up between intervals so differences in total volume between the intervals may remain) and computes the order imbalance for each. 67 | 68 | To control the length of the intervals rather than the number use 69 | ```python 70 | Vb = tc.buyvolume(window=10,window_type='time') 71 | ``` 72 | The call returns the buyer-initiated volume and total volume for the trading data split into intervals of 10 seconds. 73 | 74 | ## Transaction Costs 75 | The classification result can also be used to compute the execution costs of each group of consecutive buyer- and seller-initiated trades. 76 | 77 | ```python 78 | execost = tc.impl_sf() 79 | ``` 80 | which can susequently be used in a price impact regression. 81 | 82 | ```python 83 | propcost = tc.estimate_execost(execost) 84 | ``` 85 | 86 | # References 87 | 88 | Chakrabarty, B., Li, B., Nguyen, V., Van Ness, R.A., 2007. [Trade classification algorithms for electronic communications network trades](https://doi.org/10.1016/j.jbankfin.2007.03.003). Journal of Banking & 89 | Finance 31, 3806–3821. 90 | 91 | Easley, D., de Prado, M.M.L., O’Hara, M., 2012. [Flow toxicity and liquidity in a high-frequency world](https://doi.org/10.1093/rfs/hhs053). Review of Financial Studies 25, 1457–1493. 92 | 93 | Ellis, K., Michaely, R., O’Hara, M., 2000. [The accuracy of trade classification rules: Evidence from Nasdaq](https://doi.org/10.2307/2676254). Journal of Financial and Quantitative Analysis 35, 529–551. 94 | 95 | Jurkatis, S., 2020. [Inferring trade directions in fast markets](https://doi.org/10.1016/j.finmar.2021.100635). Journal of Financial Markets, Vol 58, 100635 96 | 97 | Lee, C., Ready, M.J., 1991. [Inferring trade direction from intraday data](https://doi.org/10.1111/j.1540-6261.1991.tb02683.x). The Journal 98 | of Finance 46, 733–746. -------------------------------------------------------------------------------- /classifytrades.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from tradeclassification_c import get_ind, sign_trades_ds1, sign_trades_ds2, sign_trades_ds3, tick_rule, vol_bin, concat_runs 4 | from scipy import stats 5 | import warnings 6 | import statsmodels.api as sm 7 | 8 | 9 | def get_lastquote(quotes,as_of): 10 | 11 | ind = np.searchsorted(quotes.time.values,as_of, side='left')-1 12 | last_quote = pd.Series(np.zeros(len(as_of)), index=as_of) 13 | 14 | mask = ind>=0 15 | last_quote.loc[mask] = quotes.loc[ind[mask],'price'].values 16 | last_quote.loc[~mask] = np.nan 17 | return last_quote 18 | 19 | 20 | def get_midpoint(Ask,Bid,as_of): 21 | 22 | ask = get_lastquote(Ask,as_of) 23 | bid = get_lastquote(Bid,as_of) 24 | 25 | midpoint = (ask + bid)/2 26 | midpoint.loc[askself.df_tr.midpoint, 'Initiator'] = 1 499 | self.df_tr.loc[self.df_tr.price0.5,'buy_vol'] = buy_frac.loc[buy_frac.f_b>0.5,'vol'] 574 | buy_frac.loc[buy_frac.f_b==0.5,'buy_vol'] = -1 575 | else: 576 | buy_frac = buy_frac.join( self.df_tr[['group','vol']].groupby('group').sum() ) 577 | buy_frac['buy_vol'] = buy_frac.f_b*buy_frac.vol 578 | 579 | return buy_frac 580 | 581 | 582 | def buyvolume(self,window=1,window_type='time',start=None,drop_firsttrade=True): 583 | """ 584 | Returns buyer-initiated volume and total volume over intervals from the 585 | individually classified trades. The result can be compared to the output of 586 | the BVC algorithm. 587 | 588 | Parameters 589 | ---------- 590 | window : float or int (default 1) 591 | Length of the intervals over which to compute the fraction of 592 | buyer-inititated volumes. 593 | window_type : str (default 'time') 594 | Type of the interval. Must be either 'time', 'vol' or 'per_trade'. 595 | Specifies which unit the `window` refers to: seconds if 'time' and 596 | trading volume if 'vol'. If 'per_trade', `window` is ignored and the 597 | buyer-initiated volume is computed for each individual trade. 598 | start: float or int (default None) 599 | Starting point of the first interval. Only relevant for `window_type` 'time'. 600 | If None, starting point for the interval construction is the first mentioned 601 | timestamp at the given timestamp precision. 602 | drop_firsttrades : bool (default True) 603 | If True, first trade is not considered in constructing the interval. This 604 | choice makes the result comparable to the result from the BVC algorithm 605 | which uses the first price as the reference starting point to compute the 606 | between-interval price changes. 607 | 608 | Returns 609 | ------- 610 | pandas.DataFrame index by the interval number, containing buyer-initiated volume 611 | and total volume. 612 | 613 | """ 614 | 615 | 616 | if not 'Initiator' in self.df_tr.columns: 617 | raise KeyError("Data do not contain trade initiator label; classify trades first") 618 | 619 | i = 1 if drop_firsttrade else 0 620 | group = self.create_window(self.df_tr.iloc[i:],window=window,window_type=window_type,start=start) 621 | 622 | self.df_tr['group'] = -1 623 | self.df_tr.iloc[i:,self.df_tr.columns.get_indexer(['group'])] = group 624 | 625 | vol = self.df_tr[['group','vol']].groupby('group').sum() 626 | buyfrac = self.df_tr.loc[self.df_tr.Initiator==1,['group','vol']].groupby('group').sum().rename(columns={'vol': 'buy_vol'}) 627 | buyfrac = buyfrac.join(vol,how='outer').fillna(0) 628 | 629 | if drop_firsttrade: 630 | buyfrac = buyfrac.iloc[i:] 631 | 632 | return buyfrac 633 | 634 | 635 | def create_window(self,df_tr,window=1,window_type='time',start=None): 636 | 637 | if not window_type in ['time','vol']: 638 | raise ValueError("window type to create intervals must be either 'time' or 'vol'.") 639 | 640 | group = df_tr[window_type].values.astype(int) 641 | if window_type == 'time': 642 | if start is not None: 643 | group = group - start 644 | elif self.freq is None: 645 | group = group - group[0] 646 | else: 647 | group = group - np.floor(group[0]*10**self.freq)/10**self.freq 648 | 649 | group = group // window 650 | else: 651 | group = vol_bin(group, window) 652 | 653 | return group 654 | 655 | 656 | def emo(self,interpolate=False,**kwargs): 657 | """ 658 | Classify trades using the algorithm of Ellis et al. (2000). 659 | 660 | Parameters 661 | ---------- 662 | interpolate : bool (default False) 663 | If True, interpolate timestamp precision prior 664 | to applying the algorithm according to 665 | Holden and Jacobsen (2014) 666 | 667 | Returns 668 | ------- 669 | None. Result is appended to the provided dataframe. 670 | 671 | References 672 | ---------- 673 | Holden, C.W., Jacobsen, S., 2014. Liquidity measurement problems in fast, com- 674 | petitive markets: expensive and cheap solutions. The Journal of Finance 69, 675 | 1747–1785. 676 | 677 | Ellis, K., Michaely, R., O’Hara, M., 2000. The accuracy of trade classification rules: 678 | Evidence from Nasdaq. Journal of Financial and Quantitative Analysis 35, 529–551. 679 | 680 | """ 681 | 682 | # cleanup 683 | self.df_tr.drop(columns=['ask','bid'], errors='ignore',inplace=True) 684 | 685 | if interpolate: 686 | self.tcol_interpolation() 687 | timecol = 'time_inter' 688 | else: 689 | timecol = 'time' 690 | 691 | lastask = get_lastquote(self.Ask[[timecol,'price']].rename(columns={'time_inter': 'time'}), 692 | self.df_tr[timecol].unique() 693 | ) 694 | 695 | lastbid = get_lastquote(self.Bid[[timecol,'price']].rename(columns={'time_inter': 'time'}), 696 | self.df_tr[timecol].unique() 697 | ) 698 | 699 | self.df_tr = self.df_tr.merge(lastask.to_frame(name='ask').join(lastbid.to_frame(name='bid'), how='outer'), 700 | left_on=timecol, right_index=True, how='left') 701 | 702 | mask = self.df_tr.ask<=self.df_tr.bid 703 | self.df_tr.loc[mask,'ask'] = np.nan 704 | self.df_tr.loc[mask,'bid'] = np.nan 705 | 706 | self.df_tr['Initiator'] = 0 707 | self.df_tr['Step'] = 0 708 | 709 | self.df_tr.loc[self.df_tr.price==self.df_tr.ask,'Initiator'] = 1 710 | self.df_tr.loc[self.df_tr.price==self.df_tr.bid,'Initiator'] = -1 711 | 712 | self.df_tr.loc[(self.df_tr.Initiator==1) | (self.df_tr.Initiator==-1), 'Step'] = 1 713 | 714 | # tick rule 715 | self.apply_tick() 716 | self.df_tr.loc[self.df_tr.Step==0,'Step'] = 2 717 | 718 | return 719 | 720 | 721 | def clnv(self,interpolate=False,**kwargs): 722 | """ 723 | Classify trades using the algorithm of Chakrabarty et al. (2007). 724 | 725 | Parameters 726 | ---------- 727 | interpolate : bool (default False) 728 | If True, interpolate timestamp precision prior 729 | to applying the algorithm according to 730 | Holden and Jacobsen (2014) 731 | 732 | Returns 733 | ------- 734 | None. Result is appended to the provided dataframe. 735 | 736 | References 737 | ---------- 738 | Chakrabarty, B., Li, B., Nguyen, V., Van Ness, R.A., 2007. Trade classification 739 | algorithms for electronic communications network trades. Journal of Banking & 740 | Finance 31, 3806–3821. 741 | 742 | Holden, C.W., Jacobsen, S., 2014. Liquidity measurement problems in fast, com- 743 | petitive markets: expensive and cheap solutions. The Journal of Finance 69, 744 | 1747–1785. 745 | 746 | """ 747 | 748 | # cleanup 749 | self.df_tr.drop(columns=['ask','bid'], errors='ignore',inplace=True) 750 | 751 | if interpolate: 752 | self.tcol_interpolation() 753 | timecol = 'time_inter' 754 | else: 755 | timecol = 'time' 756 | 757 | lastask = get_lastquote(self.Ask[[timecol,'price']].rename(columns={'time_inter': 'time'}), 758 | self.df_tr[timecol].unique() 759 | ) 760 | 761 | lastbid = get_lastquote(self.Bid[[timecol,'price']].rename(columns={'time_inter': 'time'}), 762 | self.df_tr[timecol].unique() 763 | ) 764 | 765 | self.df_tr = self.df_tr.merge(lastask.to_frame(name='ask').join(lastbid.to_frame(name='bid'), how='outer'), 766 | left_on=timecol, right_index=True, how='left') 767 | 768 | mask = self.df_tr.ask<=self.df_tr.bid 769 | self.df_tr.loc[mask,'ask'] = np.nan 770 | self.df_tr.loc[mask,'bid'] = np.nan 771 | 772 | self.df_tr['Initiator'] = 0 773 | self.df_tr['Step'] = 0 774 | 775 | self.df_tr.loc[(self.df_tr.price>0.7*self.df_tr.ask + 0.3*self.df_tr.bid) & (self.df_tr.price<=self.df_tr.ask),'Initiator'] = 1 776 | self.df_tr.loc[(self.df_tr.price<0.3*self.df_tr.ask + 0.7*self.df_tr.bid) & (self.df_tr.price>=self.df_tr.bid),'Initiator'] = -1 777 | 778 | self.df_tr.loc[(self.df_tr.Initiator==1) | (self.df_tr.Initiator==-1), 'Step'] = 1 779 | 780 | # tick rule 781 | self.apply_tick() 782 | self.df_tr.loc[self.df_tr.Step==0,'Step'] = 2 783 | 784 | return 785 | 786 | 787 | def apply_tick(self): 788 | """Classify trades using the tick-test. Used in conjunction with one of 789 | the other algorithms, but can be used standalone if a `Step` column 790 | containing only zeros is given in the transaction dataframe.""" 791 | 792 | # tick rule 793 | mask = self.df_tr.Step==0 794 | 795 | trrest = self.df_tr.loc[mask,['price']].reset_index(drop=False).values.astype(int) 796 | index_p, prices = trrest[:,0], trrest[:,1] 797 | 798 | s = tick_rule(self.df_tr.price.values.astype(int), prices, index_p) 799 | 800 | self.df_tr.loc[mask,'Initiator'] = s 801 | 802 | return 803 | 804 | def true_initiator(self): 805 | self.df_tr['Initiator'] = self.df_tr.direction*-1 806 | return 807 | 808 | 809 | def evaluate_bulkclass(self,buyvol,target): 810 | """ 811 | Evaluate classification result when estimated as a fraction of 812 | trading volume over time or volume intervals. Criterium follows 813 | Chakrabarty et al. (2015): 814 | 815 | ..math:: \sum_{i} \min(V_i^B,\hat{V}_i^B) + \min(V_i^S,\hat{V}_i^S) / \sum_i V_i. 816 | 817 | Parameters: 818 | ----------- 819 | buyvol : pandas.DataFrame 820 | Estimated buyer initiated volume. Indexed by estimation intervals. 821 | Must contain `buy_vol` and `vol` columns to containing the buyer- 822 | initiated volume and total trading volume for each interval. 823 | target : pandas.DataFrame 824 | Same as `buyvol` but with the true buyer-initiated volume 825 | 826 | Returns: 827 | -------- 828 | float 829 | 830 | References: 831 | ----------- 832 | Chakrabarty, B., Pascual, R., Shkilko, A., 2015. Evaluating trade classification 833 | algorithms: Bulk volume classification versus the tick rule and the Lee-Ready algo- 834 | rithm. Journal of Financial Markets 25, 52–79. 835 | 836 | """ 837 | 838 | vb = np.minimum(buyvol.buy_vol, target.buy_vol).sum() 839 | vs = np.minimum(buyvol.vol-buyvol.buy_vol, target.vol-target.buy_vol).sum() 840 | 841 | s = vb + vs 842 | vol = target.vol.sum() 843 | return s/vol 844 | 845 | 846 | def tcol_interpolation(self): 847 | """Interpolate original timestamp in all dataframes. Result saved in new column 'time_inter'.""" 848 | 849 | self.df_tr['time_inter'] = interpolate_time(self.df_tr.time.values,self.freq,hj_version=True) 850 | self.Ask['time_inter'] = interpolate_time(self.Ask.time.values,self.freq,hj_version=True) 851 | self.Bid['time_inter'] = interpolate_time(self.Bid.time.values,self.freq,hj_version=True) 852 | 853 | return 854 | 855 | 856 | def into_bins(self,n,bin_type='vol'): 857 | """Split data into n equally sized bins, either by time or by volume.""" 858 | 859 | x = self.df_tr[bin_type].values 860 | if bin_type=='vol': 861 | x = np.cumsum(x) 862 | 863 | bins = np.linspace(np.min(x),np.max(x), n+1)[1:] 864 | group = np.searchsorted(bins,x) 865 | 866 | self.df_tr['group'] = group 867 | return 868 | 869 | 870 | def get_orderimbalance(self,n,bin_type='vol'): 871 | """ 872 | Returns the order imbalance computed from individually 873 | classified trades over `n` data intervals. (To specify 874 | the length of the intverals rather than the number, use 875 | the `buyvolume` method.) 876 | 877 | Parameters 878 | ---------- 879 | n : int 880 | Number of intervals to split the data into. 881 | bin_type : str (default : 'vol') 882 | If 'vol', data are split into `n` volume bins. If 883 | 'time', data are split into `n` time bins. 884 | 885 | Returns 886 | ------- 887 | pandas.DataFrame 888 | 889 | """ 890 | 891 | self.into_bins(n,bin_type=bin_type) 892 | 893 | V = self.df_tr[['vol','group']].groupby('group').sum() 894 | Vb = self.df_tr.loc[self.df_tr.Initiator==1,['vol','group']].groupby('group').sum() 895 | Vs = self.df_tr.loc[self.df_tr.Initiator==-1,['vol','group']].groupby('group').sum() 896 | 897 | oi = (Vb.subtract(Vs,fill_value=0)).divide(V) 898 | return oi.rename(columns={'vol': 'oi'}) 899 | 900 | 901 | def impl_sf(self,iloc=True,tcol='time'): 902 | """ 903 | Returns the execution costs for each group of consecutive buyer- or 904 | seller-initiated trades. 905 | 906 | ..math:: e_i = o_i \sum_{t=1}^{\tau_i} (p_{i,t} - m_i)v_{t,i} 907 | 908 | where `o_i` is the trade direction of the i-th group of consectutive buyer or seller- 909 | initiated trades (1 for a buy, -1 for a sell order), `{p_it , v_it}` are the transaction 910 | prices (in log) and volumes of all trades belonging to the i-th group, and `m_i` is 911 | the mid-quote (also in log) at the time of the order. 912 | 913 | Parameters 914 | ---------- 915 | tcol : str (default 'time') 916 | Determines which column to use to determine the 917 | corresponding mid-quote for each group. 918 | 919 | Returns 920 | ------- 921 | pandas.DataFrame with group label, execution time of the first transaction 922 | of the group, total trading volume of the group and the execution cost. 923 | 924 | """ 925 | 926 | net = self.df_tr[[tcol,'Initiator','vol','price']].rename(columns={'vol': 'net_vol'}) 927 | 928 | runs, start, end = get_runs(net.Initiator.values) 929 | 930 | net['group'] = runs 931 | if iloc: 932 | as_of = net.iloc[start][tcol].values 933 | else: 934 | as_of = net.loc[start,tcol].values 935 | 936 | midpoint = get_midpoint(self.Ask.rename(columns={tcol: 'time'}),self.Bid.rename(columns={tcol: 'time'}),as_of) 937 | midpoint['group'] = net.group.unique() 938 | midpoint.set_index('group',inplace=True) 939 | 940 | net = net.merge(midpoint, left_on='group', right_index=True, how='left') 941 | 942 | net['impl_shortfall'] = (np.log(net.price) - np.log(net.midpoint))*net['net_vol']*net.Initiator 943 | 944 | sf = net[['group','net_vol','impl_shortfall']].groupby('group').sum() 945 | sf['time'] = as_of 946 | 947 | return sf 948 | 949 | 950 | def estimate_execost(self,sf,params_only=True,quadratic=False): 951 | """ 952 | Returns the result from a price impact regression. 953 | 954 | ..math:: e_i = \beta_0 + \beta_1 v_i + \eps_i 955 | 956 | Parameters 957 | ---------- 958 | sf : pandas.DataFrame 959 | Contains a column 'impl_shortfall' with the transaction 960 | costs of the i-th order and a column 'net_vol' with 961 | the total volume of the i-th order. 962 | params_only : bool (defaul True) 963 | If True, returns numpy.array with the parameter estimates. 964 | Otherwise, statsmodel regression result is returned. 965 | quadratic : bool (default False) 966 | If True, use v_i^2 as additional regressor. 967 | 968 | Returns 969 | ------- 970 | numpy.array of parameter estimates or statsmodels regression 971 | result object. 972 | 973 | """ 974 | mask = pd.notnull(sf.impl_shortfall) 975 | data = sm.add_constant(sf.loc[mask,['net_vol']].values) 976 | if quadratic: 977 | data = np.hstack([data,sf.loc[mask,['net_vol']].values**2]) 978 | 979 | model = sm.OLS(sf.loc[mask,'impl_shortfall'].values,data) 980 | 981 | res = model.fit() #cov_type='HC3' 982 | return res.params if params_only else res -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Feb 01 18:40:32 2017 4 | 5 | @author: sjurkatis 6 | """ 7 | 8 | try: 9 | from setuptools import setup 10 | from setuptools import Extension 11 | except ImportError: 12 | from distutils.core import setup 13 | from distutils.extension import Extension 14 | 15 | from Cython.Distutils import build_ext 16 | import numpy 17 | 18 | #libraries = ['msvcr90.dll'] 19 | ext_modules = [Extension('tradeclassification_c', 20 | ['tradeclassification_c.pyx'], 21 | include_dirs = [numpy.get_include()])] 22 | 23 | setup(cmdclass = {'build_ext': build_ext}, ext_modules = ext_modules) 24 | 25 | #from Cython.Build import cythonize 26 | #setup(ext_modules = cythonize('c_get_inds.pyx' , include_dirs = [numpy.get_include()])) 27 | 28 | -------------------------------------------------------------------------------- /tradeclassification_c.pyx: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Jul 30 13:16:59 2017 4 | 5 | @author: sjurkatis 6 | """ 7 | 8 | 9 | #from libc.stdlib cimport malloc, free 10 | import numpy as np 11 | cimport numpy as np 12 | cimport cython 13 | #from cpython cimport array 14 | #import array 15 | 16 | DTYPE = np.int 17 | ctypedef np.int_t DTYPE_t 18 | 19 | DTYPEf = np.float64 20 | ctypedef np.float64_t DTYPEf_t 21 | 22 | @cython.boundscheck(False) 23 | @cython.wraparound(False) 24 | def get_ind(double [:] arr, double [:] time): 25 | cdef: 26 | int n = time.shape[0] 27 | int s = arr.shape[0] 28 | int begin, end, found, k 29 | unsigned int ind = 0 30 | size_t j, i 31 | np.ndarray[DTYPE_t, ndim=1] left = np.zeros(n, dtype=DTYPE) 32 | np.ndarray[DTYPE_t, ndim=1] right = np.zeros(n, dtype=DTYPE) 33 | double [:] tmp 34 | double v 35 | 36 | for j in xrange(n): 37 | begin = s 38 | end = 0 39 | found = 0 40 | tmp = arr[ind:] 41 | k = tmp.shape[0] 42 | for i in xrange(k): 43 | v = tmp[i] - time[j] 44 | if not found and v>=0: 45 | found = 1 46 | begin = ind + i 47 | 48 | if found: 49 | if v > 0: 50 | ind += i 51 | end = ind 52 | break 53 | elif i==k-1: 54 | end = s 55 | break 56 | 57 | left[j] = begin 58 | right[j] = end 59 | 60 | return left, right 61 | 62 | 63 | @cython.boundscheck(False) 64 | @cython.wraparound(False) 65 | def sign_trades_ds1(long[:] P, long[:] V, long[:] Al, long[:] Ar, long[:] Bl, long[:] Br, long[:] runlength, long[:] askp, long[:] bidp, long[:] avdiff, long[:] bvdiff, double[:] atime, double[:] btime, double bar): 66 | 67 | cdef: 68 | int n = Al.shape[0] 69 | int anum = askp.shape[0] 70 | int bnum = bidp.shape[0] 71 | int trnum = P.shape[0] 72 | np.ndarray[DTYPE_t, ndim=1] s = np.zeros(trnum, dtype=DTYPE) 73 | np.ndarray[DTYPE_t, ndim=1] c = np.zeros(trnum, dtype=DTYPE) 74 | #np.ndarray[DTYPE_t, ndim=1] ask = np.zeros(trnum, dtype=DTYPE) 75 | #np.ndarray[DTYPE_t, ndim=1] bid = np.zeros(trnum, dtype=DTYPE) 76 | size_t j,i 77 | int al, ar, bl, br, last_ask, last_bid, k, d_a, d_b, tmpind, p, v 78 | np.ndarray[DTYPE_t, ndim=1] discard_a = np.zeros(anum, dtype=DTYPE) 79 | np.ndarray[DTYPE_t, ndim=1] discard_b = np.zeros(bnum, dtype=DTYPE) 80 | int ind = 0 81 | bint av_match, bv_match 82 | double upper, lower 83 | 84 | for j in range(n): 85 | 86 | al = Al[j] 87 | ar = Ar[j] 88 | bl = Bl[j] 89 | br = Br[j] 90 | 91 | last_ask = askp[al] 92 | last_bid = bidp[bl] 93 | for i in range(runlength[j]): 94 | tmpind = ind+i 95 | p = P[tmpind] 96 | v = V[tmpind] 97 | 98 | av_match = False 99 | bv_match = False 100 | 101 | #if ar>0: 102 | # ask[tmpind] = last_ask 103 | #else: 104 | # ask[tmpind] = -1 105 | 106 | #if br>0: 107 | # bid[tmpind] = last_bid 108 | #else: 109 | # bid[tmpind] = -1 110 | 111 | for k in range(ar-al): 112 | if discard_a[al+k]==0 and askp[al+k]==p and avdiff[al+k]==v: 113 | av_match = True 114 | d_a = k 115 | break 116 | 117 | for k in range(br-bl): 118 | if discard_b[bl+k]==0 and bidp[bl+k]==p and bvdiff[bl+k]==v: 119 | bv_match = True 120 | d_b = k 121 | break 122 | 123 | if av_match and not bv_match: 124 | s[tmpind] = 1 125 | c[tmpind] = 1 126 | #ask[tmpind] = p 127 | 128 | last_ask = askp[al+d_a] 129 | elif bv_match and not av_match: 130 | s[tmpind] = -1 131 | c[tmpind] = 1 132 | #bid[tmpind] = p 133 | 134 | last_bid = bidp[bl+d_b] 135 | elif av_match and bv_match: 136 | 137 | if atime[al+d_a] < btime[bl+d_b]: 138 | s[tmpind] = 1 139 | c[tmpind] = 2 140 | #ask[tmpind] = p 141 | 142 | discard_a[al+d_a] = 1 143 | last_ask = askp[al+d_a] 144 | 145 | elif atime[al+d_a] > btime[bl+d_b]: 146 | s[tmpind] = -1 147 | c[tmpind] = 2 148 | #bid[tmpind] = p 149 | 150 | 151 | discard_b[bl+d_b] = 1 152 | last_bid = bidp[bl+d_b] 153 | 154 | else: 155 | 156 | # not av_match and not bv_match: 157 | # most likely hidden order 158 | if last_ask > last_bid: 159 | upper = last_ask*(1-bar) + last_bid*bar 160 | lower = last_ask*bar + last_bid*(1-bar) 161 | #x = (p-last_bid)/(last_ask-last_bid) 162 | 163 | if p>upper: #x>1-bar: 164 | s[tmpind] = 1 165 | c[tmpind] = 3 166 | elif p=trV: 214 | av_match = True 215 | d_a = k 216 | break 217 | 218 | for k in range(br-bl): 219 | if bidp[bl+k]==p and bvdiff[bl+k]>=trV: 220 | bv_match = True 221 | d_b = k 222 | break 223 | 224 | if av_match and not bv_match: 225 | s[tmpind] = 1 226 | c[tmpind] = 1 227 | #last_ask = p 228 | 229 | elif bv_match and not av_match: 230 | s[tmpind] = -1 231 | c[tmpind] = 1 232 | #last_bid = p 233 | 234 | elif av_match and bv_match: 235 | 236 | if atime[al+d_a] < btime[bl+d_b]: 237 | 238 | s[tmpind] = 1 239 | c[tmpind] = 2 240 | 241 | avdiff[al+d_a] -= trV 242 | #last_ask = p 243 | 244 | elif atime[al+d_a] > btime[bl+d_b]: 245 | s[tmpind] = -1 246 | c[tmpind] = 2 247 | 248 | bvdiff[bl+d_b] -= trV 249 | #last_bid = p 250 | 251 | else: 252 | # there are two possibilites for a visible order to not find a 253 | # match: 254 | # (1): a market order trades against the best quote and the next 255 | # best quote, due to the size of the market order. The order 256 | # book, however, records only best quote and volume before 257 | # the market order and the new best quote with corresponding 258 | # volume after the completion of the full market order. That is, 259 | # we have a price match, but not the corresponding volume 260 | # change match. 261 | 262 | for k in range(ar-al): 263 | if al+k+1==anum: 264 | break 265 | elif askp[al+k]

p and bidp[bl+k+1]==p: 274 | bv_match = True 275 | d_b = k+1 276 | break 277 | 278 | if av_match and not bv_match: 279 | s[tmpind] = 1 280 | c[tmpind] = 5 281 | #last_ask = p 282 | 283 | 284 | elif bv_match and not av_match: 285 | s[tmpind] = -1 286 | c[tmpind] = 5 287 | #last_bid = p 288 | 289 | elif av_match and bv_match: 290 | if atime[al+d_a] < btime[bl+d_b]: 291 | s[tmpind] = 1 292 | c[tmpind] = 6 293 | #last_ask = p 294 | 295 | elif atime[al+d_a] > btime[bl+d_b]: 296 | s[tmpind] = -1 297 | c[tmpind] = 6 298 | #last_bid = p 299 | 300 | 301 | else: 302 | # if there is no match with a price, there is a second possibility 303 | # (2): The market order goes through the levels 1 to n>2 of 304 | # the order book. The prices between level 1 and n are 305 | # not displayed in the order book. Then all transactions 306 | # taking place between level 1 and n have no price match 307 | for k in range(ar-al): 308 | if al+k+1==anum: 309 | break 310 | elif askp[al+k]

p: 311 | av_match = True 312 | d_a = k+1 313 | break 314 | 315 | for k in range(br-bl): 316 | if bl+k+1==bnum: 317 | break 318 | elif bidp[bl+k]>p and bidp[bl+k+1] btime[bl+d_b]: 342 | s[tmpind] = -1 343 | c[tmpind] = 8 344 | #last_bid = p 345 | 346 | 347 | else: # still no match; must be a hidden order 348 | # not av_match and not bv_match: 349 | # most likely hidden order 350 | if last_ask > last_bid: 351 | upper = last_ask*(1-bar) + last_bid*bar 352 | lower = last_ask*bar + last_bid*(1-bar) 353 | #x = (p-last_bid)/(last_ask-last_bid) 354 | 355 | if p>upper: #x>1-bar: 356 | s[tmpind] = 1 357 | c[tmpind] = 3 358 | elif p=trV: 406 | av_match = True 407 | d_a = k 408 | break 409 | 410 | for k in range(br-bl): 411 | if bidp[bl+k]==p and bidv[bl+k]>=trV: 412 | bv_match = True 413 | d_b = k 414 | break 415 | 416 | if av_match and not bv_match: 417 | s[tmpind] = 1 418 | c[tmpind] = 1 419 | 420 | elif bv_match and not av_match: 421 | s[tmpind] = -1 422 | c[tmpind] = 1 423 | 424 | elif av_match and bv_match: 425 | 426 | if atime[al+d_a] < btime[bl+d_b]: 427 | s[tmpind] = 1 428 | c[tmpind] = 2 429 | 430 | askv[al+d_a] -=trV 431 | elif atime[al+d_a] > btime[bl+d_b]: 432 | s[tmpind] = -1 433 | c[tmpind] = 2 434 | 435 | bidv[bl+d_b] -=trV 436 | 437 | else: # still no match; must be a hidden order 438 | # not av_match and not bv_match: 439 | # most likely hidden order 440 | if last_ask > last_bid: 441 | upper = last_ask*(1-bar) + last_bid*bar 442 | lower = last_ask*bar + last_bid*(1-bar) 443 | #x = (p-last_bid)/(last_ask-last_bid) 444 | 445 | if p>upper: #x>1-bar: 446 | s[tmpind] = 1 447 | c[tmpind] = 3 448 | elif pcount: 477 | count += 1 478 | lp = lstp[sz-count] 479 | if p>lp: 480 | s[j] = 1 481 | break 482 | elif p=w: 503 | g+=1 504 | vol = 0 505 | 506 | return group 507 | 508 | @cython.boundscheck(False) 509 | @cython.wraparound(False) 510 | @cython.cdivision(True) 511 | def concat_runs(long[:] x, bint hj_version=False): 512 | 513 | cdef: 514 | int n = x.shape[0] 515 | size_t i, j 516 | double k 517 | size_t count = 0 518 | np.ndarray[DTYPEf_t, ndim=1] interp = np.zeros(sum(x), dtype=DTYPEf) 519 | 520 | if hj_version: 521 | for i in range(n): 522 | k = x[i]+1. 523 | for j in range(1,x[i]+1): 524 | interp[count] = (2*j-1)/(2*k) 525 | count += 1 526 | else: 527 | for i in range(n): 528 | k = x[i]+1. 529 | for j in range(1,x[i]+1): 530 | interp[count] = j/k 531 | count += 1 532 | 533 | return interp --------------------------------------------------------------------------------