├── .gitignore
├── LICENSE
├── README.md
├── classifytrades.py
├── setup.py
└── tradeclassification_c.pyx


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | *.c
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | pip-wheel-metadata/
 25 | share/python-wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # pipenv
 89 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 90 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 91 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 92 | #   install all needed dependencies.
 93 | #Pipfile.lock
 94 | 
 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 96 | __pypackages__/
 97 | 
 98 | # Celery stuff
 99 | celerybeat-schedule
100 | celerybeat.pid
101 | 
102 | # SageMath parsed files
103 | *.sage.py
104 | 
105 | # Environments
106 | .env
107 | .venv
108 | env/
109 | venv/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 | 
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 | 
118 | # Rope project settings
119 | .ropeproject
120 | 
121 | # mkdocs documentation
122 | /site
123 | 
124 | # mypy
125 | .mypy_cache/
126 | .dmypy.json
127 | dmypy.json
128 | 
129 | # Pyre type checker
130 | .pyre/
131 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 jktis
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Trade-Classification-Algorithms
 2 | 
 3 | Module to classify financial markets transactions data into 
 4 | buyer- and seller-initiated trades. 
 5 | 
 6 | Available methods are the Lee-Ready algorithm (Lee and Ready, 1991),
 7 | the Bulk-Volume classification algorithm (Easley et al., 2012), the 
 8 | algorithm of Ellis et al. (2000), the algorithm of Chakrabarty et al. 
 9 | (2007) and the Full-Information algorithm of Jurkatis (2020). 
10 | 
11 | Also allows the estimation of order imbalances and transaction costs.
12 | 
13 | # Dependencies
14 | - numpy
15 | - pandas
16 | - cython
17 | - scipy
18 | - warnings
19 | - statsmodels
20 | 
21 | # Installation
22 | To compile the `tradeclassification_c.pyx` file into the required C file
23 | ```
24 | $ python setup.py build_ext -i
25 | ```
26 | 
27 | # Usage
28 | ## Trade Classification
29 | ```python
30 | from classifytrades import TradeClassification 
31 | 
32 | tc = TradeClassification(df,Ask=Ask,Bid=Bid)
33 | tc.classify(method='lee_ready', freq=0, reduce_precision=True)
34 | 
35 | print(tc.df_tr.head())
36 | ```
37 | Other method arguments are `'clnv'`, `'emo'`, `'bvc'`, `'ds_1'`, `'ds_2'`, `'ds_3'`.
38 | 
39 | - `df` : pandas.DataFrame with transaction data. 
40 | Assumed to be deduplicated, ie. only one record per trade between two counterparties (only relevant for FI and BVC).
41 | The dataframe must contain at least a `time` column containing the transaction times measured in seconds (i.e. timestamps of precision higher than seconds are expressed as floats) and a `price` column containing the transaction prices. For the FI algorithm the dataframe must also contain a `vol` column with the number of shares exchanged in the transaction.
42 | - `Ask` : pandas.DataFrame (optional; default None).
43 | For the FI, LR, EMO and CLNV algorithms order book data are required, as well as for computing transaction costs. The dataframe must contain a `time` column indicating the time of the  quote change expressed in seconds and a `price` column with the best ask. For the FI algorithm the dataframe must also contain the volume available at the best ask.
44 | - `Bid` : analogous to `Ask`. 
45 | 
46 | 
47 | ### The FI algorithm
48 | The FI algorithm comes in three different versions, depending on the data structure (see Jurkatis, 2020).
49 | 
50 | #### Data Structure 1
51 | For data where each transaction at the ask or bid must have a corresponding reduction in the volume available at the respective quote and where trades and quotes can be assumed to be recorded in the same order in which they were executed use `method = 'ds_1'`.  
52 | 
53 | #### Data Structure 2
54 | For data where, contrary to DS1, quote changes that are due to the same trade are aggregated, use `method='ds_2'`. Aggregated quote changes mean that, for example, a buy order for 100 shares that is executed against two
55 | standing sell limit-orders for 50 shares each will be reflected in a single change at the ask of a total change in volume of 100 shares, instead of two separate changes of 50 shares.
56 | 
57 | #### Data Structure 3
58 | If in addition to DS2 one cannot assume that trades and quotes are in the correct order, use `method='ds_3'`.
59 | 
60 | ## Order Imbalances
61 | The module also allows to compute the order imbalance, defined as the buyer-initiated volume minus seller-initiated volume over total volume over a given measurement interval. 
62 | 
63 | ```python
64 | oi = tc.get_orderimbalance(10,bin_type='vol')
65 | ```
66 | splits the data into 10 equal volume intervals (individual trades are not broken up between intervals so differences in total volume between the intervals may remain) and computes the order imbalance for each.
67 | 
68 | To control the length of the intervals rather than the number use
69 | ```python
70 | Vb = tc.buyvolume(window=10,window_type='time')
71 | ```
72 | The call returns the buyer-initiated volume and total volume for the trading data split into intervals of 10 seconds.
73 | 
74 | ## Transaction Costs
75 | The classification result can also be used to compute the execution costs of each group of consecutive buyer- and seller-initiated trades. 
76 | 
77 | ```python
78 | execost = tc.impl_sf()
79 | ```
80 | which can susequently be used in a price impact regression. 
81 | 
82 | ```python
83 | propcost = tc.estimate_execost(execost)
84 | ```
85 | 
86 | # References
87 | 
88 | Chakrabarty, B., Li, B., Nguyen, V., Van Ness, R.A., 2007. [Trade classification algorithms for electronic communications network trades](https://doi.org/10.1016/j.jbankfin.2007.03.003). Journal of Banking &
89 | Finance 31, 3806–3821.
90 | 
91 | Easley, D., de Prado, M.M.L., O’Hara, M., 2012. [Flow toxicity and liquidity in a high-frequency world](https://doi.org/10.1093/rfs/hhs053). Review of Financial Studies 25, 1457–1493.
92 | 
93 | Ellis, K., Michaely, R., O’Hara, M., 2000. [The accuracy of trade classification rules: Evidence from Nasdaq](https://doi.org/10.2307/2676254). Journal of Financial and Quantitative Analysis 35, 529–551.
94 | 
95 | Jurkatis, S., 2020. [Inferring trade directions in fast markets](https://doi.org/10.1016/j.finmar.2021.100635). Journal of Financial Markets, Vol 58, 100635
96 | 
97 | Lee, C., Ready, M.J., 1991. [Inferring trade direction from intraday data](https://doi.org/10.1111/j.1540-6261.1991.tb02683.x). The Journal
98 | of Finance 46, 733–746.


--------------------------------------------------------------------------------
/classifytrades.py:
--------------------------------------------------------------------------------
  1 | import numpy as np 
  2 | import pandas as pd 
  3 | from tradeclassification_c import get_ind, sign_trades_ds1, sign_trades_ds2, sign_trades_ds3, tick_rule, vol_bin, concat_runs
  4 | from scipy import stats
  5 | import warnings
  6 | import statsmodels.api as sm
  7 | 
  8 | 
  9 | def get_lastquote(quotes,as_of):
 10 | 
 11 |     ind = np.searchsorted(quotes.time.values,as_of, side='left')-1
 12 |     last_quote = pd.Series(np.zeros(len(as_of)), index=as_of)
 13 |     
 14 |     mask = ind>=0
 15 |     last_quote.loc[mask] = quotes.loc[ind[mask],'price'].values
 16 |     last_quote.loc[~mask]  = np.nan
 17 |     return last_quote
 18 | 
 19 | 
 20 | def get_midpoint(Ask,Bid,as_of):
 21 | 
 22 |     ask = get_lastquote(Ask,as_of)
 23 |     bid = get_lastquote(Bid,as_of)
 24 | 
 25 |     midpoint = (ask + bid)/2
 26 |     midpoint.loc[ask<bid] = np.nan
 27 | 
 28 |     return midpoint.to_frame(name='midpoint')
 29 | 
 30 | 
 31 | def get_runs(x):
 32 |     """Returns group label, start and length of runs of values in x."""
 33 | 
 34 |     dx = np.diff(x)
 35 |     xi = np.nonzero(np.append(1,dx))[0]
 36 |     g_label = np.arange(len(xi))
 37 |     run_length = np.diff(np.append(xi,len(x)))
 38 |     group = np.repeat(g_label,run_length)
 39 |   
 40 |     return group, xi, run_length 
 41 | 
 42 | 
 43 | def trim_timestamp(x,freq):
 44 |     """
 45 |     Reduces timestamp precision.
 46 |     
 47 |     Parameters
 48 |     ----------
 49 |     x : numpy.ndarray
 50 |         1D array of timestamps measured in seconds after midnight.
 51 |     freq : float or int
 52 |         Frequency to which to timestamp precision should be reduced. 
 53 |         Frequency is measured in 10^freq of a second. 
 54 |         E.g.:
 55 |         freq=0 sets timestamp precision to seconds
 56 |         freq=1 sets it to tenth of a second
 57 |         freq = log10(0.5) sets it to every second second
 58 | 
 59 |     Returns
 60 |     -------
 61 |     numpy.ndarray
 62 | 
 63 |     """
 64 | 
 65 |     return np.floor(x*10**freq)/10**freq
 66 | 
 67 | 
 68 | def quote_index(q_t,tr_t):
 69 |     """Get start and end index of quote times in `q_t` with the same timestamp as trade times in `tr_t`."""
 70 | 
 71 |     left, right = get_ind(q_t,tr_t)
 72 |     right[left<right] -=1 # last quote cannot be traded on, so shift index 
 73 |     left -=1 # consider last quote from before the timestamp of the trade
 74 |     left[left<0] = 0    
 75 | 
 76 |     return left, right
 77 | 
 78 | 
 79 | def interpolate_time(t,freq,hj_version=False):
 80 |     """
 81 |     Interpolate timestamp precision. 
 82 | 
 83 |     .. math:: t_i = t + \frac{i}{N+1} f.
 84 | 
 85 |     Parameters
 86 |     ----------
 87 |     t : array like
 88 |         Array containing timestamps represented in seconds after midnight. 
 89 |     freq : int
 90 |         Timestamp precision as :math:`10^freq`th of a second. 
 91 |         E.g. `freq=0` means timestamp is precise to the second, 
 92 |         `freq=1` timestamp is precise to 10th of a second.
 93 |     hj_version : bool (default : False)
 94 |         If true, assigns equal interval length to each timestamp
 95 |         following Holden and Jacobsen (2014).
 96 | 
 97 |     Returns
 98 |     -------
 99 |     Array with interpolated timestamp
100 | 
101 |     References
102 |     ----------
103 |     Holden, C.W. and Jacobsen, S., 2014. Liquidity measurement problems in
104 |     fast, competitive markets: Expensive and cheap solutions. 
105 |     The Journal of Finance, 69(4), pp.1747-1785.
106 |     
107 |     """
108 | 
109 |     run = np.append(0,np.diff(t))
110 |     runlength = np.append(np.nonzero(run),run.size)
111 |     runlength = np.diff(np.append(0,runlength))
112 | 
113 |     intertime = concat_runs(runlength,hj_version)    
114 | 
115 |     return t+intertime*10**(-freq)
116 | 
117 | 
118 | def delta_vol(p,v,ask=True):
119 |     """
120 |     Returns change in volume `v` given quotes in `p`.
121 |     For ask quotes the change is given by
122 | 
123 |     .. math::
124 | 
125 |         \Delta v^a_j =    
126 |         \begin{cases}
127 |         v^a_j - v^a_{j+1} & \text{if $a_j = a_{j+1}$} \\
128 |         v^a_j           & \text{if $a_j < a_{j+1}$} \\
129 |         -1            & \text{otherwise}. 
130 |         \end{cases} 
131 |     
132 |     For bid quotes the inequality in reversed.
133 | 
134 |     Parameters
135 |     ----------
136 |     p : numpy array
137 |         Array of quotes
138 |     v : numpy array
139 |         Array of volume at the quotes. p and v must have the same length.
140 |     ask : bool (default, True)
141 |         If True, change in volume is computed for ask quotes, otherwise for bid quotes.
142 | 
143 |     Returns
144 |     -------
145 |     numpy array with changes in volume
146 | 
147 |     """
148 |     
149 |     vdiff = np.append(np.diff(v),0)*-1
150 |     
151 |     up = np.less(p[:-1],p[1:]) 
152 |     up = np.append(up,0).astype(int)
153 |     
154 |     down = np.greater(p[:-1],p[1:]) 
155 |     down = np.append(down,0).astype(int)
156 |     if ask:
157 |         vdiff[up==1] = v[up==1] # if ask price went up, volume change is equal to volume previously available
158 |         vdiff[down==1] = -1 # if ask price went down, volume change will not be considered
159 |     else:
160 |         # opposite for the bid
161 |         vdiff[down==1] = v[down==1] 
162 |         vdiff[up==1] = -1 
163 |     
164 |     return vdiff
165 | 
166 | 
167 | def fraction_buy(p,dof='estimate'):
168 | 
169 |     dp = np.diff(np.log(p))
170 |     mask = ~(np.isinf(dp) | np.isnan(dp))
171 |     if dp[mask].shape[0] < 2:
172 |         return np.array([np.nan]*dp.shape[0]) 
173 | 
174 |     sigma = np.std(dp[mask],ddof=1)
175 |     if sigma == 0:
176 |         return np.array([np.nan]*dp.shape[0])
177 | 
178 |     x = dp/sigma
179 |     if dof=='estimate':
180 |         dof = stats.t.fit(x[mask],loc=0,scale=1)[0]
181 |     elif dof=='normal':
182 |         with warnings.catch_warnings():
183 |             warnings.filterwarnings('ignore', r'invalid value encountered in (greater|less)')
184 |             return stats.norm.cdf(x)
185 | 
186 |     with warnings.catch_warnings():
187 |         return stats.t.cdf(x,dof)   
188 | 
189 | 
190 | class TradeClassification:
191 |     """
192 |     Class to classify transactions into buyer- and seller-initiated trades.
193 |     Available methods are the Full-Information algorithm (FI, see Jurkatis, 2020),
194 |     the Lee-Ready algortihm (LR, see Lee and Ready, 1991), the Bulk-Volume 
195 |     classification algorithm (BVC, see Easley et al., 2012), the algorithm of
196 |     Chakrabarty et al. (2007) (CLNV) and the algortihm of Ellis et al. (2000) (EMO).
197 | 
198 | 
199 |     Parameters
200 |     ----------
201 |     df_tr : pandas.DataFrame
202 |         Dataframe with transaction data, assumed to be deduplicated, ie. only 
203 |         one record per trade between two counterparties (only relevant for FI and BVC).
204 |         The dataframe must contain at least a `time` column containing the transaction times 
205 |         measured in seconds (i.e. timestamps of precision higher than seconds 
206 |         are expressed as floats) and a `price` column containing the transaction prices. 
207 |         For the FI algorithm the dataframe must also contain a `vol` column with 
208 |         the number of shares exchanged in the transaction.
209 |     Ask : pandas.DataFrame (optional; default None)
210 |         For the FI, LR, EMO and CLNV algorithms order book data is required. 
211 |         The dataframe must contain a `time` column indicating the time of the 
212 |         quote change expressed in seconds and a `price` column with the best ask.
213 |         For the FI algorithm the dataframe must also contain the volume available
214 |         at the best ask.
215 |     Bid : analogous to `Ask`. 
216 |     
217 | 
218 |     References
219 |     ----------
220 |     Chakrabarty, B., Li, B., Nguyen, V., Van Ness, R.A., 2007. Trade classification
221 |     algorithms for electronic communications network trades. Journal of Banking &
222 |     Finance 31, 3806–3821.
223 | 
224 |     Easley, D., de Prado, M.M.L., O’Hara, M., 2012. Flow toxicity and liquidity in a
225 |     high-frequency world. Review of Financial Studies 25, 1457–1493.
226 | 
227 |     Ellis, K., Michaely, R., O’Hara, M., 2000. The accuracy of trade classification rules:
228 |     Evidence from Nasdaq. Journal of Financial and Quantitative Analysis 35, 529–551.
229 | 
230 |     Jurkatis, S., 2020. Inferring Trade Directions in Fast Markets. Unpublished Mimeo
231 | 
232 |     Lee, C., Ready, M.J., 1991. Inferring trade direction from intraday data. The Journal
233 |     of Finance 46, 733–746.
234 |     """
235 | 
236 |     def __init__(self,df_tr,Ask=None,Bid=None):
237 | 
238 |         self.valid_methods = ['ds_1', 'ds_2', 'ds_3', 'lee_ready', 'bvc', 'emo', 'clnv','true']
239 |         self.df_tr = df_tr
240 |         self.Ask = Ask 
241 |         self.Bid = Bid
242 | 
243 | 
244 |     def extract_variables(self,version):
245 |         # get first and last ask/bid quote valid at each trade time
246 |         a_l, a_r = quote_index(self.Ask.time.values,self.df_tr.time.unique()) 
247 |         b_l, b_r = quote_index(self.Bid.time.values,self.df_tr.time.unique())
248 | 
249 |         # interpolate ask abd bid-quote times
250 |         askit = interpolate_time(self.Ask.time.values,self.freq)
251 |         bidit = interpolate_time(self.Bid.time.values,self.freq)
252 | 
253 |         # ask price, volume and volume change
254 |         askp = self.Ask['price'].values.astype(int)
255 |         bidp = self.Bid['price'].values.astype(int)
256 | 
257 |         if version in ['ds_1','ds_2']:
258 |             askv = delta_vol(askp,self.Ask['vol'].values.astype(int))
259 |             bidv = delta_vol(bidp,self.Bid['vol'].values.astype(int),ask=False)
260 |         elif version == 'ds_3':
261 |             askv = self.Ask['vol'].values.astype(int)
262 |             bidv = self.Bid['vol'].values.astype(int)
263 | 
264 |         # number trades per timestamp, trade prices and volume
265 |         tr_n = self.df_tr[['vol','time']].groupby('time').count().values.flatten().astype(int)
266 |         P = self.df_tr.price.values.astype(int)
267 |         V = self.df_tr['vol'].values.astype(int)
268 | 
269 |         return [P, V, a_l, a_r, b_l, b_r, tr_n, askp, bidp, askv, bidv, askit, bidit] 
270 | 
271 |     def reduce_tprecision(self,freq,tcol=None):
272 |         """
273 |         Reduce timesamp precision of data in place.
274 |         If a 'time_org' column does not exist the orginal
275 |         'time' column is renamed as such.
276 |         The new timestamp data is given in the 'time' column.
277 |         
278 |         Parameters
279 |         ----------
280 |         freq : int 
281 |             Frequency to which to timestamp precision should be reduced. 
282 |             Frequency is measured in 10^freq of a second. 
283 |             E.g. freq=0 sets timestamp precision to seconds, freq=1 sets it to tenth of a second.
284 |         tcol : str (default : None)
285 |             Time column on which to perform timestamp precision reduction. If `None`, the 
286 |             precision is reduced on the 'time_org' column. 
287 |         
288 |         """
289 | 
290 |         if tcol is None:
291 |             tcol = 'time_org'
292 | 
293 |         if not 'time_org' in self.df_tr.columns:
294 |             self.df_tr.rename(columns={'time': 'time_org'}, inplace=True)
295 | 
296 |         self.df_tr['time'] = trim_timestamp(self.df_tr[tcol].values,freq)
297 | 
298 |         if self.Ask is not None:
299 |             if not 'time_org' in self.Ask.columns:
300 |                 self.Ask.rename(columns={'time': 'time_org'}, inplace=True)
301 | 
302 |             self.Ask['time'] = trim_timestamp(self.Ask[tcol].values,freq)
303 | 
304 |         if self.Bid is not None:            
305 |             if not 'time_org' in self.Bid.columns:
306 |                 self.Bid.rename(columns={'time': 'time_org'}, inplace=True)
307 |         
308 |             self.Bid['time'] = trim_timestamp(self.Bid[tcol].values,freq)
309 | 
310 |         return 
311 | 
312 |     def rename_timecol(self,tcol=None):
313 |         """
314 |         Rename column given in `tcol` to 'time' column. 
315 |         If `tcol` is None it is set to 'time_org'.
316 |         """
317 | 
318 |         if tcol is None:
319 |             tcol = 'time_org'
320 |         
321 |         if tcol in self.df_tr.columns:
322 |             if 'time' in self.df_tr.columns:
323 |                 self.df_tr.drop(columns='time', inplace=True)
324 |             
325 |             self.df_tr.rename(columns={tcol: 'time'}, inplace=True)
326 |         
327 |         if self.Ask is not None:
328 |             if tcol in self.Ask.columns:
329 |                 if 'time' in self.Ask.columns:
330 |                     self.Ask.drop(columns='time', inplace=True)
331 |                 
332 |                 self.Ask.rename(columns={tcol: 'time'}, inplace=True)
333 |             
334 |         if self.Bid is not None:
335 |             if tcol in self.Bid.columns:
336 |                 if 'time' in self.Bid.columns:
337 |                     self.Bid.drop(columns='time', inplace=True)
338 |                 
339 |                 self.Bid.rename(columns={tcol: 'time'}, inplace=True)
340 | 
341 |         return
342 | 
343 |     def classify(self,method,freq,reduce_precision=True,**kwargs):
344 |         """
345 |         Classifies trades into buyer- and seller-initiated.
346 | 
347 |         Parameters
348 |         ----------
349 |         method : str
350 |             Must be one of the following: 'ds_1', 'ds_2', 'ds_3' (different versions of 
351 |             the FI algorithm), 'lee_ready', 'bvc', 'emo', 'clvn'. See Jurkatis (2020).
352 |         freq : float or int
353 |             Timestamp precision of the data as measured in 10^-freq of a second. 
354 |             E.g., freq=3 corresponds to millisecond timestamps. 
355 |         reduce_precision : bool (default True)
356 |             If True, reduces the precision of the timestamp to the frequency specified in
357 |             `freq`. The original `time` column is renamed to `time_org`.
358 |         kwargs : optional keyword arguments passed to the algorithm.
359 |                  
360 |         Returns
361 |         -------
362 |         None, unless bvc is choosen. 
363 |         If not bvc, classification results is provided in a new column `Initiator` with 1 for
364 |         buyer-initiated trades, -1 for seller-initiated trades and 0 for unclassified 
365 |         trades. An additional `Step` column indicated at which step the the trade was
366 |         classificied. E.g. for lee_ready, 1 means the trade is classified using the 
367 |         quote rule, 2 indicates the use of the tick-test.
368 |         If bvc, returns a pandas.DataFrame with buyer-initiated and total volume over
369 |         the respective classification intervals.
370 | 
371 |         References
372 |         ----------
373 |         Jurkatis, S., 2020. Inferring Trade Directions in Fast Markets. Unpublished Mimeo
374 | 
375 |         """
376 | 
377 |         if not method in self.valid_methods:
378 |             raise ValueError(f"'{method}' is not a valid method; use one of {self.valid_methods}")
379 | 
380 |         self.freq = freq      
381 |         # rename time colums depending on whether timestamp precision should be reduced
382 |         # or not; and reduce timestamp precision accordingly
383 |         if reduce_precision:
384 |             self.reduce_tprecision(freq,kwargs.get('tcol'))
385 |         else:
386 |             self.rename_timecol(kwargs.get('tcol'))
387 |             
388 | 
389 |         if method in ['ds_1', 'ds_2', 'ds_3']:
390 |             return self.fi_algo(method,**kwargs)
391 |         elif method == 'lee_ready':
392 |             return self.lee_ready(**kwargs)
393 |         elif method == 'bvc':
394 |             return self.bvc(**kwargs)
395 |         elif method == 'emo':
396 |             return self.emo(**kwargs)
397 |         elif method == 'clnv':
398 |             return self.clnv(**kwargs)
399 |         elif method == 'true':
400 |             return self.true_initiator()
401 | 
402 | 
403 |     def fi_algo(self,version,bar=0.3,**kwargs):
404 |         """
405 |         Classify trades using the Full-Information algorithm.
406 | 
407 |         Parameters
408 |         ----------
409 |         version : str
410 |             Must be one of 'ds_1', 'ds_2' or 'ds_3'. See Jurkatis (2020).
411 |             Note that 'ds_3' corresponds to the version for Data Structure 2
412 |             in the paper and 'ds_2' to Data Structure 3 in the accompanying
413 |             online appendix. 
414 |         bar : float (default 0.3)
415 |             Must be between 0 and 1. Determines the range around the spread 
416 |             midpoint in which the tick-test is used. 
417 | 
418 |         Returns
419 |         -------
420 |         None. Result is appended to the provided dataframe.
421 | 
422 |         References
423 |         ----------
424 |         Jurkatis, S., 2020. Inferring Trade Directions in Fast Markets. Unpublished Mimeo      
425 |         
426 |         """
427 | 
428 |         varpack = self.extract_variables(version)
429 | 
430 |         # sign trades
431 |         if version == 'ds_1':
432 |             sign, c = sign_trades_ds1(*varpack, bar)
433 |         elif version == 'ds_2':
434 |             sign, c = sign_trades_ds2(*varpack, bar)
435 |         elif version == 'ds_3':
436 |             sign, c = sign_trades_ds3(*varpack, bar)
437 |         else:
438 |             raise ValueError(f"'{version}' is not a valid method; use 'ds_1', 'ds_2' or 'ds_3'.")
439 | 
440 |         # trade initiator
441 |         self.df_tr['Initiator'] = sign
442 | 
443 |         # classification step
444 |         self.df_tr['Step'] = c
445 | 
446 |         # tick rule 
447 |         self.apply_tick()
448 |         self.df_tr.loc[self.df_tr.Step==0,'Step'] = 4
449 | 
450 |         return 
451 | 
452 | 
453 |     def lee_ready(self,interpolate=False,**kwargs):
454 |         """
455 |         Classify trades using the Lee-Ready algorithm.
456 | 
457 |         Parameters
458 |         ----------      
459 |         interpolate : bool (default False)
460 |             If True, interpolate timestamp precision prior
461 |             to applying the algorithm according to 
462 |             Holden and Jacobsen (2014)
463 | 
464 |         Returns
465 |         -------
466 |         None. Result is appended to the provided dataframe.
467 | 
468 |         References
469 |         ----------
470 |         Holden, C.W., Jacobsen, S., 2014. Liquidity measurement problems in fast, com-
471 |         petitive markets: expensive and cheap solutions. The Journal of Finance 69,
472 |         1747–1785.
473 | 
474 |         Lee, C., Ready, M.J., 1991. Inferring trade direction from intraday data. The Journal
475 |         of Finance 46, 733–746.
476 |         
477 |         """
478 |         
479 |         # cleanup
480 |         self.df_tr.drop(columns='midpoint',errors='ignore',inplace=True)
481 | 
482 |         if interpolate:
483 |             self.tcol_interpolation()
484 |             timecol = 'time_inter'
485 |         else:
486 |             timecol = 'time'
487 | 
488 |         midpoint = get_midpoint(self.Ask[[timecol,'price']].rename(columns={'time_inter':'time'}),
489 |                                 self.Bid[[timecol,'price']].rename(columns={'time_inter':'time'}),
490 |                                 self.df_tr[timecol].unique()
491 |         )
492 | 
493 |         self.df_tr = self.df_tr.merge(midpoint, left_on=timecol,right_index=True,how='left')
494 |         
495 |         self.df_tr['Initiator'] = 0
496 |         self.df_tr['Step'] = 0
497 | 
498 |         self.df_tr.loc[self.df_tr.price>self.df_tr.midpoint, 'Initiator'] = 1
499 |         self.df_tr.loc[self.df_tr.price<self.df_tr.midpoint, 'Initiator'] = -1
500 | 
501 |         self.df_tr.loc[self.df_tr.price!=self.df_tr.midpoint, 'Step'] = 1
502 | 
503 |         # tick rule
504 |         self.apply_tick()
505 |         self.df_tr.loc[self.df_tr.Step==0,'Step'] = 2
506 | 
507 |         return 
508 | 
509 | 
510 |     def bvc(self,window=1,window_type='time',dof='estimate',start=None,**kwargs):
511 |         """
512 |         Returns the fraction of buyer-initiated volume according to the 
513 |         Bulk-Volume classification algorithm and total volume.
514 | 
515 |         Parameters
516 |         ----------
517 |         window : float or int (default 1)
518 |             Length of the intervals over which to compute the fraction of
519 |             buyer-inititated volumes.
520 |         window_type : str (default 'time')
521 |             Type of the interval. Must be either 'time', 'vol' or 'per_trade'.
522 |             Specifies which unit the `window` refers to: seconds if 'time' and
523 |             trading volume if 'vol'. If 'per_trade', `window` is ignored and the
524 |             buyer-initiated volume is computed for each individual trade.
525 |         dof : str, int or float (default 'estimate')
526 |             Specifies which distribution to choose to map standardized price 
527 |             changes on the unit-line. If a string is given it must be either 'normal'
528 |             or 'estimate'. If 'normal', the standard Gaussian distribution is chosen. 
529 |             If 'estimate', the student t-distribution is chosen with the degrees of 
530 |             freedom estimated from the array of standardized prices changes. If int
531 |             or float, the provided value is used for the degrees of freedom of the
532 |             t-distribution.
533 |         start: float or int (default None)
534 |             Starting point of the first interval. Only relevant for `window_type` 'time'.
535 |             If None, starting  point for the interval construction is the first mentioned
536 |             timestamp at the given timestamp precision.
537 | 
538 |         Returns
539 |         -------
540 |         pandas.DataFrame index by the interval number, containing buyer-initiated volume
541 |         and total volume. If 'per_trade', the buyer-initiated volume is equal to 
542 |         total trading volume if the probability of being buyer-initiated is greater 0.5, it 
543 |         is zero if the probability is smaller 0.5 and it is set to -1, if the probability
544 |         of being buyer-initiated is 0.5. 
545 | 
546 |         References
547 |         ----------
548 |         Easley, D., de Prado, M.M.L., O’Hara, M., 2012. Flow toxicity and liquidity in a
549 |         high-frequency world. Review of Financial Studies 25, 1457–1493.
550 | 
551 |         """
552 | 
553 |         if window_type=='per_trade':
554 |             group = np.arange(len(self.df_tr)-1)            
555 |             p = self.df_tr.price.values
556 |         else:
557 |             p0 = self.df_tr.price.iloc[0]
558 |             group = self.create_window(self.df_tr.iloc[1:],window=window,window_type=window_type,start=start)
559 | 
560 |             # last price per group/window
561 |             ind = np.searchsorted(group,np.unique(group),side='right')-1
562 |             p = np.append(p0,self.df_tr.price.iloc[1:].values[ind])
563 | 
564 |         buy_frac = pd.DataFrame(fraction_buy(p,dof=dof), index=np.unique(group), columns=['f_b'])
565 |         buy_frac.index.name = 'group'
566 | 
567 |         self.df_tr['group'] = -1
568 |         self.df_tr.iloc[1:,self.df_tr.columns.get_indexer(['group'])] = group 
569 | 
570 |         if window_type=='per_trade':
571 |             buy_frac['vol'] = self.df_tr.vol.values[1:]
572 |             buy_frac['buy_vol'] = 0
573 |             buy_frac.loc[buy_frac.f_b>0.5,'buy_vol'] = buy_frac.loc[buy_frac.f_b>0.5,'vol']
574 |             buy_frac.loc[buy_frac.f_b==0.5,'buy_vol'] = -1
575 |         else:
576 |             buy_frac = buy_frac.join( self.df_tr[['group','vol']].groupby('group').sum() ) 
577 |             buy_frac['buy_vol'] = buy_frac.f_b*buy_frac.vol
578 | 
579 |         return buy_frac
580 | 
581 | 
582 |     def buyvolume(self,window=1,window_type='time',start=None,drop_firsttrade=True):
583 |         """
584 |         Returns buyer-initiated volume and total volume over intervals from the 
585 |         individually classified trades. The result can be compared to the output of
586 |         the BVC algorithm.
587 | 
588 |         Parameters
589 |         ----------
590 |         window : float or int (default 1)
591 |             Length of the intervals over which to compute the fraction of
592 |             buyer-inititated volumes.
593 |         window_type : str (default 'time')
594 |             Type of the interval. Must be either 'time', 'vol' or 'per_trade'.
595 |             Specifies which unit the `window` refers to: seconds if 'time' and
596 |             trading volume if 'vol'. If 'per_trade', `window` is ignored and the
597 |             buyer-initiated volume is computed for each individual trade.
598 |         start: float or int (default None)
599 |             Starting point of the first interval. Only relevant for `window_type` 'time'.
600 |             If None, starting  point for the interval construction is the first mentioned
601 |             timestamp at the given timestamp precision.
602 |         drop_firsttrades : bool (default True)
603 |             If True, first trade is not considered in constructing the interval. This
604 |             choice makes the result comparable to the result from the BVC algorithm
605 |             which uses the first price as the reference starting point to compute the
606 |             between-interval price changes.
607 | 
608 |         Returns
609 |         -------
610 |         pandas.DataFrame index by the interval number, containing buyer-initiated volume
611 |         and total volume.  
612 | 
613 |         """
614 | 
615 | 
616 |         if not 'Initiator' in self.df_tr.columns:
617 |             raise KeyError("Data do not contain trade initiator label; classify trades first")
618 | 
619 |         i = 1 if drop_firsttrade else 0
620 |         group = self.create_window(self.df_tr.iloc[i:],window=window,window_type=window_type,start=start)
621 | 
622 |         self.df_tr['group'] = -1
623 |         self.df_tr.iloc[i:,self.df_tr.columns.get_indexer(['group'])] = group 
624 | 
625 |         vol = self.df_tr[['group','vol']].groupby('group').sum()
626 |         buyfrac = self.df_tr.loc[self.df_tr.Initiator==1,['group','vol']].groupby('group').sum().rename(columns={'vol': 'buy_vol'})
627 |         buyfrac = buyfrac.join(vol,how='outer').fillna(0)
628 |         
629 |         if drop_firsttrade:
630 |             buyfrac = buyfrac.iloc[i:]
631 | 
632 |         return buyfrac 
633 | 
634 | 
635 |     def create_window(self,df_tr,window=1,window_type='time',start=None):
636 | 
637 |         if not window_type in ['time','vol']:
638 |             raise ValueError("window type to create intervals must be either 'time' or 'vol'.")
639 | 
640 |         group = df_tr[window_type].values.astype(int)
641 |         if window_type == 'time':
642 |             if start is not None:
643 |                 group = group - start 
644 |             elif self.freq is None:
645 |                 group = group - group[0]
646 |             else:
647 |                 group = group - np.floor(group[0]*10**self.freq)/10**self.freq
648 | 
649 |             group = group // window
650 |         else:
651 |             group = vol_bin(group, window)
652 | 
653 |         return group
654 | 
655 | 
656 |     def emo(self,interpolate=False,**kwargs):
657 |         """
658 |         Classify trades using the algorithm of Ellis et al. (2000).
659 | 
660 |         Parameters
661 |         ----------      
662 |         interpolate : bool (default False)
663 |             If True, interpolate timestamp precision prior
664 |             to applying the algorithm according to 
665 |             Holden and Jacobsen (2014)
666 | 
667 |         Returns
668 |         -------
669 |         None. Result is appended to the provided dataframe.
670 | 
671 |         References
672 |         ----------
673 |         Holden, C.W., Jacobsen, S., 2014. Liquidity measurement problems in fast, com-
674 |         petitive markets: expensive and cheap solutions. The Journal of Finance 69,
675 |         1747–1785.
676 | 
677 |         Ellis, K., Michaely, R., O’Hara, M., 2000. The accuracy of trade classification rules:
678 |         Evidence from Nasdaq. Journal of Financial and Quantitative Analysis 35, 529–551.
679 |         
680 |         """
681 |         
682 |         # cleanup
683 |         self.df_tr.drop(columns=['ask','bid'], errors='ignore',inplace=True)
684 | 
685 |         if interpolate:
686 |             self.tcol_interpolation()
687 |             timecol = 'time_inter'
688 |         else:
689 |             timecol = 'time'
690 | 
691 |         lastask = get_lastquote(self.Ask[[timecol,'price']].rename(columns={'time_inter': 'time'}),
692 |                                 self.df_tr[timecol].unique()
693 |         )
694 | 
695 |         lastbid = get_lastquote(self.Bid[[timecol,'price']].rename(columns={'time_inter': 'time'}),
696 |                                 self.df_tr[timecol].unique()
697 |         )
698 | 
699 |         self.df_tr = self.df_tr.merge(lastask.to_frame(name='ask').join(lastbid.to_frame(name='bid'), how='outer'),
700 |                                       left_on=timecol, right_index=True, how='left')
701 | 
702 |         mask = self.df_tr.ask<=self.df_tr.bid 
703 |         self.df_tr.loc[mask,'ask'] = np.nan 
704 |         self.df_tr.loc[mask,'bid'] = np.nan 
705 | 
706 |         self.df_tr['Initiator'] = 0
707 |         self.df_tr['Step'] = 0
708 | 
709 |         self.df_tr.loc[self.df_tr.price==self.df_tr.ask,'Initiator'] = 1 
710 |         self.df_tr.loc[self.df_tr.price==self.df_tr.bid,'Initiator'] = -1
711 | 
712 |         self.df_tr.loc[(self.df_tr.Initiator==1) | (self.df_tr.Initiator==-1), 'Step'] = 1
713 | 
714 |         # tick rule
715 |         self.apply_tick()
716 |         self.df_tr.loc[self.df_tr.Step==0,'Step'] = 2
717 | 
718 |         return 
719 | 
720 | 
721 |     def clnv(self,interpolate=False,**kwargs):
722 |         """
723 |         Classify trades using the algorithm of Chakrabarty et al. (2007).
724 | 
725 |         Parameters
726 |         ----------      
727 |         interpolate : bool (default False)
728 |             If True, interpolate timestamp precision prior
729 |             to applying the algorithm according to 
730 |             Holden and Jacobsen (2014)
731 | 
732 |         Returns
733 |         -------
734 |         None. Result is appended to the provided dataframe.
735 | 
736 |         References
737 |         ----------
738 |         Chakrabarty, B., Li, B., Nguyen, V., Van Ness, R.A., 2007. Trade classification
739 |         algorithms for electronic communications network trades. Journal of Banking &
740 |         Finance 31, 3806–3821.
741 |         
742 |         Holden, C.W., Jacobsen, S., 2014. Liquidity measurement problems in fast, com-
743 |         petitive markets: expensive and cheap solutions. The Journal of Finance 69,
744 |         1747–1785.
745 | 
746 |         """
747 | 
748 |         # cleanup
749 |         self.df_tr.drop(columns=['ask','bid'], errors='ignore',inplace=True)
750 | 
751 |         if interpolate:
752 |             self.tcol_interpolation()
753 |             timecol = 'time_inter'
754 |         else:
755 |             timecol = 'time'
756 | 
757 |         lastask = get_lastquote(self.Ask[[timecol,'price']].rename(columns={'time_inter': 'time'}),
758 |                                 self.df_tr[timecol].unique()
759 |         )
760 | 
761 |         lastbid = get_lastquote(self.Bid[[timecol,'price']].rename(columns={'time_inter': 'time'}),
762 |                                 self.df_tr[timecol].unique()
763 |         )
764 | 
765 |         self.df_tr = self.df_tr.merge(lastask.to_frame(name='ask').join(lastbid.to_frame(name='bid'), how='outer'),
766 |                                       left_on=timecol, right_index=True, how='left')
767 | 
768 |         mask = self.df_tr.ask<=self.df_tr.bid 
769 |         self.df_tr.loc[mask,'ask'] = np.nan 
770 |         self.df_tr.loc[mask,'bid'] = np.nan 
771 | 
772 |         self.df_tr['Initiator'] = 0
773 |         self.df_tr['Step'] = 0
774 | 
775 |         self.df_tr.loc[(self.df_tr.price>0.7*self.df_tr.ask + 0.3*self.df_tr.bid) & (self.df_tr.price<=self.df_tr.ask),'Initiator'] = 1 
776 |         self.df_tr.loc[(self.df_tr.price<0.3*self.df_tr.ask + 0.7*self.df_tr.bid) & (self.df_tr.price>=self.df_tr.bid),'Initiator'] = -1
777 | 
778 |         self.df_tr.loc[(self.df_tr.Initiator==1) | (self.df_tr.Initiator==-1), 'Step'] = 1
779 | 
780 |         # tick rule
781 |         self.apply_tick()
782 |         self.df_tr.loc[self.df_tr.Step==0,'Step'] = 2
783 | 
784 |         return 
785 | 
786 | 
787 |     def apply_tick(self):
788 |         """Classify trades using the tick-test. Used in conjunction with one of
789 |         the other algorithms, but can be used standalone if a `Step` column 
790 |         containing only zeros is given in the transaction dataframe."""
791 | 
792 |         # tick rule 
793 |         mask = self.df_tr.Step==0
794 | 
795 |         trrest = self.df_tr.loc[mask,['price']].reset_index(drop=False).values.astype(int)
796 |         index_p, prices = trrest[:,0], trrest[:,1]
797 |             
798 |         s = tick_rule(self.df_tr.price.values.astype(int), prices, index_p)
799 | 
800 |         self.df_tr.loc[mask,'Initiator'] = s
801 | 
802 |         return
803 | 
804 |     def true_initiator(self):
805 |         self.df_tr['Initiator'] = self.df_tr.direction*-1
806 |         return 
807 | 
808 | 
809 |     def evaluate_bulkclass(self,buyvol,target):
810 |         """
811 |         Evaluate classification result when estimated as a fraction of
812 |         trading volume over time or volume intervals. Criterium follows
813 |         Chakrabarty et al. (2015):
814 | 
815 |         ..math:: \sum_{i} \min(V_i^B,\hat{V}_i^B) + \min(V_i^S,\hat{V}_i^S) / \sum_i V_i.
816 | 
817 |         Parameters:
818 |         -----------
819 |         buyvol : pandas.DataFrame
820 |             Estimated buyer initiated volume. Indexed by estimation intervals. 
821 |             Must contain `buy_vol` and `vol` columns to containing the buyer-
822 |             initiated volume and total trading volume for each interval.
823 |         target : pandas.DataFrame
824 |             Same as `buyvol` but with the true buyer-initiated volume
825 | 
826 |         Returns:
827 |         --------
828 |         float
829 | 
830 |         References:
831 |         -----------
832 |         Chakrabarty, B., Pascual, R., Shkilko, A., 2015. Evaluating trade classification 
833 |         algorithms: Bulk volume classification versus the tick rule and the Lee-Ready algo-
834 |         rithm. Journal of Financial Markets 25, 52–79.
835 | 
836 |         """
837 | 
838 |         vb = np.minimum(buyvol.buy_vol, target.buy_vol).sum()
839 |         vs = np.minimum(buyvol.vol-buyvol.buy_vol, target.vol-target.buy_vol).sum() 
840 | 
841 |         s = vb + vs 
842 |         vol = target.vol.sum()
843 |         return s/vol
844 | 
845 |     
846 |     def tcol_interpolation(self):
847 |         """Interpolate original timestamp in all dataframes. Result saved in new column 'time_inter'."""
848 | 
849 |         self.df_tr['time_inter'] = interpolate_time(self.df_tr.time.values,self.freq,hj_version=True)
850 |         self.Ask['time_inter'] = interpolate_time(self.Ask.time.values,self.freq,hj_version=True)
851 |         self.Bid['time_inter'] = interpolate_time(self.Bid.time.values,self.freq,hj_version=True)
852 |         
853 |         return
854 | 
855 | 
856 |     def into_bins(self,n,bin_type='vol'):
857 |         """Split data into n equally sized bins, either by time or by volume."""
858 | 
859 |         x = self.df_tr[bin_type].values
860 |         if bin_type=='vol':
861 |             x = np.cumsum(x)
862 | 
863 |         bins = np.linspace(np.min(x),np.max(x), n+1)[1:] 
864 |         group = np.searchsorted(bins,x)
865 | 
866 |         self.df_tr['group'] = group
867 |         return
868 | 
869 | 
870 |     def get_orderimbalance(self,n,bin_type='vol'):
871 |         """
872 |         Returns the order imbalance computed from individually 
873 |         classified trades over `n` data intervals. (To specify
874 |         the length of the intverals rather than the number, use 
875 |         the `buyvolume` method.) 
876 | 
877 |         Parameters
878 |         ----------
879 |         n : int
880 |             Number of intervals to split the data into.
881 |         bin_type : str (default : 'vol')
882 |             If 'vol', data are split into `n` volume bins. If
883 |             'time', data are split into `n` time bins.
884 |         
885 |         Returns
886 |         -------
887 |         pandas.DataFrame
888 | 
889 |         """
890 | 
891 |         self.into_bins(n,bin_type=bin_type)
892 | 
893 |         V = self.df_tr[['vol','group']].groupby('group').sum()
894 |         Vb = self.df_tr.loc[self.df_tr.Initiator==1,['vol','group']].groupby('group').sum()
895 |         Vs = self.df_tr.loc[self.df_tr.Initiator==-1,['vol','group']].groupby('group').sum()
896 | 
897 |         oi = (Vb.subtract(Vs,fill_value=0)).divide(V) 
898 |         return oi.rename(columns={'vol': 'oi'})
899 | 
900 | 
901 |     def impl_sf(self,iloc=True,tcol='time'):
902 |         """
903 |         Returns the execution costs for each group of consecutive buyer- or
904 |         seller-initiated trades. 
905 | 
906 |         ..math:: e_i = o_i \sum_{t=1}^{\tau_i} (p_{i,t} - m_i)v_{t,i}
907 | 
908 |         where `o_i` is the trade direction of the i-th group of consectutive buyer or seller-
909 |         initiated trades (1 for a buy, -1 for a sell order), `{p_it , v_it}` are the transaction
910 |         prices (in log) and volumes of all trades belonging to the i-th group, and `m_i` is 
911 |         the mid-quote (also in log) at the time of the order.
912 | 
913 |         Parameters
914 |         ----------
915 |         tcol : str (default 'time')
916 |             Determines which column to use to determine the 
917 |             corresponding mid-quote for each group.
918 | 
919 |         Returns
920 |         -------
921 |         pandas.DataFrame with group label, execution time of the first transaction
922 |         of the group, total trading volume of the group and the execution cost.
923 | 
924 |         """
925 | 
926 |         net = self.df_tr[[tcol,'Initiator','vol','price']].rename(columns={'vol': 'net_vol'})           
927 |         
928 |         runs, start, end = get_runs(net.Initiator.values)
929 | 
930 |         net['group'] = runs
931 |         if iloc:
932 |             as_of = net.iloc[start][tcol].values
933 |         else:
934 |             as_of = net.loc[start,tcol].values
935 | 
936 |         midpoint = get_midpoint(self.Ask.rename(columns={tcol: 'time'}),self.Bid.rename(columns={tcol: 'time'}),as_of) 
937 |         midpoint['group'] = net.group.unique()
938 |         midpoint.set_index('group',inplace=True)
939 | 
940 |         net = net.merge(midpoint, left_on='group', right_index=True, how='left')
941 | 
942 |         net['impl_shortfall'] = (np.log(net.price) - np.log(net.midpoint))*net['net_vol']*net.Initiator
943 | 
944 |         sf = net[['group','net_vol','impl_shortfall']].groupby('group').sum()
945 |         sf['time'] = as_of 
946 | 
947 |         return sf
948 | 
949 | 
950 |     def estimate_execost(self,sf,params_only=True,quadratic=False):
951 |         """
952 |         Returns the result from a price impact regression.
953 | 
954 |         ..math:: e_i = \beta_0 + \beta_1 v_i + \eps_i
955 | 
956 |         Parameters
957 |         ----------
958 |         sf : pandas.DataFrame
959 |             Contains a column 'impl_shortfall' with the transaction
960 |             costs of the i-th order and a column 'net_vol' with
961 |             the total volume of the i-th order.
962 |         params_only : bool (defaul True)
963 |             If True, returns numpy.array with the parameter estimates.
964 |             Otherwise, statsmodel regression result is returned.
965 |         quadratic : bool (default False)
966 |             If True, use v_i^2 as additional regressor.
967 | 
968 |         Returns
969 |         -------
970 |         numpy.array of parameter estimates or statsmodels regression
971 |         result object.
972 |         
973 |         """    
974 |         mask = pd.notnull(sf.impl_shortfall)
975 |         data = sm.add_constant(sf.loc[mask,['net_vol']].values)
976 |         if quadratic:
977 |             data = np.hstack([data,sf.loc[mask,['net_vol']].values**2])
978 |             
979 |         model = sm.OLS(sf.loc[mask,'impl_shortfall'].values,data)
980 | 
981 |         res = model.fit() #cov_type='HC3'
982 |         return res.params if params_only else res


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Feb 01 18:40:32 2017
 4 | 
 5 | @author: sjurkatis
 6 | """
 7 | 
 8 | try:
 9 |     from setuptools import setup
10 |     from setuptools import Extension
11 | except ImportError:
12 |     from distutils.core import setup
13 |     from distutils.extension import Extension
14 |     
15 | from Cython.Distutils import build_ext
16 | import numpy
17 | 
18 | #libraries = ['msvcr90.dll']
19 | ext_modules = [Extension('tradeclassification_c',
20 |                          ['tradeclassification_c.pyx'], 
21 |                          include_dirs = [numpy.get_include()])]
22 |                          
23 | setup(cmdclass = {'build_ext': build_ext}, ext_modules = ext_modules)
24 | 
25 | #from Cython.Build import cythonize
26 | #setup(ext_modules = cythonize('c_get_inds.pyx' , include_dirs = [numpy.get_include()]))
27 | 
28 | 


--------------------------------------------------------------------------------
/tradeclassification_c.pyx:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sun Jul 30 13:16:59 2017
  4 | 
  5 | @author: sjurkatis
  6 | """
  7 | 
  8 | 
  9 | #from libc.stdlib cimport malloc, free
 10 | import numpy as np
 11 | cimport numpy as np
 12 | cimport cython 
 13 | #from cpython cimport array
 14 | #import array
 15 | 
 16 | DTYPE = np.int
 17 | ctypedef np.int_t DTYPE_t
 18 | 
 19 | DTYPEf = np.float64
 20 | ctypedef np.float64_t DTYPEf_t
 21 | 
 22 | @cython.boundscheck(False)
 23 | @cython.wraparound(False)
 24 | def get_ind(double [:] arr, double [:] time):
 25 |     cdef: 
 26 |         int n = time.shape[0]
 27 |         int s = arr.shape[0]
 28 |         int begin, end, found, k
 29 |         unsigned int ind = 0
 30 |         size_t j, i
 31 |         np.ndarray[DTYPE_t, ndim=1] left = np.zeros(n, dtype=DTYPE)
 32 |         np.ndarray[DTYPE_t, ndim=1] right = np.zeros(n, dtype=DTYPE)
 33 |         double [:] tmp
 34 |         double v
 35 | 
 36 |     for j in xrange(n):
 37 |         begin = s
 38 |         end = 0
 39 |         found = 0
 40 |         tmp = arr[ind:]
 41 |         k = tmp.shape[0]
 42 |         for i in xrange(k):
 43 |             v = tmp[i] - time[j]
 44 |             if not found and v>=0:
 45 |                 found = 1
 46 |                 begin = ind + i
 47 |             
 48 |             if found:
 49 |                 if v > 0:
 50 |                     ind += i
 51 |                     end = ind
 52 |                     break
 53 |                 elif i==k-1:
 54 |                     end = s
 55 |                     break
 56 |         
 57 |         left[j] = begin
 58 |         right[j] = end
 59 |         
 60 |     return left, right
 61 | 
 62 | 
 63 | @cython.boundscheck(False)
 64 | @cython.wraparound(False)
 65 | def sign_trades_ds1(long[:] P, long[:] V, long[:] Al, long[:] Ar, long[:] Bl, long[:] Br, long[:] runlength, long[:] askp, long[:] bidp, long[:] avdiff, long[:] bvdiff, double[:] atime, double[:] btime, double bar):
 66 | 
 67 |     cdef:
 68 |         int n = Al.shape[0]
 69 |         int anum = askp.shape[0]
 70 |         int bnum = bidp.shape[0]                
 71 |         int trnum = P.shape[0]
 72 |         np.ndarray[DTYPE_t, ndim=1] s = np.zeros(trnum, dtype=DTYPE)
 73 |         np.ndarray[DTYPE_t, ndim=1] c = np.zeros(trnum, dtype=DTYPE)
 74 |         #np.ndarray[DTYPE_t, ndim=1] ask = np.zeros(trnum, dtype=DTYPE)
 75 |         #np.ndarray[DTYPE_t, ndim=1] bid = np.zeros(trnum, dtype=DTYPE)
 76 |         size_t j,i
 77 |         int al, ar, bl, br, last_ask, last_bid, k, d_a, d_b, tmpind, p, v 
 78 |         np.ndarray[DTYPE_t, ndim=1] discard_a = np.zeros(anum, dtype=DTYPE)
 79 |         np.ndarray[DTYPE_t, ndim=1] discard_b = np.zeros(bnum, dtype=DTYPE)              
 80 |         int ind = 0
 81 |         bint av_match, bv_match
 82 |         double upper, lower
 83 | 
 84 |     for j in range(n):
 85 |         
 86 |         al = Al[j]
 87 |         ar = Ar[j]
 88 |         bl = Bl[j]
 89 |         br = Br[j]
 90 | 
 91 |         last_ask = askp[al]
 92 |         last_bid = bidp[bl]
 93 |         for i in range(runlength[j]):
 94 |             tmpind = ind+i
 95 |             p = P[tmpind]
 96 |             v = V[tmpind]
 97 |             
 98 |             av_match = False
 99 |             bv_match = False
100 | 
101 |             #if ar>0:
102 |             #    ask[tmpind] = last_ask
103 |             #else:
104 |             #    ask[tmpind] = -1
105 | 
106 |             #if br>0:
107 |             #    bid[tmpind] = last_bid
108 |             #else:
109 |             #    bid[tmpind] = -1
110 |             
111 |             for k in range(ar-al):
112 |                 if discard_a[al+k]==0 and askp[al+k]==p and avdiff[al+k]==v:
113 |                     av_match = True
114 |                     d_a = k
115 |                     break
116 | 
117 |             for k in range(br-bl):
118 |                 if discard_b[bl+k]==0 and bidp[bl+k]==p and bvdiff[bl+k]==v:
119 |                     bv_match = True
120 |                     d_b = k
121 |                     break
122 |                 
123 |             if av_match and not bv_match:
124 |                 s[tmpind] = 1
125 |                 c[tmpind] = 1
126 |                 #ask[tmpind] = p
127 | 
128 |                 last_ask = askp[al+d_a]
129 |             elif bv_match and not av_match:
130 |                 s[tmpind] = -1
131 |                 c[tmpind] = 1
132 |                 #bid[tmpind] = p
133 | 
134 |                 last_bid = bidp[bl+d_b]
135 |             elif av_match and bv_match:
136 |                 
137 |                 if atime[al+d_a] < btime[bl+d_b]:
138 |                     s[tmpind] = 1
139 |                     c[tmpind] = 2
140 |                     #ask[tmpind] = p
141 |                                         
142 |                     discard_a[al+d_a] = 1
143 |                     last_ask = askp[al+d_a]
144 | 
145 |                 elif atime[al+d_a] > btime[bl+d_b]:
146 |                     s[tmpind] = -1
147 |                     c[tmpind] = 2
148 |                     #bid[tmpind] = p
149 | 
150 | 
151 |                     discard_b[bl+d_b] = 1
152 |                     last_bid = bidp[bl+d_b]
153 |                     
154 |             else:
155 | 
156 |                 # not av_match and not bv_match:
157 |                 # most likely hidden order
158 |                 if last_ask > last_bid:
159 |                     upper = last_ask*(1-bar) + last_bid*bar
160 |                     lower = last_ask*bar + last_bid*(1-bar)                    
161 |                     #x = (p-last_bid)/(last_ask-last_bid)               
162 |                     
163 |                     if p>upper: #x>1-bar:                  
164 |                         s[tmpind] = 1
165 |                         c[tmpind] = 3
166 |                     elif p<lower: #x<bar:                  
167 |                         s[tmpind] = -1
168 |                         c[tmpind] = 3    
169 |                    
170 |                 
171 |         ind +=runlength[j]
172 |     
173 | 
174 |     return s, c #, ask, bid
175 | 
176 | 
177 | @cython.boundscheck(False)
178 | @cython.wraparound(False)
179 | def sign_trades_ds2(long[:] P, long[:] V, long[:] Al, long[:] Ar, long[:] Bl, long[:] Br, long[:] runlength, long[:] askp, long[:] bidp, long[:] avdiff, long[:] bvdiff, double[:] atime, double[:] btime, double bar):
180 | 
181 |     cdef:
182 |         int n = Al.shape[0]
183 |         int anum = askp.shape[0]
184 |         int bnum = bidp.shape[0]                
185 |         int trnum = P.shape[0]
186 |         np.ndarray[DTYPE_t, ndim=1] s = np.zeros(trnum, dtype=DTYPE)
187 |         np.ndarray[DTYPE_t, ndim=1] c = np.zeros(trnum, dtype=DTYPE)
188 |         size_t j,i,l
189 |         int al, ar, bl, br, last_ask, last_bid, k, d_a, d_b, tmpind, p, trV, v
190 |         int ind = 0
191 |         bint av_match, bv_match 
192 |         double upper, lower
193 | 
194 |     for j in range(n):
195 |         
196 |         al = Al[j]
197 |         ar = Ar[j]
198 |         bl = Bl[j]
199 |         br = Br[j]
200 |        
201 |         last_ask = askp[al]
202 |         last_bid = bidp[bl]
203 |         for i in range(runlength[j]):
204 |             
205 |             tmpind = ind+i
206 |             p = P[tmpind]
207 |             trV = V[tmpind]
208 |             
209 |             av_match = False
210 |             bv_match = False
211 | 
212 |             for k in range(ar-al):
213 |                 if askp[al+k]==p and avdiff[al+k]>=trV:
214 |                     av_match = True
215 |                     d_a = k
216 |                     break
217 | 
218 |             for k in range(br-bl):
219 |                 if bidp[bl+k]==p and bvdiff[bl+k]>=trV:
220 |                     bv_match = True
221 |                     d_b = k
222 |                     break
223 | 
224 |             if av_match and not bv_match:
225 |                 s[tmpind] = 1
226 |                 c[tmpind] = 1
227 |                 #last_ask = p
228 |                 
229 |             elif bv_match and not av_match:
230 |                 s[tmpind] = -1
231 |                 c[tmpind] = 1
232 |                 #last_bid = p
233 | 
234 |             elif av_match and bv_match:
235 | 
236 |                 if atime[al+d_a] < btime[bl+d_b]:
237 |                 
238 |                     s[tmpind] = 1
239 |                     c[tmpind] = 2
240 |                                         
241 |                     avdiff[al+d_a] -= trV
242 |                     #last_ask = p
243 |                     
244 |                 elif atime[al+d_a] > btime[bl+d_b]:
245 |                     s[tmpind] = -1
246 |                     c[tmpind] = 2
247 |  
248 |                     bvdiff[bl+d_b] -= trV
249 |                     #last_bid = p                    
250 |                     
251 |             else:
252 |                 # there are two possibilites for a visible order to not find a
253 |                 # match:
254 |                 # (1): a market order trades against the best quote and the next
255 |                 #      best quote, due to the size of the market order. The order
256 |                 #      book, however, records only best quote and volume before
257 |                 #      the market order and the new best quote with corresponding
258 |                 #      volume after the completion of the full market order. That is,
259 |                 #      we have a price match, but not the corresponding volume 
260 |                 #      change match.
261 |                         
262 |                 for k in range(ar-al):
263 |                     if al+k+1==anum:
264 |                         break
265 |                     elif askp[al+k]<p and askp[al+k+1]==p: # this correctly checks also the price that we moved to the next interval
266 |                         av_match = True
267 |                         d_a = k+1
268 |                         break
269 |     
270 |                 for k in range(br-bl):
271 |                     if bl+k+1==bnum:
272 |                         break
273 |                     elif bidp[bl+k]>p and bidp[bl+k+1]==p:
274 |                         bv_match = True
275 |                         d_b = k+1
276 |                         break
277 |                     
278 |                 if av_match and not bv_match:
279 |                     s[tmpind] = 1
280 |                     c[tmpind] = 5
281 |                     #last_ask = p
282 |     
283 |                     
284 |                 elif bv_match and not av_match:
285 |                     s[tmpind] = -1
286 |                     c[tmpind] = 5
287 |                     #last_bid = p
288 | 
289 |                 elif av_match and bv_match:
290 |                     if atime[al+d_a] < btime[bl+d_b]:
291 |                         s[tmpind] = 1
292 |                         c[tmpind] = 6
293 |                         #last_ask = p
294 | 
295 |                     elif atime[al+d_a] > btime[bl+d_b]:
296 |                         s[tmpind] = -1
297 |                         c[tmpind] = 6
298 |                         #last_bid = p
299 |  
300 | 
301 |                 else: 
302 |                     # if there is no match with a price, there is a second possibility
303 |                     # (2): The market order goes through the levels 1 to n>2 of  
304 |                     #      the order book. The prices between level 1 and n are 
305 |                     #      not displayed in the order book. Then all transactions
306 |                     #      taking place between level 1 and n have no price match
307 |                     for k in range(ar-al):
308 |                         if al+k+1==anum:
309 |                             break
310 |                         elif askp[al+k]<p and askp[al+k+1]>p: 
311 |                             av_match = True
312 |                             d_a = k+1
313 |                             break
314 |         
315 |                     for k in range(br-bl):
316 |                         if bl+k+1==bnum:
317 |                             break
318 |                         elif bidp[bl+k]>p and bidp[bl+k+1]<p:
319 |                             bv_match = True
320 |                             d_b = k+1
321 |                             break
322 |                         
323 |                     if av_match and not bv_match:
324 |                         s[tmpind] = 1
325 |                         c[tmpind] = 7
326 |                         #last_ask = p
327 |         
328 |                         
329 |                     elif bv_match and not av_match:
330 |                         s[tmpind] = -1
331 |                         c[tmpind] = 7
332 |                         #last_bid = p
333 |         
334 |     
335 |                     elif av_match and bv_match:                        
336 |                         if atime[al+d_a] < btime[bl+d_b]:
337 |                             s[tmpind] = 1
338 |                             c[tmpind] = 8
339 |                             #last_ask = p
340 | 
341 |                         elif atime[al+d_a] > btime[bl+d_b]:
342 |                             s[tmpind] = -1
343 |                             c[tmpind] = 8
344 |                             #last_bid = p
345 |  
346 | 
347 |                     else: # still no match; must be a hidden order
348 |                         # not av_match and not bv_match:
349 |                         # most likely hidden order
350 |                         if last_ask > last_bid:
351 |                             upper = last_ask*(1-bar) + last_bid*bar
352 |                             lower = last_ask*bar + last_bid*(1-bar)                    
353 |                             #x = (p-last_bid)/(last_ask-last_bid)               
354 |                             
355 |                             if p>upper: #x>1-bar:                  
356 |                                 s[tmpind] = 1
357 |                                 c[tmpind] = 3
358 |                             elif p<lower: #x<bar:                  
359 |                                 s[tmpind] = -1
360 |                                 c[tmpind] = 3    
361 |                    
362 |                 
363 |         ind +=runlength[j]
364 |     
365 | 
366 |     return s, c
367 | 
368 | 
369 | @cython.boundscheck(False)
370 | @cython.wraparound(False)
371 | def sign_trades_ds3(long[:] P, long[:] V, long[:] Al, long[:] Ar, long[:] Bl, long[:] Br, long[:] runlength, long[:] askp, long[:] bidp, long[:] askv, long[:] bidv, double[:] atime, double[:] btime, double bar):
372 | 
373 |     cdef:
374 |         int n = Al.shape[0]
375 |         int anum = askp.shape[0]
376 |         int bnum = bidp.shape[0]                
377 |         int trnum = P.shape[0]
378 |         np.ndarray[DTYPE_t, ndim=1] s = np.zeros(trnum, dtype=DTYPE)
379 |         np.ndarray[DTYPE_t, ndim=1] c = np.zeros(trnum, dtype=DTYPE)
380 |         size_t j,i,l
381 |         int al, ar, bl, br, last_ask, last_bid, k, d_a, d_b, tmpind, p, trV 
382 |         int ind = 0
383 |         bint av_match, bv_match 
384 |         double upper, lower
385 | 
386 |     for j in range(n):
387 |         
388 |         al = Al[j]
389 |         ar = Ar[j]
390 |         bl = Bl[j]
391 |         br = Br[j]
392 |        
393 |         last_ask = askp[al]
394 |         last_bid = bidp[bl]
395 |         for i in range(runlength[j]):
396 |             
397 |             tmpind = ind+i
398 |             p = P[tmpind]
399 |             trV = V[tmpind]
400 |             
401 |             av_match = False
402 |             bv_match = False
403 | 
404 |             for k in range(ar-al):
405 |                 if askp[al+k]==p and askv[al+k]>=trV:
406 |                     av_match = True
407 |                     d_a = k
408 |                     break
409 | 
410 |             for k in range(br-bl):
411 |                 if bidp[bl+k]==p and bidv[bl+k]>=trV:
412 |                     bv_match = True
413 |                     d_b = k
414 |                     break
415 | 
416 |             if av_match and not bv_match:
417 |                 s[tmpind] = 1
418 |                 c[tmpind] = 1
419 |                 
420 |             elif bv_match and not av_match:
421 |                 s[tmpind] = -1
422 |                 c[tmpind] = 1  
423 | 
424 |             elif av_match and bv_match:
425 |                 
426 |                 if atime[al+d_a] < btime[bl+d_b]:
427 |                     s[tmpind] = 1
428 |                     c[tmpind] = 2
429 |                     
430 |                     askv[al+d_a] -=trV
431 |                 elif atime[al+d_a] > btime[bl+d_b]:
432 |                     s[tmpind] = -1
433 |                     c[tmpind] = 2
434 |                     
435 |                     bidv[bl+d_b] -=trV
436 | 
437 |             else: # still no match; must be a hidden order
438 |                 # not av_match and not bv_match:
439 |                 # most likely hidden order
440 |                 if last_ask > last_bid:
441 |                     upper = last_ask*(1-bar) + last_bid*bar
442 |                     lower = last_ask*bar + last_bid*(1-bar)                    
443 |                     #x = (p-last_bid)/(last_ask-last_bid)               
444 |                     
445 |                     if p>upper: #x>1-bar:                  
446 |                         s[tmpind] = 1
447 |                         c[tmpind] = 3
448 |                     elif p<lower: #x<bar:                  
449 |                         s[tmpind] = -1
450 |                         c[tmpind] = 3    
451 |                    
452 |                 
453 |         ind +=runlength[j]
454 |     
455 |     return s, c
456 | 
457 | 
458 | @cython.boundscheck(False)
459 | @cython.wraparound(False)
460 | def tick_rule(long[:] allprices, long[:] prices, long[:] index_p):
461 |     
462 |     cdef:
463 |         int n = prices.shape[0] 
464 |         np.ndarray[DTYPE_t, ndim=1] s = np.zeros(n, dtype=DTYPE)
465 |         int t, p, count, sz, lp
466 |         long [:] lstp
467 |         size_t j
468 | 
469 |     for j in range(n):
470 |         count = 0
471 |         p = prices[j]
472 |         t = index_p[j]
473 |         lstp = allprices[:t]
474 |         sz = lstp.shape[0]
475 |         s[j] = 0
476 |         while sz>count:
477 |             count += 1
478 |             lp = lstp[sz-count]
479 |             if p>lp:
480 |                 s[j] = 1                    
481 |                 break
482 |             elif p<lp:
483 |                 s[j] = -1                    
484 |                 break
485 | 
486 |     return s #[s[j] for j in range(n)]
487 |         
488 | @cython.boundscheck(False)
489 | @cython.wraparound(False)
490 | def vol_bin(long[:] x, long w):
491 | 
492 |     cdef:
493 |         long vol = 0
494 |         long g = 0
495 |         int n = x.shape[0]
496 |         np.ndarray[DTYPE_t, ndim=1] group = np.zeros(n, dtype=DTYPE)
497 |         size_t i
498 | 
499 |     for i in range(n):
500 |         vol += x[i]
501 |         group[i] = g
502 |         if vol>=w:
503 |             g+=1
504 |             vol = 0
505 | 
506 |     return group 
507 | 
508 | @cython.boundscheck(False)
509 | @cython.wraparound(False)
510 | @cython.cdivision(True)
511 | def concat_runs(long[:] x, bint hj_version=False):
512 | 
513 |     cdef:
514 |         int n = x.shape[0]
515 |         size_t i, j
516 |         double k
517 |         size_t count = 0  
518 |         np.ndarray[DTYPEf_t, ndim=1] interp = np.zeros(sum(x), dtype=DTYPEf)
519 | 
520 |     if hj_version:        
521 |         for i in range(n):
522 |             k = x[i]+1.
523 |             for j in range(1,x[i]+1):
524 |                 interp[count] = (2*j-1)/(2*k)
525 |                 count += 1 
526 |     else:
527 |         for i in range(n):
528 |             k = x[i]+1.
529 |             for j in range(1,x[i]+1):
530 |                 interp[count] = j/k
531 |                 count += 1 
532 | 
533 |     return interp


--------------------------------------------------------------------------------