├── .github └── workflows │ ├── main.yml │ └── pythontesting.py ├── README.md ├── StockAnalysis.ipynb ├── cipython.py └── stock.py /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | # This is a basic workflow to help you get started with Actions 2 | 3 | name: CI 4 | 5 | # Controls when the workflow will run 6 | on: 7 | # Triggers the workflow on push or pull request events but only for the master branch 8 | push: 9 | branches: [ master ] 10 | pull_request: 11 | branches: [ master ] 12 | 13 | # Allows you to run this workflow manually from the Actions tab 14 | workflow_dispatch: 15 | 16 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel 17 | jobs: 18 | # This workflow contains a single job called "build" 19 | build: 20 | # The type of runner that the job will run on 21 | runs-on: ubuntu-latest 22 | 23 | # Steps represent a sequence of tasks that will be executed as part of the job 24 | steps: 25 | # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it 26 | - uses: actions/checkout@v2 27 | 28 | # Runs a single command using the runners shell 29 | - name: Run a one-line script 30 | run: pip install pandas 31 | - name : run code 32 | run: python cipython.py 33 | - name : run more code 34 | run : python .github/workflows/pythontesting.py 35 | 36 | # Runs a set of commands using the runners shell 37 | - name: Run a multi-line script 38 | run: | 39 | echo Add other actions to build, 40 | echo test, and deploy your project. 41 | -------------------------------------------------------------------------------- /.github/workflows/pythontesting.py: -------------------------------------------------------------------------------- 1 | print('HELLLOOOO WORLD THIS MEANS THE CI ACCESSED THE PYTHON FILE') 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # StockForecast 2 | Using machine learning to predict/forecast the future trend of stock prices. 3 | 4 | 5 | Check out the Medium article that goes into detail about this repo! 6 | https://medium.com/@lucas.rea1998/predicting-future-stock-market-trends-with-python-machine-learning-2bf3f1633b3c 7 | -------------------------------------------------------------------------------- /StockAnalysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import yfinance as yf\n", 10 | "import datetime\n", 11 | "import pandas as pd\n", 12 | "import numpy as np\n", 13 | "from finta import TA\n", 14 | "import matplotlib.pyplot as plt\n", 15 | "\n", 16 | "from sklearn import svm\n", 17 | "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier \n", 18 | "from sklearn.neighbors import KNeighborsClassifier\n", 19 | "from sklearn.model_selection import train_test_split, GridSearchCV\n", 20 | "from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, accuracy_score" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "\"\"\"\n", 30 | "Defining some constants for data mining\n", 31 | "\"\"\"\n", 32 | "\n", 33 | "NUM_DAYS = 10000 # The number of days of historical data to retrieve\n", 34 | "INTERVAL = '1d' # Sample rate of historical data\n", 35 | "symbol = 'SPY' # Symbol of the desired stock\n", 36 | "\n", 37 | "# List of symbols for technical indicators\n", 38 | "INDICATORS = ['RSI', 'MACD', 'STOCH','ADL', 'ATR', 'MOM', 'MFI', 'ROC', 'OBV', 'CCI', 'EMV', 'VORTEX']" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 3, 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "name": "stdout", 48 | "output_type": "stream", 49 | "text": [ 50 | "[*********************100%***********************] 1 of 1 completed\n", 51 | "6895\n" 52 | ] 53 | }, 54 | { 55 | "data": { 56 | "text/plain": [ 57 | "" 58 | ] 59 | }, 60 | "execution_count": 3, 61 | "metadata": {}, 62 | "output_type": "execute_result" 63 | }, 64 | { 65 | "data": { 66 | "image/png": "\n", 67 | "text/plain": [ 68 | "
" 69 | ] 70 | }, 71 | "metadata": { 72 | "needs_background": "light" 73 | }, 74 | "output_type": "display_data" 75 | } 76 | ], 77 | "source": [ 78 | "\"\"\"\n", 79 | "Next we pull the historical data using yfinance\n", 80 | "Rename the column names because finta uses the lowercase names\n", 81 | "\"\"\"\n", 82 | "\n", 83 | "start = (datetime.date.today() - datetime.timedelta( NUM_DAYS ) )\n", 84 | "end = datetime.datetime.today()\n", 85 | "\n", 86 | "data = yf.download(symbol, start=start, end=end, interval=INTERVAL)\n", 87 | "data.rename(columns={\"Close\": 'close', \"High\": 'high', \"Low\": 'low', 'Volume': 'volume', 'Open': 'open'}, inplace=True)\n", 88 | "print(len(data))\n", 89 | "\n", 90 | "tmp = data.iloc[-60:]\n", 91 | "tmp['close'].plot()" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 4, 97 | "metadata": {}, 98 | "outputs": [ 99 | { 100 | "data": { 101 | "text/plain": [ 102 | "" 103 | ] 104 | }, 105 | "execution_count": 4, 106 | "metadata": {}, 107 | "output_type": "execute_result" 108 | }, 109 | { 110 | "data": { 111 | "image/png": "\n", 112 | "text/plain": [ 113 | "
" 114 | ] 115 | }, 116 | "metadata": { 117 | "needs_background": "light" 118 | }, 119 | "output_type": "display_data" 120 | } 121 | ], 122 | "source": [ 123 | "\"\"\"\n", 124 | "Next we clean our data and perform feature engineering to create new technical indicator features that our\n", 125 | "model can learn from\n", 126 | "\"\"\"\n", 127 | "\n", 128 | "def _exponential_smooth(data, alpha):\n", 129 | " \"\"\"\n", 130 | " Function that exponentially smooths dataset so values are less 'rigid'\n", 131 | " :param alpha: weight factor to weight recent values more\n", 132 | " \"\"\"\n", 133 | " \n", 134 | " return data.ewm(alpha=alpha).mean()\n", 135 | "\n", 136 | "data = _exponential_smooth(data, 0.65)\n", 137 | "\n", 138 | "tmp1 = data.iloc[-60:]\n", 139 | "tmp1['close'].plot()" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 5, 145 | "metadata": {}, 146 | "outputs": [ 147 | { 148 | "name": "stdout", 149 | "output_type": "stream", 150 | "text": [ 151 | "Index(['close', '14 period RSI', 'MACD', 'SIGNAL', '14 period STOCH %K', 'MFV',\n", 152 | " '14 period ATR', 'MOM', '14 period MFI', 'ROC', 'OBV', '20 period CCI',\n", 153 | " '14 period EMV', 'VIm', 'VIp', 'ema50', 'ema21', 'ema15', 'ema5',\n", 154 | " 'normVol'],\n", 155 | " dtype='object')\n" 156 | ] 157 | } 158 | ], 159 | "source": [ 160 | "def _get_indicator_data(data):\n", 161 | " \"\"\"\n", 162 | " Function that uses the finta API to calculate technical indicators used as the features\n", 163 | " :return:\n", 164 | " \"\"\"\n", 165 | "\n", 166 | " for indicator in INDICATORS:\n", 167 | " ind_data = eval('TA.' + indicator + '(data)')\n", 168 | " if not isinstance(ind_data, pd.DataFrame):\n", 169 | " ind_data = ind_data.to_frame()\n", 170 | " data = data.merge(ind_data, left_index=True, right_index=True)\n", 171 | " data.rename(columns={\"14 period EMV.\": '14 period EMV'}, inplace=True)\n", 172 | "\n", 173 | " # Also calculate moving averages for features\n", 174 | " data['ema50'] = data['close'] / data['close'].ewm(50).mean()\n", 175 | " data['ema21'] = data['close'] / data['close'].ewm(21).mean()\n", 176 | " data['ema15'] = data['close'] / data['close'].ewm(14).mean()\n", 177 | " data['ema5'] = data['close'] / data['close'].ewm(5).mean()\n", 178 | "\n", 179 | " # Instead of using the actual volume value (which changes over time), we normalize it with a moving volume average\n", 180 | " data['normVol'] = data['volume'] / data['volume'].ewm(5).mean()\n", 181 | "\n", 182 | " # Remove columns that won't be used as features\n", 183 | " del (data['open'])\n", 184 | " del (data['high'])\n", 185 | " del (data['low'])\n", 186 | " del (data['volume'])\n", 187 | " del (data['Adj Close'])\n", 188 | " \n", 189 | " return data\n", 190 | "\n", 191 | "data = _get_indicator_data(data)\n", 192 | "print(data.columns)" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 6, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "live_pred_data = data.iloc[-16:-11]" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 7, 207 | "metadata": {}, 208 | "outputs": [ 209 | { 210 | "name": "stdout", 211 | "output_type": "stream", 212 | "text": [ 213 | "6866\n" 214 | ] 215 | } 216 | ], 217 | "source": [ 218 | "def _produce_prediction(data, window):\n", 219 | " \"\"\"\n", 220 | " Function that produces the 'truth' values\n", 221 | " At a given row, it looks 'window' rows ahead to see if the price increased (1) or decreased (0)\n", 222 | " :param window: number of days, or rows to look ahead to see what the price did\n", 223 | " \"\"\"\n", 224 | " \n", 225 | " prediction = (data.shift(-window)['close'] >= data['close'])\n", 226 | " prediction = prediction.iloc[:-window]\n", 227 | " data['pred'] = prediction.astype(int)\n", 228 | " \n", 229 | " return data\n", 230 | "\n", 231 | "data = _produce_prediction(data, window=15)\n", 232 | "del (data['close'])\n", 233 | "data = data.dropna() # Some indicators produce NaN values for the first few rows, we just remove them here\n", 234 | "data.tail()\n", 235 | "print(len(data))" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 16, 241 | "metadata": {}, 242 | "outputs": [ 243 | { 244 | "name": "stderr", 245 | "output_type": "stream", 246 | "text": [ 247 | "/Users/lucasrea/.pyenv/versions/3.7.3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n", 248 | " \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n" 249 | ] 250 | }, 251 | { 252 | "name": "stdout", 253 | "output_type": "stream", 254 | "text": [ 255 | "RF Accuracy = 0.6554416788677402\n", 256 | "KNN Accuracy = 0.6809419228892142\n", 257 | "ENSEMBLE Accuracy = 0.6817959980478276\n" 258 | ] 259 | } 260 | ], 261 | "source": [ 262 | "def cross_Validation(data):\n", 263 | "\n", 264 | " # Split data into equal partitions of size len_train\n", 265 | " \n", 266 | " num_train = 10 # Increment of how many starting points (len(data) / num_train = number of train-test sets)\n", 267 | " len_train = 40 # Length of each train-test set\n", 268 | " \n", 269 | " # Lists to store the results from each model\n", 270 | " rf_RESULTS = []\n", 271 | " knn_RESULTS = []\n", 272 | " gbt_RESULTS = []\n", 273 | " ensemble_RESULTS = []\n", 274 | " \n", 275 | " i = 0\n", 276 | " \n", 277 | " # Models which will be used\n", 278 | " rf = RandomForestClassifier()\n", 279 | " knn = KNeighborsClassifier()\n", 280 | " \n", 281 | " # Create a tuple list of our models\n", 282 | " estimators=[('knn', knn), ('rf', rf)]\n", 283 | " ensemble = VotingClassifier(estimators, voting='soft')\n", 284 | " \n", 285 | " while True:\n", 286 | " \n", 287 | " # Partition the data into chunks of size len_train every num_train days\n", 288 | " df = data.iloc[i * num_train : (i * num_train) + len_train]\n", 289 | " i += 1\n", 290 | " #print(i * num_train, (i * num_train) + len_train)\n", 291 | " \n", 292 | " if len(df) < 40:\n", 293 | " break\n", 294 | " \n", 295 | " y = df['pred']\n", 296 | " features = [x for x in df.columns if x not in ['pred']]\n", 297 | " X = df[features]\n", 298 | "\n", 299 | " X_train, X_test, y_train, y_test = train_test_split(X, y, train_size= 7 * len(X) // 10,shuffle=False)\n", 300 | " \n", 301 | " # fit models\n", 302 | " rf.fit(X_train, y_train)\n", 303 | " knn.fit(X_train, y_train)\n", 304 | " ensemble.fit(X_train, y_train)\n", 305 | " \n", 306 | " # get predictions\n", 307 | " rf_prediction = rf.predict(X_test)\n", 308 | " knn_prediction = knn.predict(X_test)\n", 309 | " ensemble_prediction = ensemble.predict(X_test)\n", 310 | " \n", 311 | "# print('rf prediction is ', rf_prediction)\n", 312 | "# print('knn prediction is ', knn_prediction)\n", 313 | "# print('ensemble prediction is ', ensemble_prediction)\n", 314 | "# print('truth values are ', y_test.values)\n", 315 | " \n", 316 | " # determine accuracy and append to results\n", 317 | " rf_accuracy = accuracy_score(y_test.values, rf_prediction)\n", 318 | " knn_accuracy = accuracy_score(y_test.values, knn_prediction)\n", 319 | " ensemble_accuracy = accuracy_score(y_test.values, ensemble_prediction)\n", 320 | " \n", 321 | "# print(rf_accuracy)\n", 322 | "# print(knn_accuracy)\n", 323 | "# print(ensemble_accuracy)\n", 324 | " rf_RESULTS.append(rf_accuracy)\n", 325 | " knn_RESULTS.append(knn_accuracy)\n", 326 | " ensemble_RESULTS.append(ensemble_accuracy)\n", 327 | " \n", 328 | " print('RF Accuracy = ' + str( sum(rf_RESULTS) / len(rf_RESULTS)))\n", 329 | " print('KNN Accuracy = ' + str( sum(knn_RESULTS) / len(knn_RESULTS)))\n", 330 | " print('ENSEMBLE Accuracy = ' + str( sum(ensemble_RESULTS) / len(ensemble_RESULTS)))\n", 331 | " \n", 332 | "cross_Validation(data)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "metadata": {}, 339 | "outputs": [], 340 | "source": [] 341 | } 342 | ], 343 | "metadata": { 344 | "kernelspec": { 345 | "display_name": "Python 3", 346 | "language": "python", 347 | "name": "python3" 348 | }, 349 | "language_info": { 350 | "codemirror_mode": { 351 | "name": "ipython", 352 | "version": 3 353 | }, 354 | "file_extension": ".py", 355 | "mimetype": "text/x-python", 356 | "name": "python", 357 | "nbconvert_exporter": "python", 358 | "pygments_lexer": "ipython3", 359 | "version": "3.7.3" 360 | } 361 | }, 362 | "nbformat": 4, 363 | "nbformat_minor": 4 364 | } 365 | -------------------------------------------------------------------------------- /cipython.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | print('this code ran') 4 | 5 | -------------------------------------------------------------------------------- /stock.py: -------------------------------------------------------------------------------- 1 | import yfinance as yf 2 | import datetime 3 | import os 4 | import pandas as pd 5 | import numpy as np 6 | from finta import TA 7 | import time 8 | 9 | from sklearn.ensemble import RandomForestClassifier 10 | from sklearn.model_selection import train_test_split 11 | from sklearn.metrics import confusion_matrix, classification_report 12 | 13 | from pyspark.sql import SparkSession 14 | from pyspark.ml.feature import VectorAssembler 15 | from pyspark.ml.classification import RandomForestClassifier 16 | from pyspark.ml.classification import GBTClassifier 17 | from pyspark.ml import Pipeline 18 | from pyspark.ml.evaluation import MulticlassClassificationEvaluator 19 | 20 | 21 | 22 | spark = SparkSession.builder.appName('stockanalysis').getOrCreate() 23 | 24 | 25 | class Ticker(): 26 | 27 | NUM_DAYS = 1000 # The number of days of historical data to retrieve 28 | INTERVAL = '1d' # Sample rate of historical data 29 | 30 | # List of symbols for technical indicators 31 | INDICATORS = ['RSI', 'MACD', 'STOCH', 'ADL', 'ATR', 'MOM', 'MFI', 'ROC', 'OBV', 'CCI', 'EMV', 'VORTEX'] 32 | 33 | 34 | def __init__(self, symbol): 35 | 36 | """ 37 | Constructor for class 38 | Will obtain historical data for NUM_DAYS number of days 39 | :param symbol: ticker of stock 40 | """ 41 | 42 | self.symbol = symbol 43 | self._get_historical_data() 44 | 45 | def _get_historical_data(self): 46 | 47 | """ 48 | Function that uses the yfinance API to get stock data 49 | :return: 50 | """ 51 | 52 | start = (datetime.date.today() - datetime.timedelta( self.NUM_DAYS) ) 53 | end = datetime.datetime.today() 54 | 55 | self.data = yf.download(self.symbol, start=start, end=end, interval=self.INTERVAL) 56 | self.data.rename(columns={"Close": 'close', "High": 'high', "Low": 'low', 'Volume': 'volume', 'Open': 'open'}, inplace=True) 57 | 58 | def _exponential_smooth(self, alpha): 59 | 60 | """ 61 | Function that exponentially smooths dataset so values are less 'rigid' 62 | :param alpha: weight factor to weight recent values more 63 | """ 64 | 65 | self.data = self.data.ewm(alpha=alpha).mean() 66 | 67 | def _get_indicator_data(self): 68 | 69 | """ 70 | Function that uses the finta API to calculate technical indicators used as the features 71 | :return: 72 | """ 73 | 74 | for indicator in self.INDICATORS: 75 | ind_data = eval('TA.' + indicator + '(self.data)') 76 | if not isinstance(ind_data, pd.DataFrame): 77 | ind_data = ind_data.to_frame() 78 | self.data = self.data.merge(ind_data, left_index=True, right_index=True) 79 | self.data.rename(columns={"14 period EMV.": '14 period EMV'}, inplace=True) 80 | 81 | # Also calculate moving averages for features 82 | self.data['ema50'] = self.data['close'] / self.data['close'].ewm(50).mean() 83 | self.data['ema21'] = self.data['close'] / self.data['close'].ewm(21).mean() 84 | self.data['ema14'] = self.data['close'] / self.data['close'].ewm(14).mean() 85 | self.data['ema5'] = self.data['close'] / self.data['close'].ewm(5).mean() 86 | 87 | # Remove columns that won't be used as features 88 | del (self.data['open']) 89 | del (self.data['high']) 90 | del (self.data['low']) 91 | del (self.data['volume']) 92 | del (self.data['Adj Close']) 93 | 94 | def _produce_prediction(self, window=10): 95 | 96 | """ 97 | Function that produces the 'truth' values 98 | At a given row, it looks 'window' rows ahead to see if the price increased (1) or decreased (0) 99 | :param window: number of days, or rows to look ahead to see what the price did 100 | """ 101 | 102 | prediction = (self.data.shift(-window)['close'] >= self.data['close']) 103 | prediction = prediction.iloc[:-window] 104 | self.data['pred'] = prediction.astype(int) 105 | 106 | def _produce_data(self, window): 107 | 108 | """ 109 | Main data function that calls the others to smooth, get features, and create the predictions 110 | :param window: value used to determine the prediction 111 | :return: 112 | """ 113 | 114 | self._exponential_smooth(0.9) 115 | self._get_indicator_data() 116 | self._produce_prediction(window=window) 117 | 118 | del (self.data['close']) 119 | self.data = self.data.dropna() 120 | 121 | def _split_data(self): 122 | 123 | """ 124 | Function to partition the data into the train and test set 125 | :return: 126 | """ 127 | 128 | self.y = self.data['pred'] 129 | features = [x for x in self.data.columns if x not in ['pred']] 130 | self.X = self.data[features] 131 | 132 | self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, train_size= 2 * len(self.X) // 3) 133 | 134 | def _train_random_forest(self): 135 | 136 | """ 137 | Function that uses random forest classifier to train the model 138 | :return: 139 | """ 140 | 141 | rf = RandomForestClassifier(n_jobs=-1, n_estimators=85, random_state=65) 142 | rf.fit(self.X_train, self.y_train.values.ravel()) 143 | prediction = rf.predict(self.X_test) 144 | 145 | print(classification_report(self.y_test, prediction)) 146 | print(confusion_matrix(self.y_test, prediction)) 147 | print(rf.feature_importances_) 148 | 149 | 150 | def _data_clean(self, x=15): 151 | 152 | t1 = time.time() 153 | self._produce_data(window=x) 154 | self._split_data() 155 | print(str(time.time() - t1) + ' seconds to clean data') 156 | 157 | 158 | def _model(self): 159 | 160 | t1 = time.time() 161 | self._train_random_forest() 162 | print(time.time() - t1) 163 | 164 | def _spark_rf(self): 165 | self.df = spark.createDataFrame(self.data) 166 | 167 | features = [] 168 | for col in self.df.columns: 169 | if col == 'pred': 170 | continue 171 | else: 172 | features.append(col) 173 | 174 | (trainingData, testData) = self.df.randomSplit([0.7, 0.3], seed=24234232) 175 | 176 | assembler = VectorAssembler(inputCols=features, outputCol="features") 177 | #rf = RandomForestClassifier(labelCol="pred", featuresCol="features", numTrees=500) 178 | gbt = gbt = GBTClassifier(labelCol="pred", featuresCol="features", maxIter=200) 179 | pipeline = Pipeline(stages=[assembler, gbt]) 180 | 181 | model = pipeline.fit(trainingData) 182 | predictions = model.transform(testData) 183 | 184 | # Select (prediction, true label) and compute test error 185 | evaluator = MulticlassClassificationEvaluator( 186 | labelCol="pred", predictionCol="prediction", metricName="accuracy") 187 | accuracy = evaluator.evaluate(predictions) 188 | print("Test Error = %g" % (1.0 - accuracy)) 189 | 190 | 191 | t = Ticker('SPY') 192 | t._data_clean() 193 | t._spark_rf() 194 | 195 | 196 | #t._model() 197 | 198 | 199 | 200 | 201 | --------------------------------------------------------------------------------