├── .gitignore ├── 1_get_data.py ├── 2_preprocess_data.py ├── 3_feature_eng.py ├── 4_model.py ├── 5_backtest.py ├── 6_strategies.py ├── README.md ├── data ├── nasdaq │ ├── data.csv │ └── nasdaq.csv ├── nyse │ ├── data.csv │ └── nyse.csv └── sp500 │ ├── data.csv │ └── sp500.csv ├── img ├── OESX.png ├── SAVA.png ├── backtest.png ├── roc.png ├── tearsheet.png ├── top_analysis.png └── variables_importance.png ├── requirements.txt └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | tests 2 | models 3 | backtest 4 | *__pycache__* 5 | balanceSheetHistory.csv 6 | cashflowStatementHistory.csv 7 | dividends.csv 8 | incomeStatementHistory.csv 9 | shares.csv 10 | data_clean.csv 11 | data_evol_clean.csv 12 | prices_daily.csv -------------------------------------------------------------------------------- /1_get_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from yfinance import Ticker 5 | import pandas as pd 6 | from yahoofinancials import YahooFinancials 7 | import requests 8 | from tqdm import tqdm 9 | import time 10 | import pickle 11 | 12 | # with open('tmp.pickle', 'rb') as f: 13 | # statements, tickers_done = pickle.load(f) 14 | 15 | 16 | # Download function 17 | def _download_one(ticker, start=None, end=None, 18 | auto_adjust=False, back_adjust=False, 19 | actions=False, period="max", interval="1d", 20 | prepost=False, proxy=None, rounding=False): 21 | 22 | return Ticker(ticker).history(period=period, interval=interval, 23 | start=start, end=end, prepost=prepost, 24 | actions=actions, auto_adjust=auto_adjust, 25 | back_adjust=back_adjust, proxy=proxy, 26 | rounding=rounding, many=True) 27 | 28 | 29 | # Modify project and reference index according to your needs 30 | tickers_all = [] 31 | for project in ["sp500", "nyse", "nasdaq"]: 32 | print(project) 33 | ref_index = ["^GSPC", "^IXIC"] 34 | 35 | # Load tickers 36 | companies = pd.read_csv(f"data/{project}/{project}.csv", sep=",") 37 | tickers = companies.Symbol.tolist() 38 | tickers = [a for a in tickers if a not in tickers_all and "^" not in a and r"/" not in a] 39 | tickers_all += tickers 40 | 41 | # Download prices 42 | full_data = {} 43 | for ticker in tqdm(tickers + ref_index): 44 | tckr = _download_one(ticker, 45 | period="7y", 46 | actions=True) 47 | full_data[ticker] = tckr 48 | ohlc = pd.concat(full_data.values(), axis=1, 49 | keys=full_data.keys()) 50 | ohlc.columns = ohlc.columns.swaplevel(0, 1) 51 | ohlc.sort_index(level=0, axis=1, inplace=True) 52 | prices = ohlc["Adj Close"] 53 | dividends = ohlc["Dividends"] 54 | prices.to_csv(f"data/{project}/prices_daily.csv") 55 | dividends.to_csv(f"data/{project}/dividends.csv") 56 | 57 | statements = {} 58 | tickers_done = [] 59 | for ticker in tqdm(tickers): 60 | # Get statements 61 | if ticker in tickers_done: 62 | continue 63 | yahoo_financials = YahooFinancials(ticker) 64 | stmts_codes = ['income', 'cash', 'balance'] 65 | all_statement_data = yahoo_financials.get_financial_stmts('annual', 66 | stmts_codes) 67 | # build statements dictionnary 68 | for a in all_statement_data.keys(): 69 | if a not in statements: 70 | statements[a] = list() 71 | for b in all_statement_data[a]: 72 | try: 73 | for result in all_statement_data[a][b]: 74 | extracted_date = list(result)[0] 75 | dataframe_row = list(result.values())[0] 76 | dataframe_row['date'] = extracted_date 77 | dataframe_row['symbol'] = b 78 | statements[a].append(dataframe_row) 79 | except Exception as e: 80 | print("Error on " + ticker + " : " + a) 81 | tickers_done.append(ticker) 82 | with open('tmp.pickle', 'wb') as f: 83 | pickle.dump([statements, tickers_done], f) 84 | 85 | # save dataframes 86 | for a in all_statement_data.keys(): 87 | df = pd.DataFrame(statements[a]).set_index('date') 88 | df.to_csv(f"data/{project}/{a}.csv") 89 | 90 | # Donwload shares 91 | shares = [] 92 | tickers_done = [] 93 | for ticker in tqdm(tickers): 94 | if ticker in tickers_done: 95 | continue 96 | d = requests.get(f"https://query1.finance.yahoo.com/ws/fundamentals-timeseries/v1/finance/timeseries/{ticker}?symbol={ticker}&padTimeSeries=true&type=annualPreferredSharesNumber,annualOrdinarySharesNumber&merge=false&period1=0&period2=2013490868") 97 | if not d.ok: 98 | time.sleep(300) 99 | d = requests.get(f"https://query1.finance.yahoo.com/ws/fundamentals-timeseries/v1/finance/timeseries/{ticker}?symbol={ticker}&padTimeSeries=true&type=annualPreferredSharesNumber,annualOrdinarySharesNumber&merge=false&period1=0&period2=2013490868") 100 | ctn = d.json()['timeseries']['result'] 101 | dct = dict() 102 | for n in ctn: 103 | type = n['meta']['type'][0] 104 | dct[type] = dict() 105 | if type in n: 106 | for o in n[type]: 107 | if o is not None: 108 | dct[type][o['asOfDate']] = o['reportedValue']['raw'] 109 | df = pd.DataFrame.from_dict(dct) 110 | df['symbol'] = ticker 111 | shares.append(df) 112 | tickers_done.append(ticker) 113 | time.sleep(1) 114 | 115 | # save dataframe 116 | df = pd.concat(shares) 117 | df['date'] = df.index 118 | df.to_csv(f"data/{project}/shares.csv", index=False) 119 | 120 | # https://query1.finance.yahoo.com/ws/fundamentals-timeseries/v1/finance/timeseries/MSFT?symbol=MSFT&padTimeSeries=true&type=annualTreasurySharesNumber,trailingTreasurySharesNumber,annualPreferredSharesNumber,trailingPreferredSharesNumber,annualOrdinarySharesNumber,trailingOrdinarySharesNumber,annualShareIssued,trailingShareIssued,annualNetDebt,trailingNetDebt,annualTotalDebt,trailingTotalDebt,annualTangibleBookValue,trailingTangibleBookValue,annualInvestedCapital,trailingInvestedCapital,annualWorkingCapital,trailingWorkingCapital,annualNetTangibleAssets,trailingNetTangibleAssets,annualCapitalLeaseObligations,trailingCapitalLeaseObligations,annualCommonStockEquity,trailingCommonStockEquity,annualPreferredStockEquity,trailingPreferredStockEquity,annualTotalCapitalization,trailingTotalCapitalization,annualTotalEquityGrossMinorityInterest,trailingTotalEquityGrossMinorityInterest,annualMinorityInterest,trailingMinorityInterest,annualStockholdersEquity,trailingStockholdersEquity,annualOtherEquityInterest,trailingOtherEquityInterest,annualGainsLossesNotAffectingRetainedEarnings,trailingGainsLossesNotAffectingRetainedEarnings,annualOtherEquityAdjustments,trailingOtherEquityAdjustments,annualFixedAssetsRevaluationReserve,trailingFixedAssetsRevaluationReserve,annualForeignCurrencyTranslationAdjustments,trailingForeignCurrencyTranslationAdjustments,annualMinimumPensionLiabilities,trailingMinimumPensionLiabilities,annualUnrealizedGainLoss,trailingUnrealizedGainLoss,annualTreasuryStock,trailingTreasuryStock,annualRetainedEarnings,trailingRetainedEarnings,annualAdditionalPaidInCapital,trailingAdditionalPaidInCapital,annualCapitalStock,trailingCapitalStock,annualOtherCapitalStock,trailingOtherCapitalStock,annualCommonStock,trailingCommonStock,annualPreferredStock,trailingPreferredStock,annualTotalPartnershipCapital,trailingTotalPartnershipCapital,annualGeneralPartnershipCapital,trailingGeneralPartnershipCapital,annualLimitedPartnershipCapital,trailingLimitedPartnershipCapital,annualTotalLiabilitiesNetMinorityInterest,trailingTotalLiabilitiesNetMinorityInterest,annualTotalNonCurrentLiabilitiesNetMinorityInterest,trailingTotalNonCurrentLiabilitiesNetMinorityInterest,annualOtherNonCurrentLiabilities,trailingOtherNonCurrentLiabilities,annualLiabilitiesHeldforSaleNonCurrent,trailingLiabilitiesHeldforSaleNonCurrent,annualRestrictedCommonStock,trailingRestrictedCommonStock,annualPreferredSecuritiesOutsideStockEquity,trailingPreferredSecuritiesOutsideStockEquity,annualDerivativeProductLiabilities,trailingDerivativeProductLiabilities,annualEmployeeBenefits,trailingEmployeeBenefits,annualNonCurrentPensionAndOtherPostretirementBenefitPlans,trailingNonCurrentPensionAndOtherPostretirementBenefitPlans,annualNonCurrentAccruedExpenses,trailingNonCurrentAccruedExpenses,annualDuetoRelatedPartiesNonCurrent,trailingDuetoRelatedPartiesNonCurrent,annualTradeandOtherPayablesNonCurrent,trailingTradeandOtherPayablesNonCurrent,annualNonCurrentDeferredLiabilities,trailingNonCurrentDeferredLiabilities,annualNonCurrentDeferredRevenue,trailingNonCurrentDeferredRevenue,annualNonCurrentDeferredTaxesLiabilities,trailingNonCurrentDeferredTaxesLiabilities,annualLongTermDebtAndCapitalLeaseObligation,trailingLongTermDebtAndCapitalLeaseObligation,annualLongTermCapitalLeaseObligation,trailingLongTermCapitalLeaseObligation,annualLongTermDebt,trailingLongTermDebt,annualLongTermProvisions,trailingLongTermProvisions,annualCurrentLiabilities,trailingCurrentLiabilities,annualOtherCurrentLiabilities,trailingOtherCurrentLiabilities,annualCurrentDeferredLiabilities,trailingCurrentDeferredLiabilities,annualCurrentDeferredRevenue,trailingCurrentDeferredRevenue,annualCurrentDeferredTaxesLiabilities,trailingCurrentDeferredTaxesLiabilities,annualCurrentDebtAndCapitalLeaseObligation,trailingCurrentDebtAndCapitalLeaseObligation,annualCurrentCapitalLeaseObligation,trailingCurrentCapitalLeaseObligation,annualCurrentDebt,trailingCurrentDebt,annualOtherCurrentBorrowings,trailingOtherCurrentBorrowings,annualLineOfCredit,trailingLineOfCredit,annualCommercialPaper,trailingCommercialPaper,annualCurrentNotesPayable,trailingCurrentNotesPayable,annualPensionandOtherPostRetirementBenefitPlansCurrent,trailingPensionandOtherPostRetirementBenefitPlansCurrent,annualCurrentProvisions,trailingCurrentProvisions,annualPayablesAndAccruedExpenses,trailingPayablesAndAccruedExpenses,annualCurrentAccruedExpenses,trailingCurrentAccruedExpenses,annualInterestPayable,trailingInterestPayable,annualPayables,trailingPayables,annualOtherPayable,trailingOtherPayable,annualDuetoRelatedPartiesCurrent,trailingDuetoRelatedPartiesCurrent,annualDividendsPayable,trailingDividendsPayable,annualTotalTaxPayable,trailingTotalTaxPayable,annualIncomeTaxPayable,trailingIncomeTaxPayable,annualAccountsPayable,trailingAccountsPayable,annualTotalAssets,trailingTotalAssets,annualTotalNonCurrentAssets,trailingTotalNonCurrentAssets,annualOtherNonCurrentAssets,trailingOtherNonCurrentAssets,annualDefinedPensionBenefit,trailingDefinedPensionBenefit,annualNonCurrentPrepaidAssets,trailingNonCurrentPrepaidAssets,annualNonCurrentDeferredAssets,trailingNonCurrentDeferredAssets,annualNonCurrentDeferredTaxesAssets,trailingNonCurrentDeferredTaxesAssets,annualDuefromRelatedPartiesNonCurrent,trailingDuefromRelatedPartiesNonCurrent,annualNonCurrentNoteReceivables,trailingNonCurrentNoteReceivables,annualNonCurrentAccountsReceivable,trailingNonCurrentAccountsReceivable,annualFinancialAssets,trailingFinancialAssets,annualInvestmentsAndAdvances,trailingInvestmentsAndAdvances,annualOtherInvestments,trailingOtherInvestments,annualInvestmentinFinancialAssets,trailingInvestmentinFinancialAssets,annualHeldToMaturitySecurities,trailingHeldToMaturitySecurities,annualAvailableForSaleSecurities,trailingAvailableForSaleSecurities,annualFinancialAssetsDesignatedasFairValueThroughProfitorLossTotal,trailingFinancialAssetsDesignatedasFairValueThroughProfitorLossTotal,annualTradingSecurities,trailingTradingSecurities,annualLongTermEquityInvestment,trailingLongTermEquityInvestment,annualInvestmentsinJointVenturesatCost,trailingInvestmentsinJointVenturesatCost,annualInvestmentsInOtherVenturesUnderEquityMethod,trailingInvestmentsInOtherVenturesUnderEquityMethod,annualInvestmentsinAssociatesatCost,trailingInvestmentsinAssociatesatCost,annualInvestmentsinSubsidiariesatCost,trailingInvestmentsinSubsidiariesatCost,annualInvestmentProperties,trailingInvestmentProperties,annualGoodwillAndOtherIntangibleAssets,trailingGoodwillAndOtherIntangibleAssets,annualOtherIntangibleAssets,trailingOtherIntangibleAssets,annualGoodwill,trailingGoodwill,annualNetPPE,trailingNetPPE,annualAccumulatedDepreciation,trailingAccumulatedDepreciation,annualGrossPPE,trailingGrossPPE,annualLeases,trailingLeases,annualConstructionInProgress,trailingConstructionInProgress,annualOtherProperties,trailingOtherProperties,annualMachineryFurnitureEquipment,trailingMachineryFurnitureEquipment,annualBuildingsAndImprovements,trailingBuildingsAndImprovements,annualLandAndImprovements,trailingLandAndImprovements,annualProperties,trailingProperties,annualCurrentAssets,trailingCurrentAssets,annualOtherCurrentAssets,trailingOtherCurrentAssets,annualHedgingAssetsCurrent,trailingHedgingAssetsCurrent,annualAssetsHeldForSaleCurrent,trailingAssetsHeldForSaleCurrent,annualCurrentDeferredAssets,trailingCurrentDeferredAssets,annualCurrentDeferredTaxesAssets,trailingCurrentDeferredTaxesAssets,annualRestrictedCash,trailingRestrictedCash,annualPrepaidAssets,trailingPrepaidAssets,annualInventory,trailingInventory,annualInventoriesAdjustmentsAllowances,trailingInventoriesAdjustmentsAllowances,annualOtherInventories,trailingOtherInventories,annualFinishedGoods,trailingFinishedGoods,annualWorkInProcess,trailingWorkInProcess,annualRawMaterials,trailingRawMaterials,annualReceivables,trailingReceivables,annualReceivablesAdjustmentsAllowances,trailingReceivablesAdjustmentsAllowances,annualOtherReceivables,trailingOtherReceivables,annualDuefromRelatedPartiesCurrent,trailingDuefromRelatedPartiesCurrent,annualTaxesReceivable,trailingTaxesReceivable,annualAccruedInterestReceivable,trailingAccruedInterestReceivable,annualNotesReceivable,trailingNotesReceivable,annualLoansReceivable,trailingLoansReceivable,annualAccountsReceivable,trailingAccountsReceivable,annualAllowanceForDoubtfulAccountsReceivable,trailingAllowanceForDoubtfulAccountsReceivable,annualGrossAccountsReceivable,trailingGrossAccountsReceivable,annualCashCashEquivalentsAndShortTermInvestments,trailingCashCashEquivalentsAndShortTermInvestments,annualOtherShortTermInvestments,trailingOtherShortTermInvestments,annualCashAndCashEquivalents,trailingCashAndCashEquivalents,annualCashEquivalents,trailingCashEquivalents,annualCashFinancial,trailingCashFinancial&merge=false&period1=493590046&period2=1613490868 121 | # https://query1.finance.yahoo.com/v8/finance/chart/MSFT?symbol=MSFT&period1=1550725200&period2=1613491890&useYfid=true&interval=1d&events=div 122 | # https://query1.finance.yahoo.com/v10/finance/quoteSummary/MSFT?formatted=true&crumb=2M1BZy1YB7f&lang=en-US®ion=US&modules=incomeStatementHistory,cashflowStatementHistory,balanceSheetHistory,incomeStatementHistoryQuarterly,cashflowStatementHistoryQuarterly,balanceSheetHistoryQuarterly&corsDomain=finance.yahoo.com 123 | -------------------------------------------------------------------------------- /2_preprocess_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import pandas as pd 5 | from tqdm import tqdm 6 | 7 | # Modify project and reference index according to your needs 8 | for project in ["sp500", "nyse", "nasdaq"]: 9 | print(project) 10 | ref_index = "^IXIC" 11 | 12 | # Load data 13 | prices = pd.read_csv(f"data/{project}/prices_daily.csv") 14 | dividends = pd.read_csv(f"data/{project}/dividends.csv") 15 | income = pd.read_csv(f"data/{project}/incomeStatementHistory.csv") 16 | balance = pd.read_csv(f"data/{project}/balanceSheetHistory.csv") 17 | cashflow = pd.read_csv(f"data/{project}/cashflowStatementHistory.csv") 18 | companies = pd.read_csv(f"data/{project}/{project}.csv", sep=",") 19 | shares = pd.read_csv(f"data/{project}/shares.csv") 20 | print("Data loaded") 21 | 22 | # Format date 23 | prices['Date'] = pd.to_datetime(prices['Date']) 24 | dividends['Date'] = pd.to_datetime(dividends['Date']) 25 | income['date'] = pd.to_datetime(income['date']) 26 | balance['date'] = pd.to_datetime(balance['date']) 27 | cashflow['date'] = pd.to_datetime(cashflow['date']) 28 | shares['date'] = pd.to_datetime(shares['date']) 29 | print("Date formatted") 30 | 31 | # Merge financial statements 32 | fin_stats = income.merge(balance, 33 | on=['date', 'symbol'], 34 | how="inner", 35 | suffixes=("", "_y")) 36 | fin_stats = fin_stats.merge(cashflow, 37 | on=['date', 'symbol'], 38 | how="inner", 39 | suffixes=("", "_y")) 40 | print("Statetements merged") 41 | 42 | # Merge with price current year 43 | fin_stats = fin_stats.sort_values("date") 44 | prices_long = pd.melt(prices, "Date").sort_values("Date") 45 | fin = list() 46 | for sbl in tqdm(fin_stats.symbol.unique().tolist()): 47 | df1 = fin_stats[fin_stats.symbol == sbl] 48 | df2 = prices_long[prices_long.variable == sbl] 49 | fin.append(pd.merge_asof(df1, 50 | df2, 51 | left_on="date", 52 | right_on="Date", 53 | direction="backward")) 54 | fin = pd.concat(fin).reset_index(drop=True) 55 | fin = fin.rename(columns={"value": "price", "Date": "date_price"}) 56 | fin = fin.drop(columns=["variable"]) 57 | print("Current prices merged") 58 | 59 | # Merge with price previous year 60 | prices_long_previous = prices_long.copy() 61 | prices_long_previous['Date'] = prices_long_previous['Date'] + pd.DateOffset(years=1) 62 | fin2 = list() 63 | for sbl in tqdm(fin.symbol.unique().tolist()): 64 | df1 = fin[fin.symbol == sbl] 65 | df2 = prices_long_previous[prices_long_previous.variable == sbl] 66 | fin2.append(pd.merge_asof(df1, 67 | df2, 68 | left_on="date", 69 | right_on="Date", 70 | direction="backward")) 71 | fin = pd.concat(fin2).reset_index(drop=True) 72 | fin['Date'] = fin['Date'] - pd.DateOffset(years=1) 73 | fin = fin.rename(columns={"value": "price_previous", 74 | "Date": "date_price_previous"}) 75 | fin = fin.drop(columns=["variable"]) 76 | print("Previous prices merged") 77 | 78 | # Merge with price next year 79 | prices_long_next = prices_long.copy() 80 | prices_long_next['Date'] = prices_long_next['Date'] - pd.DateOffset(years=1) 81 | fin2 = list() 82 | for sbl in tqdm(fin.symbol.unique().tolist()): 83 | df1 = fin[fin.symbol == sbl] 84 | df2 = prices_long_next[prices_long_next.variable == sbl] 85 | fin2.append(pd.merge_asof(df1, 86 | df2, 87 | left_on="date", 88 | right_on="Date", 89 | direction="backward")) 90 | fin = pd.concat(fin2).reset_index(drop=True) 91 | fin['Date'] = fin['Date'] + pd.DateOffset(years=1) 92 | fin = fin.rename(columns={"value": "price_next", "Date": "date_price_next"}) 93 | fin = fin.drop(columns=["variable"]) 94 | print("Next prices merged") 95 | 96 | # Merge with dividends 97 | for index, row in tqdm(fin.iterrows()): 98 | datemax = pd.to_datetime(row['date']) 99 | datemin = datemax - pd.DateOffset(years=1) 100 | eps = dividends[row['symbol']][(dividends['Date'] <= datemax) & (dividends['Date'] > datemin)].sum() 101 | fin.at[index, 'eps'] = eps 102 | print("Dividends merged") 103 | 104 | # Merge with sector 105 | cpn = companies[['Symbol', 'Sector']] 106 | fin = fin.merge(cpn, left_on="symbol", right_on="Symbol") 107 | fin = fin.rename(columns={"Sector": "sector"}) 108 | fin = fin.drop(columns=["Symbol"]) 109 | print("Sector merged") 110 | 111 | # Add reference index 112 | df1 = prices_long[prices_long.variable == ref_index] 113 | fin = fin.sort_values("date") 114 | fin = fin[fin['date_price_previous'].notnull()] 115 | fin = pd.merge_asof(fin, 116 | df1, 117 | left_on="date_price", 118 | right_on="Date", 119 | direction="nearest", 120 | suffixes=("", "_ref")) 121 | fin = pd.merge_asof(fin, 122 | df1, 123 | left_on="date_price_next", 124 | right_on="Date", 125 | direction="nearest", 126 | suffixes=("", "_ref_next")) 127 | fin = pd.merge_asof(fin, 128 | df1, 129 | left_on="date_price_previous", 130 | right_on="Date", 131 | direction="nearest", 132 | suffixes=("", "_ref_previous")) 133 | fin = fin.rename(columns={"value": "ref", 134 | "Date": "date_ref", 135 | "value_ref_next": "ref_next", 136 | "value_ref_previous":"ref_previous", 137 | "Date_ref_next": "date_ref_next", 138 | "Date_ref_previous": "date_ref_previous"}) 139 | fin = fin.drop(columns=["variable", "variable_ref_next", "variable_ref_previous"]) 140 | fin = fin.sort_values(["symbol", "date"]) 141 | print("Reference index merged") 142 | 143 | # Merge with shares 144 | shares = shares.fillna(0) 145 | shares['sharesNumber'] = shares['annualOrdinarySharesNumber'] + shares['annualPreferredSharesNumber'] 146 | shares = shares.drop(columns=["annualOrdinarySharesNumber", "annualPreferredSharesNumber"]) 147 | shares = shares.sort_values("date") 148 | fin2 = list() 149 | for sbl in tqdm(fin.symbol.unique().tolist()): 150 | df1 = fin[fin.symbol == sbl] 151 | df2 = shares[shares.symbol == sbl] 152 | fin2.append(pd.merge_asof(df1, 153 | df2, 154 | left_on="date", 155 | right_on="date", 156 | direction="nearest", 157 | suffixes=("", "_ref"))) 158 | fin = pd.concat(fin2).reset_index(drop=True) 159 | fin = fin.drop(columns=['symbol_ref']) 160 | 161 | # Assess missing values 162 | percent_missing = fin.isnull().sum() * 100 / len(fin) 163 | print(percent_missing.sort_values()) 164 | 165 | # save 166 | fin.to_csv(f"data/{project}/data.csv", index=False) 167 | -------------------------------------------------------------------------------- /3_feature_eng.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import pandas as pd 5 | import datetime 6 | import numpy as np 7 | import dtale 8 | from tqdm import tqdm 9 | 10 | # Load data 11 | data = pd.concat([pd.read_csv(f"data/sp500/data.csv"), 12 | pd.read_csv(f"data/nyse/data.csv"), 13 | pd.read_csv(f"data/nasdaq/data.csv") 14 | ] 15 | ).reset_index(drop=True) 16 | 17 | # Remove duplicate 18 | data = data.drop_duplicates() 19 | 20 | # Format date 21 | data['date'] = pd.to_datetime(data['date']) 22 | data['date_price'] = pd.to_datetime(data['date_price']) 23 | data['date_price_previous'] = pd.to_datetime(data['date_price_previous']) 24 | data['date_price_next'] = pd.to_datetime(data['date_price_next']) 25 | data['date_ref'] = pd.to_datetime(data['date_ref']) 26 | data['date_ref_previous'] = pd.to_datetime(data['date_ref_previous']) 27 | data['date_ref_next'] = pd.to_datetime(data['date_ref_next']) 28 | 29 | # Assess missing values 30 | percent_missing = data.isnull().sum() * 100 / len(data) 31 | 32 | # Remove data whose price date is too different from financial statements date 33 | data = data[abs(data['date'] - data['date_price']) < datetime.timedelta(weeks=2)] 34 | 35 | # Remove features that have more than 20% of missing values 36 | col2rm = percent_missing[percent_missing > 20].index.tolist() 37 | data = data.drop(columns=col2rm) 38 | 39 | # Creation of new variables 40 | 41 | # Yield for previous year 42 | data['yield'] = (np.log(data['price']/data['price_previous']))/((data['date_price'] - data['date_price_previous']) / datetime.timedelta(weeks=52)) 43 | # Yield for next year 44 | data['yield_next'] = (np.log(data['price_next']/data['price']))/((data['date_price_next'] - data['date_price']) / datetime.timedelta(weeks=52)) 45 | # Reference yield for previous year 46 | data['yield_ref'] = (np.log(data['ref']/data['ref_previous']))/((data['date_ref'] - data['date_ref_previous']) / datetime.timedelta(weeks=52)) 47 | # Reference yield for next year 48 | data['yield_ref_next'] = (np.log(data['ref_next']/data['ref']))/((data['date_ref_next'] - data['date_ref']) / datetime.timedelta(weeks=52)) 49 | # Best performance than reference for previous year 50 | data['outperform'] = data['yield'] > data['yield_ref'] 51 | # Best performance than reference for next year 52 | data['outperform_next'] = data['yield_next'] > data['yield_ref_next'] 53 | # Positivive performance for previous year 54 | data['positive'] = data['yield'] > 0 55 | # Positive performance reference for next year 56 | data['positive_next'] = data['yield_next'] > 0 57 | # Market capitalization 58 | data['market_cap'] = data['price'] * data['sharesNumber'] 59 | # percent of dividends 60 | data['div_percent'] = data['eps'] / data['price'] 61 | # group sectors 62 | di = {'Consumer Discretionary': 'Consumer Services', 63 | 'Consumer Non-Durables': 'Consumer Services', 64 | 'Consumer Durables': 'Consumer Services', 65 | 'Consumer Staples': 'Consumer Services', 66 | 'Utilities': 'Energy', 67 | 'Basic Industries': 'Industrials', 68 | 'Materials': 'Industrials', 69 | 'Information Technology': 'Technology', 70 | 'Financials': 'Finance', 71 | } 72 | data = data.replace({"sector": di}) 73 | # Selection of variables for first analysis 74 | info = ['date', 'symbol', 'sector', ] 75 | variables = ['netIncome', 76 | 'grossProfit', 77 | 'ebit', 78 | 'totalRevenue', 79 | 'costOfRevenue', 80 | 'totalOtherIncomeExpenseNet', 81 | 'otherCurrentLiab', 82 | 'totalAssets', 83 | 'commonStock', 84 | 'otherLiab', 85 | 'otherAssets', 86 | 'cash', 87 | 'propertyPlantEquipment', 88 | 'accountsPayable', 89 | 'capitalSurplus', 90 | 'changeToLiabilities', 91 | 'totalCashflowsFromInvestingActivities', 92 | 'netBorrowings', 93 | 'totalCashFromFinancingActivities', 94 | 'changeInCash', 95 | 'totalCashFromOperatingActivities', 96 | 'depreciation', 97 | 'changeToNetincome', 98 | 'capitalExpenditures', 99 | 'changeToOperatingActivities' 100 | ] 101 | targets = ['market_cap', 102 | 'div_percent', 103 | 'yield', 104 | 'yield_ref', 105 | 'yield_next', 106 | 'yield_ref_next', 107 | 'outperform', 108 | 'outperform_next', 109 | 'positive', 110 | 'positive_next' 111 | ] 112 | data[['ebitAbs', 'totalRevenueAbs']] = data[['ebit', 'totalRevenue']] 113 | variables_abs = ['ebitAbs', 'totalRevenueAbs'] 114 | # Normalization by market cap 115 | data[variables] = data[variables].div(data.market_cap, axis=0) 116 | # Data selection 117 | data = data[info + variables + variables_abs + targets] 118 | 119 | # Remove when next yield is not available 120 | data = data.dropna(subset=['yield_next']) 121 | 122 | # Includ previous values 123 | data = data.sort_values("date") 124 | prev_features = variables + ['div_percent', 'yield', 'market_cap'] 125 | data_prev = data[["date", "symbol"] + prev_features] 126 | data_merged = list() 127 | for sbl in tqdm(data.symbol.unique().tolist()): 128 | df1 = data[data.symbol == sbl] 129 | df2 = data_prev[data_prev.symbol == sbl] 130 | data_merged.append(pd.merge_asof(df1, df2, left_on="date", right_on="date", direction="backward", suffixes=("", "_evol"), allow_exact_matches=False)) 131 | data_evol = pd.concat(data_merged).reset_index(drop=True) 132 | data_evol = data_evol.dropna(subset=["yield_evol"]) 133 | data_evol[[a + "_evol" for a in prev_features]] = data_evol[[a + "_evol" for a in prev_features]].values - data_evol[prev_features].values 134 | data_evol = data_evol.drop(columns=["symbol_evol"]) 135 | 136 | # Assess missing values 137 | percent_missing = data.isnull().sum() * 100 / len(data) 138 | print(percent_missing.sort_values()) 139 | 140 | # DTale 141 | # d = dtale.show(data) 142 | 143 | # d.open_browser() 144 | 145 | # Save data 146 | data.to_csv("data/data_clean.csv", index=False) 147 | data_evol.to_csv("data/data_evol_clean.csv", index=False) 148 | -------------------------------------------------------------------------------- /4_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import pandas as pd 5 | from sklearn.metrics import roc_auc_score, roc_curve 6 | import matplotlib.pyplot as plt 7 | from datetime import datetime 8 | import numpy as np 9 | from utils import process_data 10 | 11 | # seed 12 | np.random.seed(0) 13 | 14 | # Load data 15 | data = pd.read_csv(f"data/data_clean.csv") 16 | # Format date 17 | data['date'] = pd.to_datetime(data['date']) 18 | # Define variables 19 | features = ['netIncome', 'grossProfit', 'ebit', 'totalRevenue', 'costOfRevenue', 'totalOtherIncomeExpenseNet', 'otherCurrentLiab', 'totalAssets', 'commonStock', 'otherLiab', 'otherAssets', 'cash', 'propertyPlantEquipment', 'accountsPayable', 'capitalSurplus', 'changeToLiabilities', 'totalCashflowsFromInvestingActivities', 'netBorrowings', 'totalCashFromFinancingActivities', 'changeInCash', 'totalCashFromOperatingActivities', 'depreciation', 'changeToNetincome', 'capitalExpenditures', 'changeToOperatingActivities', 'market_cap', 'div_percent', 'ebitAbs', 'totalRevenueAbs', 'yield', 'sector', 'outperform', 'positive' ] 20 | target = 'outperform_next' 21 | 22 | # Date for validation and test sets 23 | # date_valid = datetime(year=2020, month=1, day=1) 24 | # date_test = datetime(year=2020, month=11, day=1) 25 | date_valid = datetime(year=2019, month=1, day=1) 26 | date_test = datetime(year=2020, month=6, day=1) 27 | 28 | # Process data and delcare model 29 | clf, X_train, y_train, X_valid, y_valid, X_test, y_test, _ = \ 30 | process_data(data, date_valid, date_test, features, target) 31 | 32 | # Fuse train and validation sets and random shuffle 33 | X = np.append(X_train, X_valid, axis=0) 34 | y = np.append(y_train, y_valid, axis=0) 35 | indices = list(range(len(X))) 36 | np.random.shuffle(indices) 37 | n = int(np.floor(0.9*len(indices))) 38 | t, v = indices[:n], indices[n:] 39 | X_train, X_valid, y_train, y_valid = X[t], X[v], y[t], y[v] 40 | 41 | # Training 42 | clf.fit(X_train=X_train, 43 | y_train=y_train, 44 | eval_set=[(X_train, y_train), (X_valid, y_valid)], 45 | eval_name=['train', 'valid'], 46 | eval_metric=['auc'], 47 | max_epochs=80, 48 | patience=50, 49 | batch_size=1024, 50 | virtual_batch_size=128, 51 | num_workers=0, 52 | drop_last=False 53 | ) 54 | 55 | # save model 56 | saving_path_name = "models/model" 57 | saved_filepath = clf.save_model(saving_path_name) 58 | # clf = TabNetClassifier() 59 | # clf.load_model("models/model.zip") 60 | 61 | # Test 62 | test_auc = roc_auc_score(y_score=clf.predict_proba(X_test)[:, 1], 63 | y_true=y_test) 64 | valid_auc = roc_auc_score(y_score=clf.predict_proba(X_valid)[:, 1], 65 | y_true=y_valid) 66 | train_auc = roc_auc_score(y_score=clf.predict_proba(X_train)[:, 1], 67 | y_true=y_train) 68 | print("Testing AUC\n") 69 | print(f"BEST TRAIN SCORE: {train_auc}") 70 | print(f"BEST VALID SCORE: {valid_auc}") 71 | print(f"BEST TEST SCORE: {test_auc}") 72 | 73 | # Plot roc curve 74 | fpr = dict() 75 | tpr = dict() 76 | fpr['train'], tpr['train'], _ = roc_curve(y_score=clf.predict_proba(X_train)[:, 1], 77 | y_true=y_train) 78 | fpr['valid'], tpr['valid'], _ = roc_curve(y_score=clf.predict_proba(X_valid)[:, 1], 79 | y_true=y_valid) 80 | fpr['test'], tpr['test'], _ = roc_curve(y_score=clf.predict_proba(X_test)[:, 1], 81 | y_true=y_test) 82 | plt.figure() 83 | colors = ['aqua', 'darkorange', 'cornflowerblue'] 84 | names = ['train', 'valid', 'test'] 85 | legends = [f'train ({round(train_auc, 3)})', 86 | f'validation ({round(valid_auc, 3)})', 87 | f'test ({round(test_auc, 3)})'] 88 | for name, color, legend in zip(names, colors, legends): 89 | plt.plot(fpr[name], tpr[name], color=color, lw=2, 90 | label='{0}'.format(legend)) 91 | plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') 92 | plt.xlim([0.0, 1.0]) 93 | plt.ylim([0.0, 1.05]) 94 | plt.xlabel('False Positive Rate') 95 | plt.ylabel('True Positive Rate') 96 | plt.title('Receiver operating characteristic') 97 | plt.legend(loc="lower right") 98 | plt.show() 99 | 100 | # Feature importance 101 | features_sorted = [] 102 | importance_sorted = [] 103 | print("\nFeature importance AUC\n") 104 | f = clf.feature_importances_ 105 | for i in np.argsort(f): 106 | print(f"{features[i]}: {f[i]}") 107 | features_sorted += [features[i]] 108 | importance_sorted += [f[i]] 109 | 110 | # Feature importance plot 111 | fig, ax = plt.subplots() 112 | features_sorted[18:33] = ["Performance against market", 113 | "Net Borrowings", 114 | "Total Assets", 115 | "Cost Of Revenue", 116 | "Capital Surplus", 117 | "Change To Liabilities", 118 | "Capital Expenditures", 119 | "Common Stock", 120 | "Gross Profit", 121 | "Positive performance", 122 | "Net Income", 123 | "Returns level", 124 | "Change To Net Income", 125 | "Dividends level", 126 | "Other Current Liabilities"] 127 | ax.barh(features_sorted[18:33], importance_sorted[18:33], align="center") 128 | ax.set_xlabel('') 129 | ax.set_title('Variable importance for 15 most outstanding variables') 130 | plt.show() 131 | -------------------------------------------------------------------------------- /5_backtest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import pandas as pd 5 | from datetime import datetime 6 | from dateutil.relativedelta import relativedelta 7 | 8 | from utils import process_data 9 | 10 | # Import prices 11 | prices_list = [pd.read_csv(f"data/sp500/prices_daily.csv"), 12 | pd.read_csv(f"data/nyse/prices_daily.csv"), 13 | pd.read_csv(f"data/nasdaq/prices_daily.csv") 14 | ] 15 | for i in range(3): 16 | prices_list[i]['Date'] = pd.to_datetime(prices_list[i]['Date']) 17 | prices_list[i].set_index(['Date'], inplace=True) 18 | prices = prices_list[0] 19 | for i in range(1, 3): 20 | cols_to_use = prices_list[i].columns.difference(prices.columns) 21 | prices = pd.merge(prices, 22 | prices_list[i][cols_to_use], 23 | left_index=True, 24 | right_index=True, 25 | how='outer') 26 | 27 | # Load data 28 | data = pd.read_csv(f"data/data_clean.csv") 29 | # Format date 30 | data['date'] = pd.to_datetime(data['date']) 31 | # Define variables 32 | features = ['netIncome', 'grossProfit', 'ebit', 'totalRevenue', 'costOfRevenue', 'totalOtherIncomeExpenseNet', 'otherCurrentLiab', 'totalAssets', 'commonStock', 'otherLiab', 'otherAssets', 'cash', 'propertyPlantEquipment', 'accountsPayable', 'capitalSurplus', 'changeToLiabilities', 'totalCashflowsFromInvestingActivities', 'netBorrowings', 'totalCashFromFinancingActivities', 'changeInCash', 'totalCashFromOperatingActivities', 'depreciation', 'changeToNetincome', 'capitalExpenditures', 'changeToOperatingActivities', 'market_cap', 'div_percent', 'ebitAbs', 'totalRevenueAbs', 'yield', 'sector', 'outperform', 'positive' ] 33 | target = 'outperform_next' 34 | 35 | # Define dates 36 | date_start = datetime(year=2018, month=1, day=1) 37 | dates = list(set(data[data.date >= date_start]['date'])) 38 | dates.sort() 39 | dates.append(max(dates) + relativedelta(years=10)) 40 | 41 | all_probas = [] 42 | for i in range(len(dates)-1): 43 | date = dates[i] 44 | date_next = dates[i+1] 45 | print(f"{date} - {date_next}") 46 | # Prepare model 47 | date_train = date - relativedelta(years=1) 48 | date_eval = date 49 | clf, X_train, y_train, X_eval, y_eval, _, _, indices = \ 50 | process_data(data, date_train, date_eval, features, target) 51 | sbl = data.symbol[indices == "valid"].tolist() 52 | # Fit model 53 | clf.fit(X_train=X_train, 54 | y_train=y_train, 55 | eval_set=[(X_train, y_train)], 56 | eval_name=['train'], 57 | eval_metric=['auc'], 58 | max_epochs=50, 59 | patience=0, 60 | batch_size=1024, 61 | virtual_batch_size=128, 62 | num_workers=0, 63 | drop_last=False, 64 | ) 65 | # Get relevant symbols 66 | prb = clf.predict_proba(X_eval)[:, 1] 67 | order = (-prb).argsort() 68 | symbols = {} 69 | for i in order: 70 | if sbl[i] not in symbols: 71 | symbols[sbl[i]] = prb[i] 72 | # Prices data for model testing 73 | prices_period = prices[prices.index < date_next] 74 | prices_period = prices_period[prices_period.index >= date] 75 | prices_period[~prices_period.isnull()] = 0 76 | prices_period.loc[:, symbols.keys()] = symbols.values() 77 | all_probas.append(prices_period) 78 | 79 | probas = pd.concat(all_probas) 80 | probas.to_csv(f"backtest/probas.csv") 81 | -------------------------------------------------------------------------------- /6_strategies.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import pandas as pd 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | import pyfolio as pf 8 | from pandas.plotting import table 9 | from datetime import datetime 10 | 11 | # Import prices 12 | prices_list = [pd.read_csv(f"data/sp500/prices_daily.csv"), 13 | pd.read_csv(f"data/nyse/prices_daily.csv"), 14 | pd.read_csv(f"data/nasdaq/prices_daily.csv") 15 | ] 16 | for i in range(3): 17 | prices_list[i]['Date'] = pd.to_datetime(prices_list[i]['Date']) 18 | prices_list[i].set_index(['Date'], inplace=True) 19 | prices = prices_list[0] 20 | for i in range(1, 3): 21 | cols_to_use = prices_list[i].columns.difference(prices.columns) 22 | prices = pd.merge(prices, 23 | prices_list[i][cols_to_use], 24 | left_index=True, 25 | right_index=True, 26 | how='outer') 27 | 28 | # Load data 29 | probas = pd.read_csv(f"backtest/probas.csv") 30 | # Format date 31 | probas['Date'] = pd.to_datetime(probas['Date']) 32 | probas.set_index(['Date'], inplace=True) 33 | 34 | # Filter prices 35 | prices = prices[prices.index >= min(probas.index)] 36 | probas[prices.isnull()] = np.nan 37 | probas[probas == 0] = np.nan 38 | 39 | # Returns on prices 40 | returns = prices.pct_change() 41 | 42 | 43 | # Softmax function 44 | def softmax(x): 45 | e_x = np.exp(x - np.nanmax(x)) 46 | return e_x / np.nansum(e_x) 47 | 48 | 49 | # Order and select function 50 | def orderselect(x, n=100): 51 | order = (-np.array(x)).argsort() 52 | return [1/min(n, len(x)) if i in order[:n] else np.nan for i in range(len(x))] 53 | 54 | 55 | # nasdaq 56 | nasdaq_returns = returns['^IXIC'] 57 | nasdaq_returns = (1 + nasdaq_returns).cumprod() 58 | nasdaq_returns.name = "Nasdaq" 59 | 60 | # s&p 61 | sp_returns = returns['^GSPC'] 62 | sp_returns = (1 + sp_returns).cumprod() 63 | sp_returns.name = "S&P 500" 64 | 65 | # Softmax strategy 66 | weights = probas.apply(softmax, axis=1) 67 | softmaxstrat = returns.multiply(weights).apply(np.nansum, axis=1) 68 | softmaxstrat = (1 + softmaxstrat).cumprod() 69 | softmaxstrat.name = "Softmax" 70 | 71 | # Order and select 72 | weights = probas.apply(lambda x: orderselect(x, 100), 73 | axis=1, 74 | result_type="expand") 75 | weights.columns = probas.columns 76 | strat100 = returns.multiply(weights).apply(np.nansum, axis=1) 77 | strat100 = (1 + strat100).cumprod() 78 | strat100.name = "Best 100" 79 | 80 | # Analysis on best performing stocks 81 | stocks = returns.copy() 82 | stocks = stocks[stocks.index > datetime(year=2020, month=1, day=1)] 83 | stocks_full = stocks.copy() 84 | stocks[weights.isna()] = 0 85 | cols = (1+stocks).cumprod().iloc[-1].nlargest(10).index.tolist() 86 | stocks = (1 + stocks[cols]).cumprod() 87 | stocks_full = (1 + stocks_full[cols]).cumprod() 88 | 89 | # Order and select 90 | weights = probas.apply(lambda x: orderselect(x, 1000), 91 | axis=1, 92 | result_type="expand") 93 | weights.columns = probas.columns 94 | strat1000 = returns.multiply(weights).apply(np.nansum, axis=1) 95 | strat1000 = (1 + strat1000).cumprod() 96 | strat1000.name = "Best 1000" 97 | 98 | # Plot 99 | fig, ax = plt.subplots(figsize=(16, 8)) 100 | softmaxstrat.plot(ax=ax, color="darkorange") 101 | strat100.plot(ax=ax, color="dodgerblue") 102 | strat1000.plot(ax=ax, color="seagreen") 103 | nasdaq_returns.plot(ax=ax, color="red") 104 | sp_returns.plot(ax=ax, color="purple") 105 | plt.legend(loc="best") 106 | ax.set_ylabel("Cummulative return") 107 | ax.set_title("Backtest based on the data from 2018 to 2021", fontsize=20) 108 | 109 | # Test on the numbers 110 | try: 111 | d = pd.read_csv(f"backtest/yields.csv") 112 | a = max(d.nb)+1 113 | except Exception as e: 114 | d = pd.DataFrame() 115 | a = 1 116 | 117 | for i in range(a, 2001): 118 | weights = probas.apply(lambda x: orderselect(x, i), 119 | axis=1, 120 | result_type="expand") 121 | weights.columns = probas.columns 122 | strat = returns.multiply(weights).apply(np.nansum, axis=1) 123 | strat = (1 + strat).cumprod() 124 | data = pd.DataFrame({"nb": [i], "yield": [strat.iloc[-1]]}) 125 | d = d.append(data) 126 | print(f"{i}: {strat.iloc[-1]}") 127 | d.to_csv(f"backtest/yields.csv", index=False) 128 | 129 | # Plot 130 | fig, ax = plt.subplots(figsize=(16, 8)) 131 | d['yield'][d['nb'] < 10000].plot(ax=ax, color="dodgerblue") 132 | # plt.legend(loc="best") 133 | ax.set_ylabel("cummulative return") 134 | ax.set_title("Returns for a strategy consisting of the x most promising stocks", 135 | fontsize=20) 136 | 137 | # Tear sheet 138 | weights = probas.apply(softmax, axis=1) 139 | softmaxstrat = returns.multiply(weights).apply(np.nansum, axis=1) 140 | softmax = pf.timeseries.perf_stats(softmaxstrat) 141 | weights = probas.apply(lambda x: orderselect(x, 100), 142 | axis=1, 143 | result_type="expand") 144 | weights.columns = probas.columns 145 | top100strat = returns.multiply(weights).apply(np.nansum, axis=1) 146 | top100 = pf.timeseries.perf_stats(top100strat) 147 | weights = probas.apply(lambda x: orderselect(x, 1000), 148 | axis=1, 149 | result_type="expand") 150 | weights.columns = probas.columns 151 | top1000strat = returns.multiply(weights).apply(np.nansum, axis=1) 152 | top1000 = pf.timeseries.perf_stats(top1000strat) 153 | nasdaq = pf.timeseries.perf_stats(returns['^IXIC']) 154 | sp500 = pf.timeseries.perf_stats(returns['^GSPC']) 155 | 156 | tearsheet = pd.concat({'Softmax': softmax, 157 | 'Top 100': top100, 158 | 'Top 1000': top1000, 159 | 'Nasdaq': nasdaq, 160 | 'S&P 500': sp500 161 | }, axis=1) 162 | tearsheet = tearsheet.round(2) 163 | tearsheet.fillna("", inplace=True) 164 | ax = plt.subplot(111, frame_on=False) 165 | ax.xaxis.set_visible(False) 166 | ax.yaxis.set_visible(False) 167 | table(ax, tearsheet) 168 | 169 | # Plot comparision 170 | 171 | stocks.plot(ylabel="Cumulative returns", title="Cumulative returns with model") 172 | 173 | stocks_full.plot(ylabel="Cumulative returns", title="Cumulative returns without model") 174 | 175 | # Plot 176 | fig, ax = plt.subplots() 177 | stocks['OESX'].plot(ax=ax, color="darkorange", label="With model") 178 | stocks_full['OESX'].plot(ax=ax, color="dodgerblue", label="Without model") 179 | plt.legend(loc="best") 180 | ax.set_ylabel("Cummulative return") 181 | ax.set_title("OESX", fontsize=20) 182 | 183 | # Plot 184 | fig, ax = plt.subplots() 185 | stocks['SAVA'].plot(ax=ax, color="darkorange", label="With model") 186 | stocks_full['SAVA'].plot(ax=ax, color="dodgerblue", label="Without model") 187 | plt.legend(loc="best") 188 | ax.set_ylabel("Cummulative return") 189 | ax.set_title("SAVA", fontsize=20) 190 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Stock returns prediction through financial statements 2 | 3 | This project use tabnet in order to try to predict, based on financial statements retrieved on Yahoo Finance, if a stock will outperform the market on a 1 year horizon. 4 | 5 | The project uses tabnet to perform the predictions. 6 | 7 | 1. Get financial data from Yahoo Finance ([1_get_data.py](1_get_data.py)) 8 | 2. Preprocess data into a clean dataset ([2_preprocess_data.py](2_preprocess_data.py)) 9 | 3. Feature engineering to create a training dataset ([3_feature_eng.py](3_feature_eng.py)) 10 | 4. Stock returns prediction with tabnet ([4_model.py](4_model.py)) 11 | 5. Perform a backtest analysis with the model ([5_backtest.py](5_backtest.py)) 12 | 6. Build and analyze different strategies based on the backtested scenarios ([6_strategies.py](6_strategies.py)) -------------------------------------------------------------------------------- /data/sp500/sp500.csv: -------------------------------------------------------------------------------- 1 | Symbol,Sector 2 | A,Health Care 3 | AAL,Industrials 4 | AAP,Consumer Discretionary 5 | AAPL,Information Technology 6 | ABBV,Health Care 7 | ABC,Health Care 8 | ABMD,Health Care 9 | ABT,Health Care 10 | ACN,Information Technology 11 | ADBE,Information Technology 12 | ADI,Information Technology 13 | ADM,Consumer Staples 14 | ADP,Information Technology 15 | ADSK,Information Technology 16 | AEE,Utilities 17 | AEP,Utilities 18 | AES,Utilities 19 | AFL,Financials 20 | AIG,Financials 21 | AIZ,Financials 22 | AJG,Financials 23 | AKAM,Information Technology 24 | ALB,Materials 25 | ALGN,Health Care 26 | ALK,Industrials 27 | ALL,Financials 28 | ALLE,Industrials 29 | ALXN,Health Care 30 | AMAT,Information Technology 31 | AMCR,Materials 32 | AMD,Information Technology 33 | AME,Industrials 34 | AMGN,Health Care 35 | AMP,Financials 36 | AMT,Real Estate 37 | AMZN,Consumer Discretionary 38 | ANET,Information Technology 39 | ANSS,Information Technology 40 | ANTM,Health Care 41 | AON,Financials 42 | AOS,Industrials 43 | APA,Energy 44 | APD,Materials 45 | APH,Information Technology 46 | APTV,Consumer Discretionary 47 | ARE,Real Estate 48 | ATO,Utilities 49 | ATVI,Communication Services 50 | AVB,Real Estate 51 | AVGO,Information Technology 52 | AVY,Materials 53 | AWK,Utilities 54 | AXP,Financials 55 | AZO,Consumer Discretionary 56 | BA,Industrials 57 | BAC,Financials 58 | BAX,Health Care 59 | BBY,Consumer Discretionary 60 | BDX,Health Care 61 | BEN,Financials 62 | BF-B,Consumer Staples 63 | BIIB,Health Care 64 | BIO,Health Care 65 | BK,Financials 66 | BKNG,Consumer Discretionary 67 | BKR,Energy 68 | BLK,Financials 69 | BLL,Materials 70 | BMY,Health Care 71 | BR,Information Technology 72 | BRK-B,Financials 73 | BSX,Health Care 74 | BWA,Consumer Discretionary 75 | BXP,Real Estate 76 | C,Financials 77 | CAG,Consumer Staples 78 | CAH,Health Care 79 | CARR,Industrials 80 | CAT,Industrials 81 | CB,Financials 82 | CBOE,Financials 83 | CBRE,Real Estate 84 | CCI,Real Estate 85 | CCL,Consumer Discretionary 86 | CDNS,Information Technology 87 | CDW,Information Technology 88 | CE,Materials 89 | CERN,Health Care 90 | CF,Materials 91 | CFG,Financials 92 | CHD,Consumer Staples 93 | CHRW,Industrials 94 | CHTR,Communication Services 95 | CI,Health Care 96 | CINF,Financials 97 | CL,Consumer Staples 98 | CLX,Consumer Staples 99 | CMA,Financials 100 | CMCSA,Communication Services 101 | CME,Financials 102 | CMG,Consumer Discretionary 103 | CMI,Industrials 104 | CMS,Utilities 105 | CNC,Health Care 106 | CNP,Utilities 107 | COF,Financials 108 | COG,Energy 109 | COO,Health Care 110 | COP,Energy 111 | COST,Consumer Staples 112 | CPB,Consumer Staples 113 | CPRT,Industrials 114 | CRM,Information Technology 115 | CSCO,Information Technology 116 | CSX,Industrials 117 | CTAS,Industrials 118 | CTLT,Health Care 119 | CTSH,Information Technology 120 | CTVA,Materials 121 | CTXS,Information Technology 122 | CVS,Health Care 123 | CVX,Energy 124 | D,Utilities 125 | DAL,Industrials 126 | DD,Materials 127 | DE,Industrials 128 | DFS,Financials 129 | DG,Consumer Discretionary 130 | DGX,Health Care 131 | DHI,Consumer Discretionary 132 | DHR,Health Care 133 | DIS,Communication Services 134 | DISCA,Communication Services 135 | DISCK,Communication Services 136 | DISH,Communication Services 137 | DLR,Real Estate 138 | DLTR,Consumer Discretionary 139 | DOV,Industrials 140 | DOW,Materials 141 | DPZ,Consumer Discretionary 142 | DRE,Real Estate 143 | DRI,Consumer Discretionary 144 | DTE,Utilities 145 | DUK,Utilities 146 | DVA,Health Care 147 | DVN,Energy 148 | DXC,Information Technology 149 | DXCM,Health Care 150 | EA,Communication Services 151 | EBAY,Consumer Discretionary 152 | ECL,Materials 153 | ED,Utilities 154 | EFX,Industrials 155 | EIX,Utilities 156 | EL,Consumer Staples 157 | EMN,Materials 158 | EMR,Industrials 159 | ENPH,Information Technology 160 | EOG,Energy 161 | EQIX,Real Estate 162 | EQR,Real Estate 163 | ES,Utilities 164 | ESS,Real Estate 165 | ETN,Industrials 166 | ETR,Utilities 167 | ETSY,Consumer Discretionary 168 | EVRG,Utilities 169 | EW,Health Care 170 | EXC,Utilities 171 | EXPD,Industrials 172 | EXPE,Consumer Discretionary 173 | EXR,Real Estate 174 | F,Consumer Discretionary 175 | FANG,Energy 176 | FAST,Industrials 177 | FB,Communication Services 178 | FBHS,Industrials 179 | FCX,Materials 180 | FDX,Industrials 181 | FE,Utilities 182 | FFIV,Information Technology 183 | FIS,Information Technology 184 | FISV,Information Technology 185 | FITB,Financials 186 | FLIR,Information Technology 187 | FLS,Industrials 188 | FLT,Information Technology 189 | FMC,Materials 190 | FOX,Communication Services 191 | FOXA,Communication Services 192 | FRC,Financials 193 | FRT,Real Estate 194 | FTI,Energy 195 | FTNT,Information Technology 196 | FTV,Industrials 197 | GD,Industrials 198 | GE,Industrials 199 | GILD,Health Care 200 | GIS,Consumer Staples 201 | GL,Financials 202 | GLW,Information Technology 203 | GM,Consumer Discretionary 204 | GOOG,Communication Services 205 | GOOGL,Communication Services 206 | GPC,Consumer Discretionary 207 | GPN,Information Technology 208 | GPS,Consumer Discretionary 209 | GRMN,Consumer Discretionary 210 | GS,Financials 211 | GWW,Industrials 212 | HAL,Energy 213 | HAS,Consumer Discretionary 214 | HBAN,Financials 215 | HBI,Consumer Discretionary 216 | HCA,Health Care 217 | HD,Consumer Discretionary 218 | HES,Energy 219 | HFC,Energy 220 | HIG,Financials 221 | HII,Industrials 222 | HLT,Consumer Discretionary 223 | HOLX,Health Care 224 | HON,Industrials 225 | HPE,Information Technology 226 | HPQ,Information Technology 227 | HRL,Consumer Staples 228 | HSIC,Health Care 229 | HST,Real Estate 230 | HSY,Consumer Staples 231 | HUM,Health Care 232 | HWM,Industrials 233 | IBM,Information Technology 234 | ICE,Financials 235 | IDXX,Health Care 236 | IEX,Industrials 237 | IFF,Materials 238 | ILMN,Health Care 239 | INCY,Health Care 240 | INFO,Industrials 241 | INTC,Information Technology 242 | INTU,Information Technology 243 | IP,Materials 244 | IPG,Communication Services 245 | IPGP,Information Technology 246 | IQV,Health Care 247 | IR,Industrials 248 | IRM,Real Estate 249 | ISRG,Health Care 250 | IT,Information Technology 251 | ITW,Industrials 252 | IVZ,Financials 253 | J,Industrials 254 | JBHT,Industrials 255 | JCI,Industrials 256 | JKHY,Information Technology 257 | JNJ,Health Care 258 | JNPR,Information Technology 259 | JPM,Financials 260 | K,Consumer Staples 261 | KEY,Financials 262 | KEYS,Information Technology 263 | KHC,Consumer Staples 264 | KIM,Real Estate 265 | KLAC,Information Technology 266 | KMB,Consumer Staples 267 | KMI,Energy 268 | KMX,Consumer Discretionary 269 | KO,Consumer Staples 270 | KR,Consumer Staples 271 | KSU,Industrials 272 | L,Financials 273 | LB,Consumer Discretionary 274 | LDOS,Industrials 275 | LEG,Consumer Discretionary 276 | LEN,Consumer Discretionary 277 | LH,Health Care 278 | LHX,Industrials 279 | LIN,Materials 280 | LKQ,Consumer Discretionary 281 | LLY,Health Care 282 | LMT,Industrials 283 | LNC,Financials 284 | LNT,Utilities 285 | LOW,Consumer Discretionary 286 | LRCX,Information Technology 287 | LUMN,Communication Services 288 | LUV,Industrials 289 | LVS,Consumer Discretionary 290 | LW,Consumer Staples 291 | LYB,Materials 292 | LYV,Communication Services 293 | MA,Information Technology 294 | MAA,Real Estate 295 | MAR,Consumer Discretionary 296 | MAS,Industrials 297 | MCD,Consumer Discretionary 298 | MCHP,Information Technology 299 | MCK,Health Care 300 | MCO,Financials 301 | MDLZ,Consumer Staples 302 | MDT,Health Care 303 | MET,Financials 304 | MGM,Consumer Discretionary 305 | MHK,Consumer Discretionary 306 | MKC,Consumer Staples 307 | MKTX,Financials 308 | MLM,Materials 309 | MMC,Financials 310 | MMM,Industrials 311 | MNST,Consumer Staples 312 | MO,Consumer Staples 313 | MOS,Materials 314 | MPC,Energy 315 | MRK,Health Care 316 | MRO,Energy 317 | MS,Financials 318 | MSCI,Financials 319 | MSFT,Information Technology 320 | MSI,Information Technology 321 | MTB,Financials 322 | MTD,Health Care 323 | MU,Information Technology 324 | MXIM,Information Technology 325 | NCLH,Consumer Discretionary 326 | NDAQ,Financials 327 | NEE,Utilities 328 | NEM,Materials 329 | NFLX,Communication Services 330 | NI,Utilities 331 | NKE,Consumer Discretionary 332 | NLOK,Information Technology 333 | NLSN,Industrials 334 | NOC,Industrials 335 | NOV,Energy 336 | NOW,Information Technology 337 | NRG,Utilities 338 | NSC,Industrials 339 | NTAP,Information Technology 340 | NTRS,Financials 341 | NUE,Materials 342 | NVDA,Information Technology 343 | NVR,Consumer Discretionary 344 | NWL,Consumer Discretionary 345 | NWS,Communication Services 346 | NWSA,Communication Services 347 | O,Real Estate 348 | ODFL,Industrials 349 | OKE,Energy 350 | OMC,Communication Services 351 | ORCL,Information Technology 352 | ORLY,Consumer Discretionary 353 | OTIS,Industrials 354 | OXY,Energy 355 | PAYC,Information Technology 356 | PAYX,Information Technology 357 | PBCT,Financials 358 | PCAR,Industrials 359 | PEAK,Real Estate 360 | PEG,Utilities 361 | PEP,Consumer Staples 362 | PFE,Health Care 363 | PFG,Financials 364 | PG,Consumer Staples 365 | PGR,Financials 366 | PH,Industrials 367 | PHM,Consumer Discretionary 368 | PKG,Materials 369 | PKI,Health Care 370 | PLD,Real Estate 371 | PM,Consumer Staples 372 | PNC,Financials 373 | PNR,Industrials 374 | PNW,Utilities 375 | POOL,Consumer Discretionary 376 | PPG,Materials 377 | PPL,Utilities 378 | PRGO,Health Care 379 | PRU,Financials 380 | PSA,Real Estate 381 | PSX,Energy 382 | PVH,Consumer Discretionary 383 | PWR,Industrials 384 | PXD,Energy 385 | PYPL,Information Technology 386 | QCOM,Information Technology 387 | QRVO,Information Technology 388 | RCL,Consumer Discretionary 389 | RE,Financials 390 | REG,Real Estate 391 | REGN,Health Care 392 | RF,Financials 393 | RHI,Industrials 394 | RJF,Financials 395 | RL,Consumer Discretionary 396 | RMD,Health Care 397 | ROK,Industrials 398 | ROL,Industrials 399 | ROP,Industrials 400 | ROST,Consumer Discretionary 401 | RSG,Industrials 402 | RTX,Industrials 403 | SBAC,Real Estate 404 | SBUX,Consumer Discretionary 405 | SCHW,Financials 406 | SEE,Materials 407 | SHW,Materials 408 | SIVB,Financials 409 | SJM,Consumer Staples 410 | SLB,Energy 411 | SLG,Real Estate 412 | SNA,Industrials 413 | SNPS,Information Technology 414 | SO,Utilities 415 | SPG,Real Estate 416 | SPGI,Financials 417 | SRE,Utilities 418 | STE,Health Care 419 | STT,Financials 420 | STX,Information Technology 421 | STZ,Consumer Staples 422 | SWK,Industrials 423 | SWKS,Information Technology 424 | SYF,Financials 425 | SYK,Health Care 426 | SYY,Consumer Staples 427 | T,Communication Services 428 | TAP,Consumer Staples 429 | TDG,Industrials 430 | TDY,Industrials 431 | TEL,Information Technology 432 | TER,Information Technology 433 | TFC,Financials 434 | TFX,Health Care 435 | TGT,Consumer Discretionary 436 | TJX,Consumer Discretionary 437 | TMO,Health Care 438 | TMUS,Communication Services 439 | TPR,Consumer Discretionary 440 | TRMB,Information Technology 441 | TROW,Financials 442 | TRV,Financials 443 | TSCO,Consumer Discretionary 444 | TSLA,Consumer Discretionary 445 | TSN,Consumer Staples 446 | TT,Industrials 447 | TTWO,Communication Services 448 | TWTR,Communication Services 449 | TXN,Information Technology 450 | TXT,Industrials 451 | TYL,Information Technology 452 | UA,Consumer Discretionary 453 | UAA,Consumer Discretionary 454 | UAL,Industrials 455 | UDR,Real Estate 456 | UHS,Health Care 457 | ULTA,Consumer Discretionary 458 | UNH,Health Care 459 | UNM,Financials 460 | UNP,Industrials 461 | UPS,Industrials 462 | URI,Industrials 463 | USB,Financials 464 | V,Information Technology 465 | VAR,Health Care 466 | VFC,Consumer Discretionary 467 | VIAC,Communication Services 468 | VLO,Energy 469 | VMC,Materials 470 | VNO,Real Estate 471 | VNT,Information Technology 472 | VRSK,Industrials 473 | VRSN,Information Technology 474 | VRTX,Health Care 475 | VTR,Real Estate 476 | VTRS,Health Care 477 | VZ,Communication Services 478 | WAB,Industrials 479 | WAT,Health Care 480 | WBA,Consumer Staples 481 | WDC,Information Technology 482 | WEC,Utilities 483 | WELL,Real Estate 484 | WFC,Financials 485 | WHR,Consumer Discretionary 486 | WLTW,Financials 487 | WM,Industrials 488 | WMB,Energy 489 | WMT,Consumer Staples 490 | WRB,Financials 491 | WRK,Materials 492 | WST,Health Care 493 | WU,Information Technology 494 | WY,Real Estate 495 | WYNN,Consumer Discretionary 496 | XEL,Utilities 497 | XLNX,Information Technology 498 | XOM,Energy 499 | XRAY,Health Care 500 | XRX,Information Technology 501 | XYL,Industrials 502 | YUM,Consumer Discretionary 503 | ZBH,Health Care 504 | ZBRA,Information Technology 505 | ZION,Financials 506 | ZTS,Health Care 507 | -------------------------------------------------------------------------------- /img/OESX.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArnaudBu/stock-returns-prediction/218b55e3478de4b07b6c981bd4dc58066be40bff/img/OESX.png -------------------------------------------------------------------------------- /img/SAVA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArnaudBu/stock-returns-prediction/218b55e3478de4b07b6c981bd4dc58066be40bff/img/SAVA.png -------------------------------------------------------------------------------- /img/backtest.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArnaudBu/stock-returns-prediction/218b55e3478de4b07b6c981bd4dc58066be40bff/img/backtest.png -------------------------------------------------------------------------------- /img/roc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArnaudBu/stock-returns-prediction/218b55e3478de4b07b6c981bd4dc58066be40bff/img/roc.png -------------------------------------------------------------------------------- /img/tearsheet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArnaudBu/stock-returns-prediction/218b55e3478de4b07b6c981bd4dc58066be40bff/img/tearsheet.png -------------------------------------------------------------------------------- /img/top_analysis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArnaudBu/stock-returns-prediction/218b55e3478de4b07b6c981bd4dc58066be40bff/img/top_analysis.png -------------------------------------------------------------------------------- /img/variables_importance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArnaudBu/stock-returns-prediction/218b55e3478de4b07b6c981bd4dc58066be40bff/img/variables_importance.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | tqdm 3 | dtale 4 | yfinance 5 | yahoofinancials 6 | requests 7 | sklearn 8 | matplotlib 9 | pytorch_tabnet 10 | dateutil -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | from pytorch_tabnet.tab_model import TabNetClassifier 2 | import torch 3 | from sklearn.preprocessing import LabelEncoder 4 | from sklearn.preprocessing import StandardScaler 5 | 6 | 7 | def process_data(df, date_valid, date_test, features, target): 8 | data = df.copy() 9 | # split datasets 10 | if "set" not in data.columns: 11 | data["set"] = "train" 12 | data.loc[data.date > date_valid, "set"] = "valid" 13 | data.loc[data.date > date_test, "set"] = "test" 14 | train_indices = data[data.set == "train"].index 15 | valid_indices = data[data.set == "valid"].index 16 | test_indices = data[data.set == "test"].index 17 | indices = data.set.values 18 | 19 | # Select data 20 | data = data[features + [target]] 21 | 22 | # Get categorical features and preprocess 23 | nunique = data.nunique() 24 | types = data.dtypes 25 | categorical_columns = [] 26 | categorical_dims = {} 27 | for col in data.columns: 28 | if types[col] == 'object' or nunique[col] < 200: 29 | l_enc = LabelEncoder() 30 | data[col] = data[col].fillna("Unknown") 31 | data[col] = l_enc.fit_transform(data[col].values) 32 | categorical_columns.append(col) 33 | categorical_dims[col] = len(l_enc.classes_) 34 | else: 35 | data[col].fillna(data[col].mean(), inplace=True) 36 | scaler = StandardScaler() 37 | data[[col]] = scaler.fit_transform(data[[col]]) 38 | cat_idxs = [i for i, f in enumerate(features) if f in categorical_columns] 39 | cat_dims = [categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns] 40 | 41 | # Define model 42 | clf = TabNetClassifier(cat_idxs=cat_idxs, 43 | cat_dims=cat_dims, 44 | cat_emb_dim=1, 45 | optimizer_fn=torch.optim.Adam, 46 | optimizer_params=dict(lr=1e-2), 47 | scheduler_params={"step_size": 10, 48 | "gamma": 0.9}, 49 | scheduler_fn=torch.optim.lr_scheduler.StepLR, 50 | mask_type='entmax' 51 | ) 52 | 53 | # Datasets 54 | X_train = data[features].values[train_indices] 55 | y_train = data[target].values[train_indices] 56 | X_valid = data[features].values[valid_indices] 57 | y_valid = data[target].values[valid_indices] 58 | X_test = data[features].values[test_indices] 59 | y_test = data[target].values[test_indices] 60 | 61 | return clf, X_train, y_train, X_valid, y_valid, X_test, y_test, indices 62 | --------------------------------------------------------------------------------