├── .gitignore ├── DL_Finance_Project_2.pdf ├── README.md ├── R_squares ├── 2023-06-09_05-24-47.json ├── 2023-06-09_06-07-56.json ├── 2023-06-09_09-08-26.json └── 2023-06-11_18-16-38.json ├── analysis.py ├── data_prepare.py ├── imgs ├── R2_pred_table.png ├── R2_total_table.png ├── alpha │ ├── CA0_1_inference_alpha_plot.png │ ├── CA0_2_inference_alpha_plot.png │ ├── CA0_3_inference_alpha_plot.png │ ├── CA0_4_inference_alpha_plot.png │ ├── CA0_5_alpha_plot.png │ ├── CA0_5_inference_alpha_plot.png │ ├── CA0_6_inference_alpha_plot.png │ ├── CA1_1_inference_alpha_plot.png │ ├── CA1_2_inference_alpha_plot.png │ ├── CA1_3_inference_alpha_plot.png │ ├── CA1_4_inference_alpha_plot.png │ ├── CA1_5_inference_alpha_plot.png │ ├── CA1_6_inference_alpha_plot.png │ ├── CA2_1_inference_alpha_plot.png │ ├── CA2_2_inference_alpha_plot.png │ ├── CA2_3_inference_alpha_plot.png │ ├── CA2_4_inference_alpha_plot.png │ ├── CA2_5_inference_alpha_plot.png │ ├── CA2_6_inference_alpha_plot.png │ ├── CA3_1_inference_alpha_plot.png │ ├── CA3_2_inference_alpha_plot.png │ ├── CA3_3_inference_alpha_plot.png │ ├── CA3_4_inference_alpha_plot.png │ ├── CA3_5_inference_alpha_plot.png │ ├── CA3_6_inference_alpha_plot.png │ ├── FF_1_inference_alpha_plot.png │ ├── FF_2_inference_alpha_plot.png │ ├── FF_3_inference_alpha_plot.png │ ├── FF_4_inference_alpha_plot.png │ ├── FF_5_inference_alpha_plot.png │ ├── FF_6_inference_alpha_plot.png │ ├── IPCA_1_inference_alpha_plot.png │ ├── IPCA_2_inference_alpha_plot.png │ ├── IPCA_3_inference_alpha_plot.png │ ├── IPCA_4_inference_alpha_plot.png │ ├── IPCA_5_inference_alpha_plot.png │ ├── IPCA_6_inference_alpha_plot.png │ ├── PCA_1_inference_alpha_plot.png │ ├── PCA_2_inference_alpha_plot.png │ ├── PCA_3_inference_alpha_plot.png │ ├── PCA_4_inference_alpha_plot.png │ ├── PCA_5_inference_alpha_plot.png │ └── PCA_6_inference_alpha_plot.png ├── omit_char_R2_bias.png ├── pred_R2.png └── total_R2.png ├── main.py ├── models ├── CA.py ├── FF.py ├── IPCA.py ├── PCA.py └── modelBase.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # raw data 2 | data/ 3 | data.zip 4 | __MACOSX/ 5 | new_data.zip 6 | __pycache__ 7 | models/__pycache__ 8 | saved_models 9 | *_loss_*.png 10 | *.ipynb 11 | results/ 12 | logs/ 13 | R_squares/ 14 | -------------------------------------------------------------------------------- /DL_Finance_Project_2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/DL_Finance_Project_2.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Autoencoder-Asset-Pricing-Models 2 | 3 | 🧐 [**Report**](https://www.richardsong.live/autoencoder-asset-pricing-models) | [**Report PDF file**](https://cloud.tsinghua.edu.cn/f/c02804bed00b4083bcb7/?dl=1) 4 | ## Set Up 5 | 6 | ```bash 7 | # generate preprocessed data and download portfolio returns 8 | python data_prepare.py 9 | 10 | # train models (ALL together) 11 | python main.py --Model 'FF PCA IPCA CA0 CA1 CA2 CA3' --K '1 2 3 4 5 6' 12 | 13 | # train models (selected models and K, for example) 14 | python main.py --Model 'IPCA CA3' --K '5 6' 15 | 16 | # analyze characteristics' importance (if needed) 17 | python main.py --Model 'IPCA CA0 CA1 CA2 CA3' --K '5' --omit_char 'absacc acc age agr bm bm_ia cashdebt cashpr cfp cfp_ia chatoia chcsho chempia chinv chpmia convind currat depr divi divo dy egr ep gma grcapx grltnoa herf hire invest lev lgr mve_ia operprof orgcap pchcapx_ia pchcurrat pchdepr pchgm_pchsale pchquick pchsale_pchinvt pchsale_pchrect pchsale_pchxsga pchsaleinv pctacc ps quick rd rd_mve rd_sale realestate roic salecash saleinv salerec secured securedind sgr sin sp tang tb aeavol cash chtx cinvest ear ms nincr roaq roavol roeq rsup stdacc stdcf baspread beta betasq chmom dolvol idiovol ill indmom maxret mom12m mom1m mom36m mom6m mvel1 pricedelay retvol std_dolvol std_turn turn zerotrade' 18 | 19 | # analyze models (calculate R^2, plot R^2 tables, bars and bias heatmap) 20 | python analysis.py 21 | ``` 22 | ## Results 23 | ### Total R^2 (%) 24 | 25 | 26 | 27 | 28 | ### Predict R^2 (%) 29 | 30 | 31 | 32 | ### Risk Premia v.s. Mispricing 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 |
47 | 48 | ### Characteristics Importance (reduced total R^2 (%), K=5) 49 | 50 | -------------------------------------------------------------------------------- /R_squares/2023-06-09_05-24-47.json: -------------------------------------------------------------------------------- 1 | {"models": ["FF_1", "FF_2", "FF_3", "FF_4", "FF_5", "FF_6"], "omit_char": [""], "R2_total": [0.08537139414421824, 0.1576019101919831, 0.1986486217133806, 0.20315476596988524, 0.31397093775365037, 0.3616431120471959]} -------------------------------------------------------------------------------- /R_squares/2023-06-09_06-07-56.json: -------------------------------------------------------------------------------- 1 | {"models": ["CA0_1", "CA0_2", "CA0_3", "CA0_4", "CA0_5", "CA0_6", "CA1_1", "CA1_2", "CA1_3", "CA1_4", "CA1_5", "CA1_6", "CA2_1", "CA2_2", "CA2_3", "CA2_4", "CA2_5", "CA2_6", "CA3_1", "CA3_2", "CA3_3", "CA3_4", "CA3_5", "CA3_6"], "omit_char": [""], "R2_total": [ 2 | 0.5677639760646643, 0.6761207104643877, 0.7076066105263652, 0.6913661386379286, 0.662602500272096, 0.7110612627936461, 3 | 0.5517562872860107, 0.7025783407556893, 0.685776051607686, 0.6664443573030849, 0.7006957708196195, 0.7052861947690043, 4 | 0.5967130036325399, 0.6626964974803786, 0.6608531336078073, 0.7070314610106503, 0.6462021917956272, 0.6767568343936613, 5 | 0.5531676704426002, 0.5249032928672436, 0.5642100044551001, 0.5458004779254889, 0.5558832641978944, 0.5235321637890534]} -------------------------------------------------------------------------------- /R_squares/2023-06-09_09-08-26.json: -------------------------------------------------------------------------------- 1 | {"models": ["PCA_1", "PCA_2", "PCA_3", "PCA_4", "PCA_5", "PCA_6"], "omit_char": [""], "R2_total": [0.4061160826307103, 0.5300364609271587, 0.5913033228863098, 0.6246396597772854, 0.6467919712825208, 0.6720178863573743]} -------------------------------------------------------------------------------- /R_squares/2023-06-11_18-16-38.json: -------------------------------------------------------------------------------- 1 | {"models": ["IPCA_5", "CA0_5", "CA1_5", "CA2_5", "CA3_5"], "omit_char": ["absacc", "acc", "age", "agr", "bm", "bm_ia", "cashdebt", "cashpr", "cfp", "cfp_ia", "chatoia", "chcsho", "chempia", "chinv", "chpmia", "convind", "currat", "depr", "divi", "divo", "dy", "egr", "ep", "gma", "grcapx", "grltnoa", "herf", "hire", "invest", "lev", "lgr", "mve_ia", "operprof", "orgcap", "pchcapx_ia", "pchcurrat", "pchdepr", "pchgm_pchsale", "pchquick", "pchsale_pchinvt", "pchsale_pchrect", "pchsale_pchxsga", "pchsaleinv", "pctacc", "ps", "quick", "rd", "rd_mve", "rd_sale", "realestate", "roic", "salecash", "saleinv", "salerec", "secured", "securedind", "sgr", "sin", "sp", "tang", "tb", "aeavol", "cash", "chtx", "cinvest", "ear", "ms", "nincr", "roaq", "roavol", "roeq", "rsup", "stdacc", "stdcf", "baspread", "beta", "betasq", "chmom", "dolvol", "idiovol", "ill", "indmom", "maxret", "mom12m", "mom1m", "mom36m", "mom6m", "mvel1", "pricedelay", "retvol", "std_dolvol", "std_turn", "turn", "zerotrade"], "R2_total": [1.5442819689237552e-05, -0.0008641544212459884, -0.0001747076111721091, -6.028723743389808e-05, 0.0004881307548909586, -9.453040450568828e-05, 0.0011809606037788134, -0.00047160860320083486, 0.0004497328261676703, 0.00035797708987406196, 0.00046514268643882417, 8.010321267071241e-05, 0.0005184883538837948, -0.00037818109386877907, -5.991642326275137e-05, 0.0002601835988962353, 0.0011439216429843801, 0.0017112133538985663, -4.350496459193387e-06, -0.00019380257770462705, 0.00043948388552850215, 0.00031580941297537635, 0.003338161066369527, 0.0021027676465422696, -0.0003171094148499698, 0.0014555971916409005, -0.0007270330656120594, 0.0016980056916262587, 0.0009553837759342931, 0.0027868308676146647, 0.00026402683391868464, 0.0007850527331111357, 0.0020449984309897085, 0.001054644787377823, 5.184771678434785e-06, 0.002178892159566903, -0.00017501728488655832, -0.0002577126732409285, 0.0009543687413273716, 0.0005083110135046809, 7.62809849720325e-05, 8.658720973830913e-05, 0.0004486453095020604, 0.0008906763104503668, 0.0010605476004982295, 0.0018694816639623912, 0.0002959972532902144, 0.0017008605711600344, 0.0007976861126247625, -0.002652802485675565, 0.002662754828368419, 9.917422749694538e-05, 5.964285215753762e-05, 0.0022189518272991426, -7.435817815093504e-05, -0.001698616694105315, 0.001233404502690938, 0.00047054690788606024, 0.0026004651808273493, 0.0011172780845385422, -0.0001482646856509895, -0.00041623695665415905, -0.0015909825974204095, -0.00031927406061749153, -0.00043655338823822554, 0.0001361268145957384, -0.00032109572478289383, 8.887812899915914e-05, 0.002067090875226163, 0.0010982209839661694, -0.0005637330110257466, -0.0005329012820900481, 0.0009084092837141622, 0.0021178568491788674, 0.0033586699296590528, 0.03875439641206746, 0.007293900327323799, 0.004763913685328358, 0.004817433432478202, 0.007084909806944539, 0.0025907095069127584, 0.0080607854611624, 0.0057093251947115675, 0.009105559086416637, 0.011542299391846145, 0.0028401829485293906, 0.02709723337680492, 0.004118952479980065, -0.00025137933247576516, 0.018393986829754794, 0.003372828092497837, 0.0013541844149635995, 0.00793699314016949, 0.0036518023385704312, -0.01840451865415549, 0.0035504742986338655, -0.02124055707570216, -0.003382787260082454, 0.001661945882751592, -8.216386260140318e-05, -0.011643569891665484, -0.0015886009587249283, -0.0060291572912980484, -0.003934251792188981, -0.0003035868273979503, 0.0025248914169603287, -0.0004996588466505969, 0.0009582432785717465, -0.0017444408411588785, -0.004805523268163303, -0.002931825562557, -0.014321167551977099, -0.005731926091473105, -0.001628883333692932, -0.008380269170191412, -0.002495037120210153, -0.013508701388505795, 0.001503549704314544, -0.0007751891041964942, -0.0007030969454133729, -0.0028340425016759596, 0.002562114878683408, 0.0007168131966963642, 0.0005966917290103346, -0.0009823429746401713, -0.008049996513831648, -0.006935426188849236, -0.013752522526764732, -0.0005069144917650981, 0.0008889578779507357, -0.004511542543265024, 0.000512296750466934, -0.0002943171885458895, -0.00022271179096300386, -0.0021640651213321593, 0.0002623110944273144, -0.00026747487114753277, -0.0027015552337811277, -0.012375056721493305, -0.00010214235568639651, -0.012322425151997773, -0.004802562616962769, -0.0040789335496304036, 0.0004565130977485232, -0.0004572283160489965, 0.011102582602458222, -0.0020968867403851066, -0.0011853937188981423, -0.005591149809828, -0.010832419289181106, -0.001980246263801777, -0.0017540247099745443, 0.0018022912780594202, -0.0005056805241561158, -0.0017686227072655214, -0.0003527295679881526, 0.016529002299418005, -0.0014105862431510463, -0.0005732444581987295, -0.002869230597707273, -0.004965959446656232, 0.0004066440403122096, -0.009874817195565821, -0.030274345848102402, -0.012926682404285739, -0.0008783158084264553, -0.006107566065733372, -0.010184352477880187, -0.025327970485242712, -0.035590091815137614, -0.0303458179931807, -0.004102272911347127, -0.0004503926531160829, -0.029369754046835173, -0.007045255786105931, 0.0016875597888881266, -0.03481944581257812, -0.006678644726203498, -0.014538583275030104, -0.0007153352724629247, -0.011420023006262658, -0.004652763809388283, -0.005798820819408745, -0.024656178382399974, 0.0011578591100704916, -0.005280277401536693, 0.0010356576649497296, 0.003940580861575116, -0.005276868987642791, 0.00770392753313931, -0.007944469488529449, 0.001991397641811221, 0.003759694548383874, 0.0011195084904335184, -0.007110697689415635, 0.0032203225902148747, -0.007578862375151041, -0.0060194039077946515, 0.00024130283639867134, 0.0013235404395363082, 0.00043243599875075756, 0.001123780074328562, 0.00050973691967926, -0.00034757211361624574, -0.002281972957101308, 0.000845475795447137, -0.0020588684564848414, 0.0021372282150149413, -0.00019856545094609768, -0.0017913068956328937, 0.0006373160107784326, 0.0012739985965753986, -0.001696571054130902, -0.0002589327068805991, -0.00022288072963450034, -0.0031104722681001284, -0.0008235753576365523, -0.002447807642767752, 0.002004633800402944, -0.003224266216026561, 0.00864564856685679, -0.00554499989692514, -0.00026444194813035615, -0.0010032832870013886, -0.0032213627206253426, -0.000911160024681501, 0.0004076477827085201, 0.0007995060213979999, -0.0005885363844878588, 0.0032955989913380224, 0.0004894614286667931, 0.002261644799368967, -0.0032746021967355876, -0.0005524850007211368, -0.0031971616217488785, 0.0060628232866996035, -0.0018951843317909223, -0.004804670310596504, -0.016865397416521932, 0.0022279632468085175, 0.0004397955748330906, -0.0012895574791834674, -0.0052936056246406515, 0.001047196960554997, 0.0014035810745537391, -0.0021789054250395123, 0.004835618627879845, -0.004423137045597492, 0.0033839427269372058, 0.0004317648131846319, 0.010148684107688322, -0.0005852067203551137, -0.00023673829544612612, -0.0004969772951760598, -0.0007139106069110612, -0.0006781488781816281, -0.004463513032158084, -0.008314803798795345, 0.005664192158806869, -0.0017831527387602852, -0.005528449668342539, -0.00018355344047649158, -0.013367195182721558, 0.010476427486050932, -0.0030995623273649686, -0.0033128122970741414, 0.004656006266778645, -0.010849289595229128, -0.002397579088451285, -0.009958364378056861, -0.0011444352933996926, 0.0021166335185883733, -0.018000165477872865, -0.0032624694006755384, -0.004127065179238776, 0.012949255788881175, -0.0018095543583533935, -0.005592349116810835, 0.0037267653611244844, -0.0038516820526487416, 0.024489734744573943, 0.004683327531111781, 0.001269388150738071, -0.0017407506416220464, 0.011902507395885942, 0.004639839618832409, 0.002506386658879589, 0.0003685885982728232, 0.009029032176506746, 0.003135888070068371, -0.0016520809387076119, 0.003088001303703236, 0.00014727469647846103, 0.0035561159496605432, -2.876814735663924e-05, -1.1060450691990908e-06, -0.00046745210145082705, 0.0012870350251698026, 0.0042191397766525585, 0.00655774915823315, 0.0005207348452095362, 0.0023795699587515484, 0.0043810533565454834, 0.004609257547817935, 0.01290056247180027, 0.0014757109716835304, 0.0010271818164879765, 0.0015607194203527408, 0.0004559197274759397, 0.00033707593565324157, 0.002642703381714573, 0.006501328950226926, 0.0008704244332061739, 0.002463581165456641, 0.004364567880446146, -0.0018809110325389566, 0.00017150256113329654, 0.00017711236826822851, -7.174454357028459e-05, 3.2134260813054816e-05, -0.0018074430724653867, 0.0007488238809050252, -0.00021827012815400781, -0.0002562985350954561, 0.00023716735492340657, 0.001413468319080624, 0.006882923410709174, 0.0007931366577244026, -0.0004717657827425503, 0.009100363328674477, 0.005404627376927373, -0.004026623783044081, 0.005351064544848239, -0.0035389768278023537, 0.002068255236817085, -0.002090048742856676, -0.0019271886285645579, 0.0011498135139685894, 0.0018314054258520285, -0.00026283971236906734, 0.0015413975782866407, -0.004150725533632493, -0.001320978589455457, 0.00031339691627352284, -0.0016566925476326766, -0.0006220275171834322, 7.44998579660372e-05, -0.0002999797537633908, 0.0007030142990428478, 0.0007913912947924429, 0.0011556895007599488, 0.011568536396979079, 0.0028225455085252316, 0.00035905603670860486, -0.000841494670420162, 0.009615860981466828, 0.03377755052540243, 0.0789273078045386, 0.05525867670164508, -0.0010166697260519664, 0.00957764643602288, 0.06977357007791107, 0.006012541368369928, 0.002010875608860041, 0.03032162865092458, 0.002846750112344365, -0.00016217114898731122, 0.0060712783132312875, 0.006622682402069313, 0.006219871147256417, 0.0018001652923477218, 0.06992553842572524, 0.006724233973021909, 0.007539747957793552, 0.020538531613926936, 0.007943897075231354, 0.008360812789827254, 0.003290025441256006, 0.020634865090878085, 0.0008080876743085108, -0.0030892486063024416, 0.00041600088024251747, 0.0070829632445147395, 0.0036536124147609206, -0.0016869382360118479, 0.0034263287960714095, -0.0006554657701027811, 0.0029753478023664126, 0.0006174977901268752, 7.101865790082318e-05, -0.00016257411215181428, -0.0009369820715646737, 0.0005434387564733356, 0.0012271814940856274, 0.0017175994942115747, 0.0005473661944555008, 0.005273938186585614, 0.0027605830798231867, 0.0053267195601733874, -0.0008641356489738072, 0.0003752916592929534, -8.763321084148679e-05, 0.0015245669500787429, -0.00012165228700977693, 0.0007204145703794129, 7.33961001830874e-06, 0.0007303311512011357, 0.002484907097797917, 0.012401690162314405, 0.002021342443194185, -0.00036144987650921223, -0.0012055569846747272, 0.00085115010144865, 2.829758256761572e-05, -3.762854703992513e-05, 0.0006878905429239524, -0.0007663772085586551, 0.0001906678540952722, 0.0001922411569781346, 0.001457809117053288, 0.0061655533145379415, -0.0002077338553436725, 0.004428694252206933, 0.007572691730434067, 0.015522139371950017, -0.0007973496961124482, 0.010913869326709902, -0.0009089608282075723, 0.001704153441209666, -0.001975343179565603, 0.0007767569381108563, 0.0001946668039775057, -0.0003049050619283733, -0.0006077903992949274, -0.0014501145568337481, 0.0024408470743457755, 0.0024867530414642847, -0.00028112355434628533, 0.00029719386409921, -0.0005815474877897131, -9.683347434896739e-05, 0.0015214781874257621, 0.00038445751824878194, -0.0006103274076518783, 0.0016081931094918955, 0.008255889709628095, 0.013087103367242059, 0.0004902027422780675, 0.007683661011998355, 0.0006227685572650632, 0.040959894211064496, 0.04659799783753149, 0.05623489106446333, -0.0011074112724636098, 0.0026292183555661763, 0.06648468153042297, 0.008507328563525984, 0.0012577368022517188, 0.05146040805311081, 0.006280620030596484, 0.0025791825898906495, 0.006777579940425715, 0.009547184931625763, 0.008159048875296504, 0.0008150683146730398, 0.06841724518294723, -0.0003192050956154491, 0.005694154235833304, 0.0024969631810545234, -0.001106586510389418]} -------------------------------------------------------------------------------- /analysis.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from utils import * 4 | import seaborn as sns 5 | import matplotlib.pyplot as plt 6 | import plotly.figure_factory as ff 7 | 8 | import warnings 9 | warnings.filterwarnings('ignore') 10 | 11 | 12 | def calculate_R2(model, type, input=None, complete_r=None): 13 | portfolio_ret = pd.read_pickle('data/portfolio_ret.pkl') 14 | oos_ret = portfolio_ret.loc[(portfolio_ret['DATE'] >= OOS_start) & (portfolio_ret['DATE'] <= OOS_end)] 15 | 16 | if not isinstance(input, np.ndarray): 17 | # print('type: ', type) 18 | if isinstance(model, str): 19 | output_path = f'results/{type}/{model}_{type}.csv' 20 | else: 21 | output_path = f'results/{type}/{model.name}_{type}.csv' 22 | # print('path : ', output_path) 23 | model_output = pd.read_csv(output_path) 24 | else: 25 | model_output = input 26 | model_output = pd.DataFrame(model_output, columns=CHARAS_LIST) 27 | model_output['DATE'] = oos_ret['DATE'].to_list() 28 | 29 | for col in model_output.columns: # hard code for format error 30 | model_output[col] = model_output[col].apply(lambda x: float(str(x).replace('[', '').replace(']', ''))) 31 | 32 | residual_square = ((oos_ret.set_index('DATE') - model_output.set_index('DATE'))**2).dropna() 33 | residual_square = (1 - (residual_square == np.inf) * 1.0) * residual_square # drop Inf outliers 34 | 35 | total_square = oos_ret.set_index('DATE')**2 36 | total_square = (1 - (total_square == np.inf) * 1.0) * total_square # drop Inf outliers 37 | 38 | model_output_R2 = 1 - np.sum(residual_square.values)/np.sum(total_square.values) 39 | 40 | if not isinstance(input, np.ndarray): 41 | return model_output_R2 42 | 43 | else: 44 | no_omit_output = complete_r 45 | no_omit_output = pd.DataFrame(no_omit_output, columns=CHARAS_LIST) 46 | no_omit_output['DATE'] = oos_ret['DATE'].to_list() 47 | 48 | no_omit_residual_square = ((oos_ret.set_index('DATE') - no_omit_output.set_index('DATE'))**2).dropna() 49 | no_omit_residual_square = (1 - (no_omit_residual_square == np.inf) * 1.0) * no_omit_residual_square # drop Inf outliers 50 | 51 | no_omit_model_output_R2 = 1 - np.sum(no_omit_residual_square.values)/np.sum(total_square.values) 52 | 53 | return no_omit_model_output_R2 - model_output_R2 # the difference of R^2, i.e. the importance of characteristics 54 | 55 | 56 | 57 | def alpha_plot(model, type, save_dir='imgs'): 58 | if 'alpha' not in os.listdir(save_dir): 59 | os.mkdir(f'{save_dir}/alpha') 60 | 61 | portfolio_ret = pd.read_pickle('data/portfolio_ret.pkl') 62 | oos_result = portfolio_ret.loc[(portfolio_ret['DATE'] >= OOS_start) & (portfolio_ret['DATE'] <= OOS_end)].set_index('DATE') 63 | 64 | output_path = f'results/{type}/{model.name}_{type}.csv' 65 | inference_result = pd.read_csv(output_path) 66 | inference_result = inference_result.set_index('DATE') 67 | 68 | pricing_error_analysis = [] 69 | for col in CHARAS_LIST: 70 | raw_return = oos_result[col].mean() 71 | error = oos_result[col] - inference_result[col] 72 | alpha = error.mean() 73 | t_stat = abs(error.mean()/error.std()) * np.sqrt(oos_result.shape[0]) 74 | pricing_error_analysis.append([raw_return, alpha, t_stat]) 75 | 76 | pricing_error_analysis = pd.DataFrame(pricing_error_analysis, columns = ['raw ret', 'alpha', 't_stat'], index=CHARAS_LIST) 77 | 78 | lower_point = min(np.min(pricing_error_analysis['raw ret']), np.min(pricing_error_analysis['alpha'])) * 1.15 79 | upper_point = max(np.max(pricing_error_analysis['raw ret']), np.max(pricing_error_analysis['alpha'])) * 1.15 80 | 81 | significant_mask = pricing_error_analysis['t_stat'] > 3 82 | 83 | plt.scatter(pricing_error_analysis.loc[significant_mask]['raw ret'], pricing_error_analysis.loc[significant_mask]['alpha'], marker='^', color='r', alpha=0.6, label=f'#Alphas(|t|>3.0)={np.sum(significant_mask*1.0)}') 84 | plt.scatter(pricing_error_analysis.loc[~significant_mask]['raw ret'], pricing_error_analysis.loc[~significant_mask]['alpha'], marker='o', color='b', alpha=0.6, label=f'#Alphas(|t|<3.0)={94-np.sum(significant_mask*1.0)}') 85 | plt.plot(np.linspace(lower_point, upper_point, 10), np.linspace(lower_point, upper_point, 10), color='black') 86 | 87 | plt.ylabel('Alpha (%)') 88 | plt.xlabel('Raw Return (%)') 89 | plt.legend() 90 | 91 | plt.title(model.name) 92 | plt.savefig(f'{save_dir}/alpha/{model.name}_inference_alpha_plot.png') 93 | plt.close() 94 | 95 | 96 | def plot_R2_bar(R_df, type): 97 | 98 | R_df['Model'] = R_df[0].apply(lambda x: x.split('_')[0]) 99 | 100 | labels = ['K=1', 'K=2', 'K=3', 'K=4', 'K=5', 'K=6'] 101 | FF = (R_df.loc[R_df['Model']=='FF'][1]*100).to_list() 102 | PCA = (R_df.loc[R_df['Model']=='PCA'][1]*100).to_list() 103 | IPCA = (R_df.loc[R_df['Model']=='IPCA'][1]*100).to_list() 104 | CA0 = (R_df.loc[R_df['Model']=='CA0'][1]*100).to_list() 105 | CA1 = (R_df.loc[R_df['Model']=='CA1'][1]*100).to_list() 106 | CA2 = (R_df.loc[R_df['Model']=='CA2'][1]*100).to_list() 107 | CA3 = (R_df.loc[R_df['Model']=='CA3'][1]*100).to_list() 108 | 109 | 110 | x = np.arange(len(labels)) # 标签位置 111 | width = 0.11 112 | 113 | fig, ax = plt.subplots(figsize=(15, 5)) 114 | ax.bar(x - width*3 , FF, width, label='FF', color=plt.get_cmap('OrRd')(np.linspace(0, 1, 8)[1])) 115 | ax.bar(x - width*2 , PCA, width, label='PCA', color=plt.get_cmap('OrRd')(np.linspace(0, 1, 8)[2])) 116 | ax.bar(x - width , IPCA, width, label='IPCA', color=plt.get_cmap('OrRd')(np.linspace(0, 1, 8)[3])) 117 | ax.bar(x + 0.00, CA0, width, label='CA0', color=plt.get_cmap('OrRd')(np.linspace(0, 1, 8)[4])) 118 | ax.bar(x + width , CA1, width, label='CA1', color=plt.get_cmap('OrRd')(np.linspace(0, 1, 8)[5])) 119 | ax.bar(x + width*2 , CA2, width, label='CA2', color=plt.get_cmap('OrRd')(np.linspace(0, 1, 8)[6])) 120 | ax.bar(x + width*3 , CA3, width, label='CA3', color=plt.get_cmap('OrRd')(np.linspace(0, 1, 8)[7])) 121 | 122 | 123 | ax.set_ylabel(f'Portfolio {type} R^2 (%)') 124 | ax.set_xticks(x) 125 | ax.set_xticklabels(labels) 126 | ax.legend() 127 | 128 | fig.tight_layout() 129 | 130 | plt.savefig(f'imgs/{type}_R2.png') 131 | plt.close() 132 | 133 | 134 | 135 | def plot_R2_table(R_df, type): 136 | plt.figure(dpi=200) 137 | 138 | for col in R_df.columns: 139 | R_df[col] = R_df[col].apply(lambda x: round_number(x)) 140 | 141 | R_df = R_df.reset_index() 142 | R_df.columns = ['Model', 'K=1', 'K=2', 'K=3', 'K=4', 'K=5', 'K=6'] 143 | 144 | 145 | fig_total = ff.create_table(R_df, 146 | colorscale=[[0, 'white'], 147 | [0.01, 'lightgrey'], 148 | [1.0, 'white']], 149 | font_colors=['#000000', '#000000', 150 | '#000000']) 151 | fig_total.update_layout( 152 | autosize=False, 153 | width=500, 154 | height=200, 155 | ) 156 | fig_total.write_image(f"imgs/R2_{type}_table.png", scale=4) 157 | 158 | 159 | 160 | def round_number(num): 161 | num = str(round(num*100, 2)) 162 | while len(num.split('.')[1]) < 2: 163 | num = num + '0' 164 | return num 165 | 166 | 167 | 168 | if __name__=="__main__": 169 | CAs = ["CA0_1", "CA0_2", "CA0_3", "CA0_4", "CA0_5", "CA0_6", "CA1_1", "CA1_2", "CA1_3", "CA1_4", "CA1_5", "CA1_6", "CA2_1", "CA2_2", "CA2_3", "CA2_4", "CA2_5", "CA2_6", "CA3_1", "CA3_2", "CA3_3", "CA3_4", "CA3_5", "CA3_6"] 170 | FFs = ["FF_1", "FF_2", "FF_3", "FF_4", "FF_5", "FF_6"] 171 | PCAs = ["PCA_1", "PCA_2", "PCA_3", "PCA_4", "PCA_5", "PCA_6"] 172 | IPCAs = ["IPCA_1", "IPCA_2", "IPCA_3", "IPCA_4", "IPCA_5", "IPCA_6"] 173 | models = FFs + PCAs + IPCAs + CAs 174 | 175 | ## Plot R^2 bars 176 | total_R2 = [] 177 | for m in models: 178 | total_R2.append(calculate_R2(m, 'inference')) 179 | R_total = pd.DataFrame([models, total_R2]).T 180 | 181 | predict_R2 = [] 182 | for m in models: 183 | predict_R2.append(calculate_R2(m, 'predict')) 184 | R_pred = pd.DataFrame([models, predict_R2]).T 185 | 186 | plot_R2_bar(R_total, 'total') 187 | plot_R2_bar(R_pred, 'pred') 188 | 189 | ## Save R^2 tables 190 | R_total_df = pd.DataFrame(np.array(total_R2).reshape(-1, 6), columns = ['K=1', 'K=2', 'K=3', 'K=4', 'K=5', 'K=6'], index=['FF', 'PCA', 'IPCA', 'CA0', 'CA1', 'CA2', 'CA3']) 191 | R_pred_df = pd.DataFrame(np.array(predict_R2).reshape(-1, 6), columns = ['K=1', 'K=2', 'K=3', 'K=4', 'K=5', 'K=6'], index=['FF', 'PCA', 'IPCA', 'CA0', 'CA1', 'CA2', 'CA3']) 192 | 193 | plot_R2_table(R_total_df, 'total') 194 | plot_R2_table(R_pred_df, 'pred') 195 | 196 | 197 | ## Plot characteristics importance heatmap 198 | # models = ["IPCA", "CA0_5", "CA1_5", "CA2_5", "CA3_5"] 199 | # #TODO: paste results from R_squares/ 200 | # R2_omit = [] 201 | # R_minus = pd.DataFrame(np.array(R2_omit).reshape(-1, 94)*100, index=models, columns=CHARAS_LIST).T 202 | # char_ranks = R_minus.T.sum().argsort().argsort().index.to_list() 203 | # char_ranks.reverse() 204 | 205 | # plt.figure(figsize=(8, 15), dpi=200) 206 | # sns.heatmap(R_minus.T[char_ranks].T, cmap='Blues', linewidths=0.6) 207 | # plt.savefig('imgs/omit_char_R2_bias.png', bbox_inches='tight') 208 | # plt.close() 209 | -------------------------------------------------------------------------------- /data_prepare.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | pd.options.mode.chained_assignment = None 4 | from tqdm import tqdm 5 | 6 | import os 7 | import zipfile 8 | from joblib import delayed, Parallel 9 | from itertools import product 10 | from utils import CHARAS_LIST 11 | 12 | import warnings 13 | warnings.filterwarnings('ignore') 14 | 15 | if 'data.zip' not in os.listdir(): 16 | os.system('wget https://cloud.tsinghua.edu.cn/f/07d6a0223d054247af26/?dl=1 -O data.zip') 17 | 18 | if 'data' not in os.listdir(): 19 | os.mkdir('data') 20 | os.system('wget https://cloud.tsinghua.edu.cn/f/179082ecf0f147a4840c/?dl=1 -O portfolio_ret.pkl') 21 | os.system('wget https://cloud.tsinghua.edu.cn/f/b93c6ae7e2014d3a951e/?dl=1 -O ff5.csv') 22 | os.system('wget https://cloud.tsinghua.edu.cn/f/5f077be9eda0428ab7e5/?dl=1 -O UMD.csv') 23 | os.system('wget https://cloud.tsinghua.edu.cn/f/a916da12d5a9450eb0df/?dl=1 -O p_charas.pkl') 24 | 25 | os.system('mv portfolio_ret.pkl data') 26 | os.system('mv ff5.csv data') 27 | os.system('mv UMD.csv data') 28 | os.system('mv p_charas.pkl data') 29 | 30 | 31 | with zipfile.ZipFile('data.zip', 'r') as z: 32 | with z.open('data/month_ret.pkl') as f: 33 | print('Reading month_ret.pkl', end=' ') 34 | mon_ret = pd.read_pickle(f) 35 | mon_ret.to_pickle('data/month_ret.pkl') 36 | print('Done!') 37 | 38 | with z.open('data/datashare.pkl') as f: 39 | print('Reading datashare.pkl', end=' ') 40 | datashare = pd.read_pickle(f) 41 | datashare['DATE'].drop_duplicates().reset_index(drop=True).to_pickle('data/mon_list.pkl') 42 | # datashare.to_pickle('data/datashare.pkl') 43 | print('Done!') 44 | 45 | 46 | 47 | def pre_process(date): 48 | cross_slice = datashare.loc[datashare.DATE == date].copy(deep=False) 49 | omitted_mask = 1.0 * np.isnan(cross_slice.loc[cross_slice['DATE'] == date]) 50 | # fill nan values with each factors median 51 | cross_slice.loc[cross_slice.DATE == date] = cross_slice.fillna(0) + omitted_mask * cross_slice.median() 52 | # if all stocks' factor is nan, fill by zero 53 | cross_slice.loc[cross_slice.DATE == date] = cross_slice.fillna(0) 54 | 55 | re_df = [] 56 | # rank normalization 57 | for col in CHARAS_LIST: 58 | series = cross_slice[col] 59 | de_duplicate_slice = pd.DataFrame(series.drop_duplicates().to_list(), columns=['chara']) 60 | series = pd.DataFrame(series.to_list(), columns=['chara']) 61 | # sort and assign rank, the same value should have the same rank 62 | de_duplicate_slice['sort_rank'] = de_duplicate_slice['chara'].argsort().argsort() 63 | rank = pd.merge(series, de_duplicate_slice, left_on='chara', right_on='chara', how='right')['sort_rank'] 64 | # if all values are zero, the results will contain nan 65 | rank_normal = ((rank - rank.min())/(rank.max() - rank.min())*2 - 1) 66 | re_df.append(rank_normal) 67 | re_df = pd.DataFrame(re_df, index=CHARAS_LIST).T.fillna(0) 68 | re_df['permno'] = list(cross_slice['permno'].astype(int)) 69 | re_df['DATE'] = list(cross_slice['DATE'].astype(int)) 70 | 71 | return re_df[['permno', 'DATE'] + CHARAS_LIST] 72 | 73 | 74 | 75 | def cal_portfolio_ret(it, df): 76 | d, f = it[0], it[1] 77 | # long portfolio, qunatile 0.0~0.1; short portfolio, qunatile 0.9~1.0 78 | long_portfolio = df.loc[df.DATE == d][['permno', f]].sort_values(by=f, ascending=False)[:df.loc[df.DATE == d].shape[0]//10]['permno'].to_list() 79 | short_portfolio = df.loc[df.DATE == d][['permno', f]].sort_values(by=f, ascending=False)[-df.loc[df.DATE == d].shape[0]//10:]['permno'].to_list() 80 | # long-short portfolio return 81 | long_ret = mon_ret.loc[mon_ret.date == d].drop_duplicates('permno').set_index('permno').reindex(long_portfolio)['ret-rf'].dropna().mean() 82 | short_ret = mon_ret.loc[mon_ret.date == d].drop_duplicates('permno').set_index('permno').reindex(short_portfolio)['ret-rf'].dropna().mean() 83 | chara_ret = 0.5*(long_ret - short_ret) 84 | 85 | return chara_ret 86 | 87 | 88 | def cal_portfolio_charas(month, df): 89 | mon_portfolio_chara = [] 90 | p_name = ['p_' + chr for chr in CHARAS_LIST] 91 | for chr in CHARAS_LIST: 92 | long_portfolio = df.loc[df.DATE == month].sort_values(by=chr, ascending=False).reset_index(drop=True)[:df.loc[df.DATE == month].shape[0]//10]['permno'].to_list() 93 | short_portfolio = df.loc[df.DATE == month].sort_values(by=chr, ascending=False).reset_index(drop=True)[-df.loc[df.DATE == month].shape[0]//10:]['permno'].to_list() 94 | 95 | long_charas = df.loc[df.DATE == month].set_index('permno').loc[long_portfolio][CHARAS_LIST] 96 | short_charas = df.loc[df.DATE == month].set_index('permno').loc[short_portfolio][CHARAS_LIST] 97 | 98 | mon_portfolio_chara.append([month] + (0.5*(long_charas.mean() - short_charas.mean())).to_list()) 99 | 100 | return pd.DataFrame(mon_portfolio_chara, index=p_name, columns=['DATE']+CHARAS_LIST) 101 | 102 | 103 | 104 | if __name__ == '__main__': 105 | # pre-process share data 106 | processed_df = Parallel(n_jobs=-1)(delayed(pre_process)(d) for d in tqdm(datashare.DATE.drop_duplicates().to_list(), colour='green', desc='Processing')) 107 | processed_df = pd.concat(processed_df) 108 | 109 | ##TODO: calculate portfolio returns (or download preprocessed data) 110 | # iter_list = list(product(datashare.DATE.drop_duplicates(), CHARAS_LIST)) 111 | # portfolio_rets = Parallel(n_jobs=-1)(delayed(cal_portfolio_ret)(it, df=processed_df) for it in tqdm(iter_list, colour='green', desc='Calculating')) 112 | # portfolio_rets = pd.DataFrame(np.array(portfolio_rets).reshape(-1, 94), index=datashare.DATE.drop_duplicates(), columns=CHARAS_LIST).reset_index() 113 | # portfolio_rets[CHARAS_LIST] = portfolio_rets[CHARAS_LIST].astype(np.float16) 114 | 115 | 116 | ##TODO: calculate portfolio characteristics (or download preprocessed data) 117 | # mon_list = pd.read_pickle('data/mon_list.pkl') 118 | # _portfolio_chara_set = Parallel(n_jobs=-1)(delayed(cal_portfolio_charas)(mon, df=processed_df) for mon in tqdm(mon_list, colour='yellow', desc='Calculating P characteristics')) 119 | # p_charas = _portfolio_chara_set[0].copy(deep=False) 120 | # for tdf in _portfolio_chara_set[1:]: 121 | # p_charas = pd.concat([p_charas, tdf]) 122 | 123 | 124 | processed_df.to_pickle('data/datashare_re.pkl') 125 | # portfolio_rets.to_pickle('data/portfolio_rets.pkl') 126 | # p_charas.to_pickle('data/p_charas.pkl') -------------------------------------------------------------------------------- /imgs/R2_pred_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/R2_pred_table.png -------------------------------------------------------------------------------- /imgs/R2_total_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/R2_total_table.png -------------------------------------------------------------------------------- /imgs/alpha/CA0_1_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA0_1_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA0_2_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA0_2_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA0_3_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA0_3_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA0_4_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA0_4_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA0_5_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA0_5_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA0_5_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA0_5_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA0_6_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA0_6_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA1_1_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA1_1_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA1_2_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA1_2_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA1_3_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA1_3_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA1_4_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA1_4_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA1_5_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA1_5_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA1_6_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA1_6_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA2_1_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA2_1_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA2_2_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA2_2_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA2_3_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA2_3_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA2_4_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA2_4_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA2_5_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA2_5_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA2_6_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA2_6_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA3_1_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA3_1_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA3_2_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA3_2_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA3_3_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA3_3_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA3_4_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA3_4_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA3_5_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA3_5_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA3_6_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA3_6_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/FF_1_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/FF_1_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/FF_2_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/FF_2_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/FF_3_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/FF_3_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/FF_4_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/FF_4_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/FF_5_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/FF_5_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/FF_6_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/FF_6_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/IPCA_1_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/IPCA_1_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/IPCA_2_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/IPCA_2_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/IPCA_3_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/IPCA_3_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/IPCA_4_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/IPCA_4_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/IPCA_5_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/IPCA_5_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/IPCA_6_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/IPCA_6_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/PCA_1_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/PCA_1_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/PCA_2_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/PCA_2_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/PCA_3_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/PCA_3_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/PCA_4_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/PCA_4_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/PCA_5_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/PCA_5_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/PCA_6_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/PCA_6_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/omit_char_R2_bias.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/omit_char_R2_bias.png -------------------------------------------------------------------------------- /imgs/pred_R2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/pred_R2.png -------------------------------------------------------------------------------- /imgs/total_R2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/total_R2.png -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from models.PCA import PCA 3 | from models.FF import FF 4 | from models.IPCA import IPCA 5 | from models.CA import CA0, CA1, CA2, CA3 6 | 7 | import gc 8 | import argparse 9 | import pandas as pd 10 | import numpy as np 11 | import time 12 | import json 13 | from tqdm import tqdm 14 | from utils import * 15 | from analysis import * 16 | import matplotlib.pyplot as plt 17 | from itertools import product 18 | import os 19 | 20 | import warnings 21 | warnings.filterwarnings('ignore') 22 | 23 | 24 | 25 | def model_inference_and_predict(model): 26 | """ 27 | Inference and Prediction of non NN models: 28 | Returns: model.name_inference.csv & model.name_inference.csv saved in path 'results' 29 | """ 30 | mon_list = pd.read_pickle('data/mon_list.pkl') 31 | test_mons = mon_list.loc[mon_list >= model.test_period[0]] 32 | inference_result = [] 33 | predict_result = [] 34 | T_bar = tqdm(test_mons.groupby(test_mons.apply(lambda x: x//10000)), colour='red', desc=f'{model.name} Inferencing & Predicting') 35 | 36 | for g in T_bar: # rolling train 37 | T_bar.set_postfix({'Year': g[0]}) 38 | model.train_model() 39 | 40 | for m in g[1].to_list(): 41 | inference_result.append(model.inference(m)) # T * N * m 42 | if not len(model.omit_char): 43 | predict_result.append(model.predict(m)) 44 | # model refit (change train period and valid period) 45 | model.refit() 46 | 47 | if not len(model.omit_char): 48 | inference_result = pd.DataFrame(inference_result, index=test_mons, columns=CHARAS_LIST) 49 | inference_result.to_csv(f'results/inference/{model.name}_inference.csv') 50 | predict_result = pd.DataFrame(predict_result, index=test_mons, columns=CHARAS_LIST) 51 | predict_result.to_csv(f'results/predict/{model.name}_predict.csv') 52 | 53 | return inference_result 54 | 55 | 56 | 57 | def model_inference_and_predict_CA(model): 58 | """ 59 | Inference and Prediction of NN models: 60 | Returns: model.name_inference.csv & model.name_inference.csv saved in path 'results' 61 | """ 62 | model = model.to('cuda') 63 | mon_list = pd.read_pickle('data/mon_list.pkl') 64 | test_mons = mon_list.loc[(mon_list >= model.test_period[0])] 65 | 66 | if not len(model.omit_char): # no omit characteristics 67 | inference_result = pd.DataFrame() 68 | predict_result = pd.DataFrame() 69 | else: 70 | inference_result = [] 71 | 72 | T_bar = tqdm(test_mons.groupby(test_mons.apply(lambda x: x//10000)), colour='red', desc=f'{model.name} Inferencing & Predicting') 73 | 74 | stock_index = pd.Series(dtype=np.int64) 75 | for g in T_bar: # rolling train, refit once a year 76 | T_bar.set_postfix({'Year': g[0]}) 77 | 78 | model.reset_weight() 79 | model.release_gpu() 80 | # release GPU memory 81 | for _ in range(6): # call function multiple times to clear the cuda cache 82 | torch.cuda.empty_cache() 83 | 84 | train_loss, val_loss = model.train_model() 85 | # plot loss 86 | plt.plot(train_loss, label='train_loss') 87 | plt.plot(val_loss, label='val_loss') 88 | plt.legend() 89 | plt.savefig(f'results/train_loss/{model.name}_loss_{g[0]}.png') 90 | plt.close() 91 | 92 | for m in g[1].to_list(): 93 | m_stock_index, _, _, _ = model._get_item(m) 94 | stock_index = pd.concat([stock_index, pd.Series(m_stock_index)]).drop_duplicates().astype(int) 95 | 96 | if not len(model.omit_char): # no omit characteristics 97 | # move inference_R and predict_R to cpu 98 | inference_R = model.inference(m) # return (N, 1) 99 | inference_R = inference_R.cpu().detach().numpy() 100 | inference_R = pd.DataFrame(inference_R, index=m_stock_index, columns=[m]) 101 | inference_result = pd.concat([inference_result.reset_index(drop=True), inference_R.reset_index(drop=True)], axis=1) # (N, T) 102 | 103 | predict_R = model.predict(m) # reutrn (N, 1) 104 | predict_R = predict_R.cpu().detach().numpy() 105 | predict_R = pd.DataFrame(predict_R, index=m_stock_index, columns=[m]) 106 | predict_result = pd.concat([predict_result.reset_index(drop=True), predict_R.reset_index(drop=True)], axis=1) # (N, T) 107 | 108 | else: 109 | inference_R = model.inference(m) # return (N, m), m is the length of omit_char 110 | inference_result.append(inference_R) # (T, N, m) 111 | 112 | # refit: change train period and valid period 113 | model.refit() 114 | 115 | if not len(model.omit_char): 116 | inference_result = pd.DataFrame(inference_result.values.T, index=test_mons, columns=CHARAS_LIST) 117 | inference_result.to_csv(f'results/inference/{model.name}_inference.csv') 118 | 119 | predict_result = pd.DataFrame(predict_result.values.T, index=test_mons, columns=CHARAS_LIST) 120 | predict_result.to_csv(f'results/predict/{model.name}_predict.csv') 121 | 122 | # GC: release RAM memory(model) 123 | del model 124 | gc.collect() 125 | return inference_result 126 | 127 | 128 | 129 | def git_push(msg): 130 | os.system('git add R_squares') 131 | os.system(f'git commit -m "{msg}"') 132 | os.system('git push') 133 | 134 | 135 | 136 | def model_selection(model_type, model_K, omit_char=[]): 137 | assert model_type in ['FF', 'PCA', 'IPCA', 'CA0', 'CA1', 'CA2', 'CA3'], f'No Such Model: {model_type}' 138 | 139 | if model_type == 'FF': 140 | return { 141 | 'name': f'FF_{model_K}', 142 | 'omit_char': [], 143 | 'model': FF(K=model_K) 144 | } 145 | 146 | elif model_type == 'PCA': 147 | return { 148 | 'name': f'PCA_{model_K}', 149 | 'omit_char': omit_char, 150 | 'model': PCA(K=model_K, omit_char=omit_char) 151 | } 152 | 153 | elif model_type == 'IPCA': 154 | return { 155 | 'name': f'IPCA_{model_K}', 156 | 'omit_char': omit_char, 157 | 'model': IPCA(K=model_K, omit_char=omit_char) 158 | } 159 | 160 | elif model_type == 'CA0': 161 | return { 162 | 'name': f'CA0_{model_K}', 163 | 'omit_char': omit_char, 164 | 'model': CA0(hidden_size=model_K, lr=CA_LR, omit_char=omit_char) 165 | } 166 | 167 | elif model_type == 'CA1': 168 | return { 169 | 'name': f'CA1_{model_K}', 170 | 'omit_char': omit_char, 171 | 'model': CA1(hidden_size=model_K, dropout=CA_DR, lr=CA_LR, omit_char=omit_char) 172 | } 173 | 174 | elif model_type == 'CA2': 175 | return { 176 | 'name': f'CA2_{model_K}', 177 | 'omit_char': omit_char, 178 | 'model': CA2(hidden_size=model_K, dropout=CA_DR, lr=CA_LR, omit_char=omit_char) 179 | } 180 | 181 | else: 182 | return { 183 | 'name': f'CA3_{model_K}', 184 | 'omit_char': omit_char, 185 | 'model': CA3(hidden_size=model_K, dropout=CA_DR, lr=CA_LR, omit_char=omit_char) 186 | } 187 | 188 | 189 | 190 | if __name__ == "__main__": 191 | parser = argparse.ArgumentParser() 192 | parser.add_argument('--Model', type=str, default='FF PCA IPCA CA0 CA1 CA2 CA3') 193 | parser.add_argument('--K', type=str, default='1 2 3 4 5 6') 194 | parser.add_argument('--omit_char', type=str, default='') 195 | 196 | args = parser.parse_args() 197 | 198 | if 'results' not in os.listdir('./'): 199 | os.mkdir('results') 200 | if 'train_loss' not in os.listdir('./results'): 201 | os.mkdir('results/train_loss') 202 | if 'inference' not in os.listdir('./results'): 203 | os.mkdir('results/inference') 204 | if 'predict' not in os.listdir('./results'): 205 | os.mkdir('results/predict') 206 | if 'imgs' not in os.listdir('./'): 207 | os.mkdir('imgs') 208 | 209 | 210 | models_name = [] 211 | R_square = [] 212 | for g in product(args.Model.split(' '), args.K.split(' ')): 213 | if isinstance(args.omit_char, str) and len(args.omit_char) > 0: 214 | omit_chars = args.omit_char.split(' ') 215 | else: 216 | omit_chars = [] 217 | 218 | model = model_selection(g[0], int(g[1]), omit_chars) 219 | 220 | print(f"{time.strftime('%a, %d %b %Y %H:%M:%S +0800', time.gmtime())} | Model: {model['name']} | {omit_chars}") 221 | print('name : ', model['name']) 222 | models_name.append(model['name']) 223 | 224 | if model['name'].split('_')[0][:-1] == 'CA': 225 | print('model_inference_and_predict_CA') 226 | # if have omit char, inf_ret (T, N, m) 227 | inf_ret = model_inference_and_predict_CA(model['model']) 228 | else: 229 | inf_ret = model_inference_and_predict(model['model']) 230 | 231 | gc.collect() 232 | 233 | # Save total R^2 234 | if not len(model['omit_char']): 235 | R_square.append(calculate_R2(model['model'], 'inference')) 236 | alpha_plot(model['model'], 'inference', save_dir='imgs') 237 | # alpha_plot(model['model'], 'predict', save_dir='alpha_imgs') 238 | else: 239 | inf_ret = np.array(inf_ret) 240 | for i in range(len(model['omit_char'])): 241 | inference_r = inf_ret[:, :, i] # T * N 242 | complete_r = inf_ret[:, :, -1] 243 | R_square.append(calculate_R2(None, None, inference_r, complete_r)) 244 | 245 | del model 246 | 247 | # save R_square to json 248 | p = time.localtime() 249 | time_str = "{:0>4d}-{:0>2d}-{:0>2d}_{:0>2d}-{:0>2d}-{:0>2d}".format(p.tm_year, p.tm_mon, p.tm_mday, p.tm_hour, p.tm_min, p.tm_sec) 250 | filename = f"R_squares/{time_str}.json" 251 | obj = { 252 | "models": models_name, 253 | 'omit_char': args.omit_char.split(' '), 254 | "R2_total": R_square, 255 | } 256 | 257 | with open(filename, "w") as out_file: 258 | json.dump(obj, out_file) 259 | 260 | # git push 261 | # git_push(f"Run main.py") 262 | 263 | 264 | -------------------------------------------------------------------------------- /models/CA.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import numpy as np 4 | import collections 5 | from .modelBase import modelBase 6 | from utils import CHARAS_LIST 7 | 8 | import torch 9 | from torch import nn 10 | from torch.utils.data import Dataset, DataLoader, TensorDataset 11 | 12 | 13 | MAX_EPOCH = 200 14 | 15 | class CA_base(nn.Module, modelBase): 16 | def __init__(self, name, omit_char=[], device='cuda'): 17 | nn.Module.__init__(self) 18 | modelBase.__init__(self, name) 19 | self.beta_nn = None 20 | self.factor_nn = None 21 | self.optimizer = None 22 | self.criterion = None 23 | self.omit_char = omit_char 24 | 25 | self.factor_nn_pred = [] 26 | 27 | self.device = device 28 | 29 | self.datashare_chara = pd.read_pickle('./data/datashare_re.pkl').astype(np.float64) 30 | self.p_charas = pd.read_pickle('./data/p_charas.pkl').astype(np.float64).reset_index() 31 | self.portfolio_ret= pd.read_pickle('./data/portfolio_ret.pkl').astype(np.float64) 32 | self.mon_ret = pd.read_pickle('./data/month_ret.pkl').astype(np.float64) 33 | 34 | self.train_dataloader = None 35 | self.valid_dataloader = None 36 | self.test_dataloader = None 37 | 38 | 39 | def debug(self, month): 40 | beta_nn_input = self.p_charas.loc[self.p_charas['DATE'] == month][CHARAS_LIST] 41 | # beta_nn_input = self.datashare_chara.loc[self.datashare_chara['DATE'] == month].set_index('permno')[charas] 42 | print(beta_nn_input) 43 | 44 | 45 | def _get_item(self, month): 46 | if month not in self.p_charas['DATE'].values: 47 | # find the closest month in p_charas to month 48 | month = self.p_charas['DATE'].values[np.argmin(np.abs(self.p_charas['DATE'].values - month))] 49 | 50 | beta_nn_input = self.p_charas.loc[self.p_charas['DATE'] == month][CHARAS_LIST] # (94, 94) 51 | labels = self.portfolio_ret.loc[self.portfolio_ret['DATE'] == month][CHARAS_LIST].T.values # (94, 1) 52 | beta_nn_input['ret-rf'] = labels 53 | align_df = beta_nn_input.copy(deep=False).dropna() 54 | 55 | factor_nn_input = self.portfolio_ret.loc[self.portfolio_ret['DATE'] == month][CHARAS_LIST] 56 | 57 | # exit(0) if there is any nan in align_df 58 | if align_df.isnull().values.any(): 59 | assert False, f'There is nan in align_df of : {month}' 60 | # return stock index (L), beta_nn_input (94*94=P*N), factor_nn_input (94*1=P*1), labels (94, = N,) 61 | return align_df.index, align_df.values[:, :-1].T, factor_nn_input.T.values , align_df.values[:, -1].T 62 | 63 | 64 | def dataloader(self, period): 65 | mon_list = pd.read_pickle('data/mon_list.pkl') 66 | mon_list = mon_list.loc[(mon_list >= period[0]) & (mon_list <= period[1])] 67 | beta_nn_input_set = [] 68 | factor_nn_input_set = [] 69 | label_set = [] 70 | for mon in mon_list: 71 | _, _beta_input, _factor_input, label = self._get_item(mon) 72 | beta_nn_input_set.append(_beta_input) 73 | factor_nn_input_set.append(_factor_input) 74 | label_set.append(label) 75 | 76 | beta_nn_input_set = torch.tensor(beta_nn_input_set, dtype=torch.float32).to(self.device) 77 | factor_nn_input_set = torch.tensor(factor_nn_input_set, dtype=torch.float32).to(self.device) 78 | label_set = torch.tensor(label_set, dtype=torch.float32).to(self.device) 79 | 80 | dataset = TensorDataset(beta_nn_input_set, factor_nn_input_set, label_set) 81 | return DataLoader(dataset, batch_size=1, shuffle=True) 82 | 83 | 84 | def forward(self, char, pfret): 85 | processed_char = self.beta_nn(char) 86 | processed_pfret = self.factor_nn(pfret) 87 | return torch.sum(processed_char * processed_pfret, dim=1) 88 | 89 | 90 | # train_one_epoch 91 | def __train_one_epoch(self): 92 | epoch_loss = 0.0 93 | for i, (beta_nn_input, factor_nn_input, labels) in enumerate(self.train_dataloader): 94 | self.optimizer.zero_grad() 95 | # beta_nn_input reshape: (1, 94, 94) -> (94, 94) (1*P*N => N*P) 96 | # factor_nn_input reshape: (1, 94, 1) -> (1, 94) (1*P*1 => 1*P) 97 | # labels reshape: (1, 94) -> (94, ) (1*N => N,) 98 | beta_nn_input = beta_nn_input.squeeze(0).T 99 | factor_nn_input = factor_nn_input.squeeze(0).T 100 | labels = labels.squeeze(0) 101 | output = self.forward(beta_nn_input, factor_nn_input) 102 | loss = self.criterion(output, labels) 103 | 104 | loss.backward() 105 | self.optimizer.step() 106 | epoch_loss += loss.item() 107 | 108 | if i % 100 == 0: 109 | # print(f'Batches: {i}, loss: {loss.item()}') 110 | pass 111 | 112 | return epoch_loss / len(self.train_dataloader) 113 | 114 | 115 | def __valid_one_epoch(self): 116 | epoch_loss = 0.0 117 | for i, (beta_nn_input, factor_nn_input, labels) in enumerate(self.valid_dataloader): 118 | # beta_nn_input reshape: (1, 94, 94) -> (94, 94) (1*P*N => N*P) 119 | # factor_nn_input reshape: (1, 94, 1) -> (1, 94) (1*P*1 => 1*P) 120 | # labels reshape: (1, 94) -> (94, ) (1*N => N,) 121 | beta_nn_input = beta_nn_input.squeeze(0).T 122 | factor_nn_input = factor_nn_input.squeeze(0).T 123 | labels = labels.squeeze(0) 124 | 125 | output = self.forward(beta_nn_input, factor_nn_input) 126 | loss = self.criterion(output, labels) 127 | epoch_loss += loss.item() 128 | 129 | return epoch_loss / len(self.valid_dataloader) 130 | 131 | 132 | def train_model(self): 133 | if 'saved_models' not in os.listdir('./'): 134 | os.mkdir('saved_models') 135 | 136 | self.train_dataloader = self.dataloader(self.train_period) 137 | self.valid_dataloader = self.dataloader(self.valid_period) 138 | self.test_dataloader = self.dataloader(self.test_period) 139 | 140 | min_error = np.Inf 141 | no_update_steps = 0 142 | valid_loss = [] 143 | train_loss = [] 144 | for i in range(MAX_EPOCH): 145 | # print(f'Epoch {i}') 146 | self.train() 147 | train_error = self.__train_one_epoch() 148 | train_loss.append(train_error) 149 | 150 | self.eval() 151 | # valid and early stop 152 | with torch.no_grad(): 153 | valid_error = self.__valid_one_epoch() 154 | 155 | valid_loss.append(valid_error) 156 | if valid_error < min_error: 157 | min_error = valid_error 158 | no_update_steps = 0 159 | # save model 160 | torch.save(self.state_dict(), f'./saved_models/{self.name}.pt') 161 | else: 162 | no_update_steps += 1 163 | 164 | if no_update_steps > 2: # early stop, if consecutive 3 epoches no improvement on validation set 165 | print(f'Early stop at epoch {i}') 166 | break 167 | # load from (best) saved model 168 | self.load_state_dict(torch.load(f'./saved_models/{self.name}.pt')) 169 | return train_loss, valid_loss 170 | 171 | 172 | def test_model(self): 173 | # beta, factor, label = self.test_dataset 174 | # i = np.random.randint(len(beta)) 175 | # beta_nn_input = beta[i] 176 | # factor_nn_input = factor[i] 177 | # labels = label[i] 178 | output = None 179 | label = None 180 | for i, beta_nn_input, factor_nn_input, labels in enumerate(self.test_dataloader): 181 | # convert to tensor 182 | # beta_nn_input = torch.tensor(beta_nn_input, dtype=torch.float32).T.to(self.device) 183 | # factor_nn_input = torch.tensor(factor_nn_input, dtype=torch.float32).T.to(self.device) 184 | # labels = torch.tensor(labels, dtype=torch.float32).T.to(self.device) 185 | output = self.forward(beta_nn_input, factor_nn_input) 186 | break 187 | 188 | loss = self.criterion(output, labels) 189 | print(f'Test loss: {loss.item()}') 190 | print(f'Predicted: {output}') 191 | print(f'Ground truth: {labels}') 192 | return output, labels 193 | 194 | 195 | def calBeta(self, month, skip_char=[]): 196 | _, beta_nn_input, _, _ = self._get_item(month) # beta input: 94*94 = P*N 197 | 198 | # if some variables need be omitted 199 | if len(skip_char): 200 | beta_nn_input = pd.DataFrame(beta_nn_input.T, columns=CHARAS_LIST) # N*P 201 | beta_nn_input[skip_char] = beta_nn_input[skip_char] * 0.0 202 | beta_nn_input = beta_nn_input.values.T # P*N 203 | 204 | beta_nn_input = torch.tensor(beta_nn_input, dtype=torch.float32).T.to(self.device) # N*P 205 | return self.beta_nn(beta_nn_input) # N*K 206 | 207 | 208 | def calFactor(self, month, skip_char=[]): 209 | _, _, factor_nn_input, _ = self._get_item(month) # factor input: P*1 210 | 211 | # if some variables need be omitted 212 | if len(skip_char): 213 | factor_nn_input = pd.DataFrame(factor_nn_input.T, columns=CHARAS_LIST) # 1*P 214 | factor_nn_input[skip_char] = factor_nn_input[skip_char] * 0.0 215 | factor_nn_input = factor_nn_input.values.T # P*1 216 | 217 | factor_nn_input = torch.tensor(factor_nn_input, dtype=torch.float32).T.to(self.device) # 1*P 218 | factor_pred = self.factor_nn(factor_nn_input).T # K*1 219 | 220 | self.factor_nn_pred.append(factor_pred) 221 | 222 | return factor_pred # K*1 223 | 224 | 225 | def inference(self, month): 226 | if len(self.omit_char) == 0: 227 | assert month >= self.test_period[0], f"Month error, {month} is not in test period {self.test_period}" 228 | 229 | mon_factor, mon_beta = self.calFactor(month), self.calBeta(month) 230 | 231 | assert mon_beta.shape[1] == mon_factor.shape[0], f"Dimension mismatch between mon_factor: {mon_factor.shape} and mon_beta: {mon_beta.shape}" 232 | 233 | # R_{N*1} = Beta_{N*K} @ F_{K*1} 234 | return mon_beta @ mon_factor 235 | else: 236 | ret_R = [] 237 | for char in self.omit_char: 238 | mon_factor, mon_beta = self.calFactor(month, [char]), self.calBeta(month, [char]) 239 | ret_R.append((mon_beta @ mon_factor).cpu().detach().numpy()) # N*1 240 | 241 | mon_factor, mon_beta = self.calFactor(month), self.calBeta(month) 242 | ret_R.append((mon_beta @ mon_factor).cpu().detach().numpy()) # also add complete result 243 | 244 | return np.array(ret_R).squeeze(2).T # N*m 245 | 246 | 247 | def cal_delayed_Factor(self, month): 248 | # calculate the last day of the previous month 249 | if self.refit_cnt == 0: 250 | avg_f_pred = self.factor_nn_pred[0] # input of the first predict take hat{f}_t 251 | # print(avg_f_pred.shape) 252 | else: 253 | avg_f_pred = torch.mean(torch.stack(self.factor_nn_pred[:self.refit_cnt]), dim=0) 254 | 255 | return avg_f_pred 256 | 257 | 258 | def reset_weight(self): 259 | for layer in self.beta_nn: # reset beta_nn parameters 260 | if hasattr(layer, 'reset_parameters'): 261 | layer.reset_parameters() 262 | 263 | for layer in self.factor_nn: # reset factor_nn parameters 264 | if hasattr(layer, 'reset_parameters'): 265 | layer.reset_parameters() 266 | 267 | self.optimizer.state = collections.defaultdict(dict) # reset optimizer state 268 | 269 | 270 | def release_gpu(self): 271 | if self.train_dataloader is not None: 272 | del self.train_dataloader 273 | if self.valid_dataloader is not None: 274 | del self.valid_dataloader 275 | if self.test_dataloader is not None: 276 | del self.test_dataloader 277 | torch.cuda.empty_cache() 278 | 279 | 280 | 281 | class CA0(CA_base): 282 | def __init__(self, hidden_size, lr=0.001, omit_char=[], device='cuda'): 283 | CA_base.__init__(self, name=f'CA0_{hidden_size}', omit_char=omit_char, device=device) 284 | # P -> K 285 | self.beta_nn = nn.Sequential( 286 | # output layer 287 | nn.Linear(94, hidden_size) 288 | ) 289 | self.factor_nn = nn.Sequential( 290 | nn.Linear(94, hidden_size) 291 | ) 292 | 293 | self.optimizer = torch.optim.Adam(self.parameters(), lr=lr) 294 | self.criterion = nn.MSELoss().to(device) 295 | 296 | 297 | 298 | class CA1(CA_base): 299 | def __init__(self, hidden_size, dropout=0.5, lr=0.001, omit_char=[], device='cuda'): 300 | CA_base.__init__(self, name=f'CA1_{hidden_size}', omit_char=omit_char, device=device) 301 | self.dropout = dropout 302 | # P -> 32 -> K 303 | self.beta_nn = nn.Sequential( 304 | # hidden layer 1 305 | nn.Linear(94, 32), 306 | nn.BatchNorm1d(32), 307 | nn.ReLU(), 308 | nn.Dropout(self.dropout), 309 | # output layer 310 | nn.Linear(32, hidden_size) 311 | ) 312 | self.factor_nn = nn.Sequential( 313 | nn.Linear(94, hidden_size) 314 | ) 315 | 316 | self.optimizer = torch.optim.Adam(self.parameters(), lr=lr) 317 | self.criterion = nn.MSELoss().to(device) 318 | 319 | 320 | 321 | class CA2(CA_base): 322 | def __init__(self, hidden_size, dropout=0.5, lr=0.001, omit_char=[], device='cuda'): 323 | CA_base.__init__(self, name=f'CA2_{hidden_size}', omit_char=omit_char, device=device) 324 | self.dropout = dropout 325 | # P -> 32 -> 16 -> K 326 | self.beta_nn = nn.Sequential( 327 | # hidden layer 1 328 | nn.Linear(94, 32), 329 | nn.BatchNorm1d(32), 330 | nn.ReLU(), 331 | nn.Dropout(self.dropout), 332 | # hidden layer 2 333 | nn.Linear(32, 16), 334 | nn.BatchNorm1d(16), 335 | nn.ReLU(), 336 | nn.Dropout(self.dropout), 337 | # output layer 338 | nn.Linear(16, hidden_size) 339 | ) 340 | self.factor_nn = nn.Sequential( 341 | nn.Linear(94, hidden_size) 342 | ) 343 | 344 | self.optimizer = torch.optim.Adam(self.parameters(), lr=lr) 345 | self.criterion = nn.MSELoss().to(device) 346 | 347 | 348 | 349 | class CA3(CA_base): 350 | def __init__(self, hidden_size, dropout=0.5, lr=0.001, omit_char=[], device='cuda'): 351 | CA_base.__init__(self, name=f'CA3_{hidden_size}', omit_char=omit_char, device=device) 352 | self.dropout = dropout 353 | # P -> 32 -> 16 -> 8 -> K 354 | self.beta_nn = nn.Sequential( 355 | # hidden layer 1 356 | nn.Linear(94, 32), 357 | nn.BatchNorm1d(32), 358 | nn.ReLU(), 359 | nn.Dropout(self.dropout), 360 | # hidden layer 2 361 | nn.Linear(32, 16), 362 | nn.BatchNorm1d(16), 363 | nn.ReLU(), 364 | nn.Dropout(self.dropout), 365 | # hidden layer 3 366 | nn.Linear(16, 8), 367 | nn.BatchNorm1d(8), 368 | nn.ReLU(), 369 | nn.Dropout(self.dropout), 370 | # output layer 371 | nn.Linear(8, hidden_size) 372 | ) 373 | self.factor_nn = nn.Sequential( 374 | nn.Linear(94, hidden_size) 375 | ) 376 | 377 | self.optimizer = torch.optim.Adam(self.parameters(), lr=lr, weight_decay=0.01) 378 | self.criterion = nn.MSELoss().to(device) -------------------------------------------------------------------------------- /models/FF.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('../') 3 | 4 | from utils import CHARAS_LIST 5 | from .modelBase import modelBase 6 | 7 | import pandas as pd 8 | import statsmodels.api as sm 9 | from dateutil.relativedelta import relativedelta 10 | 11 | 12 | class FF(modelBase): 13 | def __init__(self, K): 14 | super(FF, self).__init__(f'FF_{K}') 15 | self.K = K 16 | self.train_period[0] = 19630731 # ff5 data from FF website is only available from 196307 17 | self.omit_char = [] 18 | self.__prepare_FFf() 19 | 20 | 21 | def __prepare_FFf(self): 22 | ff5 = pd.read_csv('data/ff5.csv', index_col=0) 23 | UMD = pd.read_csv('data/UMD.csv', index_col=0) 24 | UMD.columns = ['UMD'] 25 | FFf = pd.concat([ff5, UMD.loc[196307:]], axis=1) 26 | self.fname = ['Mkt-RF', 'SMB', 'HML', 'CMA', 'RMW', 'UMD'] 27 | self.FFf = FFf[self.fname] 28 | self.portfolio_ret = pd.read_pickle('data/portfolio_ret.pkl') 29 | self.portfolio_ret['DATE'] = self.portfolio_ret['DATE'].apply(lambda x: x//100) 30 | 31 | 32 | def train_model(self): 33 | self.beta_matrix = [] 34 | X = self.FFf[self.fname[:self.K]].loc[self.train_period[0]//100:self.train_period[1]//100] 35 | for col in CHARAS_LIST: 36 | y = self.portfolio_ret.set_index('DATE')[col].loc[self.train_period[0]//100:self.train_period[1]//100] 37 | model = sm.OLS(y.values, X.values).fit() 38 | self.beta_matrix.append(model.params) 39 | self.beta_matrix = pd.DataFrame(self.beta_matrix, columns=self.fname[:self.K], index=CHARAS_LIST) 40 | 41 | 42 | def calBeta(self, month): # beta is time invariant 43 | return self.beta_matrix # N * K 44 | 45 | 46 | def calFactor(self, month): 47 | return self.FFf[self.fname[:self.K]].loc[month//100] # K * 1 48 | 49 | 50 | def cal_delayed_Factor(self, month): 51 | last_mon = int(str(pd.to_datetime(str(month)) - relativedelta(months=1)).split(' ')[0].replace('-', '')[:-2]) 52 | # return average of prevailing sample hat{f} (from 198701) up to t-1 53 | return self.FFf[self.fname[:self.K]].loc[198701:last_mon].mean() 54 | 55 | -------------------------------------------------------------------------------- /models/IPCA.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | import sys 5 | sys.path.append('../') 6 | 7 | from utils import * 8 | from .modelBase import modelBase 9 | 10 | 11 | class IPCA(modelBase): 12 | def __init__(self, K, omit_char=[]): 13 | super(IPCA, self).__init__(f'IPCA_{K}') 14 | self.K = K 15 | self.omit_char = omit_char 16 | np.random.seed(10) 17 | self.gamma = np.random.random([94, self.K]) # P = 94, we have total 94 characteristics 18 | self.valid_error = [] 19 | self.__prepare_data() 20 | 21 | 22 | def __prepare_data(self): 23 | self.portfolio_ret = pd.read_pickle('data/portfolio_ret.pkl') 24 | self.p_charas = pd.read_pickle('data/p_charas.pkl') 25 | self.mon_list = pd.read_pickle('data/mon_list.pkl') 26 | 27 | 28 | def __valid(self): 29 | MSE_set = [] 30 | for mon in self.mon_list[(self.mon_list >= self.valid_period[0]) & (self.mon_list <= self.valid_period[1])]: 31 | Z = self.p_charas.loc[self.p_charas.DATE == mon][CHARAS_LIST].values # N * P 32 | y = self.portfolio_ret.loc[self.portfolio_ret.DATE == mon][CHARAS_LIST].values.T # N * 1 33 | beta = Z @ self.gamma # N * K 34 | f_hat = np.array(np.matrix(beta.T @ beta).I @ beta.T @ y) # K * 1 35 | residual = y - beta @ f_hat 36 | MSE = np.sum(residual**2) 37 | MSE_set.append(MSE) 38 | 39 | valid_error = sum(MSE_set) 40 | self.valid_error.append(valid_error) 41 | 42 | return valid_error 43 | 44 | 45 | def __gamma_iter(self, gamma_old): 46 | numer = np.zeros((94*self.K, 1)) 47 | denom = np.zeros((94*self.K, 94*self.K)) 48 | for mon in self.mon_list[(self.mon_list >= self.train_period[0]) & (self.mon_list <= self.train_period[1])]: 49 | Z = self.p_charas.loc[self.p_charas.DATE == mon][CHARAS_LIST].values # N * P 50 | y = self.portfolio_ret.loc[self.portfolio_ret.DATE == mon][CHARAS_LIST].values.T # N * 1 51 | beta = Z @ gamma_old # N * K 52 | f_hat = np.array(np.matrix(beta.T @ beta).I @ beta.T @ y) # K * 1 53 | numer += (np.kron(f_hat, Z.T) @ y) 54 | denom += (np.kron(f_hat, Z.T) @ np.kron(f_hat.T, Z)) 55 | 56 | gamma_new = (np.linalg.pinv(denom) @ numer).reshape(self.K, 94) 57 | gamma_new = gamma_new.T 58 | 59 | return gamma_new 60 | 61 | 62 | def train_model(self): 63 | update_cnt = 0 64 | min_valid_err = np.Inf 65 | best_gamma = np.zeros((94, self.K)) 66 | while update_cnt < 5: 67 | self.gamma = self.__gamma_iter(self.gamma) 68 | valid_error = self.__valid() 69 | if valid_error < min_valid_err: 70 | min_valid_err = valid_error 71 | best_gamma = self.gamma 72 | update_cnt = 0 73 | else: 74 | update_cnt += 1 75 | 76 | self.gamma = best_gamma 77 | 78 | 79 | def inference(self, month): 80 | if not len(self.omit_char): 81 | Z = self.p_charas.loc[self.p_charas.DATE == month][CHARAS_LIST].values # N * P 82 | y = self.portfolio_ret.loc[self.portfolio_ret.DATE == month][CHARAS_LIST].values.T # N * 1 83 | beta = Z @ self.gamma # N * K 84 | f_hat = np.array(np.matrix(beta.T @ beta).I @ beta.T @ y) # K * 1 85 | return (beta @ f_hat).flatten() # N, 1 86 | else: 87 | inference_R = [] 88 | Z = self.p_charas.loc[self.p_charas.DATE == month][CHARAS_LIST].copy(deep=False) 89 | y = self.portfolio_ret.loc[self.portfolio_ret.DATE == month][CHARAS_LIST].copy(deep=False) 90 | 91 | for char in self.omit_char: 92 | Z_input = Z.copy(deep=False) 93 | y_input = y.copy(deep=False) 94 | Z_input[[char]] = Z_input[[char]] * 0.0 95 | y_input[[char]] = y_input[[char]] * 0.0 96 | Z_input = Z_input.values 97 | y_input = y_input.values.T 98 | beta = Z_input @ self.gamma 99 | f_hat = np.array(np.matrix(beta.T @ beta).I @ beta.T @ y_input) # K * 1 100 | inference_R.append((beta @ f_hat).flatten()) # m * N 101 | 102 | Z_input = Z.values 103 | y_input = y.values.T 104 | beta = Z_input @ self.gamma 105 | f_hat = np.array(np.matrix(beta.T @ beta).I @ beta.T @ y_input) # K * 1 106 | inference_R.append((beta @ f_hat).flatten()) # m * N 107 | 108 | return np.array(inference_R).T # N * m 109 | 110 | 111 | def predict(self, month): 112 | if self.refit_cnt == 0: 113 | return self.inference(month) 114 | 115 | lag_f_hat = [] 116 | for mon in self.mon_list[(self.mon_list >= 19870101) & (self.mon_list < month)]: 117 | Z = self.p_charas.loc[self.p_charas.DATE == mon][CHARAS_LIST].values # N * P 118 | y = self.portfolio_ret.loc[self.portfolio_ret.DATE == mon][CHARAS_LIST].values.T # N * 1 119 | beta = Z @ self.gamma # N * K 120 | f_hat = np.array(np.matrix(beta.T @ beta).I @ beta.T @ y) # K * 1 121 | lag_f_hat.append(f_hat) 122 | 123 | Z = self.p_charas.loc[self.p_charas.DATE == month][CHARAS_LIST].values # N * P 124 | y = self.portfolio_ret.loc[self.portfolio_ret.DATE == month][CHARAS_LIST].values.T # N * 1 125 | beta = Z @ self.gamma # N * K 126 | 127 | # return average of prevailing sample hat{f} (from 198701) up to t-1 128 | avg_lag_f = np.mean(lag_f_hat, axis=0) 129 | return beta @ avg_lag_f -------------------------------------------------------------------------------- /models/PCA.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('../') 3 | 4 | import pandas as pd 5 | import numpy as np 6 | 7 | from utils import * 8 | from .modelBase import modelBase 9 | 10 | 11 | def stock_R_matrix(start_date, end_date): 12 | R_matrix = pd.read_pickle('data/stock_R_matrix.pkl') 13 | return R_matrix.T.loc[start_date: end_date].T 14 | 15 | def portfolio_R_matrix(start_date, end_date): 16 | portfolio_ret = pd.read_pickle('data/portfolio_ret.pkl') 17 | return portfolio_ret.loc[(portfolio_ret['DATE'] >= start_date) & (portfolio_ret['DATE'] <= end_date)].set_index('DATE').T 18 | 19 | 20 | 21 | class PCA(modelBase): 22 | def __init__(self, K, omit_char=[]): 23 | super(PCA, self).__init__(f'PCA_{K}') 24 | self.K = K 25 | self.omit_char = omit_char 26 | self.portfolio_ret = pd.read_pickle('data/portfolio_ret.pkl') 27 | 28 | def train_model(self): 29 | pr = self.portfolio_ret.loc[(self.portfolio_ret.DATE >= self.train_period[0]) & (self.portfolio_ret.DATE <= self.train_period[1])][CHARAS_LIST].values 30 | pr = pr - np.mean(pr, axis=0) # col demean 31 | ret_cov_matrix = np.zeros((pr.shape[1], pr.shape[1])) 32 | 33 | for i in range(pr.shape[0]): # Sum of y^t @ y^t.T 34 | ret_cov_matrix += (pr[i, :].reshape(-1, 1) @ pr[i, :].reshape(-1, 1).T) 35 | ret_cov_matrix = ret_cov_matrix/(pr.shape[0]*pr.shape[1]) # N * N 36 | 37 | eigVal, eigVec = np.linalg.eig(ret_cov_matrix) 38 | sorted_indices = np.argsort(eigVal) 39 | self.beta = eigVec[:,sorted_indices[:-self.K-1:-1]] # Beta: N * K 40 | 41 | 42 | def calBeta(self, month): 43 | return np.real(self.beta) 44 | 45 | 46 | def calFactor(self, month): 47 | tr = self.portfolio_ret.loc[self.portfolio_ret.DATE <= month].iloc[-1][CHARAS_LIST].values 48 | tr = tr - np.mean(tr, axis=0) # col demean 49 | # print(tr) 50 | factor = np.array((np.matrix(self.beta.T @ self.beta).I @ self.beta.T) @ tr.T).T # K * 1 51 | return np.real(factor.flatten()) 52 | 53 | 54 | def cal_delayed_Factor(self, month): 55 | if self.refit_cnt == 0: 56 | return self.calFactor(month) 57 | 58 | tr = self.portfolio_ret.loc[(self.portfolio_ret.DATE >= 19870101) & (self.portfolio_ret.DATE < month)][CHARAS_LIST].values 59 | tr = tr - np.mean(tr, axis=0) # col demean 60 | 61 | # return average of prevailing sample hat{f} (from 198701) up to t-1 62 | factors = [] 63 | for i in range(tr.shape[0]): 64 | factors.append(np.array(np.matrix(self.beta.T @ self.beta).I @ self.beta.T @ tr[i, :])) 65 | 66 | factors = np.array(factors).squeeze(1).T # K * T 67 | avg_delay_f = np.mean(factors, axis=1).reshape(-1, 1) # K * 1 68 | 69 | return np.real(avg_delay_f.flatten()) 70 | -------------------------------------------------------------------------------- /models/modelBase.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import datetime 4 | from dateutil.relativedelta import relativedelta 5 | 6 | class modelBase: 7 | def __init__(self, name): 8 | self.name = name 9 | self.train_idx = 0 10 | self.refit_cnt = 0 11 | 12 | # initial train, valid and test periods are default accroding to original paper 13 | self.train_period = [19570101, 19741231] 14 | self.valid_period = [19750101, 19861231] 15 | self.test_period = [19870101, 19871231] 16 | 17 | 18 | def train_model(self): 19 | # print('trained') 20 | pass 21 | 22 | 23 | def calBeta(self, month): 24 | """ 25 | Calculate specific month's beta. Should be specified by different models 26 | -> return np.array, dim = (N, K) 27 | """ 28 | # return np.zeros([13000, 3]) 29 | pass 30 | 31 | 32 | def calFactor(self, month): 33 | """ 34 | Calculate specific month's factor. Should be specified by different models 35 | -> return np.array, dim = (K, 1) 36 | """ 37 | # return np.zeros([3, 1]) 38 | pass 39 | 40 | 41 | def cal_delayed_Factor(self, month): 42 | """ 43 | Calculate delayed month's factor, i.e. mean average of factors up to t-1. Should be specified by different models 44 | -> return np.array, dim = (K, 1) 45 | """ 46 | pass 47 | 48 | 49 | def inference(self, month): 50 | assert month >= self.test_period[0], f"Month error, {month} is not in test period {self.test_period}" 51 | 52 | mon_factor, mon_beta = self.calFactor(month), self.calBeta(month) 53 | 54 | assert mon_beta.shape[1] == mon_factor.shape[0], f"Dimension mismatch between mon_factor: {mon_factor.shape} and mon_beta: {mon_beta.shape}" 55 | 56 | # R_{N*1} = Beta_{N*K} @ F_{K*1} 57 | return mon_beta @ mon_factor 58 | 59 | 60 | def predict(self, month): 61 | assert month >= self.test_period[0] and month <= self.test_period[1], f"Month error, {month} is not in test period {self.test_period}" 62 | 63 | lag_factor, mon_beta = self.cal_delayed_Factor(month), self.calBeta(month) 64 | 65 | assert mon_beta.shape[1] == lag_factor.shape[0], f"Dimension mismatch between lag_factor: {lag_factor.shape} and mon_beta: {mon_beta.shape}" 66 | 67 | # R_{N*1} = Beta_{N*K} @ lag_F_avg{K*1} 68 | return mon_beta @ lag_factor 69 | 70 | 71 | def refit(self): 72 | # self.train_period[1] += 10000 # method in original paper: increase training size by one year each time refit 73 | self.train_period = (pd.Series(self.train_period) + 10000).to_list() # rolling training 74 | self.valid_period = (pd.Series(self.valid_period) + 10000).to_list() 75 | self.test_period = (pd.Series(self.test_period) + 10000).to_list() 76 | self.refit_cnt += 1 77 | 78 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | # # stock-level characteristics with index corresponding to original paper 5 | # annual_chara = { 6 | # 'absacc': 1, 'acc': 2, 'age': 4, 'agr': 5, 'bm': 9, 7 | # 'bm_ia': 10, 'cashdebt': 12, 'cashpr': 13, 'cfp': 14, 'cfp_ia': 15, 8 | # 'chatoia': 16, 'chcsho': 17, 'chempia': 18, 'chinv': 19, 'chpmia': 21, 9 | # 'convind': 24, 'currat': 25, 'depr': 26, 'divi': 27, 'divo': 28, 10 | # 'dy': 30, 'egr': 32, 'ep': 33, 'gma': 34, 'grcapx': 35, 11 | # 'grltnoa': 36, 'herf': 37, 'hire': 38, 'invest': 42, 'lev': 43, 12 | # 'lgr': 44, 'mve_ia': 52, 'operprof': 54, 'orgcap': 55, 'pchcapx_ia': 56, 13 | # 'pchcurrat': 57, 'pchdepr': 58, 'pchgm_pchsale': 59, 'pchquick': 60, 'pchsale_pchinvt': 61, 14 | # 'pchsale_pchrect': 62, 'pchsale_pchxsga': 63, 'pchsaleinv': 64, 'pctacc': 65, 'ps': 67, 15 | # 'quick': 68, 'rd': 69, 'rd_mve': 70, 'rd_sale': 71, 'realestate': 72, 16 | # 'roic': 77, 'salecash': 79, 'saleinv': 80, 'salerec': 81, 'secured': 82, 17 | # 'securedind': 83, 'sgr': 84, 'sin': 85, 'sp': 86, 'tang': 91, 'tb': 92 18 | # } 19 | 20 | # quarter_chara = { 21 | # 'aeavol': 3, 'cash': 11, 'chtx': 22, 'cinvest': 23, 22 | # 'ear': 31, 'ms': 50, 'nincr': 53, 'roaq': 74, 23 | # 'roavol': 75, 'roeq': 76, 'rsup': 78, 'stdacc': 89, 'stdcf': 90 24 | # } 25 | 26 | # month_chara = { 27 | # 'baspread': 6, 'beta': 7, 'betasq': 8, 'chmom': 20, 28 | # 'dolvol': 29, 'idiovol': 39, 'ill': 40, 'indmom': 41, 29 | # 'maxret': 45, 'mom12m': 46, 'mom1m': 47, 'mom36m': 48, 30 | # 'mom6m': 49, 'mvel1': 51, 'pricedelay': 66, 'retvol': 73, 31 | # 'std_dolvol': 87, 'std_turn': 88, 'turn': 93, 'zerotrade': 94 32 | # } 33 | 34 | CHARAS_LIST = ['absacc','acc','age','agr','bm','bm_ia','cashdebt','cashpr','cfp','cfp_ia','chatoia','chcsho','chempia','chinv','chpmia','convind','currat','depr','divi','divo','dy','egr','ep','gma','grcapx','grltnoa','herf','hire','invest','lev','lgr','mve_ia','operprof','orgcap','pchcapx_ia','pchcurrat','pchdepr','pchgm_pchsale','pchquick','pchsale_pchinvt','pchsale_pchrect','pchsale_pchxsga','pchsaleinv','pctacc','ps','quick','rd','rd_mve','rd_sale','realestate','roic','salecash','saleinv','salerec','secured','securedind','sgr','sin','sp','tang','tb','aeavol','cash','chtx','cinvest','ear','ms','nincr','roaq','roavol','roeq','rsup','stdacc','stdcf','baspread','beta','betasq','chmom','dolvol','idiovol','ill','indmom','maxret','mom12m','mom1m','mom36m','mom6m','mvel1','pricedelay','retvol','std_dolvol','std_turn','turn','zerotrade'] 35 | 36 | 37 | # default learning rate of CA model 38 | CA_DR = 0.5 # drop out rate 39 | CA_LR = 0.001 # learning rate 40 | 41 | # out of sample period 42 | OOS_start = 19870101 43 | OOS_end = 20161231 44 | 45 | 46 | 47 | class HiddenPrints: 48 | def __init__(self, activated=True): 49 | self.activated = activated 50 | self.original_stdout = None 51 | 52 | def open(self): 53 | sys.stdout.close() 54 | sys.stdout = self.original_stdout 55 | 56 | def close(self): 57 | self.original_stdout = sys.stdout 58 | sys.stdout = open(os.devnull, 'w') 59 | 60 | def __enter__(self): 61 | if self.activated: 62 | self.close() 63 | 64 | def __exit__(self, exc_type, exc_val, exc_tb): 65 | if self.activated: 66 | self.open() 67 | 68 | 69 | 70 | def git_push(message): 71 | os.system('git add results') 72 | os.system(f'git commit -m "no_dropout: {message}"') 73 | os.system('git push') --------------------------------------------------------------------------------