├── .gitignore ├── .idea ├── .gitignore ├── Autoencoder-Asset-Pricing-Models.iml ├── inspectionProfiles │ └── profiles_settings.xml ├── misc.xml ├── modules.xml └── vcs.xml ├── README.md ├── R_squares ├── 2023-06-09_05-24-47.json ├── 2023-06-09_06-07-56.json ├── 2023-06-09_09-08-26.json └── 2023-06-11_18-16-38.json ├── analysis.py ├── data_prepare.py ├── imgs ├── R2_pred_table.png ├── R2_total_table.png ├── alpha │ ├── CA0_1_inference_alpha_plot.png │ ├── CA0_2_inference_alpha_plot.png │ ├── CA0_3_inference_alpha_plot.png │ ├── CA0_4_inference_alpha_plot.png │ ├── CA0_5_alpha_plot.png │ ├── CA0_5_inference_alpha_plot.png │ ├── CA0_6_inference_alpha_plot.png │ ├── CA1_1_inference_alpha_plot.png │ ├── CA1_2_inference_alpha_plot.png │ ├── CA1_3_inference_alpha_plot.png │ ├── CA1_4_inference_alpha_plot.png │ ├── CA1_5_inference_alpha_plot.png │ ├── CA1_6_inference_alpha_plot.png │ ├── CA2_1_inference_alpha_plot.png │ ├── CA2_2_inference_alpha_plot.png │ ├── CA2_3_inference_alpha_plot.png │ ├── CA2_4_inference_alpha_plot.png │ ├── CA2_5_inference_alpha_plot.png │ ├── CA2_6_inference_alpha_plot.png │ ├── CA3_1_inference_alpha_plot.png │ ├── CA3_2_inference_alpha_plot.png │ ├── CA3_3_inference_alpha_plot.png │ ├── CA3_4_inference_alpha_plot.png │ ├── CA3_5_inference_alpha_plot.png │ ├── CA3_6_inference_alpha_plot.png │ ├── FF_1_inference_alpha_plot.png │ ├── FF_2_inference_alpha_plot.png │ ├── FF_3_inference_alpha_plot.png │ ├── FF_4_inference_alpha_plot.png │ ├── FF_5_inference_alpha_plot.png │ ├── FF_6_inference_alpha_plot.png │ ├── IPCA_1_inference_alpha_plot.png │ ├── IPCA_2_inference_alpha_plot.png │ ├── IPCA_3_inference_alpha_plot.png │ ├── IPCA_4_inference_alpha_plot.png │ ├── IPCA_5_inference_alpha_plot.png │ ├── IPCA_6_inference_alpha_plot.png │ ├── PCA_1_inference_alpha_plot.png │ ├── PCA_2_inference_alpha_plot.png │ ├── PCA_3_inference_alpha_plot.png │ ├── PCA_4_inference_alpha_plot.png │ ├── PCA_5_inference_alpha_plot.png │ ├── PCA_6_inference_alpha_plot.png │ ├── seq2seq1_1_inference_alpha_plot.png │ ├── seq2seq1_2_inference_alpha_plot.png │ ├── seq2seq1_3_inference_alpha_plot.png │ ├── seq2seq1_4_inference_alpha_plot.png │ ├── seq2seq1_5_inference_alpha_plot.png │ ├── seq2seq2_1_inference_alpha_plot.png │ ├── seq2seq2_2_inference_alpha_plot.png │ ├── seq2seq2_3_inference_alpha_plot.png │ ├── seq2seq2_4_inference_alpha_plot.png │ ├── seq2seq2_5_inference_alpha_plot.png │ ├── seq2seq3_1_inference_alpha_plot.png │ ├── seq2seq3_2_inference_alpha_plot.png │ ├── seq2seq3_3_inference_alpha_plot.png │ ├── seq2seq3_4_inference_alpha_plot.png │ └── seq2seq3_5_inference_alpha_plot.png ├── omit_char_R2_bias.png ├── pred_R2.png └── total_R2.png ├── main.py ├── models ├── CA.py ├── IPCA.py ├── Seq2Seq ├── modelBase.py └── seq.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # raw data 2 | data/ 3 | data.zip 4 | __MACOSX/ 5 | new_data.zip 6 | __pycache__ 7 | models/__pycache__ 8 | saved_models 9 | *_loss_*.png 10 | *.ipynb 11 | results/ 12 | logs/ 13 | R_squares/ 14 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Editor-based HTTP Client requests 5 | /httpRequests/ 6 | # Datasource local storage ignored files 7 | /dataSources/ 8 | /dataSources.local.xml 9 | -------------------------------------------------------------------------------- /.idea/Autoencoder-Asset-Pricing-Models.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Autoencoder-Asset-Pricing-Models 2 | 3 | 🧐 [Report](https://www.richardsong.space/autoencoder-asset-pricing-models) 4 | ## Set Up 5 | 6 | ```bash 7 | # generate preprocessed data and download portfolio returns 8 | python data_prepare.py 9 | 10 | # train models (ALL together) 11 | python main.py --Model 'FF PCA IPCA CA0 CA1 CA2 CA3' --K '1 2 3 4 5 6' 12 | 13 | # train models (selected models and K, for example) 14 | python main.py --Model 'IPCA CA3' --K '5 6' 15 | 16 | # analyze characteristics' importance (if needed) 17 | python main.py --Model 'IPCA CA0 CA1 CA2 CA3' --K '5' --omit_char 'absacc acc age agr bm bm_ia cashdebt cashpr cfp cfp_ia chatoia chcsho chempia chinv chpmia convind currat depr divi divo dy egr ep gma grcapx grltnoa herf hire invest lev lgr mve_ia operprof orgcap pchcapx_ia pchcurrat pchdepr pchgm_pchsale pchquick pchsale_pchinvt pchsale_pchrect pchsale_pchxsga pchsaleinv pctacc ps quick rd rd_mve rd_sale realestate roic salecash saleinv salerec secured securedind sgr sin sp tang tb aeavol cash chtx cinvest ear ms nincr roaq roavol roeq rsup stdacc stdcf baspread beta betasq chmom dolvol idiovol ill indmom maxret mom12m mom1m mom36m mom6m mvel1 pricedelay retvol std_dolvol std_turn turn zerotrade' 18 | 19 | # analyze models (calculate R^2, plot R^2 tables, bars and bias heatmap) 20 | python analysis.py 21 | ``` 22 | ## Results 23 | ### Total R^2 (%) 24 | 25 | 26 | 27 | 28 | ### Predict R^2 (%) 29 | 30 | 31 | 32 | ### Risk Premia v.s. Mispricing 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 |
47 | 48 | ### Characteristics Importance (reduced total R^2 (%), K=5) 49 | 50 | -------------------------------------------------------------------------------- /R_squares/2023-06-09_05-24-47.json: -------------------------------------------------------------------------------- 1 | {"models": ["FF_1", "FF_2", "FF_3", "FF_4", "FF_5", "FF_6"], "omit_char": [""], "R2_total": [0.08537139414421824, 0.1576019101919831, 0.1986486217133806, 0.20315476596988524, 0.31397093775365037, 0.3616431120471959]} -------------------------------------------------------------------------------- /R_squares/2023-06-09_06-07-56.json: -------------------------------------------------------------------------------- 1 | {"models": ["CA0_1", "CA0_2", "CA0_3", "CA0_4", "CA0_5", "CA0_6", "CA1_1", "CA1_2", "CA1_3", "CA1_4", "CA1_5", "CA1_6", "CA2_1", "CA2_2", "CA2_3", "CA2_4", "CA2_5", "CA2_6", "CA3_1", "CA3_2", "CA3_3", "CA3_4", "CA3_5", "CA3_6"], "omit_char": [""], "R2_total": [ 2 | 0.5677639760646643, 0.6761207104643877, 0.7076066105263652, 0.6913661386379286, 0.662602500272096, 0.7110612627936461, 3 | 0.5517562872860107, 0.7025783407556893, 0.685776051607686, 0.6664443573030849, 0.7006957708196195, 0.7052861947690043, 4 | 0.5967130036325399, 0.6626964974803786, 0.6608531336078073, 0.7070314610106503, 0.6462021917956272, 0.6767568343936613, 5 | 0.5531676704426002, 0.5249032928672436, 0.5642100044551001, 0.5458004779254889, 0.5558832641978944, 0.5235321637890534]} -------------------------------------------------------------------------------- /R_squares/2023-06-09_09-08-26.json: -------------------------------------------------------------------------------- 1 | {"models": ["PCA_1", "PCA_2", "PCA_3", "PCA_4", "PCA_5", "PCA_6"], "omit_char": [""], "R2_total": [0.4061160826307103, 0.5300364609271587, 0.5913033228863098, 0.6246396597772854, 0.6467919712825208, 0.6720178863573743]} -------------------------------------------------------------------------------- /R_squares/2023-06-11_18-16-38.json: -------------------------------------------------------------------------------- 1 | {"models": ["IPCA_5", "CA0_5", "CA1_5", "CA2_5", "CA3_5"], "omit_char": ["absacc", "acc", "age", "agr", "bm", "bm_ia", "cashdebt", "cashpr", "cfp", "cfp_ia", "chatoia", "chcsho", "chempia", "chinv", "chpmia", "convind", "currat", "depr", "divi", "divo", "dy", "egr", "ep", "gma", "grcapx", "grltnoa", "herf", "hire", "invest", "lev", "lgr", "mve_ia", "operprof", "orgcap", "pchcapx_ia", "pchcurrat", "pchdepr", "pchgm_pchsale", "pchquick", "pchsale_pchinvt", "pchsale_pchrect", "pchsale_pchxsga", "pchsaleinv", "pctacc", "ps", "quick", "rd", "rd_mve", "rd_sale", "realestate", "roic", "salecash", "saleinv", "salerec", "secured", "securedind", "sgr", "sin", "sp", "tang", "tb", "aeavol", "cash", "chtx", "cinvest", "ear", "ms", "nincr", "roaq", "roavol", "roeq", "rsup", "stdacc", "stdcf", "baspread", "beta", "betasq", "chmom", "dolvol", "idiovol", "ill", "indmom", "maxret", "mom12m", "mom1m", "mom36m", "mom6m", "mvel1", "pricedelay", "retvol", "std_dolvol", "std_turn", "turn", "zerotrade"], "R2_total": [1.5442819689237552e-05, -0.0008641544212459884, -0.0001747076111721091, -6.028723743389808e-05, 0.0004881307548909586, -9.453040450568828e-05, 0.0011809606037788134, -0.00047160860320083486, 0.0004497328261676703, 0.00035797708987406196, 0.00046514268643882417, 8.010321267071241e-05, 0.0005184883538837948, -0.00037818109386877907, -5.991642326275137e-05, 0.0002601835988962353, 0.0011439216429843801, 0.0017112133538985663, -4.350496459193387e-06, -0.00019380257770462705, 0.00043948388552850215, 0.00031580941297537635, 0.003338161066369527, 0.0021027676465422696, -0.0003171094148499698, 0.0014555971916409005, -0.0007270330656120594, 0.0016980056916262587, 0.0009553837759342931, 0.0027868308676146647, 0.00026402683391868464, 0.0007850527331111357, 0.0020449984309897085, 0.001054644787377823, 5.184771678434785e-06, 0.002178892159566903, -0.00017501728488655832, -0.0002577126732409285, 0.0009543687413273716, 0.0005083110135046809, 7.62809849720325e-05, 8.658720973830913e-05, 0.0004486453095020604, 0.0008906763104503668, 0.0010605476004982295, 0.0018694816639623912, 0.0002959972532902144, 0.0017008605711600344, 0.0007976861126247625, -0.002652802485675565, 0.002662754828368419, 9.917422749694538e-05, 5.964285215753762e-05, 0.0022189518272991426, -7.435817815093504e-05, -0.001698616694105315, 0.001233404502690938, 0.00047054690788606024, 0.0026004651808273493, 0.0011172780845385422, -0.0001482646856509895, -0.00041623695665415905, -0.0015909825974204095, -0.00031927406061749153, -0.00043655338823822554, 0.0001361268145957384, -0.00032109572478289383, 8.887812899915914e-05, 0.002067090875226163, 0.0010982209839661694, -0.0005637330110257466, -0.0005329012820900481, 0.0009084092837141622, 0.0021178568491788674, 0.0033586699296590528, 0.03875439641206746, 0.007293900327323799, 0.004763913685328358, 0.004817433432478202, 0.007084909806944539, 0.0025907095069127584, 0.0080607854611624, 0.0057093251947115675, 0.009105559086416637, 0.011542299391846145, 0.0028401829485293906, 0.02709723337680492, 0.004118952479980065, -0.00025137933247576516, 0.018393986829754794, 0.003372828092497837, 0.0013541844149635995, 0.00793699314016949, 0.0036518023385704312, -0.01840451865415549, 0.0035504742986338655, -0.02124055707570216, -0.003382787260082454, 0.001661945882751592, -8.216386260140318e-05, -0.011643569891665484, -0.0015886009587249283, -0.0060291572912980484, -0.003934251792188981, -0.0003035868273979503, 0.0025248914169603287, -0.0004996588466505969, 0.0009582432785717465, -0.0017444408411588785, -0.004805523268163303, -0.002931825562557, -0.014321167551977099, -0.005731926091473105, -0.001628883333692932, -0.008380269170191412, -0.002495037120210153, -0.013508701388505795, 0.001503549704314544, -0.0007751891041964942, -0.0007030969454133729, -0.0028340425016759596, 0.002562114878683408, 0.0007168131966963642, 0.0005966917290103346, -0.0009823429746401713, -0.008049996513831648, -0.006935426188849236, -0.013752522526764732, -0.0005069144917650981, 0.0008889578779507357, -0.004511542543265024, 0.000512296750466934, -0.0002943171885458895, -0.00022271179096300386, -0.0021640651213321593, 0.0002623110944273144, -0.00026747487114753277, -0.0027015552337811277, -0.012375056721493305, -0.00010214235568639651, -0.012322425151997773, -0.004802562616962769, -0.0040789335496304036, 0.0004565130977485232, -0.0004572283160489965, 0.011102582602458222, -0.0020968867403851066, -0.0011853937188981423, -0.005591149809828, -0.010832419289181106, -0.001980246263801777, -0.0017540247099745443, 0.0018022912780594202, -0.0005056805241561158, -0.0017686227072655214, -0.0003527295679881526, 0.016529002299418005, -0.0014105862431510463, -0.0005732444581987295, -0.002869230597707273, -0.004965959446656232, 0.0004066440403122096, -0.009874817195565821, -0.030274345848102402, -0.012926682404285739, -0.0008783158084264553, -0.006107566065733372, -0.010184352477880187, -0.025327970485242712, -0.035590091815137614, -0.0303458179931807, -0.004102272911347127, -0.0004503926531160829, -0.029369754046835173, -0.007045255786105931, 0.0016875597888881266, -0.03481944581257812, -0.006678644726203498, -0.014538583275030104, -0.0007153352724629247, -0.011420023006262658, -0.004652763809388283, -0.005798820819408745, -0.024656178382399974, 0.0011578591100704916, -0.005280277401536693, 0.0010356576649497296, 0.003940580861575116, -0.005276868987642791, 0.00770392753313931, -0.007944469488529449, 0.001991397641811221, 0.003759694548383874, 0.0011195084904335184, -0.007110697689415635, 0.0032203225902148747, -0.007578862375151041, -0.0060194039077946515, 0.00024130283639867134, 0.0013235404395363082, 0.00043243599875075756, 0.001123780074328562, 0.00050973691967926, -0.00034757211361624574, -0.002281972957101308, 0.000845475795447137, -0.0020588684564848414, 0.0021372282150149413, -0.00019856545094609768, -0.0017913068956328937, 0.0006373160107784326, 0.0012739985965753986, -0.001696571054130902, -0.0002589327068805991, -0.00022288072963450034, -0.0031104722681001284, -0.0008235753576365523, -0.002447807642767752, 0.002004633800402944, -0.003224266216026561, 0.00864564856685679, -0.00554499989692514, -0.00026444194813035615, -0.0010032832870013886, -0.0032213627206253426, -0.000911160024681501, 0.0004076477827085201, 0.0007995060213979999, -0.0005885363844878588, 0.0032955989913380224, 0.0004894614286667931, 0.002261644799368967, -0.0032746021967355876, -0.0005524850007211368, -0.0031971616217488785, 0.0060628232866996035, -0.0018951843317909223, -0.004804670310596504, -0.016865397416521932, 0.0022279632468085175, 0.0004397955748330906, -0.0012895574791834674, -0.0052936056246406515, 0.001047196960554997, 0.0014035810745537391, -0.0021789054250395123, 0.004835618627879845, -0.004423137045597492, 0.0033839427269372058, 0.0004317648131846319, 0.010148684107688322, -0.0005852067203551137, -0.00023673829544612612, -0.0004969772951760598, -0.0007139106069110612, -0.0006781488781816281, -0.004463513032158084, -0.008314803798795345, 0.005664192158806869, -0.0017831527387602852, -0.005528449668342539, -0.00018355344047649158, -0.013367195182721558, 0.010476427486050932, -0.0030995623273649686, -0.0033128122970741414, 0.004656006266778645, -0.010849289595229128, -0.002397579088451285, -0.009958364378056861, -0.0011444352933996926, 0.0021166335185883733, -0.018000165477872865, -0.0032624694006755384, -0.004127065179238776, 0.012949255788881175, -0.0018095543583533935, -0.005592349116810835, 0.0037267653611244844, -0.0038516820526487416, 0.024489734744573943, 0.004683327531111781, 0.001269388150738071, -0.0017407506416220464, 0.011902507395885942, 0.004639839618832409, 0.002506386658879589, 0.0003685885982728232, 0.009029032176506746, 0.003135888070068371, -0.0016520809387076119, 0.003088001303703236, 0.00014727469647846103, 0.0035561159496605432, -2.876814735663924e-05, -1.1060450691990908e-06, -0.00046745210145082705, 0.0012870350251698026, 0.0042191397766525585, 0.00655774915823315, 0.0005207348452095362, 0.0023795699587515484, 0.0043810533565454834, 0.004609257547817935, 0.01290056247180027, 0.0014757109716835304, 0.0010271818164879765, 0.0015607194203527408, 0.0004559197274759397, 0.00033707593565324157, 0.002642703381714573, 0.006501328950226926, 0.0008704244332061739, 0.002463581165456641, 0.004364567880446146, -0.0018809110325389566, 0.00017150256113329654, 0.00017711236826822851, -7.174454357028459e-05, 3.2134260813054816e-05, -0.0018074430724653867, 0.0007488238809050252, -0.00021827012815400781, -0.0002562985350954561, 0.00023716735492340657, 0.001413468319080624, 0.006882923410709174, 0.0007931366577244026, -0.0004717657827425503, 0.009100363328674477, 0.005404627376927373, -0.004026623783044081, 0.005351064544848239, -0.0035389768278023537, 0.002068255236817085, -0.002090048742856676, -0.0019271886285645579, 0.0011498135139685894, 0.0018314054258520285, -0.00026283971236906734, 0.0015413975782866407, -0.004150725533632493, -0.001320978589455457, 0.00031339691627352284, -0.0016566925476326766, -0.0006220275171834322, 7.44998579660372e-05, -0.0002999797537633908, 0.0007030142990428478, 0.0007913912947924429, 0.0011556895007599488, 0.011568536396979079, 0.0028225455085252316, 0.00035905603670860486, -0.000841494670420162, 0.009615860981466828, 0.03377755052540243, 0.0789273078045386, 0.05525867670164508, -0.0010166697260519664, 0.00957764643602288, 0.06977357007791107, 0.006012541368369928, 0.002010875608860041, 0.03032162865092458, 0.002846750112344365, -0.00016217114898731122, 0.0060712783132312875, 0.006622682402069313, 0.006219871147256417, 0.0018001652923477218, 0.06992553842572524, 0.006724233973021909, 0.007539747957793552, 0.020538531613926936, 0.007943897075231354, 0.008360812789827254, 0.003290025441256006, 0.020634865090878085, 0.0008080876743085108, -0.0030892486063024416, 0.00041600088024251747, 0.0070829632445147395, 0.0036536124147609206, -0.0016869382360118479, 0.0034263287960714095, -0.0006554657701027811, 0.0029753478023664126, 0.0006174977901268752, 7.101865790082318e-05, -0.00016257411215181428, -0.0009369820715646737, 0.0005434387564733356, 0.0012271814940856274, 0.0017175994942115747, 0.0005473661944555008, 0.005273938186585614, 0.0027605830798231867, 0.0053267195601733874, -0.0008641356489738072, 0.0003752916592929534, -8.763321084148679e-05, 0.0015245669500787429, -0.00012165228700977693, 0.0007204145703794129, 7.33961001830874e-06, 0.0007303311512011357, 0.002484907097797917, 0.012401690162314405, 0.002021342443194185, -0.00036144987650921223, -0.0012055569846747272, 0.00085115010144865, 2.829758256761572e-05, -3.762854703992513e-05, 0.0006878905429239524, -0.0007663772085586551, 0.0001906678540952722, 0.0001922411569781346, 0.001457809117053288, 0.0061655533145379415, -0.0002077338553436725, 0.004428694252206933, 0.007572691730434067, 0.015522139371950017, -0.0007973496961124482, 0.010913869326709902, -0.0009089608282075723, 0.001704153441209666, -0.001975343179565603, 0.0007767569381108563, 0.0001946668039775057, -0.0003049050619283733, -0.0006077903992949274, -0.0014501145568337481, 0.0024408470743457755, 0.0024867530414642847, -0.00028112355434628533, 0.00029719386409921, -0.0005815474877897131, -9.683347434896739e-05, 0.0015214781874257621, 0.00038445751824878194, -0.0006103274076518783, 0.0016081931094918955, 0.008255889709628095, 0.013087103367242059, 0.0004902027422780675, 0.007683661011998355, 0.0006227685572650632, 0.040959894211064496, 0.04659799783753149, 0.05623489106446333, -0.0011074112724636098, 0.0026292183555661763, 0.06648468153042297, 0.008507328563525984, 0.0012577368022517188, 0.05146040805311081, 0.006280620030596484, 0.0025791825898906495, 0.006777579940425715, 0.009547184931625763, 0.008159048875296504, 0.0008150683146730398, 0.06841724518294723, -0.0003192050956154491, 0.005694154235833304, 0.0024969631810545234, -0.001106586510389418]} -------------------------------------------------------------------------------- /analysis.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from utils import * 4 | import seaborn as sns 5 | import matplotlib.pyplot as plt 6 | import plotly.figure_factory as ff 7 | 8 | import warnings 9 | warnings.filterwarnings('ignore') 10 | 11 | 12 | def calculate_R2(model, type, input=None, complete_r=None): 13 | portfolio_ret = pd.read_pickle('data/portfolio_ret.pkl') 14 | oos_ret = portfolio_ret.loc[(portfolio_ret['DATE'] >= OOS_start) & (portfolio_ret['DATE'] <= OOS_end)] 15 | 16 | if not isinstance(input, np.ndarray): 17 | # print('type: ', type) 18 | if isinstance(model, str): 19 | output_path = f'results/{type}/{model}_{type}.csv' 20 | else: 21 | output_path = f'results/{type}/{model.name}_{type}.csv' 22 | # print('path : ', output_path) 23 | model_output = pd.read_csv(output_path) 24 | else: 25 | model_output = input 26 | model_output = pd.DataFrame(model_output, columns=CHARAS_LIST) 27 | model_output['DATE'] = oos_ret['DATE'].to_list() 28 | 29 | for col in model_output.columns: # hard code for format error 30 | model_output[col] = model_output[col].apply(lambda x: float(str(x).replace('[', '').replace(']', ''))) 31 | 32 | residual_square = ((oos_ret.set_index('DATE') - model_output.set_index('DATE'))**2).dropna() 33 | residual_square = (1 - (residual_square == np.inf) * 1.0) * residual_square # drop Inf outliers 34 | 35 | total_square = oos_ret.set_index('DATE')**2 36 | total_square = (1 - (total_square == np.inf) * 1.0) * total_square # drop Inf outliers 37 | 38 | model_output_R2 = 1 - np.sum(residual_square.values)/np.sum(total_square.values) 39 | 40 | if not isinstance(input, np.ndarray): 41 | return model_output_R2 42 | 43 | else: 44 | no_omit_output = complete_r 45 | no_omit_output = pd.DataFrame(no_omit_output, columns=CHARAS_LIST) 46 | no_omit_output['DATE'] = oos_ret['DATE'].to_list() 47 | 48 | no_omit_residual_square = ((oos_ret.set_index('DATE') - no_omit_output.set_index('DATE'))**2).dropna() 49 | no_omit_residual_square = (1 - (no_omit_residual_square == np.inf) * 1.0) * no_omit_residual_square # drop Inf outliers 50 | 51 | no_omit_model_output_R2 = 1 - np.sum(no_omit_residual_square.values)/np.sum(total_square.values) 52 | 53 | return no_omit_model_output_R2 - model_output_R2 # the difference of R^2, i.e. the importance of characteristics 54 | 55 | 56 | 57 | def alpha_plot(model, type, save_dir='imgs'): 58 | if 'alpha' not in os.listdir(save_dir): 59 | os.mkdir(f'{save_dir}/alpha') 60 | 61 | portfolio_ret = pd.read_pickle('data/portfolio_ret.pkl') 62 | oos_result = portfolio_ret.loc[(portfolio_ret['DATE'] >= OOS_start) & (portfolio_ret['DATE'] <= OOS_end)].set_index('DATE') 63 | 64 | output_path = f'results/{type}/{model.name}_{type}.csv' 65 | inference_result = pd.read_csv(output_path) 66 | inference_result = inference_result.set_index('DATE') 67 | 68 | pricing_error_analysis = [] 69 | for col in CHARAS_LIST: 70 | raw_return = oos_result[col].mean() 71 | error = oos_result[col] - inference_result[col] 72 | alpha = error.mean() 73 | t_stat = abs(error.mean()/error.std()) * np.sqrt(oos_result.shape[0]) 74 | pricing_error_analysis.append([raw_return, alpha, t_stat]) 75 | 76 | pricing_error_analysis = pd.DataFrame(pricing_error_analysis, columns = ['raw ret', 'alpha', 't_stat'], index=CHARAS_LIST) 77 | 78 | lower_point = min(np.min(pricing_error_analysis['raw ret']), np.min(pricing_error_analysis['alpha'])) * 1.15 79 | upper_point = max(np.max(pricing_error_analysis['raw ret']), np.max(pricing_error_analysis['alpha'])) * 1.15 80 | 81 | significant_mask = pricing_error_analysis['t_stat'] > 3 82 | 83 | plt.scatter(pricing_error_analysis.loc[significant_mask]['raw ret'], pricing_error_analysis.loc[significant_mask]['alpha'], marker='^', color='r', alpha=0.6, label=f'#Alphas(|t|>3.0)={np.sum(significant_mask*1.0)}') 84 | plt.scatter(pricing_error_analysis.loc[~significant_mask]['raw ret'], pricing_error_analysis.loc[~significant_mask]['alpha'], marker='o', color='b', alpha=0.6, label=f'#Alphas(|t|<3.0)={94-np.sum(significant_mask*1.0)}') 85 | plt.plot(np.linspace(lower_point, upper_point, 10), np.linspace(lower_point, upper_point, 10), color='black') 86 | 87 | plt.ylabel('Alpha (%)') 88 | plt.xlabel('Raw Return (%)') 89 | plt.legend() 90 | 91 | plt.title(model.name) 92 | plt.savefig(f'{save_dir}/alpha/{model.name}_inference_alpha_plot.png') 93 | plt.close() 94 | 95 | 96 | def plot_R2_bar(R_df, type): 97 | 98 | R_df['Model'] = R_df[0].apply(lambda x: x.split('_')[0]) 99 | 100 | labels = ['K=1', 'K=2', 'K=3', 'K=4', 'K=5'] 101 | #FF = (R_df.loc[R_df['Model']=='FF'][1]*100).to_list() 102 | #PCA = (R_df.loc[R_df['Model']=='PCA'][1]*100).to_list() 103 | #IPCA = (R_df.loc[R_df['Model']=='IPCA'][1]*100).to_list() 104 | #CA0 = (R_df.loc[R_df['Model']=='CA0'][1]*100).to_list() 105 | #CA1 = (R_df.loc[R_df['Model']=='CA1'][1]*100).to_list() 106 | CA2 = (R_df.loc[R_df['Model']=='CA2'][1]*100).to_list() 107 | #CA3 = (R_df.loc[R_df['Model']=='CA3'][1]*100).to_list() 108 | seq2seq1 = (R_df.loc[R_df['Model'] == 'seq2seq1'][1] * 100).to_list() 109 | seq2seq2 = (R_df.loc[R_df['Model'] == 'seq2seq2'][1] * 100).to_list() 110 | seq2seq3 = (R_df.loc[R_df['Model'] == 'seq2seq3'][1] * 100).to_list() 111 | 112 | 113 | 114 | 115 | x = np.arange(len(labels)) # 标签位置 116 | width = 0.11 117 | 118 | fig, ax = plt.subplots(figsize=(15, 5)) 119 | #ax.bar(x - width*3 , FF, width, label='FF', color=plt.get_cmap('OrRd')(np.linspace(0, 1, 8)[1])) 120 | #ax.bar(x - width*2 , PCA, width, label='PCA', color=plt.get_cmap('OrRd')(np.linspace(0, 1, 8)[2])) 121 | #ax.bar(x - width , IPCA, width, label='IPCA', color=plt.get_cmap('OrRd')(np.linspace(0, 1, 8)[3])) 122 | #ax.bar(x + 0.00, CA0, width, label='CA0', color=plt.get_cmap('OrRd')(np.linspace(0, 1, 8)[4])) 123 | #ax.bar(x - width*3, CA1, width, label='CA1', color=plt.get_cmap('OrRd')(np.linspace(0, 1, 8)[1])) 124 | ax.bar(x - width, CA2, width, label='CA2', color=plt.get_cmap('OrRd')(np.linspace(0, 1, 8)[1])) 125 | #ax.bar(x - width, CA3, width, label='CA3', color=plt.get_cmap('OrRd')(np.linspace(0, 1, 8)[3])) 126 | ax.bar(x + 0.00, seq2seq1, width, label='seq2seq1', color=plt.get_cmap('OrRd')(np.linspace(0, 1, 8)[2])) 127 | ax.bar(x + width, seq2seq2, width, label='seq2seq2', color=plt.get_cmap('OrRd')(np.linspace(0, 1, 8)[3])) 128 | ax.bar(x + width * 2, seq2seq3, width, label='seq2seq3', color=plt.get_cmap('OrRd')(np.linspace(0, 1, 8)[4])) 129 | 130 | 131 | ax.set_ylabel(f'Portfolio {type} R^2 (%)') 132 | ax.set_xticks(x) 133 | ax.set_xticklabels(labels) 134 | ax.legend() 135 | 136 | fig.tight_layout() 137 | 138 | plt.savefig(f'imgs/{type}_R2.png') 139 | plt.close() 140 | 141 | 142 | 143 | def plot_R2_table(R_df, type): 144 | plt.figure(dpi=200) 145 | 146 | for col in R_df.columns: 147 | R_df[col] = R_df[col].apply(lambda x: round_number(x)) 148 | 149 | R_df = R_df.reset_index() 150 | R_df.columns = ['Model', 'K=1', 'K=2', 'K=3', 'K=4', 'K=5'] 151 | 152 | 153 | fig_total = ff.create_table(R_df, 154 | colorscale=[[0, 'white'], 155 | [0.01, 'lightgrey'], 156 | [1.0, 'white']], 157 | font_colors=['#000000', '#000000', 158 | '#000000']) 159 | fig_total.update_layout( 160 | autosize=False, 161 | width=500, 162 | height=200, 163 | ) 164 | fig_total.write_image(f"imgs/R2_{type}_table.png", scale=4) 165 | 166 | 167 | 168 | def round_number(num): 169 | num = str(round(num*100, 2)) 170 | while len(num.split('.')[1]) < 2: 171 | num = num + '0' 172 | return num 173 | 174 | 175 | 176 | if __name__=="__main__": 177 | #CAs = ["CA1_1", "CA1_2", "CA1_3", "CA1_4", "CA1_5", "CA2_1", "CA2_2", "CA2_3", "CA2_4", "CA2_5", "CA3_1", "CA3_2", "CA3_3", "CA3_4", "CA3_5"] 178 | CAs =["CA2_1", "CA2_2", "CA2_3", "CA2_4", "CA2_5"] 179 | Seqs = ["seq2seq1_1", "seq2seq1_2","seq2seq1_3","seq2seq1_4","seq2seq1_5","seq2seq2_1","seq2seq2_2","seq2seq2_3","seq2seq2_4","seq2seq2_5","seq2seq3_1","seq2seq3_2","seq2seq3_3","seq2seq3_4","seq2seq3_5"] 180 | #FFs = ["FF_1", "FF_2", "FF_3", "FF_4", "FF_5", "FF_6"] 181 | #PCAs = ["PCA_1", "PCA_2", "PCA_3", "PCA_4", "PCA_5", "PCA_6"] 182 | #IPCAs = ["IPCA_1", "IPCA_2", "IPCA_3", "IPCA_4", "IPCA_5", "IPCA_6"] 183 | models = CAs + Seqs 184 | 185 | ## Plot R^2 bars 186 | total_R2 = [] 187 | for m in models: 188 | total_R2.append(calculate_R2(m, 'inference')) 189 | R_total = pd.DataFrame([models, total_R2]).T 190 | 191 | predict_R2 = [] 192 | for m in models: 193 | predict_R2.append(calculate_R2(m, 'predict')) 194 | R_pred = pd.DataFrame([models, predict_R2]).T 195 | 196 | plot_R2_bar(R_total, 'total') 197 | plot_R2_bar(R_pred, 'pred') 198 | 199 | ## Save R^2 tables 200 | R_total_df = pd.DataFrame(np.array(total_R2).reshape(-1, 5), columns = ['K=1', 'K=2', 'K=3', 'K=4', 'K=5'], index=['CA2', 'seq2seq1', 'seq2seq2', 'seq2seq3']) 201 | R_pred_df = pd.DataFrame(np.array(predict_R2).reshape(-1, 5), columns = ['K=1', 'K=2', 'K=3', 'K=4', 'K=5'], index=['CA2', 'seq2seq1', 'seq2seq2', 'seq2seq3']) 202 | 203 | plot_R2_table(R_total_df, 'total') 204 | plot_R2_table(R_pred_df, 'pred') 205 | 206 | 207 | ## Plot characteristics importance heatmap 208 | #models = ["CA1_5", "CA2_5", "CA3_5"] 209 | # #TODO: paste results from R_squares/ 210 | # R2_omit = [] 211 | #R_minus = pd.DataFrame(np.array(R2_omit).reshape(-1, 94)*100, index=models, columns=CHARAS_LIST).T 212 | #char_ranks = R_minus.T.sum().argsort().argsort().index.to_list() 213 | #char_ranks.reverse() 214 | 215 | #plt.figure(figsize=(8, 15), dpi=200) 216 | #sns.heatmap(R_minus.T[char_ranks].T, cmap='Blues', linewidths=0.6) 217 | #plt.savefig('imgs/omit_char_R2_bias.png', bbox_inches='tight') 218 | #plt.close() 219 | -------------------------------------------------------------------------------- /data_prepare.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | pd.options.mode.chained_assignment = None 5 | from tqdm import tqdm 6 | 7 | import os 8 | import pickle 9 | import zipfile 10 | from joblib import delayed, Parallel 11 | from itertools import product 12 | import utils 13 | 14 | import warnings 15 | 16 | warnings.filterwarnings('ignore') 17 | 18 | # if 'new_data.zip' not in os.listdir(): 19 | # os.system('wget https://cloud.tsinghua.edu.cn/f/07d6a0223d054247af26/?dl=1 -O new_data.zip') 20 | 21 | # if 'data' not in os.listdir(): 22 | # os.mkdir('data') 23 | # os.system('wget https://cloud.tsinghua.edu.cn/f/179082ecf0f147a4840c/?dl=1 -O portfolio_ret.pkl') 24 | # os.system('wget https://cloud.tsinghua.edu.cn/f/b93c6ae7e2014d3a951e/?dl=1 -O ff5.csv') 25 | # os.system('wget https://cloud.tsinghua.edu.cn/f/5f077be9eda0428ab7e5/?dl=1 -O UMD.csv') 26 | # os.system('wget https://cloud.tsinghua.edu.cn/f/a916da12d5a9450eb0df/?dl=1 -O p_charas.pkl') 27 | 28 | # os.system('mv portfolio_ret.pkl data') 29 | # os.system('mv ff5.csv data') 30 | # os.system('mv UMD.csv data') 31 | # os.system('mv p_charas.pkl data') 32 | 33 | 34 | with open('D:/Autoencoder/data/new_data/data/month_ret.pkl', 'rb') as f: 35 | print('Reading month_ret.pkl', end=' ') 36 | mon_ret = pd.read_pickle(f) 37 | mon_ret.to_pickle('data/month_ret.pkl') 38 | print('Done!') 39 | 40 | with open('D:/Autoencoder/data/new_data/data/datashare.pkl', 'rb') as f: 41 | print('Reading datashare.pkl', end=' ') 42 | datashare = pd.read_pickle(f) 43 | datashare['DATE'].drop_duplicates().reset_index(drop=True).to_pickle('data/mon_list.pkl') 44 | # datashare.to_pickle('data/datashare.pkl') 45 | print('Done!') 46 | 47 | 48 | def pre_process(date): 49 | cross_slice = datashare.loc[datashare.DATE == date].copy(deep=False) 50 | omitted_mask = 1.0 * np.isnan(cross_slice.loc[cross_slice['DATE'] == date]) 51 | # fill nan values with each factors median 52 | cross_slice.loc[cross_slice.DATE == date] = cross_slice.fillna(0) + omitted_mask * cross_slice.median() 53 | # if all stocks' factor is nan, fill by zero 54 | cross_slice.loc[cross_slice.DATE == date] = cross_slice.fillna(0) 55 | 56 | re_df = [] 57 | # rank normalization 58 | for col in utils.CHARAS_LIST: 59 | series = cross_slice[col] 60 | de_duplicate_slice = pd.DataFrame(series.drop_duplicates().to_list(), columns=['chara']) 61 | series = pd.DataFrame(series.to_list(), columns=['chara']) 62 | # sort and assign rank, the same value should have the same rank 63 | de_duplicate_slice['sort_rank'] = de_duplicate_slice['chara'].argsort().argsort() 64 | rank = pd.merge(series, de_duplicate_slice, left_on='chara', right_on='chara', how='right')['sort_rank'] 65 | # if all values are zero, the results will contain nan 66 | rank_normal = ((rank - rank.min()) / (rank.max() - rank.min()) * 2 - 1) 67 | re_df.append(rank_normal) 68 | re_df = pd.DataFrame(re_df, index=utils.CHARAS_LIST).T.fillna(0) 69 | re_df['permno'] = list(cross_slice['permno'].astype(int)) 70 | re_df['DATE'] = list(cross_slice['DATE'].astype(int)) 71 | 72 | return re_df[['permno', 'DATE'] + utils.CHARAS_LIST] 73 | 74 | 75 | def cal_portfolio_ret(it, df): 76 | d, f = it[0], it[1] 77 | # long portfolio, qunatile 0.0~0.1; short portfolio, qunatile 0.9~1.0 78 | long_portfolio = \ 79 | df.loc[df.DATE == d][['permno', f]].sort_values(by=f, ascending=False)[:df.loc[df.DATE == d].shape[0] // 10][ 80 | 'permno'].to_list() 81 | short_portfolio = \ 82 | df.loc[df.DATE == d][['permno', f]].sort_values(by=f, ascending=False)[-df.loc[df.DATE == d].shape[0] // 10:][ 83 | 'permno'].to_list() 84 | # long-short portfolio return 85 | long_ret = mon_ret.loc[mon_ret.date == d].drop_duplicates('permno').set_index('permno').reindex(long_portfolio)[ 86 | 'ret-rf'].dropna().mean() 87 | short_ret = mon_ret.loc[mon_ret.date == d].drop_duplicates('permno').set_index('permno').reindex(short_portfolio)[ 88 | 'ret-rf'].dropna().mean() 89 | chara_ret = 0.5 * (long_ret - short_ret) 90 | 91 | return chara_ret 92 | 93 | 94 | def cal_portfolio_charas(month, df): 95 | mon_portfolio_chara = [] 96 | p_name = ['p_' + chr for chr in utils.CHARAS_LIST] 97 | for chr in utils.CHARAS_LIST: 98 | long_portfolio = df.loc[df.DATE == month].sort_values(by=chr, ascending=False).reset_index(drop=True)[ 99 | :df.loc[df.DATE == month].shape[0] // 10]['permno'].to_list() 100 | short_portfolio = df.loc[df.DATE == month].sort_values(by=chr, ascending=False).reset_index(drop=True)[ 101 | -df.loc[df.DATE == month].shape[0] // 10:]['permno'].to_list() 102 | 103 | long_charas = df.loc[df.DATE == month].set_index('permno').loc[long_portfolio][utils.CHARAS_LIST] 104 | short_charas = df.loc[df.DATE == month].set_index('permno').loc[short_portfolio][utils.CHARAS_LIST] 105 | 106 | mon_portfolio_chara.append([month] + (0.5 * (long_charas.mean() - short_charas.mean())).to_list()) 107 | 108 | return pd.DataFrame(mon_portfolio_chara, index=p_name, columns=['DATE'] + utils.CHARAS_LIST) 109 | 110 | 111 | if __name__ == '__main__': 112 | # pre-process share data 113 | processed_df = Parallel(n_jobs=-1)(delayed(pre_process)(d) for d in 114 | tqdm(datashare.DATE.drop_duplicates().to_list(), colour='green', 115 | desc='Processing')) 116 | processed_df = pd.concat(processed_df) 117 | 118 | ##TODO: calculate portfolio returns (or download preprocessed data) 119 | # iter_list = list(product(datashare.DATE.drop_duplicates(), CHARAS_LIST)) 120 | # portfolio_rets = Parallel(n_jobs=-1)(delayed(cal_portfolio_ret)(it, df=processed_df) for it in tqdm(iter_list, colour='green', desc='Calculating')) 121 | # portfolio_rets = pd.DataFrame(np.array(portfolio_rets).reshape(-1, 94), index=datashare.DATE.drop_duplicates(), columns=CHARAS_LIST).reset_index() 122 | # portfolio_rets[CHARAS_LIST] = portfolio_rets[CHARAS_LIST].astype(np.float16) 123 | 124 | ##TODO: calculate portfolio characteristics (or download preprocessed data) 125 | # mon_list = pd.read_pickle('data/mon_list.pkl') 126 | # _portfolio_chara_set = Parallel(n_jobs=-1)(delayed(cal_portfolio_charas)(mon, df=processed_df) for mon in tqdm(mon_list, colour='yellow', desc='Calculating P characteristics')) 127 | # p_charas = _portfolio_chara_set[0].copy(deep=False) 128 | # for tdf in _portfolio_chara_set[1:]: 129 | # p_charas = pd.concat([p_charas, tdf]) 130 | 131 | processed_df.to_pickle('data/datashare_re.pkl') 132 | # portfolio_rets.to_pickle('data/portfolio_rets.pkl') 133 | # p_charas.to_pickle('data/p_charas.pkl') 134 | -------------------------------------------------------------------------------- /imgs/R2_pred_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/R2_pred_table.png -------------------------------------------------------------------------------- /imgs/R2_total_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/R2_total_table.png -------------------------------------------------------------------------------- /imgs/alpha/CA0_1_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA0_1_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA0_2_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA0_2_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA0_3_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA0_3_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA0_4_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA0_4_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA0_5_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA0_5_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA0_5_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA0_5_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA0_6_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA0_6_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA1_1_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA1_1_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA1_2_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA1_2_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA1_3_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA1_3_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA1_4_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA1_4_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA1_5_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA1_5_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA1_6_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA1_6_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA2_1_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA2_1_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA2_2_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA2_2_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA2_3_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA2_3_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA2_4_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA2_4_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA2_5_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA2_5_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA2_6_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA2_6_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA3_1_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA3_1_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA3_2_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA3_2_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA3_3_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA3_3_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA3_4_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA3_4_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA3_5_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA3_5_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/CA3_6_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA3_6_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/FF_1_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/FF_1_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/FF_2_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/FF_2_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/FF_3_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/FF_3_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/FF_4_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/FF_4_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/FF_5_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/FF_5_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/FF_6_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/FF_6_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/IPCA_1_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/IPCA_1_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/IPCA_2_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/IPCA_2_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/IPCA_3_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/IPCA_3_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/IPCA_4_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/IPCA_4_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/IPCA_5_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/IPCA_5_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/IPCA_6_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/IPCA_6_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/PCA_1_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/PCA_1_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/PCA_2_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/PCA_2_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/PCA_3_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/PCA_3_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/PCA_4_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/PCA_4_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/PCA_5_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/PCA_5_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/PCA_6_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/PCA_6_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/seq2seq1_1_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/seq2seq1_1_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/seq2seq1_2_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/seq2seq1_2_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/seq2seq1_3_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/seq2seq1_3_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/seq2seq1_4_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/seq2seq1_4_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/seq2seq1_5_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/seq2seq1_5_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/seq2seq2_1_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/seq2seq2_1_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/seq2seq2_2_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/seq2seq2_2_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/seq2seq2_3_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/seq2seq2_3_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/seq2seq2_4_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/seq2seq2_4_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/seq2seq2_5_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/seq2seq2_5_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/seq2seq3_1_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/seq2seq3_1_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/seq2seq3_2_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/seq2seq3_2_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/seq2seq3_3_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/seq2seq3_3_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/seq2seq3_4_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/seq2seq3_4_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/alpha/seq2seq3_5_inference_alpha_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/seq2seq3_5_inference_alpha_plot.png -------------------------------------------------------------------------------- /imgs/omit_char_R2_bias.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/omit_char_R2_bias.png -------------------------------------------------------------------------------- /imgs/pred_R2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/pred_R2.png -------------------------------------------------------------------------------- /imgs/total_R2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/total_R2.png -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import torch 2 | #from models.PCA import PCA 3 | #from models.FF import FF 4 | #from models.IPCA import IPCA 5 | from models.CA import CA1, CA2, CA3 6 | from models.seq import seq2seq1, seq2seq2, seq2seq3 7 | 8 | import gc 9 | import argparse 10 | import pandas as pd 11 | import numpy as np 12 | import time 13 | import json 14 | from tqdm import tqdm 15 | import utils 16 | from analysis import * 17 | import matplotlib.pyplot as plt 18 | from itertools import product 19 | import os 20 | 21 | import warnings 22 | warnings.filterwarnings('ignore') 23 | 24 | 25 | def model_inference_and_predict_CA(model): 26 | """ 27 | Inference and Prediction of NN models: 28 | Returns: model.name_inference.csv & model.name_inference.csv saved in path 'results' 29 | """ 30 | model = model.to('cuda') 31 | mon_list = pd.read_pickle('data/mon_list.pkl') 32 | test_mons = mon_list.loc[(mon_list >= model.test_period[0])] 33 | 34 | if not len(model.omit_char): # no omit characteristics 35 | inference_result = pd.DataFrame() 36 | predict_result = pd.DataFrame() 37 | else: 38 | inference_result = [] 39 | 40 | T_bar = tqdm(test_mons.groupby(test_mons.apply(lambda x: x//10000)), colour='red', desc=f'{model.name} Inferencing & Predicting') 41 | 42 | stock_index = pd.Series(dtype=np.int64) 43 | for g in T_bar: # rolling train, refit once a year 44 | T_bar.set_postfix({'Year': g[0]}) 45 | 46 | model.reset_weight() 47 | model.release_gpu() 48 | # release GPU memory 49 | for _ in range(6): # call function multiple times to clear the cuda cache 50 | torch.cuda.empty_cache() 51 | 52 | train_loss, val_loss = model.train_model() 53 | # plot loss 54 | plt.plot(train_loss, label='train_loss') 55 | plt.plot(val_loss, label='val_loss') 56 | plt.legend() 57 | plt.savefig(f'results/train_loss/{model.name}_loss_{g[0]}.png') 58 | plt.close() 59 | 60 | for m in g[1].to_list(): 61 | m_stock_index, _, _, _ = model._get_item(m) 62 | stock_index = pd.concat([stock_index, pd.Series(m_stock_index)]).drop_duplicates().astype(int) 63 | 64 | if not len(model.omit_char): # no omit characteristics 65 | # move inference_R and predict_R to cpu 66 | inference_R = model.inference(m) # return (N, 1) 67 | inference_R = inference_R.cpu().detach().numpy() 68 | inference_R = pd.DataFrame(inference_R, index=m_stock_index, columns=[m]) 69 | inference_result = pd.concat([inference_result.reset_index(drop=True), inference_R.reset_index(drop=True)], axis=1) # (N, T) 70 | 71 | predict_R = model.predict(m) # reutrn (N, 1) 72 | predict_R = predict_R.cpu().detach().numpy() 73 | predict_R = pd.DataFrame(predict_R, index=m_stock_index, columns=[m]) 74 | predict_result = pd.concat([predict_result.reset_index(drop=True), predict_R.reset_index(drop=True)], axis=1) # (N, T) 75 | 76 | else: 77 | inference_R = model.inference(m) # return (N, m), m is the length of omit_char 78 | inference_result.append(inference_R) # (T, N, m) 79 | 80 | # refit: change train period and valid period 81 | model.refit() 82 | 83 | if not len(model.omit_char): 84 | inference_result = pd.DataFrame(inference_result.values.T, index=test_mons, columns=CHARAS_LIST) 85 | inference_result.to_csv(f'results/inference/{model.name}_inference.csv') 86 | 87 | predict_result = pd.DataFrame(predict_result.values.T, index=test_mons, columns=CHARAS_LIST) 88 | predict_result.to_csv(f'results/predict/{model.name}_predict.csv') 89 | 90 | # GC: release RAM memory(model) 91 | del model 92 | gc.collect() 93 | return inference_result 94 | 95 | def model_inference_and_predict_seq2seq(model): 96 | """ 97 | Inference and Prediction of seq2seq models: 98 | Returns: model.name_inference.csv & model.name_inference.csv saved in path 'results' 99 | """ 100 | model = model.to('cuda') 101 | mon_list = pd.read_pickle('data/mon_list.pkl') 102 | test_mons = mon_list.loc[(mon_list >= model.test_period[0])] 103 | 104 | if not len(model.omit_char): # no omit characteristics 105 | inference_result = pd.DataFrame() 106 | predict_result = pd.DataFrame() 107 | else: 108 | inference_result = [] 109 | 110 | T_bar = tqdm(test_mons.groupby(test_mons.apply(lambda x: x//10000)), colour='red', desc=f'{model.name} Inferencing & Predicting') 111 | 112 | stock_index = pd.Series(dtype=np.int64) 113 | for g in T_bar: # rolling train, refit once a year 114 | T_bar.set_postfix({'Year': g[0]}) 115 | 116 | 117 | model.release_gpu() 118 | # release GPU memory 119 | for _ in range(6): # call function multiple times to clear the cuda cache 120 | torch.cuda.empty_cache() 121 | 122 | train_loss, val_loss = model.train_model() 123 | # plot loss 124 | plt.plot(train_loss, label='train_loss') 125 | plt.plot(val_loss, label='val_loss') 126 | plt.legend() 127 | plt.savefig(f'results/train_loss/{model.name}_loss_{g[0]}.png') 128 | plt.close() 129 | 130 | for m in g[1].to_list(): 131 | m_stock_index, _, _, _ = model._get_item(m) 132 | stock_index = pd.concat([stock_index, pd.Series(m_stock_index)]).drop_duplicates().astype(int) 133 | 134 | if not len(model.omit_char): # no omit characteristics 135 | # move inference_R and predict_R to cpu 136 | inference_R = model.inference(m) # return (N, 1) 137 | inference_R = inference_R.cpu().detach().numpy() 138 | inference_R = pd.DataFrame(inference_R, index=m_stock_index, columns=[m]) 139 | inference_result = pd.concat([inference_result.reset_index(drop=True), inference_R.reset_index(drop=True)], axis=1) # (N, T) 140 | 141 | predict_R = model.predict(m) # reutrn (N, 1) 142 | predict_R = predict_R.cpu().detach().numpy() 143 | predict_R = pd.DataFrame(predict_R, index=m_stock_index, columns=[m]) 144 | predict_result = pd.concat([predict_result.reset_index(drop=True), predict_R.reset_index(drop=True)], axis=1) # (N, T) 145 | 146 | else: 147 | inference_R = model.inference(m) # return (N, m), m is the length of omit_char 148 | inference_result.append(inference_R) # (T, N, m) 149 | 150 | # refit: change train period and valid period 151 | model.refit() 152 | 153 | if not len(model.omit_char): 154 | inference_result = pd.DataFrame(inference_result.values.T, index=test_mons, columns=CHARAS_LIST) 155 | inference_result.to_csv(f'results/inference/{model.name}_inference.csv') 156 | 157 | predict_result = pd.DataFrame(predict_result.values.T, index=test_mons, columns=CHARAS_LIST) 158 | predict_result.to_csv(f'results/predict/{model.name}_predict.csv') 159 | 160 | # GC: release RAM memory(model) 161 | del model 162 | gc.collect() 163 | return inference_result 164 | 165 | 166 | 167 | def git_push(msg): 168 | os.system('git add R_squares') 169 | os.system(f'git commit -m "{msg}"') 170 | os.system('git push') 171 | 172 | 173 | 174 | def model_selection(model_type, model_K, omit_char=[]): 175 | assert model_type in ['seq2seq1', 'seq2seq3','CA2'],f'No Such Model: {model_type}' 176 | 177 | 178 | #if model_type == 'CA1': 179 | #return { 180 | #'name': f'CA1_{model_K}', 181 | #'omit_char': omit_char, 182 | #'model': CA1(hidden_size=model_K, dropout=CA_DR, lr=CA_LR, omit_char=omit_char) 183 | #} 184 | 185 | if model_type == 'CA2': 186 | return { 187 | 'name': f'CA2_{model_K}', 188 | 'omit_char': omit_char, 189 | 'model': CA2(hidden_size=model_K, dropout=CA_DR, lr=CA_LR, omit_char=omit_char) 190 | } 191 | #elif model_type == 'CA3': 192 | #return { 193 | #'name': f'CA3_{model_K}', 194 | #'omit_char': omit_char, 195 | #'model': CA3(hidden_size=model_K, dropout=CA_DR, lr=CA_LR, omit_char=omit_char) 196 | #} 197 | 198 | #elif model_type == 'seq2seq1': 199 | # return { 200 | #'name': f'seq2seq1_{model_K}', 201 | #'omit_char': omit_char, 202 | #'model': seq2seq1(hidden_size=model_K, dropout=CA_DR, lr=CA_LR, omit_char=omit_char) 203 | #} 204 | elif model_type == 'seq2seq1': 205 | return { 206 | 'name': f'seq2seq1_{model_K}', 207 | 'omit_char': omit_char, 208 | 'model': seq2seq1(hidden_size=model_K, dropout=CA_DR, lr=CA_LR, omit_char=omit_char) 209 | } 210 | else: 211 | return { 212 | 'name': f'seq2seq3_{model_K}', 213 | 'omit_char': omit_char, 214 | 'model': seq2seq3(hidden_size=model_K, dropout=CA_DR, lr=0.01, omit_char=omit_char) 215 | } 216 | 217 | 218 | 219 | if __name__ == "__main__": 220 | parser = argparse.ArgumentParser() 221 | parser.add_argument('--Model', type=str, default='seq2seq1 seq2seq3 CA2') 222 | parser.add_argument('--K', type=str, default='1 2 3 4 5') 223 | parser.add_argument('--omit_char', type=str, default='') 224 | 225 | args = parser.parse_args() 226 | 227 | if 'results' not in os.listdir('./'): 228 | os.mkdir('results') 229 | if 'train_loss' not in os.listdir('./results'): 230 | os.mkdir('results/train_loss') 231 | if 'inference' not in os.listdir('./results'): 232 | os.mkdir('results/inference') 233 | if 'predict' not in os.listdir('./results'): 234 | os.mkdir('results/predict') 235 | if 'imgs' not in os.listdir('./'): 236 | os.mkdir('imgs') 237 | 238 | 239 | models_name = [] 240 | R_square = [] 241 | for g in product(args.Model.split(' '), args.K.split(' ')): 242 | if isinstance(args.omit_char, str) and len(args.omit_char) > 0: 243 | omit_chars = args.omit_char.split(' ') 244 | else: 245 | omit_chars = [] 246 | 247 | model = model_selection(g[0], int(g[1]), omit_chars) 248 | 249 | print(f"{time.strftime('%a, %d %b %Y %H:%M:%S +0800', time.gmtime())} | Model: {model['name']} | {omit_chars}") 250 | print('name : ', model['name']) 251 | models_name.append(model['name']) 252 | 253 | if model['name'].split('_')[0][:-1] == 'CA': 254 | print('model_inference_and_predict_CA') 255 | # if have omit char, inf_ret (T, N, m) 256 | inf_ret = model_inference_and_predict_CA(model['model']) 257 | else: 258 | print('model_inference_and_predict_seq') 259 | inf_ret = model_inference_and_predict_seq2seq(model['model']) 260 | 261 | gc.collect() 262 | 263 | # Save total R^2 264 | if not len(model['omit_char']): 265 | R_square.append(calculate_R2(model['model'], 'inference')) 266 | alpha_plot(model['model'], 'inference', save_dir='imgs') 267 | # alpha_plot(model['model'], 'predict', save_dir='alpha_imgs') 268 | else: 269 | inf_ret = np.array(inf_ret) 270 | for i in range(len(model['omit_char'])): 271 | inference_r = inf_ret[:, :, i] # T * N 272 | complete_r = inf_ret[:, :, -1] 273 | R_square.append(calculate_R2(None, None, inference_r, complete_r)) 274 | 275 | del model 276 | 277 | # save R_square to json 278 | p = time.localtime() 279 | time_str = "{:0>4d}-{:0>2d}-{:0>2d}_{:0>2d}-{:0>2d}-{:0>2d}".format(p.tm_year, p.tm_mon, p.tm_mday, p.tm_hour, p.tm_min, p.tm_sec) 280 | filename = f"R_squares/{time_str}.json" 281 | obj = { 282 | "models": models_name, 283 | 'omit_char': args.omit_char.split(' '), 284 | "R2_total": R_square, 285 | } 286 | 287 | with open(filename, "w") as out_file: 288 | json.dump(obj, out_file) 289 | 290 | # git push 291 | # git_push(f"Run main.py") 292 | 293 | 294 | 295 | -------------------------------------------------------------------------------- /models/CA.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import numpy as np 4 | import collections 5 | from .modelBase import modelBase 6 | from utils import CHARAS_LIST 7 | 8 | import torch 9 | from torch import nn 10 | from torch.utils.data import Dataset, DataLoader, TensorDataset 11 | 12 | 13 | MAX_EPOCH = 200 14 | 15 | class CA_base(nn.Module, modelBase): 16 | def __init__(self, name, omit_char=[], device='cuda'): 17 | nn.Module.__init__(self) 18 | modelBase.__init__(self, name) 19 | self.beta_nn = None 20 | self.factor_nn = None 21 | self.optimizer = None 22 | self.criterion = None 23 | self.omit_char = omit_char 24 | 25 | self.factor_nn_pred = [] 26 | 27 | self.device = device 28 | 29 | self.datashare_chara = pd.read_pickle('./data/datashare_re.pkl').astype(np.float64) 30 | self.p_charas = pd.read_pickle('./data/p_charas.pkl').astype(np.float64).reset_index() 31 | self.portfolio_ret= pd.read_pickle('./data/portfolio_ret.pkl').astype(np.float64) 32 | self.mon_ret = pd.read_pickle('./data/month_ret.pkl').astype(np.float64) 33 | 34 | self.train_dataloader = None 35 | self.valid_dataloader = None 36 | self.test_dataloader = None 37 | 38 | 39 | def debug(self, month): 40 | beta_nn_input = self.p_charas.loc[self.p_charas['DATE'] == month][CHARAS_LIST] 41 | # beta_nn_input = self.datashare_chara.loc[self.datashare_chara['DATE'] == month].set_index('permno')[charas] 42 | print(beta_nn_input) 43 | 44 | 45 | def _get_item(self, month): 46 | if month not in self.p_charas['DATE'].values: 47 | # find the closest month in p_charas to month 48 | month = self.p_charas['DATE'].values[np.argmin(np.abs(self.p_charas['DATE'].values - month))] 49 | 50 | beta_nn_input = self.p_charas.loc[self.p_charas['DATE'] == month][CHARAS_LIST] # (94, 94) 51 | labels = self.portfolio_ret.loc[self.portfolio_ret['DATE'] == month][CHARAS_LIST].T.values # (94, 1) 52 | beta_nn_input['ret-rf'] = labels 53 | align_df = beta_nn_input.copy(deep=False).dropna() 54 | 55 | factor_nn_input = self.portfolio_ret.loc[self.portfolio_ret['DATE'] == month][CHARAS_LIST] 56 | 57 | # exit(0) if there is any nan in align_df 58 | if align_df.isnull().values.any(): 59 | assert False, f'There is nan in align_df of : {month}' 60 | # return stock index (L), beta_nn_input (94*94=P*N), factor_nn_input (94*1=P*1), labels (94, = N,) 61 | return align_df.index, align_df.values[:, :-1].T, factor_nn_input.T.values , align_df.values[:, -1].T 62 | 63 | 64 | def dataloader(self, period): 65 | mon_list = pd.read_pickle('data/mon_list.pkl') 66 | mon_list = mon_list.loc[(mon_list >= period[0]) & (mon_list <= period[1])] 67 | beta_nn_input_set = [] 68 | factor_nn_input_set = [] 69 | label_set = [] 70 | for mon in mon_list: 71 | _, _beta_input, _factor_input, label = self._get_item(mon) 72 | beta_nn_input_set.append(_beta_input) 73 | factor_nn_input_set.append(_factor_input) 74 | label_set.append(label) 75 | 76 | beta_nn_input_set = torch.tensor(beta_nn_input_set, dtype=torch.float32).to(self.device) 77 | factor_nn_input_set = torch.tensor(factor_nn_input_set, dtype=torch.float32).to(self.device) 78 | label_set = torch.tensor(label_set, dtype=torch.float32).to(self.device) 79 | 80 | dataset = TensorDataset(beta_nn_input_set, factor_nn_input_set, label_set) 81 | return DataLoader(dataset, batch_size=1, shuffle=True) 82 | 83 | 84 | def forward(self, char, pfret): 85 | processed_char = self.beta_nn(char) 86 | processed_pfret = self.factor_nn(pfret) 87 | return torch.sum(processed_char * processed_pfret, dim=1) 88 | 89 | 90 | # train_one_epoch 91 | def __train_one_epoch(self): 92 | epoch_loss = 0.0 93 | for i, (beta_nn_input, factor_nn_input, labels) in enumerate(self.train_dataloader): 94 | self.optimizer.zero_grad() 95 | # beta_nn_input reshape: (1, 94, 94) -> (94, 94) (1*P*N => N*P) 96 | # factor_nn_input reshape: (1, 94, 1) -> (1, 94) (1*P*1 => 1*P) 97 | # labels reshape: (1, 94) -> (94, ) (1*N => N,) 98 | beta_nn_input = beta_nn_input.squeeze(0).T 99 | factor_nn_input = factor_nn_input.squeeze(0).T 100 | labels = labels.squeeze(0) 101 | output = self.forward(beta_nn_input, factor_nn_input) 102 | loss = self.criterion(output, labels) 103 | 104 | loss.backward() 105 | self.optimizer.step() 106 | epoch_loss += loss.item() 107 | 108 | if i % 100 == 0: 109 | # print(f'Batches: {i}, loss: {loss.item()}') 110 | pass 111 | 112 | return epoch_loss / len(self.train_dataloader) 113 | 114 | 115 | def __valid_one_epoch(self): 116 | epoch_loss = 0.0 117 | for i, (beta_nn_input, factor_nn_input, labels) in enumerate(self.valid_dataloader): 118 | # beta_nn_input reshape: (1, 94, 94) -> (94, 94) (1*P*N => N*P) 119 | # factor_nn_input reshape: (1, 94, 1) -> (1, 94) (1*P*1 => 1*P) 120 | # labels reshape: (1, 94) -> (94, ) (1*N => N,) 121 | beta_nn_input = beta_nn_input.squeeze(0).T 122 | factor_nn_input = factor_nn_input.squeeze(0).T 123 | labels = labels.squeeze(0) 124 | 125 | output = self.forward(beta_nn_input, factor_nn_input) 126 | loss = self.criterion(output, labels) 127 | epoch_loss += loss.item() 128 | 129 | return epoch_loss / len(self.valid_dataloader) 130 | 131 | 132 | def train_model(self): 133 | if 'saved_models' not in os.listdir('./'): 134 | os.mkdir('saved_models') 135 | 136 | self.train_dataloader = self.dataloader(self.train_period) 137 | self.valid_dataloader = self.dataloader(self.valid_period) 138 | self.test_dataloader = self.dataloader(self.test_period) 139 | 140 | min_error = np.Inf 141 | no_update_steps = 0 142 | valid_loss = [] 143 | train_loss = [] 144 | for i in range(MAX_EPOCH): 145 | # print(f'Epoch {i}') 146 | self.train() 147 | train_error = self.__train_one_epoch() 148 | train_loss.append(train_error) 149 | 150 | self.eval() 151 | # valid and early stop 152 | with torch.no_grad(): 153 | valid_error = self.__valid_one_epoch() 154 | 155 | valid_loss.append(valid_error) 156 | if valid_error < min_error: 157 | min_error = valid_error 158 | no_update_steps = 0 159 | # save model 160 | torch.save(self.state_dict(), f'./saved_models/{self.name}.pt') 161 | else: 162 | no_update_steps += 1 163 | 164 | if no_update_steps > 2: # early stop, if consecutive 3 epoches no improvement on validation set 165 | print(f'Early stop at epoch {i}') 166 | break 167 | # load from (best) saved model 168 | self.load_state_dict(torch.load(f'./saved_models/{self.name}.pt')) 169 | return train_loss, valid_loss 170 | 171 | 172 | def test_model(self): 173 | # beta, factor, label = self.test_dataset 174 | # i = np.random.randint(len(beta)) 175 | # beta_nn_input = beta[i] 176 | # factor_nn_input = factor[i] 177 | # labels = label[i] 178 | output = None 179 | label = None 180 | for i, beta_nn_input, factor_nn_input, labels in enumerate(self.test_dataloader): 181 | # convert to tensor 182 | # beta_nn_input = torch.tensor(beta_nn_input, dtype=torch.float32).T.to(self.device) 183 | # factor_nn_input = torch.tensor(factor_nn_input, dtype=torch.float32).T.to(self.device) 184 | # labels = torch.tensor(labels, dtype=torch.float32).T.to(self.device) 185 | output = self.forward(beta_nn_input, factor_nn_input) 186 | break 187 | 188 | loss = self.criterion(output, labels) 189 | print(f'Test loss: {loss.item()}') 190 | print(f'Predicted: {output}') 191 | print(f'Ground truth: {labels}') 192 | return output, labels 193 | 194 | 195 | def calBeta(self, month, skip_char=[]): 196 | _, beta_nn_input, _, _ = self._get_item(month) # beta input: 94*94 = P*N 197 | 198 | # if some variables need be omitted 199 | if len(skip_char): 200 | beta_nn_input = pd.DataFrame(beta_nn_input.T, columns=CHARAS_LIST) # N*P 201 | beta_nn_input[skip_char] = beta_nn_input[skip_char] * 0.0 202 | beta_nn_input = beta_nn_input.values.T # P*N 203 | 204 | beta_nn_input = torch.tensor(beta_nn_input, dtype=torch.float32).T.to(self.device) # N*P 205 | return self.beta_nn(beta_nn_input) # N*K 206 | 207 | 208 | def calFactor(self, month, skip_char=[]): 209 | _, _, factor_nn_input, _ = self._get_item(month) # factor input: P*1 210 | 211 | # if some variables need be omitted 212 | if len(skip_char): 213 | factor_nn_input = pd.DataFrame(factor_nn_input.T, columns=CHARAS_LIST) # 1*P 214 | factor_nn_input[skip_char] = factor_nn_input[skip_char] * 0.0 215 | factor_nn_input = factor_nn_input.values.T # P*1 216 | 217 | factor_nn_input = torch.tensor(factor_nn_input, dtype=torch.float32).T.to(self.device) # 1*P 218 | factor_pred = self.factor_nn(factor_nn_input).T # K*1 219 | 220 | self.factor_nn_pred.append(factor_pred) 221 | 222 | return factor_pred # K*1 223 | 224 | 225 | def inference(self, month): 226 | if len(self.omit_char) == 0: 227 | assert month >= self.test_period[0], f"Month error, {month} is not in test period {self.test_period}" 228 | 229 | mon_factor, mon_beta = self.calFactor(month), self.calBeta(month) 230 | 231 | assert mon_beta.shape[1] == mon_factor.shape[0], f"Dimension mismatch between mon_factor: {mon_factor.shape} and mon_beta: {mon_beta.shape}" 232 | 233 | # R_{N*1} = Beta_{N*K} @ F_{K*1} 234 | return mon_beta @ mon_factor 235 | else: 236 | ret_R = [] 237 | for char in self.omit_char: 238 | mon_factor, mon_beta = self.calFactor(month, [char]), self.calBeta(month, [char]) 239 | ret_R.append((mon_beta @ mon_factor).cpu().detach().numpy()) # N*1 240 | 241 | mon_factor, mon_beta = self.calFactor(month), self.calBeta(month) 242 | ret_R.append((mon_beta @ mon_factor).cpu().detach().numpy()) # also add complete result 243 | 244 | return np.array(ret_R).squeeze(2).T # N*m 245 | 246 | 247 | def cal_delayed_Factor(self, month): 248 | # calculate the last day of the previous month 249 | if self.refit_cnt == 0: 250 | avg_f_pred = self.factor_nn_pred[0] # input of the first predict take hat{f}_t 251 | # print(avg_f_pred.shape) 252 | else: 253 | avg_f_pred = torch.mean(torch.stack(self.factor_nn_pred[:self.refit_cnt]), dim=0) 254 | 255 | return avg_f_pred 256 | 257 | 258 | def reset_weight(self): 259 | for layer in self.beta_nn: # reset beta_nn parameters 260 | if hasattr(layer, 'reset_parameters'): 261 | layer.reset_parameters() 262 | 263 | for layer in self.factor_nn: # reset factor_nn parameters 264 | if hasattr(layer, 'reset_parameters'): 265 | layer.reset_parameters() 266 | 267 | self.optimizer.state = collections.defaultdict(dict) # reset optimizer state 268 | 269 | 270 | def release_gpu(self): 271 | if self.train_dataloader is not None: 272 | del self.train_dataloader 273 | if self.valid_dataloader is not None: 274 | del self.valid_dataloader 275 | if self.test_dataloader is not None: 276 | del self.test_dataloader 277 | torch.cuda.empty_cache() 278 | 279 | 280 | 281 | class CA0(CA_base): 282 | def __init__(self, hidden_size, lr=0.001, omit_char=[], device='cuda'): 283 | CA_base.__init__(self, name=f'CA0_{hidden_size}', omit_char=omit_char, device=device) 284 | # P -> K 285 | self.beta_nn = nn.Sequential( 286 | # output layer 287 | nn.Linear(94, hidden_size) 288 | ) 289 | self.factor_nn = nn.Sequential( 290 | nn.Linear(94, hidden_size) 291 | ) 292 | 293 | self.optimizer = torch.optim.Adam(self.parameters(), lr=lr) 294 | self.criterion = nn.MSELoss().to(device) 295 | 296 | 297 | 298 | class CA1(CA_base): 299 | def __init__(self, hidden_size, dropout=0.5, lr=0.001, omit_char=[], device='cuda'): 300 | CA_base.__init__(self, name=f'CA1_{hidden_size}', omit_char=omit_char, device=device) 301 | self.dropout = dropout 302 | # P -> 32 -> K 303 | self.beta_nn = nn.Sequential( 304 | # hidden layer 1 305 | nn.Linear(94, 32), 306 | nn.BatchNorm1d(32), 307 | nn.ReLU(), 308 | nn.Dropout(self.dropout), 309 | # output layer 310 | nn.Linear(32, hidden_size) 311 | ) 312 | self.factor_nn = nn.Sequential( 313 | nn.Linear(94, hidden_size) 314 | ) 315 | 316 | self.optimizer = torch.optim.Adam(self.parameters(), lr=lr) 317 | self.criterion = nn.MSELoss().to(device) 318 | 319 | 320 | 321 | class CA2(CA_base): 322 | def __init__(self, hidden_size, dropout=0.5, lr=0.001, omit_char=[], device='cuda'): 323 | CA_base.__init__(self, name=f'CA2_{hidden_size}', omit_char=omit_char, device=device) 324 | self.dropout = dropout 325 | # P -> 32 -> 16 -> K 326 | self.beta_nn = nn.Sequential( 327 | # hidden layer 1 328 | nn.Linear(94, 32), 329 | nn.BatchNorm1d(32), 330 | nn.ReLU(), 331 | nn.Dropout(self.dropout), 332 | # hidden layer 2 333 | nn.Linear(32, 16), 334 | nn.BatchNorm1d(16), 335 | nn.ReLU(), 336 | nn.Dropout(self.dropout), 337 | # output layer 338 | nn.Linear(16, hidden_size) 339 | ) 340 | self.factor_nn = nn.Sequential( 341 | nn.Linear(94, hidden_size) 342 | ) 343 | 344 | self.optimizer = torch.optim.Adam(self.parameters(), lr=lr) 345 | self.criterion = nn.MSELoss().to(device) 346 | 347 | 348 | 349 | class CA3(CA_base): 350 | def __init__(self, hidden_size, dropout=0.5, lr=0.001, omit_char=[], device='cuda'): 351 | CA_base.__init__(self, name=f'CA3_{hidden_size}', omit_char=omit_char, device=device) 352 | self.dropout = dropout 353 | # P -> 32 -> 16 -> 8 -> K 354 | self.beta_nn = nn.Sequential( 355 | # hidden layer 1 356 | nn.Linear(94, 32), 357 | nn.BatchNorm1d(32), 358 | nn.ReLU(), 359 | nn.Dropout(self.dropout), 360 | # hidden layer 2 361 | nn.Linear(32, 16), 362 | nn.BatchNorm1d(16), 363 | nn.ReLU(), 364 | nn.Dropout(self.dropout), 365 | # hidden layer 3 366 | nn.Linear(16, 8), 367 | nn.BatchNorm1d(8), 368 | nn.ReLU(), 369 | nn.Dropout(self.dropout), 370 | # output layer 371 | nn.Linear(8, hidden_size) 372 | ) 373 | self.factor_nn = nn.Sequential( 374 | nn.Linear(94, hidden_size) 375 | ) 376 | 377 | self.optimizer = torch.optim.Adam(self.parameters(), lr=lr, weight_decay=0.01) 378 | self.criterion = nn.MSELoss().to(device) -------------------------------------------------------------------------------- /models/IPCA.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | import sys 5 | sys.path.append('../') 6 | 7 | from utils import * 8 | from .modelBase import modelBase 9 | 10 | 11 | class IPCA(modelBase): 12 | def __init__(self, K, omit_char=[]): 13 | super(IPCA, self).__init__(f'IPCA_{K}') 14 | self.K = K 15 | self.omit_char = omit_char 16 | np.random.seed(10) 17 | self.gamma = np.random.random([94, self.K]) # P = 94, we have total 94 characteristics 18 | self.valid_error = [] 19 | self.__prepare_data() 20 | 21 | 22 | def __prepare_data(self): 23 | self.portfolio_ret = pd.read_pickle('data/portfolio_ret.pkl') 24 | self.p_charas = pd.read_pickle('data/p_charas.pkl') 25 | self.mon_list = pd.read_pickle('data/mon_list.pkl') 26 | 27 | 28 | def __valid(self): 29 | MSE_set = [] 30 | for mon in self.mon_list[(self.mon_list >= self.valid_period[0]) & (self.mon_list <= self.valid_period[1])]: 31 | Z = self.p_charas.loc[self.p_charas.DATE == mon][CHARAS_LIST].values # N * P 32 | y = self.portfolio_ret.loc[self.portfolio_ret.DATE == mon][CHARAS_LIST].values.T # N * 1 33 | beta = Z @ self.gamma # N * K 34 | f_hat = np.array(np.matrix(beta.T @ beta).I @ beta.T @ y) # K * 1 35 | residual = y - beta @ f_hat 36 | MSE = np.sum(residual**2) 37 | MSE_set.append(MSE) 38 | 39 | valid_error = sum(MSE_set) 40 | self.valid_error.append(valid_error) 41 | 42 | return valid_error 43 | 44 | 45 | def __gamma_iter(self, gamma_old): 46 | numer = np.zeros((94*self.K, 1)) 47 | denom = np.zeros((94*self.K, 94*self.K)) 48 | for mon in self.mon_list[(self.mon_list >= self.train_period[0]) & (self.mon_list <= self.train_period[1])]: 49 | Z = self.p_charas.loc[self.p_charas.DATE == mon][CHARAS_LIST].values # N * P 50 | y = self.portfolio_ret.loc[self.portfolio_ret.DATE == mon][CHARAS_LIST].values.T # N * 1 51 | beta = Z @ gamma_old # N * K 52 | f_hat = np.array(np.matrix(beta.T @ beta).I @ beta.T @ y) # K * 1 53 | numer += (np.kron(f_hat, Z.T) @ y) 54 | denom += (np.kron(f_hat, Z.T) @ np.kron(f_hat.T, Z)) 55 | 56 | gamma_new = (np.linalg.pinv(denom) @ numer).reshape(self.K, 94) 57 | gamma_new = gamma_new.T 58 | 59 | return gamma_new 60 | 61 | 62 | def train_model(self): 63 | update_cnt = 0 64 | min_valid_err = np.Inf 65 | best_gamma = np.zeros((94, self.K)) 66 | while update_cnt < 5: 67 | self.gamma = self.__gamma_iter(self.gamma) 68 | valid_error = self.__valid() 69 | if valid_error < min_valid_err: 70 | min_valid_err = valid_error 71 | best_gamma = self.gamma 72 | update_cnt = 0 73 | else: 74 | update_cnt += 1 75 | 76 | self.gamma = best_gamma 77 | 78 | 79 | def inference(self, month): 80 | if not len(self.omit_char): 81 | Z = self.p_charas.loc[self.p_charas.DATE == month][CHARAS_LIST].values # N * P 82 | y = self.portfolio_ret.loc[self.portfolio_ret.DATE == month][CHARAS_LIST].values.T # N * 1 83 | beta = Z @ self.gamma # N * K 84 | f_hat = np.array(np.matrix(beta.T @ beta).I @ beta.T @ y) # K * 1 85 | return (beta @ f_hat).flatten() # N, 1 86 | else: 87 | inference_R = [] 88 | Z = self.p_charas.loc[self.p_charas.DATE == month][CHARAS_LIST].copy(deep=False) 89 | y = self.portfolio_ret.loc[self.portfolio_ret.DATE == month][CHARAS_LIST].copy(deep=False) 90 | 91 | for char in self.omit_char: 92 | Z_input = Z.copy(deep=False) 93 | y_input = y.copy(deep=False) 94 | Z_input[[char]] = Z_input[[char]] * 0.0 95 | y_input[[char]] = y_input[[char]] * 0.0 96 | Z_input = Z_input.values 97 | y_input = y_input.values.T 98 | beta = Z_input @ self.gamma 99 | f_hat = np.array(np.matrix(beta.T @ beta).I @ beta.T @ y_input) # K * 1 100 | inference_R.append((beta @ f_hat).flatten()) # m * N 101 | 102 | Z_input = Z.values 103 | y_input = y.values.T 104 | beta = Z_input @ self.gamma 105 | f_hat = np.array(np.matrix(beta.T @ beta).I @ beta.T @ y_input) # K * 1 106 | inference_R.append((beta @ f_hat).flatten()) # m * N 107 | 108 | return np.array(inference_R).T # N * m 109 | 110 | 111 | def predict(self, month): 112 | if self.refit_cnt == 0: 113 | return self.inference(month) 114 | 115 | lag_f_hat = [] 116 | for mon in self.mon_list[(self.mon_list >= 19870101) & (self.mon_list < month)]: 117 | Z = self.p_charas.loc[self.p_charas.DATE == mon][CHARAS_LIST].values # N * P 118 | y = self.portfolio_ret.loc[self.portfolio_ret.DATE == mon][CHARAS_LIST].values.T # N * 1 119 | beta = Z @ self.gamma # N * K 120 | f_hat = np.array(np.matrix(beta.T @ beta).I @ beta.T @ y) # K * 1 121 | lag_f_hat.append(f_hat) 122 | 123 | Z = self.p_charas.loc[self.p_charas.DATE == month][CHARAS_LIST].values # N * P 124 | y = self.portfolio_ret.loc[self.portfolio_ret.DATE == month][CHARAS_LIST].values.T # N * 1 125 | beta = Z @ self.gamma # N * K 126 | 127 | # return average of prevailing sample hat{f} (from 198701) up to t-1 128 | avg_lag_f = np.mean(lag_f_hat, axis=0) 129 | return beta @ avg_lag_f -------------------------------------------------------------------------------- /models/Seq2Seq: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import numpy as np 4 | import collections 5 | from .modelBase import modelBase 6 | from utils import CHARAS_LIST 7 | 8 | from io import open 9 | import unicodedata 10 | import string 11 | import re 12 | import random 13 | 14 | import torch 15 | import torch.nn as nn 16 | from torch.autograd import Variable 17 | from torch import optim 18 | import torch.nn.functional as F 19 | 20 | use_cuda = torch.cuda.is_available() 21 | 22 | MAX_EPOCH = 200 23 | 24 | class seq2seq_base(nn.Module, modelBase): 25 | def __init__(self, name, omit_char=[], device='cuda'): 26 | nn.Module.__init__(self) 27 | modelBase.__init__(self, name) 28 | self.beta_seq = None 29 | self.factor_seq = None 30 | self.optimizer = None 31 | self.criterion = None 32 | self.omit_char = omit_char 33 | 34 | self.factor_seq_pred = [] 35 | 36 | self.device = device 37 | 38 | self.datashare_chara = pd.read_pickle('./data/datashare_re.pkl').astype(np.float64) 39 | self.p_charas = pd.read_pickle('./data/p_charas.pkl').astype(np.float64).reset_index() 40 | self.portfolio_ret= pd.read_pickle('./data/portfolio_ret.pkl').astype(np.float64) 41 | self.mon_ret = pd.read_pickle('./data/month_ret.pkl').astype(np.float64) 42 | 43 | self.train_dataloader = None 44 | self.valid_dataloader = None 45 | self.test_dataloader = None 46 | 47 | 48 | def debug(self, month): 49 | beta_seq_input = self.p_charas.loc[self.p_charas['DATE'] == month][CHARAS_LIST] 50 | # beta_seq_input = self.datashare_chara.loc[self.datashare_chara['DATE'] == month].set_index('permno')[charas] 51 | print(beta_seq_input) 52 | 53 | 54 | def _get_item(self, month): 55 | if month not in self.p_charas['DATE'].values: 56 | # find the closest month in p_charas to month 57 | month = self.p_charas['DATE'].values[np.argmin(np.abs(self.p_charas['DATE'].values - month))] 58 | 59 | beta_seq_input = self.p_charas.loc[self.p_charas['DATE'] == month][CHARAS_LIST] # (94, 94) 60 | labels = self.portfolio_ret.loc[self.portfolio_ret['DATE'] == month][CHARAS_LIST].T.values # (94, 1) 61 | beta_seq_input['ret-rf'] = labels 62 | align_df = beta_seq_input.copy(deep=False).dropna() 63 | 64 | factor_seq_input = self.portfolio_ret.loc[self.portfolio_ret['DATE'] == month][CHARAS_LIST] 65 | 66 | # exit(0) if there is any nan in align_df 67 | if align_df.isnull().values.any(): 68 | assert False, f'There is nan in align_df of : {month}' 69 | # return stock index (L), beta_seq_input (94*94=P*N), factor_seq_input (94*1=P*1), labels (94, = N,) 70 | return align_df.index, align_df.values[:, :-1].T, factor_seq_input.T.values , align_df.values[:, -1].T 71 | 72 | 73 | def dataloader(self, period): 74 | mon_list = pd.read_pickle('data/mon_list.pkl') 75 | mon_list = mon_list.loc[(mon_list >= period[0]) & (mon_list <= period[1])] 76 | beta_seq_input_set = [] 77 | factor_seq_input_set = [] 78 | label_set = [] 79 | for mon in mon_list: 80 | _, _beta_input, _factor_input, label = self._get_item(mon) 81 | beta_seq_input_set.append(_beta_input) 82 | factor_seq_input_set.append(_factor_input) 83 | label_set.append(label) 84 | 85 | beta_seq_input_set = torch.tensor(beta_seq_input_set, dtype=torch.float32).to(self.device) 86 | factor_seq_input_set = torch.tensor(factor_seq_input_set, dtype=torch.float32).to(self.device) 87 | label_set = torch.tensor(label_set, dtype=torch.float32).to(self.device) 88 | 89 | dataset = TensorDataset(beta_seq_input_set, factor_seq_input_set, label_set) 90 | return DataLoader(dataset, batch_size=1, shuffle=True) 91 | 92 | 93 | def forward(self, char, pfret): 94 | processed_char = self.beta_seq(char) 95 | 96 | encoder_hidden = self.encoder_factor_seq.initHidden() 97 | input_length = char.size(0) 98 | encoder_outputs = torch.zeros(input_length, self.hidden_size, device=device) 99 | 100 | for ei in range(input_length): 101 | encoder_output, encoder_hidden = self.encoder_factor_seq(char[ei], encoder_hidden) 102 | encoder_outputs[ei] = encoder_output[0, 0] 103 | 104 | # Factor Seq: Decoding 105 | decoder_input = torch.tensor([[0]], device=device) # Replace 0 with whatever start token you use 106 | decoder_hidden = encoder_hidden 107 | decoded_sequence = torch.zeros(input_length, 94, device=device) # Assuming output size is 94 108 | 109 | for di in range(input_length): 110 | decoder_output, decoder_hidden = self.decoder_factor_seq(decoder_input, decoder_hidden) 111 | decoded_sequence[di] = decoder_output 112 | decoder_input = decoder_output.argmax(1) 113 | 114 | # Now, 'decoded_sequence' can be used as 'factor_seq' 115 | processed_pfret = decoded_sequence 116 | return torch.sum(processed_char * processed_pfret, dim=1) 117 | 118 | 119 | # train_one_epoch 120 | def __train_one_epoch(self): 121 | epoch_loss = 0.0 122 | for i, (beta_seq_input, factor_seq_input, labels) in enumerate(self.train_dataloader): 123 | self.optimizer.zero_grad() 124 | # beta_nn_input reshape: (1, 94, 94) -> (94, 94) (1*P*N => N*P) 125 | # factor_nn_input reshape: (1, 94, 1) -> (1, 94) (1*P*1 => 1*P) 126 | # labels reshape: (1, 94) -> (94, ) (1*N => N,) 127 | beta_seq_input = beta_seq_input.squeeze(0).T 128 | factor_seq_input = factor_seq_input.squeeze(0).T 129 | labels = labels.squeeze(0) 130 | output = self.forward(beta_seq_input, factor_seq_input) 131 | loss = self.criterion(output, labels) 132 | 133 | loss.backward() 134 | self.optimizer.step() 135 | epoch_loss += loss.item() 136 | 137 | if i % 100 == 0: 138 | # print(f'Batches: {i}, loss: {loss.item()}') 139 | pass 140 | 141 | return epoch_loss / len(self.train_dataloader) 142 | 143 | 144 | def __valid_one_epoch(self): 145 | epoch_loss = 0.0 146 | for i, (beta_seq_input, factor_seq_input, labels) in enumerate(self.valid_dataloader): 147 | # beta_nn_input reshape: (1, 94, 94) -> (94, 94) (1*P*N => N*P) 148 | # factor_nn_input reshape: (1, 94, 1) -> (1, 94) (1*P*1 => 1*P) 149 | # labels reshape: (1, 94) -> (94, ) (1*N => N,) 150 | beta_seq_input = beta_seq_input.squeeze(0).T 151 | factor_seq_input = factor_seq_input.squeeze(0).T 152 | labels = labels.squeeze(0) 153 | 154 | output = self.forward(beta_seq_input, factor_seq_input) 155 | loss = self.criterion(output, labels) 156 | epoch_loss += loss.item() 157 | 158 | return epoch_loss / len(self.valid_dataloader) 159 | 160 | 161 | def train_model(self): 162 | if 'saved_models' not in os.listdir('./'): 163 | os.mkdir('saved_models') 164 | 165 | self.train_dataloader = self.dataloader(self.train_period) 166 | self.valid_dataloader = self.dataloader(self.valid_period) 167 | self.test_dataloader = self.dataloader(self.test_period) 168 | 169 | min_error = np.Inf 170 | no_update_steps = 0 171 | valid_loss = [] 172 | train_loss = [] 173 | for i in range(MAX_EPOCH): 174 | # print(f'Epoch {i}') 175 | self.train() 176 | train_error = self.__train_one_epoch() 177 | train_loss.append(train_error) 178 | 179 | self.eval() 180 | # valid and early stop 181 | with torch.no_grad(): 182 | valid_error = self.__valid_one_epoch() 183 | 184 | valid_loss.append(valid_error) 185 | if valid_error < min_error: 186 | min_error = valid_error 187 | no_update_steps = 0 188 | # save model 189 | torch.save(self.state_dict(), f'./saved_models/{self.name}.pt') 190 | else: 191 | no_update_steps += 1 192 | 193 | if no_update_steps > 2: # early stop, if consecutive 3 epoches no improvement on validation set 194 | print(f'Early stop at epoch {i}') 195 | break 196 | # load from (best) saved model 197 | self.load_state_dict(torch.load(f'./saved_models/{self.name}.pt')) 198 | return train_loss, valid_loss 199 | 200 | 201 | def test_model(self): 202 | # beta, factor, label = self.test_dataset 203 | # i = np.random.randint(len(beta)) 204 | # beta_nn_input = beta[i] 205 | # factor_nn_input = factor[i] 206 | # labels = label[i] 207 | output = None 208 | label = None 209 | for i, beta_seq_input, factor_seq_input, labels in enumerate(self.test_dataloader): 210 | # convert to tensor 211 | # beta_nn_input = torch.tensor(beta_nn_input, dtype=torch.float32).T.to(self.device) 212 | # factor_nn_input = torch.tensor(factor_nn_input, dtype=torch.float32).T.to(self.device) 213 | # labels = torch.tensor(labels, dtype=torch.float32).T.to(self.device) 214 | output = self.forward(beta_seq_input, factor_seq_input) 215 | break 216 | 217 | loss = self.criterion(output, labels) 218 | print(f'Test loss: {loss.item()}') 219 | print(f'Predicted: {output}') 220 | print(f'Ground truth: {labels}') 221 | return output, labels 222 | 223 | 224 | def calBeta(self, month, skip_char=[]): 225 | _, beta_seq_input, _, _ = self._get_item(month) # beta input: 94*94 = P*N 226 | 227 | # if some variables need be omitted 228 | if len(skip_char): 229 | beta_seq_input = pd.DataFrame(beta_seq_input.T, columns=CHARAS_LIST) # N*P 230 | beta_seq_input[skip_char] = beta_seq_input[skip_char] * 0.0 231 | beta_seq_input = beta_seq_input.values.T # P*N 232 | 233 | beta_seq_input = torch.tensor(beta_seq_input, dtype=torch.float32).T.to(self.device) # N*P 234 | return self.beta_seq(beta_seq_input) # N*K 235 | 236 | 237 | def calFactor(self, month, skip_char=[]): 238 | _, _, factor_seq_input, _ = self._get_item(month) # factor input: P*1 239 | 240 | # if some variables need be omitted 241 | if len(skip_char): 242 | factor_seq_input = pd.DataFrame(factor_seq_input.T, columns=CHARAS_LIST) # 1*P 243 | factor_seq_input[skip_char] = factor_seq_input[skip_char] * 0.0 244 | factor_seq_input = factor_seq_input.values.T # P*1 245 | 246 | factor_seq_input = torch.tensor(factor_seq_input, dtype=torch.float32).T.to(self.device) # 1*P 247 | factor_pred = self.factor_seq(factor_seq_input).T # K*1 248 | 249 | self.factor_seq_pred.append(factor_pred) 250 | 251 | return factor_pred # K*1 252 | 253 | 254 | def inference(self, month): 255 | if len(self.omit_char) == 0: 256 | assert month >= self.test_period[0], f"Month error, {month} is not in test period {self.test_period}" 257 | 258 | mon_factor, mon_beta = self.calFactor(month), self.calBeta(month) 259 | 260 | assert mon_beta.shape[1] == mon_factor.shape[0], f"Dimension mismatch between mon_factor: {mon_factor.shape} and mon_beta: {mon_beta.shape}" 261 | 262 | # R_{N*1} = Beta_{N*K} @ F_{K*1} 263 | return mon_beta @ mon_factor 264 | else: 265 | ret_R = [] 266 | for char in self.omit_char: 267 | mon_factor, mon_beta = self.calFactor(month, [char]), self.calBeta(month, [char]) 268 | ret_R.append((mon_beta @ mon_factor).cpu().detach().numpy()) # N*1 269 | 270 | mon_factor, mon_beta = self.calFactor(month), self.calBeta(month) 271 | ret_R.append((mon_beta @ mon_factor).cpu().detach().numpy()) # also add complete result 272 | 273 | return np.array(ret_R).squeeze(2).T # N*m 274 | 275 | 276 | def cal_delayed_Factor(self, month): 277 | # calculate the last day of the previous month 278 | if self.refit_cnt == 0: 279 | avg_f_pred = self.factor_seq_pred[0] # input of the first predict take hat{f}_t 280 | # print(avg_f_pred.shape) 281 | else: 282 | avg_f_pred = torch.mean(torch.stack(self.factor_seq_pred[:self.refit_cnt]), dim=0) 283 | 284 | return avg_f_pred 285 | 286 | 287 | def reset_weight(self): 288 | for layer in self.beta_seq: # reset beta_nn parameters 289 | if hasattr(layer, 'reset_parameters'): 290 | layer.reset_parameters() 291 | 292 | for layer in self.factor_seq: # reset factor_nn parameters 293 | if hasattr(layer, 'reset_parameters'): 294 | layer.reset_parameters() 295 | 296 | self.optimizer.state = collections.defaultdict(dict) # reset optimizer state 297 | 298 | 299 | def release_gpu(self): 300 | if self.train_dataloader is not None: 301 | del self.train_dataloader 302 | if self.valid_dataloader is not None: 303 | del self.valid_dataloader 304 | if self.test_dataloader is not None: 305 | del self.test_dataloader 306 | torch.cuda.empty_cache() 307 | 308 | 309 | class EncoderRNN(nn.Module): 310 | def __init__(self, input_size, hidden_size): 311 | super(EncoderRNN, self).__init__() 312 | self.hidden_size = hidden_size 313 | 314 | self.embedding = nn.Embedding(input_size, hidden_size) 315 | self.gru = nn.GRU(hidden_size, hidden_size) 316 | 317 | def forward(self, input, hidden): 318 | embedded = self.embedding(input).view(1, 1, -1) 319 | output = embedded 320 | output, hidden = self.gru(output, hidden) 321 | return output, hidden 322 | 323 | def initHidden(self): 324 | result = Variable(torch.zeros(1, 1, self.hidden_size)) 325 | if use_cuda: 326 | return result.cuda() 327 | else: 328 | return result 329 | 330 | class DecoderRNN(nn.Module): 331 | def __init__(self, hidden_size, output_size): 332 | super(DecoderRNN, self).__init__() 333 | self.hidden_size = hidden_size 334 | 335 | self.embedding = nn.Embedding(output_size, hidden_size) 336 | self.gru = nn.GRU(hidden_size, hidden_size) 337 | self.out = nn.Linear(hidden_size, output_size) 338 | self.softmax = nn.LogSoftmax(dim=1) 339 | 340 | def forward(self, input, hidden): 341 | output = self.embedding(input).view(1, 1, -1) 342 | output = F.relu(output) 343 | output, hidden = self.gru(output, hidden) 344 | output = self.softmax(self.out(output[0])) 345 | return output, hidden 346 | 347 | def initHidden(self): 348 | result = Variable(torch.zeros(1, 1, self.hidden_size)) 349 | if use_cuda: 350 | return result.cuda() 351 | else: 352 | return result 353 | 354 | 355 | 356 | 357 | class seq2seq0(seq2seq_base): 358 | def __init__(self, hidden_size, lr=0.001, omit_char=[], device='cuda'): 359 | seq2seq_base.__init__(self, name=f'seq2seq0_{hidden_size}', omit_char=omit_char, device=device) 360 | # P -> K 361 | self.beta_seq = nn.Sequential( 362 | # output layer 363 | nn.Linear(94, hidden_size) 364 | ) 365 | # Initialize the encoder and decoder for factor_seq 366 | self.encoder_factor_seq = EncoderRNN(94, hidden_size).to(device) 367 | self.decoder_factor_seq = DecoderRNN(hidden_size, 94).to(device) # output size is 94 368 | 369 | self.optimizer = torch.optim.Adam(self.parameters(), lr=lr) 370 | self.criterion = nn.MSELoss().to(device) 371 | 372 | 373 | 374 | class seq2seq1(seq2seq_base): 375 | def __init__(self, hidden_size, dropout=0.5, lr=0.001, omit_char=[], device='cuda'): 376 | seq2seq_base.__init__(self, name=f'seq2seq1_{hidden_size}', omit_char=omit_char, device=device) 377 | self.dropout = dropout 378 | # P -> 32 -> K 379 | self.beta_seq = nn.Sequential( 380 | # hidden layer 1 381 | nn.Linear(94, 32), 382 | nn.BatchNorm1d(32), 383 | nn.ReLU(), 384 | nn.Dropout(self.dropout), 385 | # output layer 386 | nn.Linear(32, hidden_size) 387 | ) 388 | self.encoder_factor_seq = EncoderRNN(94, hidden_size).to(device) 389 | self.decoder_factor_seq = DecoderRNN(hidden_size, 94).to(device) 390 | 391 | self.optimizer = torch.optim.Adam(self.parameters(), lr=lr) 392 | self.criterion = nn.MSELoss().to(device) 393 | 394 | 395 | ###应该self.factor_seq 被self.decoder_factor_seq替换 396 | 397 | class seq2seq2(seq2seq_base): 398 | def __init__(self, hidden_size, dropout=0.5, lr=0.001, omit_char=[], device='cuda'): 399 | seq2seq_base.__init__(self, name=f'seq2seq2_{hidden_size}', omit_char=omit_char, device=device) 400 | self.dropout = dropout 401 | # P -> 32 -> 16 -> K 402 | self.beta_nn = nn.Sequential( 403 | # hidden layer 1 404 | nn.Linear(94, 32), 405 | nn.BatchNorm1d(32), 406 | nn.ReLU(), 407 | nn.Dropout(self.dropout), 408 | # hidden layer 2 409 | nn.Linear(32, 16), 410 | nn.BatchNorm1d(16), 411 | nn.ReLU(), 412 | nn.Dropout(self.dropout), 413 | # output layer 414 | nn.Linear(16, hidden_size) 415 | ) 416 | self.encoder_factor_seq = EncoderRNN(94, hidden_size).to(device) 417 | self.decoder_factor_seq = DecoderRNN(hidden_size, 94).to(device) 418 | 419 | self.optimizer = torch.optim.Adam(self.parameters(), lr=lr) 420 | self.criterion = nn.MSELoss().to(device) 421 | 422 | 423 | 424 | class seq2seq3(seq2seq_base): 425 | def __init__(self, hidden_size, dropout=0.5, lr=0.001, omit_char=[], device='cuda'): 426 | seq2seq_base.__init__(self, name=f'seq2seq3_{hidden_size}', omit_char=omit_char, device=device) 427 | self.dropout = dropout 428 | # P -> 32 -> 16 -> 8 -> K 429 | self.beta_seq = nn.Sequential( 430 | # hidden layer 1 431 | nn.Linear(94, 32), 432 | nn.BatchNorm1d(32), 433 | nn.ReLU(), 434 | nn.Dropout(self.dropout), 435 | # hidden layer 2 436 | nn.Linear(32, 16), 437 | nn.BatchNorm1d(16), 438 | nn.ReLU(), 439 | nn.Dropout(self.dropout), 440 | # hidden layer 3 441 | nn.Linear(16, 8), 442 | nn.BatchNorm1d(8), 443 | nn.ReLU(), 444 | nn.Dropout(self.dropout), 445 | # output layer 446 | nn.Linear(8, hidden_size) 447 | ) 448 | self.encoder_factor_seq = EncoderRNN(94, hidden_size).to(device) 449 | self.decoder_factor_seq = DecoderRNN(hidden_size, 94).to(device) 450 | 451 | self.optimizer = torch.optim.Adam(self.parameters(), lr=lr, weight_decay=0.01) 452 | self.criterion = nn.MSELoss().to(device) 453 | -------------------------------------------------------------------------------- /models/modelBase.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import datetime 4 | from dateutil.relativedelta import relativedelta 5 | 6 | class modelBase: 7 | def __init__(self, name): 8 | self.name = name 9 | self.train_idx = 0 10 | self.refit_cnt = 0 11 | 12 | # initial train, valid and test periods are default accroding to original paper 13 | self.train_period = [19570101, 19741231] 14 | self.valid_period = [19750101, 19861231] 15 | self.test_period = [19870101, 19871231] 16 | 17 | 18 | def train_model(self): 19 | # print('trained') 20 | pass 21 | 22 | 23 | def calBeta(self, month): 24 | """ 25 | Calculate specific month's beta. Should be specified by different models 26 | -> return np.array, dim = (N, K) 27 | """ 28 | # return np.zeros([13000, 3]) 29 | pass 30 | 31 | 32 | def calFactor(self, month): 33 | """ 34 | Calculate specific month's factor. Should be specified by different models 35 | -> return np.array, dim = (K, 1) 36 | """ 37 | # return np.zeros([3, 1]) 38 | pass 39 | 40 | 41 | def cal_delayed_Factor(self, month): 42 | """ 43 | Calculate delayed month's factor, i.e. mean average of factors up to t-1. Should be specified by different models 44 | -> return np.array, dim = (K, 1) 45 | """ 46 | pass 47 | 48 | 49 | def inference(self, month): 50 | assert month >= self.test_period[0], f"Month error, {month} is not in test period {self.test_period}" 51 | 52 | mon_factor, mon_beta = self.calFactor(month), self.calBeta(month) 53 | 54 | assert mon_beta.shape[1] == mon_factor.shape[0], f"Dimension mismatch between mon_factor: {mon_factor.shape} and mon_beta: {mon_beta.shape}" 55 | 56 | # R_{N*1} = Beta_{N*K} @ F_{K*1} 57 | return mon_beta @ mon_factor 58 | 59 | 60 | def predict(self, month): 61 | assert month >= self.test_period[0] and month <= self.test_period[1], f"Month error, {month} is not in test period {self.test_period}" 62 | 63 | lag_factor, mon_beta = self.cal_delayed_Factor(month), self.calBeta(month) 64 | 65 | assert mon_beta.shape[1] == lag_factor.shape[0], f"Dimension mismatch between lag_factor: {lag_factor.shape} and mon_beta: {mon_beta.shape}" 66 | 67 | # R_{N*1} = Beta_{N*K} @ lag_F_avg{K*1} 68 | return mon_beta @ lag_factor 69 | 70 | 71 | def refit(self): 72 | # self.train_period[1] += 10000 # method in original paper: increase training size by one year each time refit 73 | self.train_period = (pd.Series(self.train_period) + 10000).to_list() # rolling training 74 | self.valid_period = (pd.Series(self.valid_period) + 10000).to_list() 75 | self.test_period = (pd.Series(self.test_period) + 10000).to_list() 76 | self.refit_cnt += 1 77 | 78 | -------------------------------------------------------------------------------- /models/seq.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import numpy as np 4 | import collections 5 | from .modelBase import modelBase 6 | from utils import CHARAS_LIST 7 | 8 | from io import open 9 | import torch 10 | import torch.nn as nn 11 | from torch.autograd import Variable 12 | from torch import optim 13 | import torch.nn.functional as F 14 | from torch.utils.data import Dataset, DataLoader, TensorDataset 15 | 16 | use_cuda = torch.cuda.is_available() 17 | 18 | MAX_EPOCH = 200 19 | 20 | 21 | class seq2seq_base(nn.Module, modelBase): 22 | def __init__(self, name, omit_char=[], device='cuda'): 23 | nn.Module.__init__(self) 24 | modelBase.__init__(self, name) 25 | self.beta_seq = None 26 | self.factor_seq = None 27 | self.optimizer = None 28 | self.criterion = None 29 | self.omit_char = omit_char 30 | 31 | self.factor_seq_pred = [] 32 | 33 | self.device = device 34 | 35 | self.datashare_chara = pd.read_pickle('./data/datashare_re.pkl').astype(np.float64) 36 | self.p_charas = pd.read_pickle('./data/p_charas.pkl').astype(np.float64).reset_index() 37 | self.portfolio_ret = pd.read_pickle('./data/portfolio_ret.pkl').astype(np.float64) 38 | self.mon_ret = pd.read_pickle('./data/month_ret.pkl').astype(np.float64) 39 | 40 | self.train_dataloader = None 41 | self.valid_dataloader = None 42 | self.test_dataloader = None 43 | 44 | def debug(self, month): 45 | beta_seq_input = self.p_charas.loc[self.p_charas['DATE'] == month][CHARAS_LIST] 46 | # beta_seq_input = self.datashare_chara.loc[self.datashare_chara['DATE'] == month].set_index('permno')[charas] 47 | print(beta_seq_input) 48 | 49 | def _get_item(self, month): 50 | if month not in self.p_charas['DATE'].values: 51 | # find the closest month in p_charas to month 52 | month = self.p_charas['DATE'].values[np.argmin(np.abs(self.p_charas['DATE'].values - month))] 53 | 54 | beta_seq_input = self.p_charas.loc[self.p_charas['DATE'] == month][CHARAS_LIST] # (94, 94) 55 | labels = self.portfolio_ret.loc[self.portfolio_ret['DATE'] == month][CHARAS_LIST].T.values # (94, 1) 56 | beta_seq_input['ret-rf'] = labels 57 | align_df = beta_seq_input.copy(deep=False).dropna() 58 | 59 | factor_seq_input = self.portfolio_ret.loc[self.portfolio_ret['DATE'] == month][CHARAS_LIST] 60 | 61 | # exit(0) if there is any nan in align_df 62 | if align_df.isnull().values.any(): 63 | assert False, f'There is nan in align_df of : {month}' 64 | # return stock index (L), beta_seq_input (94*94=P*N), factor_seq_input (94*1=P*1), labels (94, = N,) 65 | return align_df.index, align_df.values[:, :-1].T, factor_seq_input.T.values, align_df.values[:, -1].T 66 | 67 | def dataloader(self, period): 68 | mon_list = pd.read_pickle('data/mon_list.pkl') 69 | mon_list = mon_list.loc[(mon_list >= period[0]) & (mon_list <= period[1])] 70 | beta_seq_input_set = [] 71 | factor_seq_input_set = [] 72 | label_set = [] 73 | for mon in mon_list: 74 | _, _beta_input, _factor_input, label = self._get_item(mon) 75 | beta_seq_input_set.append(_beta_input) 76 | factor_seq_input_set.append(_factor_input) 77 | label_set.append(label) 78 | 79 | beta_seq_input_set = torch.tensor(beta_seq_input_set, dtype=torch.float32).to(self.device) 80 | factor_seq_input_set = torch.tensor(factor_seq_input_set, dtype=torch.float32).to(self.device) 81 | label_set = torch.tensor(label_set, dtype=torch.float32).to(self.device) 82 | 83 | dataset = TensorDataset(beta_seq_input_set, factor_seq_input_set, label_set) 84 | return DataLoader(dataset, batch_size=1, shuffle=True) 85 | 86 | def forward(self, char, pfret): 87 | processed_char = self.beta_seq(char) 88 | # print(processed_char.shape) 89 | decoded_sequence = self.factor_seq(pfret) 90 | 91 | # Now, 'decoded_sequence' can be used as 'factor_seq' 92 | processed_pfret = decoded_sequence 93 | 94 | # return torch.sum(processed_char * processed_pfret, dim=1) 95 | return torch.mm(processed_char, processed_pfret) 96 | 97 | # train_one_epoch 98 | def __train_one_epoch(self): 99 | epoch_loss = 0.0 100 | self.train() 101 | for i, (beta_seq_input, factor_seq_input, labels) in enumerate(self.train_dataloader): 102 | 103 | # beta_nn_input reshape: (1, 94, 94) -> (94, 94) (1*P*N => N*P) 104 | # factor_nn_input reshape: (1, 94, 1) -> (1, 94) (1*P*1 => 1*P) 105 | # labels reshape: (1, 94) -> (94, ) (1*N => N,) 106 | beta_seq_input = beta_seq_input.squeeze(0).T 107 | factor_seq_input = factor_seq_input.squeeze(0).T 108 | labels = labels.squeeze(0) 109 | output = self.forward(beta_seq_input, factor_seq_input) 110 | loss = self.criterion(output, labels) 111 | 112 | self.optimizer.zero_grad() 113 | loss.backward() 114 | self.optimizer.step() 115 | epoch_loss += loss.item() 116 | 117 | if i % 100 == 0: 118 | # print(f'Batches: {i}, loss: {loss.item()}') 119 | pass 120 | 121 | return epoch_loss / len(self.train_dataloader) 122 | 123 | def __valid_one_epoch(self): 124 | epoch_loss = 0.0 125 | for i, (beta_seq_input, factor_seq_input, labels) in enumerate(self.valid_dataloader): 126 | # beta_nn_input reshape: (1, 94, 94) -> (94, 94) (1*P*N => N*P) 127 | # factor_nn_input reshape: (1, 94, 1) -> (1, 94) (1*P*1 => 1*P) 128 | # labels reshape: (1, 94) -> (94, ) (1*N => N,) 129 | beta_seq_input = beta_seq_input.squeeze(0).T 130 | factor_seq_input = factor_seq_input.squeeze(0).T 131 | labels = labels.squeeze(0) 132 | 133 | output = self.forward(beta_seq_input, factor_seq_input) 134 | loss = self.criterion(output, labels) 135 | epoch_loss += loss.item() 136 | 137 | return epoch_loss / len(self.valid_dataloader) 138 | 139 | def train_model(self): 140 | if 'saved_models' not in os.listdir('./'): 141 | os.mkdir('saved_models') 142 | 143 | self.train_dataloader = self.dataloader(self.train_period) 144 | self.valid_dataloader = self.dataloader(self.valid_period) 145 | self.test_dataloader = self.dataloader(self.test_period) 146 | 147 | min_error = np.Inf 148 | no_update_steps = 0 149 | valid_loss = [] 150 | train_loss = [] 151 | for i in range(MAX_EPOCH): 152 | # print(f'Epoch {i}') 153 | train_error = self.__train_one_epoch() 154 | train_loss.append(train_error) 155 | 156 | self.eval() 157 | # valid and early stop 158 | with torch.no_grad(): 159 | valid_error = self.__valid_one_epoch() 160 | 161 | valid_loss.append(valid_error) 162 | if valid_error < min_error: 163 | min_error = valid_error 164 | no_update_steps = 0 165 | # save model 166 | torch.save(self.state_dict(), f'./saved_models/{self.name}.pt') 167 | else: 168 | no_update_steps += 1 169 | 170 | if no_update_steps > 2: # early stop, if consecutive 3 epoches no improvement on validation set 171 | print(f'Early stop at epoch {i}') 172 | break 173 | # load from (best) saved model 174 | self.load_state_dict(torch.load(f'./saved_models/{self.name}.pt')) 175 | return train_loss, valid_loss 176 | 177 | def test_model(self): 178 | # beta, factor, label = self.test_dataset 179 | # i = np.random.randint(len(beta)) 180 | # beta_nn_input = beta[i] 181 | # factor_nn_input = factor[i] 182 | # labels = label[i] 183 | output = None 184 | label = None 185 | for i, beta_seq_input, factor_seq_input, labels in enumerate(self.test_dataloader): 186 | # convert to tensor 187 | # beta_nn_input = torch.tensor(beta_nn_input, dtype=torch.float32).T.to(self.device) 188 | # factor_nn_input = torch.tensor(factor_nn_input, dtype=torch.float32).T.to(self.device) 189 | # labels = torch.tensor(labels, dtype=torch.float32).T.to(self.device) 190 | output = self.forward(beta_seq_input, factor_seq_input) 191 | break 192 | 193 | loss = self.criterion(output, labels) 194 | print(f'Test loss: {loss.item()}') 195 | print(f'Predicted: {output}') 196 | print(f'Ground truth: {labels}') 197 | return output, labels 198 | 199 | def calBeta(self, month, skip_char=[]): 200 | _, beta_seq_input, _, _ = self._get_item(month) # beta input: 94*94 = P*N 201 | 202 | # if some variables need be omitted 203 | if len(skip_char): 204 | beta_seq_input = pd.DataFrame(beta_seq_input.T, columns=CHARAS_LIST) # N*P 205 | beta_seq_input[skip_char] = beta_seq_input[skip_char] * 0.0 206 | beta_seq_input = beta_seq_input.values.T # P*N 207 | 208 | beta_seq_input = torch.tensor(beta_seq_input, dtype=torch.float32).T.to(self.device) # N*P 209 | return self.beta_seq(beta_seq_input) # N*K 210 | 211 | def calFactor(self, month, skip_char=[]): 212 | _, _, factor_seq_input, _ = self._get_item(month) # factor input: P*1 213 | 214 | # if some variables need be omitted 215 | if len(skip_char): 216 | factor_seq_input = pd.DataFrame(factor_seq_input.T, columns=CHARAS_LIST) # 1*P 217 | factor_seq_input[skip_char] = factor_seq_input[skip_char] * 0.0 218 | factor_seq_input = factor_seq_input.values.T # P*1 219 | 220 | factor_seq_input = torch.tensor(factor_seq_input, dtype=torch.float32).T.to(self.device) # 1*P 221 | factor_pred = self.factor_seq(factor_seq_input) # K*1 222 | 223 | self.factor_seq_pred.append(factor_pred) 224 | 225 | return factor_pred # K*1 226 | 227 | def inference(self, month): 228 | if len(self.omit_char) == 0: 229 | assert month >= self.test_period[0], f"Month error, {month} is not in test period {self.test_period}" 230 | 231 | mon_factor, mon_beta = self.calFactor(month), self.calBeta(month) 232 | 233 | assert mon_beta.shape[1] == mon_factor.shape[ 234 | 0], f"Dimension mismatch between mon_factor: {mon_factor.shape} and mon_beta: {mon_beta.shape}" 235 | 236 | # R_{N*1} = Beta_{N*K} @ F_{K*1} 237 | return mon_beta @ mon_factor 238 | else: 239 | ret_R = [] 240 | for char in self.omit_char: 241 | mon_factor, mon_beta = self.calFactor(month, [char]), self.calBeta(month, [char]) 242 | ret_R.append((mon_beta @ mon_factor).cpu().detach().numpy()) # N*1 243 | 244 | mon_factor, mon_beta = self.calFactor(month), self.calBeta(month) 245 | ret_R.append((mon_beta @ mon_factor).cpu().detach().numpy()) # also add complete result 246 | 247 | return np.array(ret_R).squeeze(2).T # N*m 248 | 249 | def cal_delayed_Factor(self, month): 250 | # calculate the last day of the previous month 251 | if self.refit_cnt == 0: 252 | avg_f_pred = self.factor_seq_pred[0] # input of the first predict take hat{f}_t 253 | # print(avg_f_pred.shape) 254 | else: 255 | avg_f_pred = torch.mean(torch.stack(self.factor_seq_pred[:self.refit_cnt]), dim=0) 256 | 257 | return avg_f_pred 258 | 259 | def reset_weight(self): 260 | for layer in self.beta_seq: # reset beta_nn parameters 261 | if hasattr(layer, 'reset_parameters'): 262 | layer.reset_parameters() 263 | 264 | for layer in self.factor_seq: # reset factor_nn parameters 265 | if hasattr(layer, 'reset_parameters'): 266 | layer.reset_parameters() 267 | 268 | self.optimizer.state = collections.defaultdict(dict) # reset optimizer state 269 | 270 | def release_gpu(self): 271 | if self.train_dataloader is not None: 272 | del self.train_dataloader 273 | if self.valid_dataloader is not None: 274 | del self.valid_dataloader 275 | if self.test_dataloader is not None: 276 | del self.test_dataloader 277 | torch.cuda.empty_cache() 278 | 279 | 280 | class EncoderRNN(nn.Module): 281 | def __init__(self, input_size, hidden_size): 282 | super(EncoderRNN, self).__init__() 283 | self.input_size = input_size 284 | self.hidden_size = hidden_size 285 | 286 | self.embedding = nn.Linear(input_size, hidden_size) 287 | self.relu = nn.ReLU() 288 | self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True) 289 | 290 | def forward(self, input, hidden): 291 | # input = input.long() 292 | b, l = input.shape 293 | input = input.reshape(b, l, 1) 294 | embedded = self.embedding(input) 295 | embedded = self.relu(embedded) 296 | output, hidden = self.gru(embedded, hidden) 297 | 298 | return output, hidden 299 | 300 | def initHidden(self): 301 | result = Variable(torch.zeros(1, 1, self.hidden_size)) 302 | # result = Variable(torch.zeros(1, self.hidden_size, self.input_size)) 303 | if use_cuda: 304 | return result.cuda() 305 | else: 306 | return result 307 | 308 | 309 | class DecoderRNN(nn.Module): 310 | def __init__(self, hidden_size, output_size): 311 | super(DecoderRNN, self).__init__() 312 | self.hidden_size = hidden_size 313 | 314 | # self.embedding = nn.Linear(hidden_size, hidden_size) 315 | self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True) 316 | self.relu = nn.ReLU() 317 | self.out = nn.Linear(hidden_size, output_size) 318 | self.act = nn.Sigmoid() 319 | 320 | def forward(self, input, hidden): 321 | output, hidden = self.gru(input, hidden) 322 | output = self.relu(output) 323 | output = self.out(output[0]) 324 | output = self.act(output) 325 | return output, hidden 326 | 327 | def initHidden(self): 328 | result = Variable(torch.zeros(1, 1, self.hidden_size)) 329 | if use_cuda: 330 | return result.cuda() 331 | else: 332 | return result 333 | 334 | 335 | class FactorSeq(nn.Module): 336 | def __init__(self, input_size, hidden_size, device='cuda'): 337 | super(FactorSeq, self).__init__() 338 | self.encoder = EncoderRNN(input_size, hidden_size) 339 | self.decoder = DecoderRNN(hidden_size, input_size) 340 | self.hidden_size = hidden_size 341 | self.device = device 342 | 343 | def forward(self, input): 344 | encoder_hidden = self.encoder.initHidden() 345 | 346 | encoder_output, encoder_hidden = self.encoder(input, encoder_hidden) 347 | decoder_output, _ = self.decoder(encoder_output, encoder_hidden) 348 | decoded_sequence = decoder_output 349 | # print(decoded_sequence.shape) 350 | # decoder_input = decoder_output.argmax(1) 351 | 352 | return decoded_sequence 353 | 354 | 355 | class seq2seq0(seq2seq_base): 356 | def __init__(self, hidden_size, lr=0.001, omit_char=[], device='cuda'): 357 | seq2seq_base.__init__(self, name=f'seq2seq0_{hidden_size}', omit_char=omit_char, device=device) 358 | # P -> K 359 | self.beta_seq = nn.Sequential( 360 | # output layer 361 | nn.Linear(94, hidden_size) 362 | ) 363 | # Initialize the encoder and decoder for factor_seq 364 | self.encoder_factor_seq = EncoderRNN(94, hidden_size).to(device) 365 | self.decoder_factor_seq = DecoderRNN(hidden_size, 94).to(device) # output size is 94 366 | 367 | self.optimizer = torch.optim.Adam(self.parameters(), lr=lr) 368 | self.criterion = nn.MSELoss().to(device) 369 | 370 | 371 | class seq2seq1(seq2seq_base): 372 | def __init__(self, hidden_size, dropout=0.5, lr=0.001, omit_char=[], device='cuda'): 373 | seq2seq_base.__init__(self, name=f'seq2seq1_{hidden_size}', omit_char=omit_char, device=device) 374 | self.hidden_size = hidden_size 375 | self.dropout = dropout 376 | # P -> 32 -> K 377 | K = 94 378 | self.beta_seq = nn.Sequential( 379 | # hidden layer 1 380 | nn.Linear(94, 32), 381 | nn.BatchNorm1d(32), 382 | nn.ReLU(), 383 | nn.Dropout(self.dropout), 384 | # output layer 385 | nn.Linear(32, K) 386 | ) 387 | self.factor_seq = FactorSeq(input_size=1, hidden_size=hidden_size) 388 | 389 | self.optimizer = torch.optim.Adam(self.parameters(), lr=lr) 390 | self.criterion = nn.MSELoss().to(device) 391 | 392 | 393 | ###应该self.factor_seq 被self.decoder_factor_seq替换 394 | 395 | class seq2seq2(seq2seq_base): 396 | def __init__(self, hidden_size, dropout=0.5, lr=0.001, omit_char=[], device='cuda'): 397 | seq2seq_base.__init__(self, name=f'seq2seq2_{hidden_size}', omit_char=omit_char, device=device) 398 | self.hidden_size = hidden_size 399 | self.dropout = dropout 400 | # P -> 32 -> 16 -> K 401 | K = 94 402 | self.beta_seq = nn.Sequential( 403 | # hidden layer 1 404 | nn.Linear(94, 32), 405 | nn.BatchNorm1d(32), 406 | nn.ReLU(), 407 | nn.Dropout(self.dropout), 408 | # hidden layer 2 409 | nn.Linear(32, 16), 410 | nn.BatchNorm1d(16), 411 | nn.ReLU(), 412 | nn.Dropout(self.dropout), 413 | # output layer 414 | nn.Linear(16, K) 415 | ) 416 | self.factor_seq = FactorSeq(input_size=1, hidden_size=hidden_size) 417 | 418 | self.optimizer = torch.optim.Adam(self.parameters(), lr=lr) 419 | self.criterion = nn.MSELoss().to(device) 420 | 421 | 422 | class seq2seq3(seq2seq_base): 423 | def __init__(self, hidden_size, dropout=0.5, lr=0.001, omit_char=[], device='cuda'): 424 | seq2seq_base.__init__(self, name=f'seq2seq3_{hidden_size}', omit_char=omit_char, device=device) 425 | self.hidden_size = hidden_size 426 | self.dropout = dropout 427 | # P -> 32 -> 16 -> 8 -> K 428 | K = 94 429 | self.beta_seq = nn.Sequential( 430 | # hidden layer 1 431 | nn.Linear(94, 32), 432 | nn.BatchNorm1d(32), 433 | nn.ReLU(), 434 | nn.Dropout(self.dropout), 435 | # hidden layer 2 436 | nn.Linear(32, 16), 437 | nn.BatchNorm1d(16), 438 | nn.ReLU(), 439 | nn.Dropout(self.dropout), 440 | # hidden layer 3 441 | nn.Linear(16, 8), 442 | nn.BatchNorm1d(8), 443 | nn.ReLU(), 444 | nn.Dropout(self.dropout), 445 | # output layer 446 | nn.Linear(8, K) 447 | ) 448 | self.factor_seq = FactorSeq(input_size=1, hidden_size=hidden_size) 449 | 450 | self.optimizer = torch.optim.Adam(self.parameters(), lr=lr, weight_decay=0.01) 451 | self.criterion = nn.MSELoss().to(device) 452 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | # # stock-level characteristics with index corresponding to original paper 5 | # annual_chara = { 6 | # 'absacc': 1, 'acc': 2, 'age': 4, 'agr': 5, 'bm': 9, 7 | # 'bm_ia': 10, 'cashdebt': 12, 'cashpr': 13, 'cfp': 14, 'cfp_ia': 15, 8 | # 'chatoia': 16, 'chcsho': 17, 'chempia': 18, 'chinv': 19, 'chpmia': 21, 9 | # 'convind': 24, 'currat': 25, 'depr': 26, 'divi': 27, 'divo': 28, 10 | # 'dy': 30, 'egr': 32, 'ep': 33, 'gma': 34, 'grcapx': 35, 11 | # 'grltnoa': 36, 'herf': 37, 'hire': 38, 'invest': 42, 'lev': 43, 12 | # 'lgr': 44, 'mve_ia': 52, 'operprof': 54, 'orgcap': 55, 'pchcapx_ia': 56, 13 | # 'pchcurrat': 57, 'pchdepr': 58, 'pchgm_pchsale': 59, 'pchquick': 60, 'pchsale_pchinvt': 61, 14 | # 'pchsale_pchrect': 62, 'pchsale_pchxsga': 63, 'pchsaleinv': 64, 'pctacc': 65, 'ps': 67, 15 | # 'quick': 68, 'rd': 69, 'rd_mve': 70, 'rd_sale': 71, 'realestate': 72, 16 | # 'roic': 77, 'salecash': 79, 'saleinv': 80, 'salerec': 81, 'secured': 82, 17 | # 'securedind': 83, 'sgr': 84, 'sin': 85, 'sp': 86, 'tang': 91, 'tb': 92 18 | # } 19 | 20 | # quarter_chara = { 21 | # 'aeavol': 3, 'cash': 11, 'chtx': 22, 'cinvest': 23, 22 | # 'ear': 31, 'ms': 50, 'nincr': 53, 'roaq': 74, 23 | # 'roavol': 75, 'roeq': 76, 'rsup': 78, 'stdacc': 89, 'stdcf': 90 24 | # } 25 | 26 | # month_chara = { 27 | # 'baspread': 6, 'beta': 7, 'betasq': 8, 'chmom': 20, 28 | # 'dolvol': 29, 'idiovol': 39, 'ill': 40, 'indmom': 41, 29 | # 'maxret': 45, 'mom12m': 46, 'mom1m': 47, 'mom36m': 48, 30 | # 'mom6m': 49, 'mvel1': 51, 'pricedelay': 66, 'retvol': 73, 31 | # 'std_dolvol': 87, 'std_turn': 88, 'turn': 93, 'zerotrade': 94 32 | # } 33 | 34 | CHARAS_LIST = ['absacc','acc','age','agr','bm','bm_ia','cashdebt','cashpr','cfp','cfp_ia','chatoia','chcsho','chempia','chinv','chpmia','convind','currat','depr','divi','divo','dy','egr','ep','gma','grcapx','grltnoa','herf','hire','invest','lev','lgr','mve_ia','operprof','orgcap','pchcapx_ia','pchcurrat','pchdepr','pchgm_pchsale','pchquick','pchsale_pchinvt','pchsale_pchrect','pchsale_pchxsga','pchsaleinv','pctacc','ps','quick','rd','rd_mve','rd_sale','realestate','roic','salecash','saleinv','salerec','secured','securedind','sgr','sin','sp','tang','tb','aeavol','cash','chtx','cinvest','ear','ms','nincr','roaq','roavol','roeq','rsup','stdacc','stdcf','baspread','beta','betasq','chmom','dolvol','idiovol','ill','indmom','maxret','mom12m','mom1m','mom36m','mom6m','mvel1','pricedelay','retvol','std_dolvol','std_turn','turn','zerotrade'] 35 | 36 | 37 | # default learning rate of CA model 38 | CA_DR = 0.5 # drop out rate 39 | CA_LR = 0.001 # learning rate 40 | 41 | # out of sample period 42 | OOS_start = 19870101 43 | OOS_end = 20161231 44 | 45 | 46 | 47 | class HiddenPrints: 48 | def __init__(self, activated=True): 49 | self.activated = activated 50 | self.original_stdout = None 51 | 52 | def open(self): 53 | sys.stdout.close() 54 | sys.stdout = self.original_stdout 55 | 56 | def close(self): 57 | self.original_stdout = sys.stdout 58 | sys.stdout = open(os.devnull, 'w') 59 | 60 | def __enter__(self): 61 | if self.activated: 62 | self.close() 63 | 64 | def __exit__(self, exc_type, exc_val, exc_tb): 65 | if self.activated: 66 | self.open() 67 | 68 | 69 | 70 | def git_push(message): 71 | os.system('git add results') 72 | os.system(f'git commit -m "no_dropout: {message}"') 73 | os.system('git push') --------------------------------------------------------------------------------