├── .gitignore
├── DL_Finance_Project_2.pdf
├── README.md
├── R_squares
    ├── 2023-06-09_05-24-47.json
    ├── 2023-06-09_06-07-56.json
    ├── 2023-06-09_09-08-26.json
    └── 2023-06-11_18-16-38.json
├── analysis.py
├── data_prepare.py
├── imgs
    ├── R2_pred_table.png
    ├── R2_total_table.png
    ├── alpha
    │   ├── CA0_1_inference_alpha_plot.png
    │   ├── CA0_2_inference_alpha_plot.png
    │   ├── CA0_3_inference_alpha_plot.png
    │   ├── CA0_4_inference_alpha_plot.png
    │   ├── CA0_5_alpha_plot.png
    │   ├── CA0_5_inference_alpha_plot.png
    │   ├── CA0_6_inference_alpha_plot.png
    │   ├── CA1_1_inference_alpha_plot.png
    │   ├── CA1_2_inference_alpha_plot.png
    │   ├── CA1_3_inference_alpha_plot.png
    │   ├── CA1_4_inference_alpha_plot.png
    │   ├── CA1_5_inference_alpha_plot.png
    │   ├── CA1_6_inference_alpha_plot.png
    │   ├── CA2_1_inference_alpha_plot.png
    │   ├── CA2_2_inference_alpha_plot.png
    │   ├── CA2_3_inference_alpha_plot.png
    │   ├── CA2_4_inference_alpha_plot.png
    │   ├── CA2_5_inference_alpha_plot.png
    │   ├── CA2_6_inference_alpha_plot.png
    │   ├── CA3_1_inference_alpha_plot.png
    │   ├── CA3_2_inference_alpha_plot.png
    │   ├── CA3_3_inference_alpha_plot.png
    │   ├── CA3_4_inference_alpha_plot.png
    │   ├── CA3_5_inference_alpha_plot.png
    │   ├── CA3_6_inference_alpha_plot.png
    │   ├── FF_1_inference_alpha_plot.png
    │   ├── FF_2_inference_alpha_plot.png
    │   ├── FF_3_inference_alpha_plot.png
    │   ├── FF_4_inference_alpha_plot.png
    │   ├── FF_5_inference_alpha_plot.png
    │   ├── FF_6_inference_alpha_plot.png
    │   ├── IPCA_1_inference_alpha_plot.png
    │   ├── IPCA_2_inference_alpha_plot.png
    │   ├── IPCA_3_inference_alpha_plot.png
    │   ├── IPCA_4_inference_alpha_plot.png
    │   ├── IPCA_5_inference_alpha_plot.png
    │   ├── IPCA_6_inference_alpha_plot.png
    │   ├── PCA_1_inference_alpha_plot.png
    │   ├── PCA_2_inference_alpha_plot.png
    │   ├── PCA_3_inference_alpha_plot.png
    │   ├── PCA_4_inference_alpha_plot.png
    │   ├── PCA_5_inference_alpha_plot.png
    │   └── PCA_6_inference_alpha_plot.png
    ├── omit_char_R2_bias.png
    ├── pred_R2.png
    └── total_R2.png
├── main.py
├── models
    ├── CA.py
    ├── FF.py
    ├── IPCA.py
    ├── PCA.py
    └── modelBase.py
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # raw data
 2 | data/
 3 | data.zip
 4 | __MACOSX/
 5 | new_data.zip
 6 | __pycache__
 7 | models/__pycache__
 8 | saved_models
 9 | *_loss_*.png
10 | *.ipynb
11 | results/
12 | logs/
13 | R_squares/
14 | 


--------------------------------------------------------------------------------
/DL_Finance_Project_2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/DL_Finance_Project_2.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Autoencoder-Asset-Pricing-Models
 2 | 
 3 | 🧐 [**Report**](https://www.richardsong.live/autoencoder-asset-pricing-models) | [**Report PDF file**](https://cloud.tsinghua.edu.cn/f/c02804bed00b4083bcb7/?dl=1)
 4 | ## Set Up
 5 | 
 6 | ```bash
 7 | # generate preprocessed data and download portfolio returns
 8 | python data_prepare.py
 9 | 
10 | # train models (ALL together)
11 | python main.py --Model 'FF PCA IPCA CA0 CA1 CA2 CA3' --K '1 2 3 4 5 6'
12 | 
13 | # train models (selected models and K, for example)
14 | python main.py --Model 'IPCA CA3' --K '5 6'
15 | 
16 | # analyze characteristics' importance (if needed)
17 | python main.py --Model 'IPCA CA0 CA1 CA2 CA3' --K '5' --omit_char 'absacc acc age agr bm bm_ia cashdebt cashpr cfp cfp_ia chatoia chcsho chempia chinv chpmia convind currat depr divi divo dy egr ep gma grcapx grltnoa herf hire invest lev lgr mve_ia operprof orgcap pchcapx_ia pchcurrat pchdepr pchgm_pchsale pchquick pchsale_pchinvt pchsale_pchrect pchsale_pchxsga pchsaleinv pctacc ps quick rd rd_mve rd_sale realestate roic salecash saleinv salerec secured securedind sgr sin sp tang tb aeavol cash chtx cinvest ear ms nincr roaq roavol roeq rsup stdacc stdcf baspread beta betasq chmom dolvol idiovol ill indmom maxret mom12m mom1m mom36m mom6m mvel1 pricedelay retvol std_dolvol std_turn turn zerotrade'
18 | 
19 | # analyze models (calculate R^2, plot R^2 tables, bars and bias heatmap)
20 | python analysis.py
21 | ```
22 | ## Results
23 | ### Total R^2 (%)
24 | <img src="/imgs/R2_total_table.png" width=60%>
25 | <img src="/imgs/total_R2.png" width=100%>
26 | 
27 | 
28 | ### Predict R^2 (%)
29 | <img src="/imgs/R2_pred_table.png" width=60%>
30 | <img src="/imgs/pred_R2.png" width=100%>
31 | 
32 | ### Risk Premia v.s. Mispricing
33 | <table>
34 | <tr>
35 | <td><img src="imgs/alpha/FF_5_inference_alpha_plot.png" border=0></td>
36 | <td><img src="imgs/alpha/PCA_5_inference_alpha_plot.png" border=0></td>
37 | </tr>
38 | <tr>
39 | <td><img src="imgs/alpha/IPCA_5_inference_alpha_plot.png" border=0></td>
40 | <td><img src="imgs/alpha/CA1_5_inference_alpha_plot.png" border=0></td>
41 | </tr>
42 | <tr>
43 | <td><img src="imgs/alpha/CA2_5_inference_alpha_plot.png" border=0></td>
44 | <td><img src="imgs/alpha/CA3_5_inference_alpha_plot.png" border=0></td>
45 | </tr>
46 | </table>
47 | 
48 | ### Characteristics Importance (reduced total R^2 (%), K=5)
49 | <img src="/imgs/omit_char_R2_bias.png" width=100%>
50 | 


--------------------------------------------------------------------------------
/R_squares/2023-06-09_05-24-47.json:
--------------------------------------------------------------------------------
1 | {"models": ["FF_1", "FF_2", "FF_3", "FF_4", "FF_5", "FF_6"], "omit_char": [""], "R2_total": [0.08537139414421824, 0.1576019101919831, 0.1986486217133806, 0.20315476596988524, 0.31397093775365037, 0.3616431120471959]}


--------------------------------------------------------------------------------
/R_squares/2023-06-09_06-07-56.json:
--------------------------------------------------------------------------------
1 | {"models": ["CA0_1", "CA0_2", "CA0_3", "CA0_4", "CA0_5", "CA0_6", "CA1_1", "CA1_2", "CA1_3", "CA1_4", "CA1_5", "CA1_6", "CA2_1", "CA2_2", "CA2_3", "CA2_4", "CA2_5", "CA2_6", "CA3_1", "CA3_2", "CA3_3", "CA3_4", "CA3_5", "CA3_6"], "omit_char": [""], "R2_total": [
2 |     0.5677639760646643, 0.6761207104643877, 0.7076066105263652, 0.6913661386379286, 0.662602500272096,  0.7110612627936461, 
3 |     0.5517562872860107, 0.7025783407556893, 0.685776051607686,  0.6664443573030849, 0.7006957708196195, 0.7052861947690043, 
4 |     0.5967130036325399, 0.6626964974803786, 0.6608531336078073, 0.7070314610106503, 0.6462021917956272, 0.6767568343936613, 
5 |     0.5531676704426002, 0.5249032928672436, 0.5642100044551001, 0.5458004779254889, 0.5558832641978944, 0.5235321637890534]}


--------------------------------------------------------------------------------
/R_squares/2023-06-09_09-08-26.json:
--------------------------------------------------------------------------------
1 | {"models": ["PCA_1", "PCA_2", "PCA_3", "PCA_4", "PCA_5", "PCA_6"], "omit_char": [""], "R2_total": [0.4061160826307103, 0.5300364609271587, 0.5913033228863098, 0.6246396597772854, 0.6467919712825208, 0.6720178863573743]}


--------------------------------------------------------------------------------
/R_squares/2023-06-11_18-16-38.json:
--------------------------------------------------------------------------------
1 | {"models": ["IPCA_5", "CA0_5", "CA1_5", "CA2_5", "CA3_5"], "omit_char": ["absacc", "acc", "age", "agr", "bm", "bm_ia", "cashdebt", "cashpr", "cfp", "cfp_ia", "chatoia", "chcsho", "chempia", "chinv", "chpmia", "convind", "currat", "depr", "divi", "divo", "dy", "egr", "ep", "gma", "grcapx", "grltnoa", "herf", "hire", "invest", "lev", "lgr", "mve_ia", "operprof", "orgcap", "pchcapx_ia", "pchcurrat", "pchdepr", "pchgm_pchsale", "pchquick", "pchsale_pchinvt", "pchsale_pchrect", "pchsale_pchxsga", "pchsaleinv", "pctacc", "ps", "quick", "rd", "rd_mve", "rd_sale", "realestate", "roic", "salecash", "saleinv", "salerec", "secured", "securedind", "sgr", "sin", "sp", "tang", "tb", "aeavol", "cash", "chtx", "cinvest", "ear", "ms", "nincr", "roaq", "roavol", "roeq", "rsup", "stdacc", "stdcf", "baspread", "beta", "betasq", "chmom", "dolvol", "idiovol", "ill", "indmom", "maxret", "mom12m", "mom1m", "mom36m", "mom6m", "mvel1", "pricedelay", "retvol", "std_dolvol", "std_turn", "turn", "zerotrade"], "R2_total": [1.5442819689237552e-05, -0.0008641544212459884, -0.0001747076111721091, -6.028723743389808e-05, 0.0004881307548909586, -9.453040450568828e-05, 0.0011809606037788134, -0.00047160860320083486, 0.0004497328261676703, 0.00035797708987406196, 0.00046514268643882417, 8.010321267071241e-05, 0.0005184883538837948, -0.00037818109386877907, -5.991642326275137e-05, 0.0002601835988962353, 0.0011439216429843801, 0.0017112133538985663, -4.350496459193387e-06, -0.00019380257770462705, 0.00043948388552850215, 0.00031580941297537635, 0.003338161066369527, 0.0021027676465422696, -0.0003171094148499698, 0.0014555971916409005, -0.0007270330656120594, 0.0016980056916262587, 0.0009553837759342931, 0.0027868308676146647, 0.00026402683391868464, 0.0007850527331111357, 0.0020449984309897085, 0.001054644787377823, 5.184771678434785e-06, 0.002178892159566903, -0.00017501728488655832, -0.0002577126732409285, 0.0009543687413273716, 0.0005083110135046809, 7.62809849720325e-05, 8.658720973830913e-05, 0.0004486453095020604, 0.0008906763104503668, 0.0010605476004982295, 0.0018694816639623912, 0.0002959972532902144, 0.0017008605711600344, 0.0007976861126247625, -0.002652802485675565, 0.002662754828368419, 9.917422749694538e-05, 5.964285215753762e-05, 0.0022189518272991426, -7.435817815093504e-05, -0.001698616694105315, 0.001233404502690938, 0.00047054690788606024, 0.0026004651808273493, 0.0011172780845385422, -0.0001482646856509895, -0.00041623695665415905, -0.0015909825974204095, -0.00031927406061749153, -0.00043655338823822554, 0.0001361268145957384, -0.00032109572478289383, 8.887812899915914e-05, 0.002067090875226163, 0.0010982209839661694, -0.0005637330110257466, -0.0005329012820900481, 0.0009084092837141622, 0.0021178568491788674, 0.0033586699296590528, 0.03875439641206746, 0.007293900327323799, 0.004763913685328358, 0.004817433432478202, 0.007084909806944539, 0.0025907095069127584, 0.0080607854611624, 0.0057093251947115675, 0.009105559086416637, 0.011542299391846145, 0.0028401829485293906, 0.02709723337680492, 0.004118952479980065, -0.00025137933247576516, 0.018393986829754794, 0.003372828092497837, 0.0013541844149635995, 0.00793699314016949, 0.0036518023385704312, -0.01840451865415549, 0.0035504742986338655, -0.02124055707570216, -0.003382787260082454, 0.001661945882751592, -8.216386260140318e-05, -0.011643569891665484, -0.0015886009587249283, -0.0060291572912980484, -0.003934251792188981, -0.0003035868273979503, 0.0025248914169603287, -0.0004996588466505969, 0.0009582432785717465, -0.0017444408411588785, -0.004805523268163303, -0.002931825562557, -0.014321167551977099, -0.005731926091473105, -0.001628883333692932, -0.008380269170191412, -0.002495037120210153, -0.013508701388505795, 0.001503549704314544, -0.0007751891041964942, -0.0007030969454133729, -0.0028340425016759596, 0.002562114878683408, 0.0007168131966963642, 0.0005966917290103346, -0.0009823429746401713, -0.008049996513831648, -0.006935426188849236, -0.013752522526764732, -0.0005069144917650981, 0.0008889578779507357, -0.004511542543265024, 0.000512296750466934, -0.0002943171885458895, -0.00022271179096300386, -0.0021640651213321593, 0.0002623110944273144, -0.00026747487114753277, -0.0027015552337811277, -0.012375056721493305, -0.00010214235568639651, -0.012322425151997773, -0.004802562616962769, -0.0040789335496304036, 0.0004565130977485232, -0.0004572283160489965, 0.011102582602458222, -0.0020968867403851066, -0.0011853937188981423, -0.005591149809828, -0.010832419289181106, -0.001980246263801777, -0.0017540247099745443, 0.0018022912780594202, -0.0005056805241561158, -0.0017686227072655214, -0.0003527295679881526, 0.016529002299418005, -0.0014105862431510463, -0.0005732444581987295, -0.002869230597707273, -0.004965959446656232, 0.0004066440403122096, -0.009874817195565821, -0.030274345848102402, -0.012926682404285739, -0.0008783158084264553, -0.006107566065733372, -0.010184352477880187, -0.025327970485242712, -0.035590091815137614, -0.0303458179931807, -0.004102272911347127, -0.0004503926531160829, -0.029369754046835173, -0.007045255786105931, 0.0016875597888881266, -0.03481944581257812, -0.006678644726203498, -0.014538583275030104, -0.0007153352724629247, -0.011420023006262658, -0.004652763809388283, -0.005798820819408745, -0.024656178382399974, 0.0011578591100704916, -0.005280277401536693, 0.0010356576649497296, 0.003940580861575116, -0.005276868987642791, 0.00770392753313931, -0.007944469488529449, 0.001991397641811221, 0.003759694548383874, 0.0011195084904335184, -0.007110697689415635, 0.0032203225902148747, -0.007578862375151041, -0.0060194039077946515, 0.00024130283639867134, 0.0013235404395363082, 0.00043243599875075756, 0.001123780074328562, 0.00050973691967926, -0.00034757211361624574, -0.002281972957101308, 0.000845475795447137, -0.0020588684564848414, 0.0021372282150149413, -0.00019856545094609768, -0.0017913068956328937, 0.0006373160107784326, 0.0012739985965753986, -0.001696571054130902, -0.0002589327068805991, -0.00022288072963450034, -0.0031104722681001284, -0.0008235753576365523, -0.002447807642767752, 0.002004633800402944, -0.003224266216026561, 0.00864564856685679, -0.00554499989692514, -0.00026444194813035615, -0.0010032832870013886, -0.0032213627206253426, -0.000911160024681501, 0.0004076477827085201, 0.0007995060213979999, -0.0005885363844878588, 0.0032955989913380224, 0.0004894614286667931, 0.002261644799368967, -0.0032746021967355876, -0.0005524850007211368, -0.0031971616217488785, 0.0060628232866996035, -0.0018951843317909223, -0.004804670310596504, -0.016865397416521932, 0.0022279632468085175, 0.0004397955748330906, -0.0012895574791834674, -0.0052936056246406515, 0.001047196960554997, 0.0014035810745537391, -0.0021789054250395123, 0.004835618627879845, -0.004423137045597492, 0.0033839427269372058, 0.0004317648131846319, 0.010148684107688322, -0.0005852067203551137, -0.00023673829544612612, -0.0004969772951760598, -0.0007139106069110612, -0.0006781488781816281, -0.004463513032158084, -0.008314803798795345, 0.005664192158806869, -0.0017831527387602852, -0.005528449668342539, -0.00018355344047649158, -0.013367195182721558, 0.010476427486050932, -0.0030995623273649686, -0.0033128122970741414, 0.004656006266778645, -0.010849289595229128, -0.002397579088451285, -0.009958364378056861, -0.0011444352933996926, 0.0021166335185883733, -0.018000165477872865, -0.0032624694006755384, -0.004127065179238776, 0.012949255788881175, -0.0018095543583533935, -0.005592349116810835, 0.0037267653611244844, -0.0038516820526487416, 0.024489734744573943, 0.004683327531111781, 0.001269388150738071, -0.0017407506416220464, 0.011902507395885942, 0.004639839618832409, 0.002506386658879589, 0.0003685885982728232, 0.009029032176506746, 0.003135888070068371, -0.0016520809387076119, 0.003088001303703236, 0.00014727469647846103, 0.0035561159496605432, -2.876814735663924e-05, -1.1060450691990908e-06, -0.00046745210145082705, 0.0012870350251698026, 0.0042191397766525585, 0.00655774915823315, 0.0005207348452095362, 0.0023795699587515484, 0.0043810533565454834, 0.004609257547817935, 0.01290056247180027, 0.0014757109716835304, 0.0010271818164879765, 0.0015607194203527408, 0.0004559197274759397, 0.00033707593565324157, 0.002642703381714573, 0.006501328950226926, 0.0008704244332061739, 0.002463581165456641, 0.004364567880446146, -0.0018809110325389566, 0.00017150256113329654, 0.00017711236826822851, -7.174454357028459e-05, 3.2134260813054816e-05, -0.0018074430724653867, 0.0007488238809050252, -0.00021827012815400781, -0.0002562985350954561, 0.00023716735492340657, 0.001413468319080624, 0.006882923410709174, 0.0007931366577244026, -0.0004717657827425503, 0.009100363328674477, 0.005404627376927373, -0.004026623783044081, 0.005351064544848239, -0.0035389768278023537, 0.002068255236817085, -0.002090048742856676, -0.0019271886285645579, 0.0011498135139685894, 0.0018314054258520285, -0.00026283971236906734, 0.0015413975782866407, -0.004150725533632493, -0.001320978589455457, 0.00031339691627352284, -0.0016566925476326766, -0.0006220275171834322, 7.44998579660372e-05, -0.0002999797537633908, 0.0007030142990428478, 0.0007913912947924429, 0.0011556895007599488, 0.011568536396979079, 0.0028225455085252316, 0.00035905603670860486, -0.000841494670420162, 0.009615860981466828, 0.03377755052540243, 0.0789273078045386, 0.05525867670164508, -0.0010166697260519664, 0.00957764643602288, 0.06977357007791107, 0.006012541368369928, 0.002010875608860041, 0.03032162865092458, 0.002846750112344365, -0.00016217114898731122, 0.0060712783132312875, 0.006622682402069313, 0.006219871147256417, 0.0018001652923477218, 0.06992553842572524, 0.006724233973021909, 0.007539747957793552, 0.020538531613926936, 0.007943897075231354, 0.008360812789827254, 0.003290025441256006, 0.020634865090878085, 0.0008080876743085108, -0.0030892486063024416, 0.00041600088024251747, 0.0070829632445147395, 0.0036536124147609206, -0.0016869382360118479, 0.0034263287960714095, -0.0006554657701027811, 0.0029753478023664126, 0.0006174977901268752, 7.101865790082318e-05, -0.00016257411215181428, -0.0009369820715646737, 0.0005434387564733356, 0.0012271814940856274, 0.0017175994942115747, 0.0005473661944555008, 0.005273938186585614, 0.0027605830798231867, 0.0053267195601733874, -0.0008641356489738072, 0.0003752916592929534, -8.763321084148679e-05, 0.0015245669500787429, -0.00012165228700977693, 0.0007204145703794129, 7.33961001830874e-06, 0.0007303311512011357, 0.002484907097797917, 0.012401690162314405, 0.002021342443194185, -0.00036144987650921223, -0.0012055569846747272, 0.00085115010144865, 2.829758256761572e-05, -3.762854703992513e-05, 0.0006878905429239524, -0.0007663772085586551, 0.0001906678540952722, 0.0001922411569781346, 0.001457809117053288, 0.0061655533145379415, -0.0002077338553436725, 0.004428694252206933, 0.007572691730434067, 0.015522139371950017, -0.0007973496961124482, 0.010913869326709902, -0.0009089608282075723, 0.001704153441209666, -0.001975343179565603, 0.0007767569381108563, 0.0001946668039775057, -0.0003049050619283733, -0.0006077903992949274, -0.0014501145568337481, 0.0024408470743457755, 0.0024867530414642847, -0.00028112355434628533, 0.00029719386409921, -0.0005815474877897131, -9.683347434896739e-05, 0.0015214781874257621, 0.00038445751824878194, -0.0006103274076518783, 0.0016081931094918955, 0.008255889709628095, 0.013087103367242059, 0.0004902027422780675, 0.007683661011998355, 0.0006227685572650632, 0.040959894211064496, 0.04659799783753149, 0.05623489106446333, -0.0011074112724636098, 0.0026292183555661763, 0.06648468153042297, 0.008507328563525984, 0.0012577368022517188, 0.05146040805311081, 0.006280620030596484, 0.0025791825898906495, 0.006777579940425715, 0.009547184931625763, 0.008159048875296504, 0.0008150683146730398, 0.06841724518294723, -0.0003192050956154491, 0.005694154235833304, 0.0024969631810545234, -0.001106586510389418]}


--------------------------------------------------------------------------------
/analysis.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from utils import *
  4 | import seaborn as sns
  5 | import matplotlib.pyplot as plt
  6 | import plotly.figure_factory as ff
  7 | 
  8 | import warnings
  9 | warnings.filterwarnings('ignore')
 10 | 
 11 | 
 12 | def calculate_R2(model, type, input=None, complete_r=None):
 13 |     portfolio_ret = pd.read_pickle('data/portfolio_ret.pkl')
 14 |     oos_ret = portfolio_ret.loc[(portfolio_ret['DATE'] >= OOS_start) & (portfolio_ret['DATE'] <= OOS_end)]
 15 | 
 16 |     if not isinstance(input, np.ndarray):
 17 |         # print('type: ', type)
 18 |         if isinstance(model, str):
 19 |             output_path = f'results/{type}/{model}_{type}.csv'
 20 |         else:
 21 |             output_path = f'results/{type}/{model.name}_{type}.csv'
 22 |         # print('path : ', output_path)
 23 |         model_output = pd.read_csv(output_path)
 24 |     else:
 25 |         model_output = input
 26 |         model_output = pd.DataFrame(model_output, columns=CHARAS_LIST)
 27 |         model_output['DATE'] = oos_ret['DATE'].to_list()
 28 |   
 29 |     for col in model_output.columns: # hard code for format error
 30 |         model_output[col] = model_output[col].apply(lambda x: float(str(x).replace('[', '').replace(']', '')))
 31 |     
 32 |     residual_square = ((oos_ret.set_index('DATE') - model_output.set_index('DATE'))**2).dropna()
 33 |     residual_square = (1 - (residual_square == np.inf) * 1.0) * residual_square # drop Inf outliers
 34 |     
 35 |     total_square = oos_ret.set_index('DATE')**2
 36 |     total_square = (1 - (total_square == np.inf) * 1.0) * total_square # drop Inf outliers
 37 |     
 38 |     model_output_R2 = 1 - np.sum(residual_square.values)/np.sum(total_square.values)
 39 |     
 40 |     if not isinstance(input, np.ndarray):
 41 |         return model_output_R2
 42 |     
 43 |     else:
 44 |         no_omit_output = complete_r
 45 |         no_omit_output = pd.DataFrame(no_omit_output, columns=CHARAS_LIST)
 46 |         no_omit_output['DATE'] = oos_ret['DATE'].to_list()
 47 |         
 48 |         no_omit_residual_square = ((oos_ret.set_index('DATE') - no_omit_output.set_index('DATE'))**2).dropna()
 49 |         no_omit_residual_square = (1 - (no_omit_residual_square == np.inf) * 1.0) * no_omit_residual_square # drop Inf outliers
 50 |         
 51 |         no_omit_model_output_R2 = 1 - np.sum(no_omit_residual_square.values)/np.sum(total_square.values)
 52 |         
 53 |         return no_omit_model_output_R2 - model_output_R2 # the difference of R^2, i.e. the importance of characteristics
 54 | 
 55 | 
 56 | 
 57 | def alpha_plot(model, type, save_dir='imgs'):
 58 |     if 'alpha' not in os.listdir(save_dir):
 59 |         os.mkdir(f'{save_dir}/alpha')
 60 |     
 61 |     portfolio_ret = pd.read_pickle('data/portfolio_ret.pkl')
 62 |     oos_result = portfolio_ret.loc[(portfolio_ret['DATE'] >= OOS_start) & (portfolio_ret['DATE'] <= OOS_end)].set_index('DATE')
 63 |     
 64 |     output_path = f'results/{type}/{model.name}_{type}.csv'
 65 |     inference_result = pd.read_csv(output_path)
 66 |     inference_result = inference_result.set_index('DATE')
 67 |     
 68 |     pricing_error_analysis = []
 69 |     for col in CHARAS_LIST:
 70 |         raw_return = oos_result[col].mean()
 71 |         error = oos_result[col] - inference_result[col]
 72 |         alpha = error.mean()
 73 |         t_stat = abs(error.mean()/error.std()) * np.sqrt(oos_result.shape[0])
 74 |         pricing_error_analysis.append([raw_return, alpha, t_stat])
 75 | 
 76 |     pricing_error_analysis = pd.DataFrame(pricing_error_analysis, columns = ['raw ret', 'alpha', 't_stat'], index=CHARAS_LIST)
 77 |     
 78 |     lower_point = min(np.min(pricing_error_analysis['raw ret']), np.min(pricing_error_analysis['alpha'])) * 1.15
 79 |     upper_point = max(np.max(pricing_error_analysis['raw ret']), np.max(pricing_error_analysis['alpha'])) * 1.15
 80 | 
 81 |     significant_mask = pricing_error_analysis['t_stat'] > 3
 82 | 
 83 |     plt.scatter(pricing_error_analysis.loc[significant_mask]['raw ret'], pricing_error_analysis.loc[significant_mask]['alpha'], marker='^', color='r', alpha=0.6, label=f'#Alphas(|t|>3.0)={np.sum(significant_mask*1.0)}')
 84 |     plt.scatter(pricing_error_analysis.loc[~significant_mask]['raw ret'], pricing_error_analysis.loc[~significant_mask]['alpha'], marker='o', color='b', alpha=0.6, label=f'#Alphas(|t|<3.0)={94-np.sum(significant_mask*1.0)}')
 85 |     plt.plot(np.linspace(lower_point, upper_point, 10), np.linspace(lower_point, upper_point, 10), color='black')
 86 | 
 87 |     plt.ylabel('Alpha (%)')
 88 |     plt.xlabel('Raw Return (%)')
 89 |     plt.legend()
 90 | 
 91 |     plt.title(model.name)
 92 |     plt.savefig(f'{save_dir}/alpha/{model.name}_inference_alpha_plot.png')
 93 |     plt.close()
 94 |     
 95 | 
 96 | def plot_R2_bar(R_df, type):
 97 |     
 98 |     R_df['Model'] = R_df[0].apply(lambda x: x.split('_')[0])
 99 | 
100 |     labels = ['K=1', 'K=2', 'K=3', 'K=4', 'K=5', 'K=6']
101 |     FF = (R_df.loc[R_df['Model']=='FF'][1]*100).to_list()
102 |     PCA = (R_df.loc[R_df['Model']=='PCA'][1]*100).to_list()
103 |     IPCA = (R_df.loc[R_df['Model']=='IPCA'][1]*100).to_list()
104 |     CA0 = (R_df.loc[R_df['Model']=='CA0'][1]*100).to_list()
105 |     CA1 = (R_df.loc[R_df['Model']=='CA1'][1]*100).to_list()
106 |     CA2 = (R_df.loc[R_df['Model']=='CA2'][1]*100).to_list()
107 |     CA3 = (R_df.loc[R_df['Model']=='CA3'][1]*100).to_list()
108 | 
109 | 
110 |     x = np.arange(len(labels))  # 标签位置
111 |     width = 0.11
112 | 
113 |     fig, ax = plt.subplots(figsize=(15, 5))
114 |     ax.bar(x - width*3 , FF, width, label='FF', color=plt.get_cmap('OrRd')(np.linspace(0, 1, 8)[1]))
115 |     ax.bar(x - width*2 , PCA, width, label='PCA', color=plt.get_cmap('OrRd')(np.linspace(0, 1, 8)[2]))
116 |     ax.bar(x - width , IPCA, width, label='IPCA', color=plt.get_cmap('OrRd')(np.linspace(0, 1, 8)[3]))
117 |     ax.bar(x + 0.00, CA0, width, label='CA0', color=plt.get_cmap('OrRd')(np.linspace(0, 1, 8)[4]))
118 |     ax.bar(x + width , CA1, width, label='CA1', color=plt.get_cmap('OrRd')(np.linspace(0, 1, 8)[5]))
119 |     ax.bar(x + width*2 , CA2, width, label='CA2', color=plt.get_cmap('OrRd')(np.linspace(0, 1, 8)[6]))
120 |     ax.bar(x + width*3 , CA3, width, label='CA3', color=plt.get_cmap('OrRd')(np.linspace(0, 1, 8)[7]))
121 | 
122 | 
123 |     ax.set_ylabel(f'Portfolio {type} R^2 (%)')
124 |     ax.set_xticks(x)
125 |     ax.set_xticklabels(labels)
126 |     ax.legend()
127 | 
128 |     fig.tight_layout()
129 | 
130 |     plt.savefig(f'imgs/{type}_R2.png')
131 |     plt.close()
132 | 
133 | 
134 | 
135 | def plot_R2_table(R_df, type):
136 |     plt.figure(dpi=200)
137 |     
138 |     for col in R_df.columns:
139 |         R_df[col] = R_df[col].apply(lambda x: round_number(x))
140 | 
141 |     R_df = R_df.reset_index()
142 |     R_df.columns = ['Model', 'K=1', 'K=2', 'K=3', 'K=4', 'K=5', 'K=6']
143 | 
144 | 
145 |     fig_total =  ff.create_table(R_df,
146 |                         colorscale=[[0, 'white'],
147 |                                     [0.01, 'lightgrey'],
148 |                                     [1.0, 'white']],
149 |                         font_colors=['#000000', '#000000',
150 |                                     '#000000'])
151 |     fig_total.update_layout(
152 |         autosize=False,
153 |         width=500,
154 |         height=200,
155 |     )
156 |     fig_total.write_image(f"imgs/R2_{type}_table.png", scale=4)
157 |     
158 | 
159 | 
160 | def round_number(num):
161 |     num = str(round(num*100, 2))
162 |     while len(num.split('.')[1]) < 2:
163 |         num = num + '0'
164 |     return num
165 | 
166 | 
167 |     
168 | if __name__=="__main__":
169 |     CAs = ["CA0_1", "CA0_2", "CA0_3", "CA0_4", "CA0_5", "CA0_6", "CA1_1", "CA1_2", "CA1_3", "CA1_4", "CA1_5", "CA1_6", "CA2_1", "CA2_2", "CA2_3", "CA2_4", "CA2_5", "CA2_6", "CA3_1", "CA3_2", "CA3_3", "CA3_4", "CA3_5", "CA3_6"]
170 |     FFs = ["FF_1", "FF_2", "FF_3", "FF_4", "FF_5", "FF_6"]
171 |     PCAs = ["PCA_1", "PCA_2", "PCA_3", "PCA_4", "PCA_5", "PCA_6"]
172 |     IPCAs = ["IPCA_1", "IPCA_2", "IPCA_3", "IPCA_4", "IPCA_5", "IPCA_6"]
173 |     models = FFs + PCAs + IPCAs + CAs
174 |     
175 |     ## Plot R^2 bars
176 |     total_R2 = []
177 |     for m in models:
178 |         total_R2.append(calculate_R2(m, 'inference'))
179 |     R_total = pd.DataFrame([models, total_R2]).T
180 | 
181 |     predict_R2 = []
182 |     for m in models:
183 |         predict_R2.append(calculate_R2(m, 'predict'))
184 |     R_pred = pd.DataFrame([models, predict_R2]).T
185 |     
186 |     plot_R2_bar(R_total, 'total')
187 |     plot_R2_bar(R_pred, 'pred')
188 |     
189 |     ## Save R^2 tables
190 |     R_total_df = pd.DataFrame(np.array(total_R2).reshape(-1, 6), columns = ['K=1', 'K=2', 'K=3', 'K=4', 'K=5', 'K=6'], index=['FF', 'PCA', 'IPCA', 'CA0', 'CA1', 'CA2', 'CA3'])
191 |     R_pred_df = pd.DataFrame(np.array(predict_R2).reshape(-1, 6), columns = ['K=1', 'K=2', 'K=3', 'K=4', 'K=5', 'K=6'], index=['FF', 'PCA', 'IPCA', 'CA0', 'CA1', 'CA2', 'CA3'])
192 |     
193 |     plot_R2_table(R_total_df, 'total')
194 |     plot_R2_table(R_pred_df, 'pred')  
195 |     
196 |     
197 |     ## Plot characteristics importance heatmap
198 |     # models = ["IPCA", "CA0_5", "CA1_5", "CA2_5", "CA3_5"]
199 |     # #TODO: paste results from R_squares/
200 |     # R2_omit = []
201 |     # R_minus = pd.DataFrame(np.array(R2_omit).reshape(-1, 94)*100, index=models, columns=CHARAS_LIST).T
202 |     # char_ranks = R_minus.T.sum().argsort().argsort().index.to_list()
203 |     # char_ranks.reverse()
204 |     
205 |     # plt.figure(figsize=(8, 15), dpi=200)
206 |     # sns.heatmap(R_minus.T[char_ranks].T, cmap='Blues', linewidths=0.6)
207 |     # plt.savefig('imgs/omit_char_R2_bias.png', bbox_inches='tight')
208 |     # plt.close()
209 |     


--------------------------------------------------------------------------------
/data_prepare.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | pd.options.mode.chained_assignment = None
  4 | from tqdm import tqdm
  5 | 
  6 | import os
  7 | import zipfile
  8 | from joblib import delayed, Parallel
  9 | from itertools import product
 10 | from utils import CHARAS_LIST
 11 | 
 12 | import warnings
 13 | warnings.filterwarnings('ignore')
 14 | 
 15 | if 'data.zip' not in os.listdir():
 16 |     os.system('wget https://cloud.tsinghua.edu.cn/f/07d6a0223d054247af26/?dl=1 -O data.zip')
 17 | 
 18 | if 'data' not in os.listdir():
 19 |     os.mkdir('data')
 20 |     os.system('wget https://cloud.tsinghua.edu.cn/f/179082ecf0f147a4840c/?dl=1 -O portfolio_ret.pkl')
 21 |     os.system('wget https://cloud.tsinghua.edu.cn/f/b93c6ae7e2014d3a951e/?dl=1 -O ff5.csv')
 22 |     os.system('wget https://cloud.tsinghua.edu.cn/f/5f077be9eda0428ab7e5/?dl=1 -O UMD.csv')
 23 |     os.system('wget https://cloud.tsinghua.edu.cn/f/a916da12d5a9450eb0df/?dl=1 -O p_charas.pkl')
 24 |     
 25 |     os.system('mv portfolio_ret.pkl data')
 26 |     os.system('mv ff5.csv data')
 27 |     os.system('mv UMD.csv data')
 28 |     os.system('mv p_charas.pkl data')
 29 |     
 30 |     
 31 | with zipfile.ZipFile('data.zip', 'r') as z:    
 32 |     with z.open('data/month_ret.pkl') as f:
 33 |         print('Reading month_ret.pkl', end=' ')
 34 |         mon_ret = pd.read_pickle(f)    
 35 |         mon_ret.to_pickle('data/month_ret.pkl')
 36 |         print('Done!')
 37 |         
 38 |     with z.open('data/datashare.pkl') as f:
 39 |         print('Reading datashare.pkl', end=' ')
 40 |         datashare = pd.read_pickle(f)
 41 |         datashare['DATE'].drop_duplicates().reset_index(drop=True).to_pickle('data/mon_list.pkl')
 42 |         # datashare.to_pickle('data/datashare.pkl')
 43 |         print('Done!')
 44 | 
 45 | 
 46 | 
 47 | def pre_process(date):
 48 |     cross_slice = datashare.loc[datashare.DATE == date].copy(deep=False)
 49 |     omitted_mask = 1.0 * np.isnan(cross_slice.loc[cross_slice['DATE'] == date])
 50 |     # fill nan values with each factors median
 51 |     cross_slice.loc[cross_slice.DATE == date] = cross_slice.fillna(0) + omitted_mask * cross_slice.median()
 52 |     # if all stocks' factor is nan, fill by zero
 53 |     cross_slice.loc[cross_slice.DATE == date] = cross_slice.fillna(0)
 54 | 
 55 |     re_df = []
 56 |     # rank normalization
 57 |     for col in CHARAS_LIST:
 58 |         series = cross_slice[col]
 59 |         de_duplicate_slice = pd.DataFrame(series.drop_duplicates().to_list(), columns=['chara'])
 60 |         series = pd.DataFrame(series.to_list(), columns=['chara'])
 61 |         # sort and assign rank, the same value should have the same rank
 62 |         de_duplicate_slice['sort_rank'] = de_duplicate_slice['chara'].argsort().argsort()
 63 |         rank = pd.merge(series, de_duplicate_slice, left_on='chara', right_on='chara', how='right')['sort_rank']
 64 |         # if all values are zero, the results will contain nan
 65 |         rank_normal = ((rank - rank.min())/(rank.max() - rank.min())*2 - 1)
 66 |         re_df.append(rank_normal)
 67 |     re_df = pd.DataFrame(re_df, index=CHARAS_LIST).T.fillna(0)
 68 |     re_df['permno'] = list(cross_slice['permno'].astype(int))
 69 |     re_df['DATE'] = list(cross_slice['DATE'].astype(int))
 70 |     
 71 |     return re_df[['permno', 'DATE'] + CHARAS_LIST]
 72 | 
 73 | 
 74 | 
 75 | def cal_portfolio_ret(it, df):
 76 |     d, f = it[0], it[1]
 77 |     # long portfolio, qunatile 0.0~0.1; short portfolio, qunatile 0.9~1.0
 78 |     long_portfolio = df.loc[df.DATE == d][['permno', f]].sort_values(by=f, ascending=False)[:df.loc[df.DATE == d].shape[0]//10]['permno'].to_list()
 79 |     short_portfolio = df.loc[df.DATE == d][['permno', f]].sort_values(by=f, ascending=False)[-df.loc[df.DATE == d].shape[0]//10:]['permno'].to_list()
 80 |     # long-short portfolio return
 81 |     long_ret = mon_ret.loc[mon_ret.date == d].drop_duplicates('permno').set_index('permno').reindex(long_portfolio)['ret-rf'].dropna().mean()
 82 |     short_ret = mon_ret.loc[mon_ret.date == d].drop_duplicates('permno').set_index('permno').reindex(short_portfolio)['ret-rf'].dropna().mean()
 83 |     chara_ret = 0.5*(long_ret - short_ret)
 84 |     
 85 |     return chara_ret
 86 | 
 87 | 
 88 | def cal_portfolio_charas(month, df):
 89 |     mon_portfolio_chara = []
 90 |     p_name = ['p_' + chr for chr in CHARAS_LIST]
 91 |     for chr in CHARAS_LIST:
 92 |         long_portfolio = df.loc[df.DATE == month].sort_values(by=chr, ascending=False).reset_index(drop=True)[:df.loc[df.DATE == month].shape[0]//10]['permno'].to_list()
 93 |         short_portfolio = df.loc[df.DATE == month].sort_values(by=chr, ascending=False).reset_index(drop=True)[-df.loc[df.DATE == month].shape[0]//10:]['permno'].to_list()
 94 |         
 95 |         long_charas = df.loc[df.DATE == month].set_index('permno').loc[long_portfolio][CHARAS_LIST]
 96 |         short_charas = df.loc[df.DATE == month].set_index('permno').loc[short_portfolio][CHARAS_LIST]
 97 |         
 98 |         mon_portfolio_chara.append([month] + (0.5*(long_charas.mean() - short_charas.mean())).to_list())
 99 | 
100 |     return pd.DataFrame(mon_portfolio_chara, index=p_name, columns=['DATE']+CHARAS_LIST)
101 | 
102 | 
103 | 
104 | if __name__ == '__main__':
105 |     # pre-process share data
106 |     processed_df = Parallel(n_jobs=-1)(delayed(pre_process)(d) for d in tqdm(datashare.DATE.drop_duplicates().to_list(), colour='green', desc='Processing'))
107 |     processed_df = pd.concat(processed_df)
108 | 
109 |     ##TODO: calculate portfolio returns (or download preprocessed data)
110 |     # iter_list = list(product(datashare.DATE.drop_duplicates(), CHARAS_LIST))
111 |     # portfolio_rets = Parallel(n_jobs=-1)(delayed(cal_portfolio_ret)(it, df=processed_df) for it in tqdm(iter_list, colour='green', desc='Calculating'))
112 |     # portfolio_rets = pd.DataFrame(np.array(portfolio_rets).reshape(-1, 94), index=datashare.DATE.drop_duplicates(), columns=CHARAS_LIST).reset_index()
113 |     # portfolio_rets[CHARAS_LIST] = portfolio_rets[CHARAS_LIST].astype(np.float16)
114 |     
115 |     
116 |     ##TODO: calculate portfolio characteristics (or download preprocessed data)
117 |     # mon_list = pd.read_pickle('data/mon_list.pkl')
118 |     # _portfolio_chara_set = Parallel(n_jobs=-1)(delayed(cal_portfolio_charas)(mon, df=processed_df) for mon in tqdm(mon_list, colour='yellow', desc='Calculating P characteristics'))
119 |     # p_charas = _portfolio_chara_set[0].copy(deep=False)
120 |     # for tdf in _portfolio_chara_set[1:]:
121 |     #     p_charas = pd.concat([p_charas, tdf])
122 |     
123 |             
124 |     processed_df.to_pickle('data/datashare_re.pkl')
125 |     # portfolio_rets.to_pickle('data/portfolio_rets.pkl')
126 |     # p_charas.to_pickle('data/p_charas.pkl')


--------------------------------------------------------------------------------
/imgs/R2_pred_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/R2_pred_table.png


--------------------------------------------------------------------------------
/imgs/R2_total_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/R2_total_table.png


--------------------------------------------------------------------------------
/imgs/alpha/CA0_1_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA0_1_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/CA0_2_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA0_2_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/CA0_3_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA0_3_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/CA0_4_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA0_4_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/CA0_5_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA0_5_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/CA0_5_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA0_5_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/CA0_6_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA0_6_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/CA1_1_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA1_1_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/CA1_2_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA1_2_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/CA1_3_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA1_3_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/CA1_4_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA1_4_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/CA1_5_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA1_5_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/CA1_6_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA1_6_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/CA2_1_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA2_1_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/CA2_2_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA2_2_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/CA2_3_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA2_3_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/CA2_4_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA2_4_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/CA2_5_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA2_5_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/CA2_6_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA2_6_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/CA3_1_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA3_1_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/CA3_2_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA3_2_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/CA3_3_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA3_3_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/CA3_4_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA3_4_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/CA3_5_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA3_5_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/CA3_6_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/CA3_6_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/FF_1_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/FF_1_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/FF_2_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/FF_2_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/FF_3_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/FF_3_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/FF_4_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/FF_4_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/FF_5_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/FF_5_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/FF_6_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/FF_6_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/IPCA_1_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/IPCA_1_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/IPCA_2_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/IPCA_2_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/IPCA_3_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/IPCA_3_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/IPCA_4_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/IPCA_4_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/IPCA_5_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/IPCA_5_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/IPCA_6_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/IPCA_6_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/PCA_1_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/PCA_1_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/PCA_2_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/PCA_2_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/PCA_3_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/PCA_3_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/PCA_4_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/PCA_4_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/PCA_5_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/PCA_5_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/alpha/PCA_6_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/alpha/PCA_6_inference_alpha_plot.png


--------------------------------------------------------------------------------
/imgs/omit_char_R2_bias.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/omit_char_R2_bias.png


--------------------------------------------------------------------------------
/imgs/pred_R2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/pred_R2.png


--------------------------------------------------------------------------------
/imgs/total_R2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RichardS0268/Autoencoder-Asset-Pricing-Models/5aa57e0a1519f1fb23bfef48cdd5cf9cb29e467a/imgs/total_R2.png


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from models.PCA import PCA
  3 | from models.FF import FF
  4 | from models.IPCA import IPCA
  5 | from models.CA import CA0, CA1, CA2, CA3
  6 | 
  7 | import gc
  8 | import argparse
  9 | import pandas as pd
 10 | import numpy as np
 11 | import time
 12 | import json
 13 | from tqdm import tqdm
 14 | from utils import *
 15 | from analysis import *
 16 | import matplotlib.pyplot as plt
 17 | from itertools import product
 18 | import os
 19 | 
 20 | import warnings
 21 | warnings.filterwarnings('ignore')
 22 | 
 23 | 
 24 | 
 25 | def model_inference_and_predict(model):
 26 |     """
 27 |     Inference and Prediction of non NN models:
 28 |     Returns: model.name_inference.csv & model.name_inference.csv saved in path 'results'
 29 |     """
 30 |     mon_list = pd.read_pickle('data/mon_list.pkl')
 31 |     test_mons = mon_list.loc[mon_list >= model.test_period[0]]
 32 |     inference_result = []
 33 |     predict_result = []
 34 |     T_bar = tqdm(test_mons.groupby(test_mons.apply(lambda x: x//10000)), colour='red', desc=f'{model.name} Inferencing & Predicting')
 35 |     
 36 |     for g in T_bar: # rolling train
 37 |         T_bar.set_postfix({'Year': g[0]})
 38 |         model.train_model()
 39 |         
 40 |         for m in g[1].to_list():
 41 |             inference_result.append(model.inference(m)) # T * N * m 
 42 |             if not len(model.omit_char):
 43 |                 predict_result.append(model.predict(m))
 44 |         # model refit (change train period and valid period)
 45 |         model.refit()
 46 |     
 47 |     if not len(model.omit_char):
 48 |         inference_result = pd.DataFrame(inference_result, index=test_mons, columns=CHARAS_LIST)
 49 |         inference_result.to_csv(f'results/inference/{model.name}_inference.csv')
 50 |         predict_result = pd.DataFrame(predict_result, index=test_mons, columns=CHARAS_LIST)
 51 |         predict_result.to_csv(f'results/predict/{model.name}_predict.csv')
 52 |     
 53 |     return inference_result
 54 |     
 55 |     
 56 |     
 57 | def model_inference_and_predict_CA(model):
 58 |     """
 59 |     Inference and Prediction of NN models:
 60 |     Returns: model.name_inference.csv & model.name_inference.csv saved in path 'results'
 61 |     """
 62 |     model = model.to('cuda')
 63 |     mon_list = pd.read_pickle('data/mon_list.pkl')
 64 |     test_mons = mon_list.loc[(mon_list >= model.test_period[0])]
 65 |     
 66 |     if not len(model.omit_char): # no omit characteristics
 67 |         inference_result = pd.DataFrame()
 68 |         predict_result = pd.DataFrame()
 69 |     else:
 70 |         inference_result = []
 71 |         
 72 |     T_bar = tqdm(test_mons.groupby(test_mons.apply(lambda x: x//10000)), colour='red', desc=f'{model.name} Inferencing & Predicting')
 73 |     
 74 |     stock_index = pd.Series(dtype=np.int64)
 75 |     for g in T_bar: # rolling train, refit once a year
 76 |         T_bar.set_postfix({'Year': g[0]})
 77 | 
 78 |         model.reset_weight()
 79 |         model.release_gpu()
 80 |         # release GPU memory
 81 |         for _ in range(6): # call function multiple times to clear the cuda cache
 82 |             torch.cuda.empty_cache()
 83 |             
 84 |         train_loss, val_loss = model.train_model()
 85 |         # plot loss
 86 |         plt.plot(train_loss, label='train_loss')
 87 |         plt.plot(val_loss, label='val_loss')
 88 |         plt.legend()
 89 |         plt.savefig(f'results/train_loss/{model.name}_loss_{g[0]}.png')
 90 |         plt.close()
 91 | 
 92 |         for m in g[1].to_list():
 93 |             m_stock_index, _, _, _ = model._get_item(m)
 94 |             stock_index = pd.concat([stock_index, pd.Series(m_stock_index)]).drop_duplicates().astype(int)
 95 | 
 96 |             if not len(model.omit_char): # no omit characteristics
 97 |                 # move inference_R and predict_R to cpu
 98 |                 inference_R = model.inference(m) # return (N, 1)
 99 |                 inference_R = inference_R.cpu().detach().numpy()
100 |                 inference_R = pd.DataFrame(inference_R, index=m_stock_index, columns=[m])
101 |                 inference_result = pd.concat([inference_result.reset_index(drop=True), inference_R.reset_index(drop=True)], axis=1) # (N, T)
102 |                 
103 |                 predict_R = model.predict(m) # reutrn (N, 1)
104 |                 predict_R = predict_R.cpu().detach().numpy()
105 |                 predict_R = pd.DataFrame(predict_R, index=m_stock_index, columns=[m])
106 |                 predict_result = pd.concat([predict_result.reset_index(drop=True), predict_R.reset_index(drop=True)], axis=1) # (N, T)
107 | 
108 |             else:
109 |                 inference_R = model.inference(m) # return (N, m), m is the length of omit_char
110 |                 inference_result.append(inference_R) # (T, N, m)
111 |             
112 |         # refit: change train period and valid period
113 |         model.refit()
114 | 
115 |     if not len(model.omit_char):
116 |         inference_result = pd.DataFrame(inference_result.values.T, index=test_mons, columns=CHARAS_LIST)
117 |         inference_result.to_csv(f'results/inference/{model.name}_inference.csv')
118 |         
119 |         predict_result = pd.DataFrame(predict_result.values.T, index=test_mons, columns=CHARAS_LIST)
120 |         predict_result.to_csv(f'results/predict/{model.name}_predict.csv')
121 | 
122 |     # GC: release RAM memory(model)
123 |     del model
124 |     gc.collect()
125 |     return inference_result
126 | 
127 | 
128 | 
129 | def git_push(msg):
130 |     os.system('git add R_squares')
131 |     os.system(f'git commit -m "{msg}"')
132 |     os.system('git push')
133 | 
134 | 
135 | 
136 | def model_selection(model_type, model_K, omit_char=[]):
137 |     assert model_type in ['FF', 'PCA', 'IPCA', 'CA0', 'CA1', 'CA2', 'CA3'], f'No Such Model: {model_type}'
138 |     
139 |     if model_type == 'FF':
140 |         return {
141 |             'name': f'FF_{model_K}',
142 |             'omit_char': [],
143 |             'model': FF(K=model_K)
144 |         } 
145 |             
146 |     elif model_type == 'PCA':
147 |         return {
148 |             'name': f'PCA_{model_K}',
149 |             'omit_char': omit_char,
150 |             'model': PCA(K=model_K, omit_char=omit_char)
151 |         } 
152 |         
153 |     elif model_type == 'IPCA':
154 |         return {
155 |             'name': f'IPCA_{model_K}',
156 |             'omit_char': omit_char,
157 |             'model': IPCA(K=model_K, omit_char=omit_char)
158 |         } 
159 |         
160 |     elif model_type == 'CA0':
161 |         return {
162 |             'name': f'CA0_{model_K}',
163 |             'omit_char': omit_char,
164 |             'model': CA0(hidden_size=model_K, lr=CA_LR, omit_char=omit_char)
165 |         } 
166 |             
167 |     elif model_type == 'CA1':
168 |         return {
169 |             'name': f'CA1_{model_K}',
170 |             'omit_char': omit_char,
171 |             'model': CA1(hidden_size=model_K, dropout=CA_DR, lr=CA_LR, omit_char=omit_char)
172 |         } 
173 |     
174 |     elif model_type == 'CA2':
175 |         return {
176 |             'name': f'CA2_{model_K}',
177 |             'omit_char': omit_char,
178 |             'model': CA2(hidden_size=model_K, dropout=CA_DR, lr=CA_LR, omit_char=omit_char)
179 |         } 
180 |         
181 |     else:
182 |         return {
183 |             'name': f'CA3_{model_K}',
184 |             'omit_char': omit_char,
185 |             'model': CA3(hidden_size=model_K, dropout=CA_DR, lr=CA_LR, omit_char=omit_char)
186 |         } 
187 |         
188 |  
189 | 
190 | if __name__ == "__main__":
191 |     parser = argparse.ArgumentParser()
192 |     parser.add_argument('--Model', type=str, default='FF PCA IPCA CA0 CA1 CA2 CA3')
193 |     parser.add_argument('--K', type=str, default='1 2 3 4 5 6')
194 |     parser.add_argument('--omit_char', type=str, default='')
195 | 
196 |     args = parser.parse_args()
197 |     
198 |     if 'results' not in os.listdir('./'):
199 |         os.mkdir('results')
200 |     if 'train_loss' not in os.listdir('./results'):
201 |         os.mkdir('results/train_loss')
202 |     if 'inference' not in os.listdir('./results'):
203 |         os.mkdir('results/inference')
204 |     if 'predict' not in os.listdir('./results'):
205 |         os.mkdir('results/predict')
206 |     if 'imgs' not in os.listdir('./'):
207 |         os.mkdir('imgs')
208 |         
209 |         
210 |     models_name = []
211 |     R_square = []
212 |     for g in product(args.Model.split(' '), args.K.split(' ')):
213 |         if isinstance(args.omit_char, str) and len(args.omit_char) > 0:
214 |             omit_chars = args.omit_char.split(' ')
215 |         else:
216 |             omit_chars = []
217 |             
218 |         model = model_selection(g[0], int(g[1]), omit_chars)
219 |             
220 |         print(f"{time.strftime('%a, %d %b %Y %H:%M:%S +0800', time.gmtime())} | Model: {model['name']} | {omit_chars}")
221 |         print('name : ', model['name'])
222 |         models_name.append(model['name'])
223 | 
224 |         if model['name'].split('_')[0][:-1] == 'CA':
225 |             print('model_inference_and_predict_CA')
226 |             # if have omit char, inf_ret (T, N, m)
227 |             inf_ret = model_inference_and_predict_CA(model['model'])  
228 |         else:
229 |             inf_ret = model_inference_and_predict(model['model'])
230 |         
231 |         gc.collect()    
232 |         
233 |         # Save total R^2   
234 |         if not len(model['omit_char']):
235 |             R_square.append(calculate_R2(model['model'], 'inference'))
236 |             alpha_plot(model['model'], 'inference', save_dir='imgs')
237 |             # alpha_plot(model['model'], 'predict', save_dir='alpha_imgs')
238 |         else:
239 |             inf_ret = np.array(inf_ret)
240 |             for i in range(len(model['omit_char'])):
241 |                 inference_r = inf_ret[:, :, i] # T * N
242 |                 complete_r = inf_ret[:, :, -1]
243 |                 R_square.append(calculate_R2(None, None, inference_r, complete_r))
244 | 
245 |         del model
246 | 
247 |     # save R_square to json
248 |     p = time.localtime()
249 |     time_str = "{:0>4d}-{:0>2d}-{:0>2d}_{:0>2d}-{:0>2d}-{:0>2d}".format(p.tm_year, p.tm_mon, p.tm_mday, p.tm_hour, p.tm_min, p.tm_sec)
250 |     filename = f"R_squares/{time_str}.json"
251 |     obj = {
252 |         "models": models_name,
253 |         'omit_char': args.omit_char.split(' '),
254 |         "R2_total": R_square,
255 |     }
256 | 
257 |     with open(filename, "w") as out_file:
258 |         json.dump(obj, out_file)
259 | 
260 |     # git push
261 |     # git_push(f"Run main.py")
262 | 
263 | 
264 |     


--------------------------------------------------------------------------------
/models/CA.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pandas as pd
  3 | import numpy as np
  4 | import collections
  5 | from .modelBase import modelBase
  6 | from utils import CHARAS_LIST
  7 | 
  8 | import torch
  9 | from torch import nn
 10 | from torch.utils.data import Dataset, DataLoader, TensorDataset
 11 | 
 12 | 
 13 | MAX_EPOCH = 200
 14 | 
 15 | class CA_base(nn.Module, modelBase):
 16 |     def __init__(self, name, omit_char=[], device='cuda'):
 17 |         nn.Module.__init__(self)
 18 |         modelBase.__init__(self, name)
 19 |         self.beta_nn = None
 20 |         self.factor_nn = None
 21 |         self.optimizer = None
 22 |         self.criterion = None
 23 |         self.omit_char = omit_char
 24 |         
 25 |         self.factor_nn_pred = []
 26 |         
 27 |         self.device = device
 28 | 
 29 |         self.datashare_chara = pd.read_pickle('./data/datashare_re.pkl').astype(np.float64)
 30 |         self.p_charas = pd.read_pickle('./data/p_charas.pkl').astype(np.float64).reset_index()
 31 |         self.portfolio_ret=  pd.read_pickle('./data/portfolio_ret.pkl').astype(np.float64)
 32 |         self.mon_ret = pd.read_pickle('./data/month_ret.pkl').astype(np.float64)
 33 | 
 34 |         self.train_dataloader = None
 35 |         self.valid_dataloader = None
 36 |         self.test_dataloader = None
 37 |     
 38 |     
 39 |     def debug(self, month):
 40 |         beta_nn_input = self.p_charas.loc[self.p_charas['DATE'] == month][CHARAS_LIST]
 41 |         # beta_nn_input = self.datashare_chara.loc[self.datashare_chara['DATE'] == month].set_index('permno')[charas]
 42 |         print(beta_nn_input)
 43 | 
 44 | 
 45 |     def _get_item(self, month):
 46 |         if month not in self.p_charas['DATE'].values:
 47 |             # find the closest month in p_charas to month
 48 |             month = self.p_charas['DATE'].values[np.argmin(np.abs(self.p_charas['DATE'].values - month))]
 49 |             
 50 |         beta_nn_input = self.p_charas.loc[self.p_charas['DATE'] == month][CHARAS_LIST] # (94, 94)
 51 |         labels = self.portfolio_ret.loc[self.portfolio_ret['DATE'] == month][CHARAS_LIST].T.values # (94, 1)
 52 |         beta_nn_input['ret-rf'] = labels
 53 |         align_df = beta_nn_input.copy(deep=False).dropna()
 54 |             
 55 |         factor_nn_input = self.portfolio_ret.loc[self.portfolio_ret['DATE'] == month][CHARAS_LIST]
 56 |          
 57 |         # exit(0) if there is any nan in align_df
 58 |         if align_df.isnull().values.any():
 59 |             assert False, f'There is nan in align_df of : {month}'
 60 |         # return stock index (L), beta_nn_input (94*94=P*N), factor_nn_input (94*1=P*1), labels (94, = N,)
 61 |         return align_df.index, align_df.values[:, :-1].T, factor_nn_input.T.values , align_df.values[:, -1].T
 62 |     
 63 |     
 64 |     def dataloader(self, period): 
 65 |         mon_list = pd.read_pickle('data/mon_list.pkl')
 66 |         mon_list = mon_list.loc[(mon_list >= period[0]) & (mon_list <= period[1])]
 67 |         beta_nn_input_set = []
 68 |         factor_nn_input_set = []
 69 |         label_set = []
 70 |         for mon in mon_list:
 71 |             _, _beta_input, _factor_input, label =  self._get_item(mon)
 72 |             beta_nn_input_set.append(_beta_input)
 73 |             factor_nn_input_set.append(_factor_input)
 74 |             label_set.append(label)
 75 |             
 76 |         beta_nn_input_set = torch.tensor(beta_nn_input_set, dtype=torch.float32).to(self.device)
 77 |         factor_nn_input_set = torch.tensor(factor_nn_input_set, dtype=torch.float32).to(self.device)
 78 |         label_set = torch.tensor(label_set, dtype=torch.float32).to(self.device)
 79 | 
 80 |         dataset = TensorDataset(beta_nn_input_set, factor_nn_input_set, label_set)   
 81 |         return DataLoader(dataset, batch_size=1, shuffle=True)
 82 | 
 83 | 
 84 |     def forward(self, char, pfret):
 85 |         processed_char = self.beta_nn(char)
 86 |         processed_pfret = self.factor_nn(pfret)
 87 |         return torch.sum(processed_char * processed_pfret, dim=1)
 88 | 
 89 |     
 90 |     # train_one_epoch
 91 |     def __train_one_epoch(self):
 92 |         epoch_loss = 0.0
 93 |         for i, (beta_nn_input, factor_nn_input, labels) in enumerate(self.train_dataloader):
 94 |             self.optimizer.zero_grad()
 95 |             # beta_nn_input reshape: (1, 94, 94) -> (94, 94) (1*P*N => N*P)
 96 |             # factor_nn_input reshape: (1, 94, 1) -> (1, 94) (1*P*1 => 1*P)
 97 |             # labels reshape: (1, 94) -> (94, ) (1*N => N,)
 98 |             beta_nn_input = beta_nn_input.squeeze(0).T
 99 |             factor_nn_input = factor_nn_input.squeeze(0).T
100 |             labels = labels.squeeze(0)
101 |             output = self.forward(beta_nn_input, factor_nn_input)
102 |             loss = self.criterion(output, labels)
103 |             
104 |             loss.backward()
105 |             self.optimizer.step()
106 |             epoch_loss += loss.item()
107 | 
108 |             if i % 100 == 0:
109 |                 # print(f'Batches: {i}, loss: {loss.item()}')
110 |                 pass
111 | 
112 |         return epoch_loss / len(self.train_dataloader)
113 | 
114 | 
115 |     def __valid_one_epoch(self):
116 |         epoch_loss = 0.0
117 |         for i, (beta_nn_input, factor_nn_input, labels) in enumerate(self.valid_dataloader):
118 |             # beta_nn_input reshape: (1, 94, 94) -> (94, 94) (1*P*N => N*P)
119 |             # factor_nn_input reshape: (1, 94, 1) -> (1, 94) (1*P*1 => 1*P)
120 |             # labels reshape: (1, 94) -> (94, ) (1*N => N,)
121 |             beta_nn_input = beta_nn_input.squeeze(0).T
122 |             factor_nn_input = factor_nn_input.squeeze(0).T
123 |             labels = labels.squeeze(0)
124 | 
125 |             output = self.forward(beta_nn_input, factor_nn_input)
126 |             loss = self.criterion(output, labels)
127 |             epoch_loss += loss.item()
128 | 
129 |         return epoch_loss / len(self.valid_dataloader)
130 |     
131 |     
132 |     def train_model(self):
133 |         if 'saved_models' not in os.listdir('./'):
134 |             os.mkdir('saved_models')
135 |         
136 |         self.train_dataloader = self.dataloader(self.train_period)
137 |         self.valid_dataloader = self.dataloader(self.valid_period)
138 |         self.test_dataloader = self.dataloader(self.test_period)
139 |         
140 |         min_error = np.Inf
141 |         no_update_steps = 0
142 |         valid_loss = []
143 |         train_loss = []
144 |         for i in range(MAX_EPOCH):
145 |             # print(f'Epoch {i}')
146 |             self.train()
147 |             train_error = self.__train_one_epoch()
148 |             train_loss.append(train_error)
149 |             
150 |             self.eval()
151 |             # valid and early stop
152 |             with torch.no_grad():
153 |                 valid_error = self.__valid_one_epoch()
154 |                 
155 |             valid_loss.append(valid_error)
156 |             if valid_error < min_error:
157 |                 min_error = valid_error
158 |                 no_update_steps = 0
159 |                 # save model
160 |                 torch.save(self.state_dict(), f'./saved_models/{self.name}.pt')
161 |             else:
162 |                 no_update_steps += 1
163 |             
164 |             if no_update_steps > 2: # early stop, if consecutive 3 epoches no improvement on validation set
165 |                 print(f'Early stop at epoch {i}')
166 |                 break
167 |             # load from (best) saved model
168 |             self.load_state_dict(torch.load(f'./saved_models/{self.name}.pt'))
169 |         return train_loss, valid_loss
170 |     
171 |     
172 |     def test_model(self):
173 |         # beta, factor, label = self.test_dataset
174 |         # i = np.random.randint(len(beta))
175 |         # beta_nn_input = beta[i]
176 |         # factor_nn_input = factor[i]
177 |         # labels = label[i]
178 |         output = None
179 |         label = None
180 |         for i, beta_nn_input, factor_nn_input, labels in enumerate(self.test_dataloader):
181 |             # convert to tensor
182 |             # beta_nn_input = torch.tensor(beta_nn_input, dtype=torch.float32).T.to(self.device)
183 |             # factor_nn_input = torch.tensor(factor_nn_input, dtype=torch.float32).T.to(self.device)
184 |             # labels = torch.tensor(labels, dtype=torch.float32).T.to(self.device)
185 |             output = self.forward(beta_nn_input, factor_nn_input)
186 |             break
187 | 
188 |         loss = self.criterion(output, labels)
189 |         print(f'Test loss: {loss.item()}')
190 |         print(f'Predicted: {output}')
191 |         print(f'Ground truth: {labels}')
192 |         return output, labels
193 | 
194 | 
195 |     def calBeta(self, month, skip_char=[]):
196 |         _, beta_nn_input, _, _ = self._get_item(month) # beta input: 94*94 = P*N
197 |         
198 |         # if some variables need be omitted
199 |         if len(skip_char):
200 |             beta_nn_input = pd.DataFrame(beta_nn_input.T, columns=CHARAS_LIST) # N*P
201 |             beta_nn_input[skip_char] = beta_nn_input[skip_char] * 0.0
202 |             beta_nn_input = beta_nn_input.values.T # P*N
203 |         
204 |         beta_nn_input = torch.tensor(beta_nn_input, dtype=torch.float32).T.to(self.device) # N*P
205 |         return self.beta_nn(beta_nn_input) # N*K
206 |     
207 |     
208 |     def calFactor(self, month, skip_char=[]):
209 |         _, _, factor_nn_input, _ = self._get_item(month) # factor input: P*1
210 |         
211 |         # if some variables need be omitted
212 |         if len(skip_char):
213 |             factor_nn_input = pd.DataFrame(factor_nn_input.T, columns=CHARAS_LIST) # 1*P
214 |             factor_nn_input[skip_char] = factor_nn_input[skip_char] * 0.0
215 |             factor_nn_input = factor_nn_input.values.T # P*1
216 | 
217 |         factor_nn_input = torch.tensor(factor_nn_input, dtype=torch.float32).T.to(self.device) # 1*P
218 |         factor_pred = self.factor_nn(factor_nn_input).T # K*1
219 |         
220 |         self.factor_nn_pred.append(factor_pred)
221 |         
222 |         return factor_pred # K*1
223 |     
224 |     
225 |     def inference(self, month):
226 |         if len(self.omit_char) == 0:
227 |             assert month >= self.test_period[0], f"Month error, {month} is not in test period {self.test_period}"
228 |             
229 |             mon_factor, mon_beta = self.calFactor(month), self.calBeta(month)
230 |             
231 |             assert mon_beta.shape[1] == mon_factor.shape[0], f"Dimension mismatch between mon_factor: {mon_factor.shape} and mon_beta: {mon_beta.shape}"
232 |             
233 |             # R_{N*1} = Beta_{N*K} @ F_{K*1}
234 |             return mon_beta @ mon_factor
235 |         else:
236 |             ret_R = []
237 |             for char in self.omit_char:
238 |                 mon_factor, mon_beta = self.calFactor(month, [char]), self.calBeta(month, [char])
239 |                 ret_R.append((mon_beta @ mon_factor).cpu().detach().numpy()) # N*1
240 |                 
241 |             mon_factor, mon_beta = self.calFactor(month), self.calBeta(month)
242 |             ret_R.append((mon_beta @ mon_factor).cpu().detach().numpy()) # also add complete result
243 |             
244 |             return np.array(ret_R).squeeze(2).T # N*m
245 |     
246 |     
247 |     def cal_delayed_Factor(self, month):
248 |         # calculate the last day of the previous month
249 |         if self.refit_cnt == 0:
250 |             avg_f_pred = self.factor_nn_pred[0] # input of the first predict take hat{f}_t
251 |             # print(avg_f_pred.shape)
252 |         else:
253 |             avg_f_pred = torch.mean(torch.stack(self.factor_nn_pred[:self.refit_cnt]), dim=0)
254 | 
255 |         return avg_f_pred
256 |     
257 |     
258 |     def reset_weight(self):
259 |         for layer in self.beta_nn: # reset beta_nn parameters
260 |             if hasattr(layer, 'reset_parameters'):
261 |                 layer.reset_parameters()
262 |         
263 |         for layer in self.factor_nn: # reset factor_nn parameters
264 |             if hasattr(layer, 'reset_parameters'):
265 |                 layer.reset_parameters()
266 |                 
267 |         self.optimizer.state = collections.defaultdict(dict) # reset optimizer state
268 | 
269 | 
270 |     def release_gpu(self):
271 |         if self.train_dataloader is not None:
272 |             del self.train_dataloader
273 |         if self.valid_dataloader is not None:
274 |             del self.valid_dataloader
275 |         if self.test_dataloader is not None:
276 |             del self.test_dataloader
277 |         torch.cuda.empty_cache()
278 | 
279 | 
280 | 
281 | class CA0(CA_base):
282 |     def __init__(self, hidden_size, lr=0.001, omit_char=[], device='cuda'):
283 |         CA_base.__init__(self, name=f'CA0_{hidden_size}', omit_char=omit_char, device=device)
284 |         # P -> K
285 |         self.beta_nn = nn.Sequential(
286 |             # output layer
287 |             nn.Linear(94, hidden_size)
288 |         )
289 |         self.factor_nn = nn.Sequential(
290 |             nn.Linear(94, hidden_size)
291 |         )
292 | 
293 |         self.optimizer = torch.optim.Adam(self.parameters(), lr=lr)
294 |         self.criterion = nn.MSELoss().to(device)
295 |         
296 | 
297 | 
298 | class CA1(CA_base):
299 |     def __init__(self, hidden_size, dropout=0.5, lr=0.001, omit_char=[], device='cuda'):
300 |         CA_base.__init__(self, name=f'CA1_{hidden_size}', omit_char=omit_char, device=device)
301 |         self.dropout = dropout
302 |         # P -> 32 -> K
303 |         self.beta_nn = nn.Sequential(
304 |             # hidden layer 1
305 |             nn.Linear(94, 32),
306 |             nn.BatchNorm1d(32),
307 |             nn.ReLU(),
308 |             nn.Dropout(self.dropout),
309 |             # output layer
310 |             nn.Linear(32, hidden_size)
311 |         )
312 |         self.factor_nn = nn.Sequential(
313 |             nn.Linear(94, hidden_size)
314 |         )
315 |         
316 |         self.optimizer = torch.optim.Adam(self.parameters(), lr=lr)
317 |         self.criterion = nn.MSELoss().to(device)
318 |         
319 |         
320 |         
321 | class CA2(CA_base):
322 |     def __init__(self, hidden_size, dropout=0.5, lr=0.001, omit_char=[], device='cuda'):
323 |         CA_base.__init__(self, name=f'CA2_{hidden_size}', omit_char=omit_char, device=device)
324 |         self.dropout = dropout
325 |         # P -> 32 -> 16 -> K
326 |         self.beta_nn = nn.Sequential(
327 |             # hidden layer 1
328 |             nn.Linear(94, 32),
329 |             nn.BatchNorm1d(32),
330 |             nn.ReLU(),
331 |             nn.Dropout(self.dropout),
332 |             # hidden layer 2
333 |             nn.Linear(32, 16),
334 |             nn.BatchNorm1d(16),
335 |             nn.ReLU(),
336 |             nn.Dropout(self.dropout),
337 |             # output layer
338 |             nn.Linear(16, hidden_size)
339 |         )
340 |         self.factor_nn = nn.Sequential(
341 |             nn.Linear(94, hidden_size)
342 |         )
343 | 
344 |         self.optimizer = torch.optim.Adam(self.parameters(), lr=lr)
345 |         self.criterion = nn.MSELoss().to(device)
346 | 
347 | 
348 | 
349 | class CA3(CA_base):
350 |     def __init__(self, hidden_size, dropout=0.5, lr=0.001, omit_char=[], device='cuda'):
351 |         CA_base.__init__(self, name=f'CA3_{hidden_size}', omit_char=omit_char, device=device)
352 |         self.dropout = dropout
353 |         # P -> 32 -> 16 -> 8 -> K
354 |         self.beta_nn = nn.Sequential(
355 |             # hidden layer 1
356 |             nn.Linear(94, 32),
357 |             nn.BatchNorm1d(32),
358 |             nn.ReLU(),
359 |             nn.Dropout(self.dropout),
360 |             # hidden layer 2
361 |             nn.Linear(32, 16),
362 |             nn.BatchNorm1d(16),
363 |             nn.ReLU(),
364 |             nn.Dropout(self.dropout),
365 |             # hidden layer 3
366 |             nn.Linear(16, 8),
367 |             nn.BatchNorm1d(8),
368 |             nn.ReLU(),
369 |             nn.Dropout(self.dropout),
370 |             # output layer
371 |             nn.Linear(8, hidden_size)
372 |         )
373 |         self.factor_nn = nn.Sequential(
374 |             nn.Linear(94, hidden_size)
375 |         )
376 |         
377 |         self.optimizer = torch.optim.Adam(self.parameters(), lr=lr, weight_decay=0.01)
378 |         self.criterion = nn.MSELoss().to(device)


--------------------------------------------------------------------------------
/models/FF.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append('../')
 3 | 
 4 | from utils import CHARAS_LIST
 5 | from .modelBase import modelBase
 6 | 
 7 | import pandas as pd
 8 | import statsmodels.api as sm
 9 | from dateutil.relativedelta import relativedelta
10 | 
11 | 
12 | class FF(modelBase):
13 |     def __init__(self, K):
14 |         super(FF, self).__init__(f'FF_{K}')
15 |         self.K = K
16 |         self.train_period[0] = 19630731 # ff5 data from FF website is only available from 196307
17 |         self.omit_char = []
18 |         self.__prepare_FFf()
19 |         
20 |     
21 |     def __prepare_FFf(self):
22 |         ff5 = pd.read_csv('data/ff5.csv', index_col=0)
23 |         UMD = pd.read_csv('data/UMD.csv', index_col=0)
24 |         UMD.columns = ['UMD']
25 |         FFf = pd.concat([ff5, UMD.loc[196307:]], axis=1)
26 |         self.fname = ['Mkt-RF', 'SMB', 'HML', 'CMA', 'RMW', 'UMD']
27 |         self.FFf = FFf[self.fname]
28 |         self.portfolio_ret = pd.read_pickle('data/portfolio_ret.pkl')
29 |         self.portfolio_ret['DATE'] = self.portfolio_ret['DATE'].apply(lambda x: x//100)
30 |         
31 |     
32 |     def train_model(self):
33 |         self.beta_matrix = []
34 |         X = self.FFf[self.fname[:self.K]].loc[self.train_period[0]//100:self.train_period[1]//100]
35 |         for col in CHARAS_LIST:
36 |             y = self.portfolio_ret.set_index('DATE')[col].loc[self.train_period[0]//100:self.train_period[1]//100]
37 |             model = sm.OLS(y.values, X.values).fit()
38 |             self.beta_matrix.append(model.params)
39 |         self.beta_matrix = pd.DataFrame(self.beta_matrix, columns=self.fname[:self.K], index=CHARAS_LIST)
40 |     
41 |         
42 |     def calBeta(self, month): # beta is time invariant
43 |         return self.beta_matrix # N * K
44 |         
45 |             
46 |     def calFactor(self, month):
47 |         return self.FFf[self.fname[:self.K]].loc[month//100] # K * 1
48 |         
49 |         
50 |     def cal_delayed_Factor(self, month):
51 |         last_mon = int(str(pd.to_datetime(str(month)) - relativedelta(months=1)).split(' ')[0].replace('-', '')[:-2])
52 |         # return average of prevailing sample hat{f} (from 198701) up to t-1
53 |         return self.FFf[self.fname[:self.K]].loc[198701:last_mon].mean() 
54 |         
55 | 


--------------------------------------------------------------------------------
/models/IPCA.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | 
  4 | import sys
  5 | sys.path.append('../')
  6 | 
  7 | from utils import *
  8 | from .modelBase import modelBase
  9 | 
 10 | 
 11 | class IPCA(modelBase):
 12 |     def __init__(self, K, omit_char=[]):
 13 |         super(IPCA, self).__init__(f'IPCA_{K}')
 14 |         self.K = K
 15 |         self.omit_char = omit_char
 16 |         np.random.seed(10)
 17 |         self.gamma = np.random.random([94, self.K]) # P = 94, we have total 94 characteristics 
 18 |         self.valid_error = []
 19 |         self.__prepare_data()
 20 |         
 21 | 
 22 |     def __prepare_data(self):
 23 |         self.portfolio_ret = pd.read_pickle('data/portfolio_ret.pkl')
 24 |         self.p_charas = pd.read_pickle('data/p_charas.pkl')
 25 |         self.mon_list = pd.read_pickle('data/mon_list.pkl')
 26 |     
 27 |         
 28 |     def __valid(self):
 29 |         MSE_set = []
 30 |         for mon in self.mon_list[(self.mon_list >= self.valid_period[0]) & (self.mon_list <= self.valid_period[1])]:
 31 |             Z = self.p_charas.loc[self.p_charas.DATE == mon][CHARAS_LIST].values # N * P
 32 |             y = self.portfolio_ret.loc[self.portfolio_ret.DATE == mon][CHARAS_LIST].values.T # N * 1
 33 |             beta = Z @ self.gamma # N * K
 34 |             f_hat = np.array(np.matrix(beta.T @ beta).I @ beta.T @ y) # K * 1
 35 |             residual = y - beta @ f_hat
 36 |             MSE = np.sum(residual**2)
 37 |             MSE_set.append(MSE)
 38 |             
 39 |         valid_error = sum(MSE_set)
 40 |         self.valid_error.append(valid_error)
 41 |         
 42 |         return valid_error
 43 |     
 44 |         
 45 |     def __gamma_iter(self, gamma_old):
 46 |         numer = np.zeros((94*self.K, 1))
 47 |         denom = np.zeros((94*self.K, 94*self.K))
 48 |         for mon in self.mon_list[(self.mon_list >= self.train_period[0]) & (self.mon_list <= self.train_period[1])]:
 49 |             Z = self.p_charas.loc[self.p_charas.DATE == mon][CHARAS_LIST].values # N * P
 50 |             y = self.portfolio_ret.loc[self.portfolio_ret.DATE == mon][CHARAS_LIST].values.T # N * 1
 51 |             beta = Z @ gamma_old # N * K
 52 |             f_hat = np.array(np.matrix(beta.T @ beta).I @ beta.T @ y) # K * 1
 53 |             numer += (np.kron(f_hat, Z.T) @ y)
 54 |             denom += (np.kron(f_hat, Z.T) @ np.kron(f_hat.T, Z))
 55 |             
 56 |         gamma_new = (np.linalg.pinv(denom) @ numer).reshape(self.K, 94)
 57 |         gamma_new = gamma_new.T    
 58 | 
 59 |         return gamma_new
 60 |     
 61 | 
 62 |     def train_model(self):
 63 |         update_cnt = 0
 64 |         min_valid_err = np.Inf
 65 |         best_gamma = np.zeros((94, self.K)) 
 66 |         while update_cnt < 5:
 67 |             self.gamma = self.__gamma_iter(self.gamma)
 68 |             valid_error = self.__valid()
 69 |             if valid_error < min_valid_err:
 70 |                 min_valid_err = valid_error
 71 |                 best_gamma = self.gamma
 72 |                 update_cnt = 0
 73 |             else:
 74 |                 update_cnt += 1
 75 |         
 76 |         self.gamma = best_gamma
 77 |         
 78 |     
 79 |     def inference(self, month):
 80 |         if not len(self.omit_char):        
 81 |             Z = self.p_charas.loc[self.p_charas.DATE == month][CHARAS_LIST].values # N * P
 82 |             y = self.portfolio_ret.loc[self.portfolio_ret.DATE == month][CHARAS_LIST].values.T # N * 1
 83 |             beta = Z @ self.gamma # N * K
 84 |             f_hat = np.array(np.matrix(beta.T @ beta).I @ beta.T @ y) # K * 1
 85 |             return (beta @ f_hat).flatten() # N, 1
 86 |         else:
 87 |             inference_R = []
 88 |             Z = self.p_charas.loc[self.p_charas.DATE == month][CHARAS_LIST].copy(deep=False)
 89 |             y = self.portfolio_ret.loc[self.portfolio_ret.DATE == month][CHARAS_LIST].copy(deep=False)
 90 |             
 91 |             for char in self.omit_char:
 92 |                 Z_input = Z.copy(deep=False)
 93 |                 y_input = y.copy(deep=False)
 94 |                 Z_input[[char]] = Z_input[[char]] * 0.0
 95 |                 y_input[[char]] = y_input[[char]] * 0.0
 96 |                 Z_input = Z_input.values
 97 |                 y_input = y_input.values.T
 98 |                 beta = Z_input @ self.gamma
 99 |                 f_hat = np.array(np.matrix(beta.T @ beta).I @ beta.T @ y_input) # K * 1
100 |                 inference_R.append((beta @ f_hat).flatten()) # m * N
101 | 
102 |             Z_input = Z.values
103 |             y_input = y.values.T
104 |             beta = Z_input @ self.gamma
105 |             f_hat = np.array(np.matrix(beta.T @ beta).I @ beta.T @ y_input) # K * 1
106 |             inference_R.append((beta @ f_hat).flatten()) # m * N
107 |             
108 |             return np.array(inference_R).T # N * m
109 |     
110 |     
111 |     def predict(self, month):
112 |         if self.refit_cnt == 0:
113 |             return self.inference(month)
114 |         
115 |         lag_f_hat = []
116 |         for mon in self.mon_list[(self.mon_list >= 19870101) & (self.mon_list < month)]:
117 |             Z = self.p_charas.loc[self.p_charas.DATE == mon][CHARAS_LIST].values # N * P
118 |             y = self.portfolio_ret.loc[self.portfolio_ret.DATE == mon][CHARAS_LIST].values.T # N * 1
119 |             beta = Z @ self.gamma # N * K
120 |             f_hat = np.array(np.matrix(beta.T @ beta).I @ beta.T @ y) # K * 1
121 |             lag_f_hat.append(f_hat)
122 |             
123 |         Z = self.p_charas.loc[self.p_charas.DATE == month][CHARAS_LIST].values # N * P
124 |         y = self.portfolio_ret.loc[self.portfolio_ret.DATE == month][CHARAS_LIST].values.T # N * 1
125 |         beta = Z @ self.gamma # N * K
126 |         
127 |         # return average of prevailing sample hat{f} (from 198701) up to t-1
128 |         avg_lag_f = np.mean(lag_f_hat, axis=0)
129 |         return beta @ avg_lag_f


--------------------------------------------------------------------------------
/models/PCA.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append('../')
 3 | 
 4 | import pandas as pd
 5 | import numpy as np
 6 | 
 7 | from utils import *
 8 | from .modelBase import modelBase
 9 | 
10 | 
11 | def stock_R_matrix(start_date, end_date):
12 |     R_matrix = pd.read_pickle('data/stock_R_matrix.pkl')
13 |     return R_matrix.T.loc[start_date: end_date].T
14 | 
15 | def portfolio_R_matrix(start_date, end_date):
16 |     portfolio_ret = pd.read_pickle('data/portfolio_ret.pkl')
17 |     return portfolio_ret.loc[(portfolio_ret['DATE'] >= start_date) & (portfolio_ret['DATE'] <= end_date)].set_index('DATE').T
18 | 
19 | 
20 | 
21 | class PCA(modelBase):
22 |     def __init__(self, K, omit_char=[]):
23 |         super(PCA, self).__init__(f'PCA_{K}')
24 |         self.K = K
25 |         self.omit_char = omit_char
26 |         self.portfolio_ret = pd.read_pickle('data/portfolio_ret.pkl')
27 |     
28 |     def train_model(self):
29 |         pr = self.portfolio_ret.loc[(self.portfolio_ret.DATE >= self.train_period[0]) & (self.portfolio_ret.DATE <= self.train_period[1])][CHARAS_LIST].values
30 |         pr = pr - np.mean(pr, axis=0) # col demean
31 |         ret_cov_matrix = np.zeros((pr.shape[1], pr.shape[1]))
32 |         
33 |         for i in range(pr.shape[0]): # Sum of y^t @ y^t.T
34 |             ret_cov_matrix += (pr[i, :].reshape(-1, 1) @ pr[i, :].reshape(-1, 1).T)
35 |         ret_cov_matrix = ret_cov_matrix/(pr.shape[0]*pr.shape[1]) # N * N
36 | 
37 |         eigVal, eigVec = np.linalg.eig(ret_cov_matrix)
38 |         sorted_indices = np.argsort(eigVal)
39 |         self.beta = eigVec[:,sorted_indices[:-self.K-1:-1]] # Beta: N * K
40 |     
41 |     
42 |     def calBeta(self, month):
43 |         return np.real(self.beta)
44 |     
45 |         
46 |     def calFactor(self, month):
47 |         tr = self.portfolio_ret.loc[self.portfolio_ret.DATE <= month].iloc[-1][CHARAS_LIST].values
48 |         tr = tr - np.mean(tr, axis=0) # col demean
49 |         # print(tr)
50 |         factor = np.array((np.matrix(self.beta.T @ self.beta).I @ self.beta.T) @ tr.T).T # K * 1
51 |         return np.real(factor.flatten())
52 |     
53 | 
54 |     def cal_delayed_Factor(self, month):
55 |         if self.refit_cnt == 0:
56 |             return self.calFactor(month)
57 |         
58 |         tr = self.portfolio_ret.loc[(self.portfolio_ret.DATE >= 19870101) & (self.portfolio_ret.DATE < month)][CHARAS_LIST].values
59 |         tr = tr - np.mean(tr, axis=0) # col demean
60 |         
61 |         # return average of prevailing sample hat{f} (from 198701) up to t-1
62 |         factors = []
63 |         for i in range(tr.shape[0]):
64 |             factors.append(np.array(np.matrix(self.beta.T @ self.beta).I @ self.beta.T @ tr[i, :]))
65 | 
66 |         factors = np.array(factors).squeeze(1).T # K * T
67 |         avg_delay_f = np.mean(factors, axis=1).reshape(-1, 1) # K * 1
68 |         
69 |         return np.real(avg_delay_f.flatten())
70 |     


--------------------------------------------------------------------------------
/models/modelBase.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import datetime
 4 | from dateutil.relativedelta import relativedelta
 5 | 
 6 | class modelBase:
 7 |     def __init__(self, name):
 8 |         self.name = name
 9 |         self.train_idx = 0
10 |         self.refit_cnt = 0
11 |         
12 |         # initial train, valid and test periods are default accroding to original paper
13 |         self.train_period = [19570101, 19741231]
14 |         self.valid_period = [19750101, 19861231]
15 |         self.test_period  = [19870101, 19871231]
16 |     
17 |     
18 |     def train_model(self):
19 |         # print('trained')
20 |         pass
21 | 
22 |     
23 |     def calBeta(self, month):
24 |         """
25 |         Calculate specific month's beta. Should be specified by different models
26 |         -> return np.array, dim = (N, K)
27 |         """
28 |         # return np.zeros([13000, 3])
29 |         pass
30 |     
31 |         
32 |     def calFactor(self, month):
33 |         """
34 |         Calculate specific month's factor. Should be specified by different models
35 |         -> return np.array, dim = (K, 1)
36 |         """
37 |         # return np.zeros([3, 1])
38 |         pass    
39 |        
40 |     
41 |     def cal_delayed_Factor(self, month):
42 |         """
43 |         Calculate delayed month's factor, i.e. mean average of factors up to t-1. Should be specified by different models
44 |         -> return np.array, dim = (K, 1)
45 |         """
46 |         pass
47 |     
48 |     
49 |     def inference(self, month):       
50 |         assert month >= self.test_period[0], f"Month error, {month} is not in test period {self.test_period}"
51 |         
52 |         mon_factor, mon_beta = self.calFactor(month), self.calBeta(month)
53 |         
54 |         assert mon_beta.shape[1] == mon_factor.shape[0], f"Dimension mismatch between mon_factor: {mon_factor.shape} and mon_beta: {mon_beta.shape}"
55 |         
56 |         # R_{N*1} = Beta_{N*K} @ F_{K*1}
57 |         return mon_beta @ mon_factor
58 |         
59 |     
60 |     def predict(self, month):
61 |         assert month >= self.test_period[0] and month <= self.test_period[1], f"Month error, {month} is not in test period {self.test_period}"
62 |         
63 |         lag_factor, mon_beta = self.cal_delayed_Factor(month), self.calBeta(month)
64 |         
65 |         assert mon_beta.shape[1] == lag_factor.shape[0], f"Dimension mismatch between lag_factor: {lag_factor.shape} and mon_beta: {mon_beta.shape}"
66 |         
67 |         # R_{N*1} = Beta_{N*K} @ lag_F_avg{K*1}  
68 |         return mon_beta @ lag_factor
69 |     
70 |     
71 |     def refit(self):
72 |         # self.train_period[1] += 10000 # method in original paper: increase training size by one year each time refit
73 |         self.train_period = (pd.Series(self.train_period) + 10000).to_list() # rolling training
74 |         self.valid_period = (pd.Series(self.valid_period) + 10000).to_list()
75 |         self.test_period = (pd.Series(self.test_period) + 10000).to_list()
76 |         self.refit_cnt += 1
77 |         
78 |         


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | 
 4 | # # stock-level characteristics with index corresponding to original paper
 5 | # annual_chara = {
 6 | #     'absacc': 1, 'acc': 2, 'age': 4, 'agr': 5, 'bm': 9,
 7 | #     'bm_ia': 10, 'cashdebt': 12, 'cashpr': 13, 'cfp': 14, 'cfp_ia': 15,  
 8 | #     'chatoia': 16, 'chcsho': 17, 'chempia': 18, 'chinv': 19, 'chpmia': 21,
 9 | #     'convind': 24, 'currat': 25, 'depr': 26, 'divi': 27, 'divo': 28,
10 | #     'dy': 30, 'egr': 32, 'ep': 33, 'gma': 34, 'grcapx': 35,
11 | #     'grltnoa': 36, 'herf': 37, 'hire': 38, 'invest': 42, 'lev': 43,
12 | #     'lgr': 44, 'mve_ia': 52, 'operprof': 54, 'orgcap': 55, 'pchcapx_ia': 56,
13 | #     'pchcurrat': 57, 'pchdepr': 58, 'pchgm_pchsale': 59, 'pchquick': 60, 'pchsale_pchinvt': 61,
14 | #     'pchsale_pchrect': 62, 'pchsale_pchxsga': 63, 'pchsaleinv': 64, 'pctacc': 65, 'ps': 67, 
15 | #     'quick': 68, 'rd': 69, 'rd_mve': 70, 'rd_sale': 71, 'realestate': 72, 
16 | #     'roic': 77, 'salecash': 79, 'saleinv': 80, 'salerec': 81, 'secured': 82, 
17 | #     'securedind': 83, 'sgr': 84, 'sin': 85, 'sp': 86, 'tang': 91, 'tb': 92
18 | # }
19 | 
20 | # quarter_chara = {
21 | #     'aeavol': 3, 'cash': 11, 'chtx': 22, 'cinvest': 23,
22 | #     'ear': 31, 'ms': 50, 'nincr': 53, 'roaq': 74,
23 | #     'roavol': 75, 'roeq': 76, 'rsup': 78, 'stdacc': 89, 'stdcf': 90
24 | # }
25 | 
26 | # month_chara = {
27 | #     'baspread': 6, 'beta': 7, 'betasq': 8, 'chmom': 20,
28 | #     'dolvol': 29, 'idiovol': 39, 'ill': 40, 'indmom': 41,
29 | #     'maxret': 45, 'mom12m': 46, 'mom1m': 47, 'mom36m': 48,
30 | #     'mom6m': 49, 'mvel1': 51, 'pricedelay': 66, 'retvol': 73,
31 | #     'std_dolvol': 87, 'std_turn': 88, 'turn': 93, 'zerotrade': 94
32 | # }
33 | 
34 | CHARAS_LIST = ['absacc','acc','age','agr','bm','bm_ia','cashdebt','cashpr','cfp','cfp_ia','chatoia','chcsho','chempia','chinv','chpmia','convind','currat','depr','divi','divo','dy','egr','ep','gma','grcapx','grltnoa','herf','hire','invest','lev','lgr','mve_ia','operprof','orgcap','pchcapx_ia','pchcurrat','pchdepr','pchgm_pchsale','pchquick','pchsale_pchinvt','pchsale_pchrect','pchsale_pchxsga','pchsaleinv','pctacc','ps','quick','rd','rd_mve','rd_sale','realestate','roic','salecash','saleinv','salerec','secured','securedind','sgr','sin','sp','tang','tb','aeavol','cash','chtx','cinvest','ear','ms','nincr','roaq','roavol','roeq','rsup','stdacc','stdcf','baspread','beta','betasq','chmom','dolvol','idiovol','ill','indmom','maxret','mom12m','mom1m','mom36m','mom6m','mvel1','pricedelay','retvol','std_dolvol','std_turn','turn','zerotrade']
35 | 
36 | 
37 | # default learning rate of CA model
38 | CA_DR = 0.5 # drop out rate
39 | CA_LR = 0.001 # learning rate
40 | 
41 | # out of sample period
42 | OOS_start = 19870101
43 | OOS_end = 20161231
44 | 
45 | 
46 | 
47 | class HiddenPrints:
48 |     def __init__(self, activated=True):
49 |         self.activated = activated
50 |         self.original_stdout = None
51 | 
52 |     def open(self):
53 |         sys.stdout.close()
54 |         sys.stdout = self.original_stdout
55 | 
56 |     def close(self):
57 |         self.original_stdout = sys.stdout
58 |         sys.stdout = open(os.devnull, 'w')
59 | 
60 |     def __enter__(self):
61 |         if self.activated:
62 |             self.close()
63 | 
64 |     def __exit__(self, exc_type, exc_val, exc_tb):
65 |         if self.activated:
66 |             self.open()
67 | 
68 | 
69 | 
70 | def git_push(message):
71 |     os.system('git add results')
72 |     os.system(f'git commit -m "no_dropout: {message}"')
73 |     os.system('git push')


--------------------------------------------------------------------------------