├── .gitignore
├── .idea
├── .gitignore
├── Autoencoder-Asset-Pricing-Models.iml
├── inspectionProfiles
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
└── vcs.xml
├── README.md
├── R_squares
├── 2023-06-09_05-24-47.json
├── 2023-06-09_06-07-56.json
├── 2023-06-09_09-08-26.json
└── 2023-06-11_18-16-38.json
├── analysis.py
├── data_prepare.py
├── imgs
├── R2_pred_table.png
├── R2_total_table.png
├── alpha
│ ├── CA0_1_inference_alpha_plot.png
│ ├── CA0_2_inference_alpha_plot.png
│ ├── CA0_3_inference_alpha_plot.png
│ ├── CA0_4_inference_alpha_plot.png
│ ├── CA0_5_alpha_plot.png
│ ├── CA0_5_inference_alpha_plot.png
│ ├── CA0_6_inference_alpha_plot.png
│ ├── CA1_1_inference_alpha_plot.png
│ ├── CA1_2_inference_alpha_plot.png
│ ├── CA1_3_inference_alpha_plot.png
│ ├── CA1_4_inference_alpha_plot.png
│ ├── CA1_5_inference_alpha_plot.png
│ ├── CA1_6_inference_alpha_plot.png
│ ├── CA2_1_inference_alpha_plot.png
│ ├── CA2_2_inference_alpha_plot.png
│ ├── CA2_3_inference_alpha_plot.png
│ ├── CA2_4_inference_alpha_plot.png
│ ├── CA2_5_inference_alpha_plot.png
│ ├── CA2_6_inference_alpha_plot.png
│ ├── CA3_1_inference_alpha_plot.png
│ ├── CA3_2_inference_alpha_plot.png
│ ├── CA3_3_inference_alpha_plot.png
│ ├── CA3_4_inference_alpha_plot.png
│ ├── CA3_5_inference_alpha_plot.png
│ ├── CA3_6_inference_alpha_plot.png
│ ├── FF_1_inference_alpha_plot.png
│ ├── FF_2_inference_alpha_plot.png
│ ├── FF_3_inference_alpha_plot.png
│ ├── FF_4_inference_alpha_plot.png
│ ├── FF_5_inference_alpha_plot.png
│ ├── FF_6_inference_alpha_plot.png
│ ├── IPCA_1_inference_alpha_plot.png
│ ├── IPCA_2_inference_alpha_plot.png
│ ├── IPCA_3_inference_alpha_plot.png
│ ├── IPCA_4_inference_alpha_plot.png
│ ├── IPCA_5_inference_alpha_plot.png
│ ├── IPCA_6_inference_alpha_plot.png
│ ├── PCA_1_inference_alpha_plot.png
│ ├── PCA_2_inference_alpha_plot.png
│ ├── PCA_3_inference_alpha_plot.png
│ ├── PCA_4_inference_alpha_plot.png
│ ├── PCA_5_inference_alpha_plot.png
│ ├── PCA_6_inference_alpha_plot.png
│ ├── seq2seq1_1_inference_alpha_plot.png
│ ├── seq2seq1_2_inference_alpha_plot.png
│ ├── seq2seq1_3_inference_alpha_plot.png
│ ├── seq2seq1_4_inference_alpha_plot.png
│ ├── seq2seq1_5_inference_alpha_plot.png
│ ├── seq2seq2_1_inference_alpha_plot.png
│ ├── seq2seq2_2_inference_alpha_plot.png
│ ├── seq2seq2_3_inference_alpha_plot.png
│ ├── seq2seq2_4_inference_alpha_plot.png
│ ├── seq2seq2_5_inference_alpha_plot.png
│ ├── seq2seq3_1_inference_alpha_plot.png
│ ├── seq2seq3_2_inference_alpha_plot.png
│ ├── seq2seq3_3_inference_alpha_plot.png
│ ├── seq2seq3_4_inference_alpha_plot.png
│ └── seq2seq3_5_inference_alpha_plot.png
├── omit_char_R2_bias.png
├── pred_R2.png
└── total_R2.png
├── main.py
├── models
├── CA.py
├── IPCA.py
├── Seq2Seq
├── modelBase.py
└── seq.py
└── utils.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # raw data
2 | data/
3 | data.zip
4 | __MACOSX/
5 | new_data.zip
6 | __pycache__
7 | models/__pycache__
8 | saved_models
9 | *_loss_*.png
10 | *.ipynb
11 | results/
12 | logs/
13 | R_squares/
14 |
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Editor-based HTTP Client requests
5 | /httpRequests/
6 | # Datasource local storage ignored files
7 | /dataSources/
8 | /dataSources.local.xml
9 |
--------------------------------------------------------------------------------
/.idea/Autoencoder-Asset-Pricing-Models.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Autoencoder-Asset-Pricing-Models
2 |
3 | 🧐 [Report](https://www.richardsong.space/autoencoder-asset-pricing-models)
4 | ## Set Up
5 |
6 | ```bash
7 | # generate preprocessed data and download portfolio returns
8 | python data_prepare.py
9 |
10 | # train models (ALL together)
11 | python main.py --Model 'FF PCA IPCA CA0 CA1 CA2 CA3' --K '1 2 3 4 5 6'
12 |
13 | # train models (selected models and K, for example)
14 | python main.py --Model 'IPCA CA3' --K '5 6'
15 |
16 | # analyze characteristics' importance (if needed)
17 | python main.py --Model 'IPCA CA0 CA1 CA2 CA3' --K '5' --omit_char 'absacc acc age agr bm bm_ia cashdebt cashpr cfp cfp_ia chatoia chcsho chempia chinv chpmia convind currat depr divi divo dy egr ep gma grcapx grltnoa herf hire invest lev lgr mve_ia operprof orgcap pchcapx_ia pchcurrat pchdepr pchgm_pchsale pchquick pchsale_pchinvt pchsale_pchrect pchsale_pchxsga pchsaleinv pctacc ps quick rd rd_mve rd_sale realestate roic salecash saleinv salerec secured securedind sgr sin sp tang tb aeavol cash chtx cinvest ear ms nincr roaq roavol roeq rsup stdacc stdcf baspread beta betasq chmom dolvol idiovol ill indmom maxret mom12m mom1m mom36m mom6m mvel1 pricedelay retvol std_dolvol std_turn turn zerotrade'
18 |
19 | # analyze models (calculate R^2, plot R^2 tables, bars and bias heatmap)
20 | python analysis.py
21 | ```
22 | ## Results
23 | ### Total R^2 (%)
24 |
25 |
26 |
27 |
28 | ### Predict R^2 (%)
29 |
30 |
31 |
32 | ### Risk Premia v.s. Mispricing
33 |
34 |
35 |  |
36 |  |
37 |
38 |
39 |  |
40 |  |
41 |
42 |
43 |  |
44 |  |
45 |
46 |
47 |
48 | ### Characteristics Importance (reduced total R^2 (%), K=5)
49 |
50 |
--------------------------------------------------------------------------------
/R_squares/2023-06-09_05-24-47.json:
--------------------------------------------------------------------------------
1 | {"models": ["FF_1", "FF_2", "FF_3", "FF_4", "FF_5", "FF_6"], "omit_char": [""], "R2_total": [0.08537139414421824, 0.1576019101919831, 0.1986486217133806, 0.20315476596988524, 0.31397093775365037, 0.3616431120471959]}
--------------------------------------------------------------------------------
/R_squares/2023-06-09_06-07-56.json:
--------------------------------------------------------------------------------
1 | {"models": ["CA0_1", "CA0_2", "CA0_3", "CA0_4", "CA0_5", "CA0_6", "CA1_1", "CA1_2", "CA1_3", "CA1_4", "CA1_5", "CA1_6", "CA2_1", "CA2_2", "CA2_3", "CA2_4", "CA2_5", "CA2_6", "CA3_1", "CA3_2", "CA3_3", "CA3_4", "CA3_5", "CA3_6"], "omit_char": [""], "R2_total": [
2 | 0.5677639760646643, 0.6761207104643877, 0.7076066105263652, 0.6913661386379286, 0.662602500272096, 0.7110612627936461,
3 | 0.5517562872860107, 0.7025783407556893, 0.685776051607686, 0.6664443573030849, 0.7006957708196195, 0.7052861947690043,
4 | 0.5967130036325399, 0.6626964974803786, 0.6608531336078073, 0.7070314610106503, 0.6462021917956272, 0.6767568343936613,
5 | 0.5531676704426002, 0.5249032928672436, 0.5642100044551001, 0.5458004779254889, 0.5558832641978944, 0.5235321637890534]}
--------------------------------------------------------------------------------
/R_squares/2023-06-09_09-08-26.json:
--------------------------------------------------------------------------------
1 | {"models": ["PCA_1", "PCA_2", "PCA_3", "PCA_4", "PCA_5", "PCA_6"], "omit_char": [""], "R2_total": [0.4061160826307103, 0.5300364609271587, 0.5913033228863098, 0.6246396597772854, 0.6467919712825208, 0.6720178863573743]}
--------------------------------------------------------------------------------
/R_squares/2023-06-11_18-16-38.json:
--------------------------------------------------------------------------------
1 | {"models": ["IPCA_5", "CA0_5", "CA1_5", "CA2_5", "CA3_5"], "omit_char": ["absacc", "acc", "age", "agr", "bm", "bm_ia", "cashdebt", "cashpr", "cfp", "cfp_ia", "chatoia", "chcsho", "chempia", "chinv", "chpmia", "convind", "currat", "depr", "divi", "divo", "dy", "egr", "ep", "gma", "grcapx", "grltnoa", "herf", "hire", "invest", "lev", "lgr", "mve_ia", "operprof", "orgcap", "pchcapx_ia", "pchcurrat", "pchdepr", "pchgm_pchsale", "pchquick", "pchsale_pchinvt", "pchsale_pchrect", "pchsale_pchxsga", "pchsaleinv", "pctacc", "ps", "quick", "rd", "rd_mve", "rd_sale", "realestate", "roic", "salecash", "saleinv", "salerec", "secured", "securedind", "sgr", "sin", "sp", "tang", "tb", "aeavol", "cash", "chtx", "cinvest", "ear", "ms", "nincr", "roaq", "roavol", "roeq", "rsup", "stdacc", "stdcf", "baspread", "beta", "betasq", "chmom", "dolvol", "idiovol", "ill", "indmom", "maxret", "mom12m", "mom1m", "mom36m", "mom6m", "mvel1", "pricedelay", "retvol", "std_dolvol", "std_turn", "turn", "zerotrade"], "R2_total": [1.5442819689237552e-05, -0.0008641544212459884, -0.0001747076111721091, -6.028723743389808e-05, 0.0004881307548909586, -9.453040450568828e-05, 0.0011809606037788134, -0.00047160860320083486, 0.0004497328261676703, 0.00035797708987406196, 0.00046514268643882417, 8.010321267071241e-05, 0.0005184883538837948, -0.00037818109386877907, -5.991642326275137e-05, 0.0002601835988962353, 0.0011439216429843801, 0.0017112133538985663, -4.350496459193387e-06, -0.00019380257770462705, 0.00043948388552850215, 0.00031580941297537635, 0.003338161066369527, 0.0021027676465422696, -0.0003171094148499698, 0.0014555971916409005, -0.0007270330656120594, 0.0016980056916262587, 0.0009553837759342931, 0.0027868308676146647, 0.00026402683391868464, 0.0007850527331111357, 0.0020449984309897085, 0.001054644787377823, 5.184771678434785e-06, 0.002178892159566903, -0.00017501728488655832, -0.0002577126732409285, 0.0009543687413273716, 0.0005083110135046809, 7.62809849720325e-05, 8.658720973830913e-05, 0.0004486453095020604, 0.0008906763104503668, 0.0010605476004982295, 0.0018694816639623912, 0.0002959972532902144, 0.0017008605711600344, 0.0007976861126247625, -0.002652802485675565, 0.002662754828368419, 9.917422749694538e-05, 5.964285215753762e-05, 0.0022189518272991426, -7.435817815093504e-05, -0.001698616694105315, 0.001233404502690938, 0.00047054690788606024, 0.0026004651808273493, 0.0011172780845385422, -0.0001482646856509895, -0.00041623695665415905, -0.0015909825974204095, -0.00031927406061749153, -0.00043655338823822554, 0.0001361268145957384, -0.00032109572478289383, 8.887812899915914e-05, 0.002067090875226163, 0.0010982209839661694, -0.0005637330110257466, -0.0005329012820900481, 0.0009084092837141622, 0.0021178568491788674, 0.0033586699296590528, 0.03875439641206746, 0.007293900327323799, 0.004763913685328358, 0.004817433432478202, 0.007084909806944539, 0.0025907095069127584, 0.0080607854611624, 0.0057093251947115675, 0.009105559086416637, 0.011542299391846145, 0.0028401829485293906, 0.02709723337680492, 0.004118952479980065, -0.00025137933247576516, 0.018393986829754794, 0.003372828092497837, 0.0013541844149635995, 0.00793699314016949, 0.0036518023385704312, -0.01840451865415549, 0.0035504742986338655, -0.02124055707570216, -0.003382787260082454, 0.001661945882751592, -8.216386260140318e-05, -0.011643569891665484, -0.0015886009587249283, -0.0060291572912980484, -0.003934251792188981, -0.0003035868273979503, 0.0025248914169603287, -0.0004996588466505969, 0.0009582432785717465, -0.0017444408411588785, -0.004805523268163303, -0.002931825562557, -0.014321167551977099, -0.005731926091473105, -0.001628883333692932, -0.008380269170191412, -0.002495037120210153, -0.013508701388505795, 0.001503549704314544, -0.0007751891041964942, -0.0007030969454133729, -0.0028340425016759596, 0.002562114878683408, 0.0007168131966963642, 0.0005966917290103346, -0.0009823429746401713, -0.008049996513831648, -0.006935426188849236, -0.013752522526764732, -0.0005069144917650981, 0.0008889578779507357, -0.004511542543265024, 0.000512296750466934, -0.0002943171885458895, -0.00022271179096300386, -0.0021640651213321593, 0.0002623110944273144, -0.00026747487114753277, -0.0027015552337811277, -0.012375056721493305, -0.00010214235568639651, -0.012322425151997773, -0.004802562616962769, -0.0040789335496304036, 0.0004565130977485232, -0.0004572283160489965, 0.011102582602458222, -0.0020968867403851066, -0.0011853937188981423, -0.005591149809828, -0.010832419289181106, -0.001980246263801777, -0.0017540247099745443, 0.0018022912780594202, -0.0005056805241561158, -0.0017686227072655214, -0.0003527295679881526, 0.016529002299418005, -0.0014105862431510463, -0.0005732444581987295, -0.002869230597707273, -0.004965959446656232, 0.0004066440403122096, -0.009874817195565821, -0.030274345848102402, -0.012926682404285739, -0.0008783158084264553, -0.006107566065733372, -0.010184352477880187, -0.025327970485242712, -0.035590091815137614, -0.0303458179931807, -0.004102272911347127, -0.0004503926531160829, -0.029369754046835173, -0.007045255786105931, 0.0016875597888881266, -0.03481944581257812, -0.006678644726203498, -0.014538583275030104, -0.0007153352724629247, -0.011420023006262658, -0.004652763809388283, -0.005798820819408745, -0.024656178382399974, 0.0011578591100704916, -0.005280277401536693, 0.0010356576649497296, 0.003940580861575116, -0.005276868987642791, 0.00770392753313931, -0.007944469488529449, 0.001991397641811221, 0.003759694548383874, 0.0011195084904335184, -0.007110697689415635, 0.0032203225902148747, -0.007578862375151041, -0.0060194039077946515, 0.00024130283639867134, 0.0013235404395363082, 0.00043243599875075756, 0.001123780074328562, 0.00050973691967926, -0.00034757211361624574, -0.002281972957101308, 0.000845475795447137, -0.0020588684564848414, 0.0021372282150149413, -0.00019856545094609768, -0.0017913068956328937, 0.0006373160107784326, 0.0012739985965753986, -0.001696571054130902, -0.0002589327068805991, -0.00022288072963450034, -0.0031104722681001284, -0.0008235753576365523, -0.002447807642767752, 0.002004633800402944, -0.003224266216026561, 0.00864564856685679, -0.00554499989692514, -0.00026444194813035615, -0.0010032832870013886, -0.0032213627206253426, -0.000911160024681501, 0.0004076477827085201, 0.0007995060213979999, -0.0005885363844878588, 0.0032955989913380224, 0.0004894614286667931, 0.002261644799368967, -0.0032746021967355876, -0.0005524850007211368, -0.0031971616217488785, 0.0060628232866996035, -0.0018951843317909223, -0.004804670310596504, -0.016865397416521932, 0.0022279632468085175, 0.0004397955748330906, -0.0012895574791834674, -0.0052936056246406515, 0.001047196960554997, 0.0014035810745537391, -0.0021789054250395123, 0.004835618627879845, -0.004423137045597492, 0.0033839427269372058, 0.0004317648131846319, 0.010148684107688322, -0.0005852067203551137, -0.00023673829544612612, -0.0004969772951760598, -0.0007139106069110612, -0.0006781488781816281, -0.004463513032158084, -0.008314803798795345, 0.005664192158806869, -0.0017831527387602852, -0.005528449668342539, -0.00018355344047649158, -0.013367195182721558, 0.010476427486050932, -0.0030995623273649686, -0.0033128122970741414, 0.004656006266778645, -0.010849289595229128, -0.002397579088451285, -0.009958364378056861, -0.0011444352933996926, 0.0021166335185883733, -0.018000165477872865, -0.0032624694006755384, -0.004127065179238776, 0.012949255788881175, -0.0018095543583533935, -0.005592349116810835, 0.0037267653611244844, -0.0038516820526487416, 0.024489734744573943, 0.004683327531111781, 0.001269388150738071, -0.0017407506416220464, 0.011902507395885942, 0.004639839618832409, 0.002506386658879589, 0.0003685885982728232, 0.009029032176506746, 0.003135888070068371, -0.0016520809387076119, 0.003088001303703236, 0.00014727469647846103, 0.0035561159496605432, -2.876814735663924e-05, -1.1060450691990908e-06, -0.00046745210145082705, 0.0012870350251698026, 0.0042191397766525585, 0.00655774915823315, 0.0005207348452095362, 0.0023795699587515484, 0.0043810533565454834, 0.004609257547817935, 0.01290056247180027, 0.0014757109716835304, 0.0010271818164879765, 0.0015607194203527408, 0.0004559197274759397, 0.00033707593565324157, 0.002642703381714573, 0.006501328950226926, 0.0008704244332061739, 0.002463581165456641, 0.004364567880446146, -0.0018809110325389566, 0.00017150256113329654, 0.00017711236826822851, -7.174454357028459e-05, 3.2134260813054816e-05, -0.0018074430724653867, 0.0007488238809050252, -0.00021827012815400781, -0.0002562985350954561, 0.00023716735492340657, 0.001413468319080624, 0.006882923410709174, 0.0007931366577244026, -0.0004717657827425503, 0.009100363328674477, 0.005404627376927373, -0.004026623783044081, 0.005351064544848239, -0.0035389768278023537, 0.002068255236817085, -0.002090048742856676, -0.0019271886285645579, 0.0011498135139685894, 0.0018314054258520285, -0.00026283971236906734, 0.0015413975782866407, -0.004150725533632493, -0.001320978589455457, 0.00031339691627352284, -0.0016566925476326766, -0.0006220275171834322, 7.44998579660372e-05, -0.0002999797537633908, 0.0007030142990428478, 0.0007913912947924429, 0.0011556895007599488, 0.011568536396979079, 0.0028225455085252316, 0.00035905603670860486, -0.000841494670420162, 0.009615860981466828, 0.03377755052540243, 0.0789273078045386, 0.05525867670164508, -0.0010166697260519664, 0.00957764643602288, 0.06977357007791107, 0.006012541368369928, 0.002010875608860041, 0.03032162865092458, 0.002846750112344365, -0.00016217114898731122, 0.0060712783132312875, 0.006622682402069313, 0.006219871147256417, 0.0018001652923477218, 0.06992553842572524, 0.006724233973021909, 0.007539747957793552, 0.020538531613926936, 0.007943897075231354, 0.008360812789827254, 0.003290025441256006, 0.020634865090878085, 0.0008080876743085108, -0.0030892486063024416, 0.00041600088024251747, 0.0070829632445147395, 0.0036536124147609206, -0.0016869382360118479, 0.0034263287960714095, -0.0006554657701027811, 0.0029753478023664126, 0.0006174977901268752, 7.101865790082318e-05, -0.00016257411215181428, -0.0009369820715646737, 0.0005434387564733356, 0.0012271814940856274, 0.0017175994942115747, 0.0005473661944555008, 0.005273938186585614, 0.0027605830798231867, 0.0053267195601733874, -0.0008641356489738072, 0.0003752916592929534, -8.763321084148679e-05, 0.0015245669500787429, -0.00012165228700977693, 0.0007204145703794129, 7.33961001830874e-06, 0.0007303311512011357, 0.002484907097797917, 0.012401690162314405, 0.002021342443194185, -0.00036144987650921223, -0.0012055569846747272, 0.00085115010144865, 2.829758256761572e-05, -3.762854703992513e-05, 0.0006878905429239524, -0.0007663772085586551, 0.0001906678540952722, 0.0001922411569781346, 0.001457809117053288, 0.0061655533145379415, -0.0002077338553436725, 0.004428694252206933, 0.007572691730434067, 0.015522139371950017, -0.0007973496961124482, 0.010913869326709902, -0.0009089608282075723, 0.001704153441209666, -0.001975343179565603, 0.0007767569381108563, 0.0001946668039775057, -0.0003049050619283733, -0.0006077903992949274, -0.0014501145568337481, 0.0024408470743457755, 0.0024867530414642847, -0.00028112355434628533, 0.00029719386409921, -0.0005815474877897131, -9.683347434896739e-05, 0.0015214781874257621, 0.00038445751824878194, -0.0006103274076518783, 0.0016081931094918955, 0.008255889709628095, 0.013087103367242059, 0.0004902027422780675, 0.007683661011998355, 0.0006227685572650632, 0.040959894211064496, 0.04659799783753149, 0.05623489106446333, -0.0011074112724636098, 0.0026292183555661763, 0.06648468153042297, 0.008507328563525984, 0.0012577368022517188, 0.05146040805311081, 0.006280620030596484, 0.0025791825898906495, 0.006777579940425715, 0.009547184931625763, 0.008159048875296504, 0.0008150683146730398, 0.06841724518294723, -0.0003192050956154491, 0.005694154235833304, 0.0024969631810545234, -0.001106586510389418]}
--------------------------------------------------------------------------------
/analysis.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | from utils import *
4 | import seaborn as sns
5 | import matplotlib.pyplot as plt
6 | import plotly.figure_factory as ff
7 |
8 | import warnings
9 | warnings.filterwarnings('ignore')
10 |
11 |
12 | def calculate_R2(model, type, input=None, complete_r=None):
13 | portfolio_ret = pd.read_pickle('data/portfolio_ret.pkl')
14 | oos_ret = portfolio_ret.loc[(portfolio_ret['DATE'] >= OOS_start) & (portfolio_ret['DATE'] <= OOS_end)]
15 |
16 | if not isinstance(input, np.ndarray):
17 | # print('type: ', type)
18 | if isinstance(model, str):
19 | output_path = f'results/{type}/{model}_{type}.csv'
20 | else:
21 | output_path = f'results/{type}/{model.name}_{type}.csv'
22 | # print('path : ', output_path)
23 | model_output = pd.read_csv(output_path)
24 | else:
25 | model_output = input
26 | model_output = pd.DataFrame(model_output, columns=CHARAS_LIST)
27 | model_output['DATE'] = oos_ret['DATE'].to_list()
28 |
29 | for col in model_output.columns: # hard code for format error
30 | model_output[col] = model_output[col].apply(lambda x: float(str(x).replace('[', '').replace(']', '')))
31 |
32 | residual_square = ((oos_ret.set_index('DATE') - model_output.set_index('DATE'))**2).dropna()
33 | residual_square = (1 - (residual_square == np.inf) * 1.0) * residual_square # drop Inf outliers
34 |
35 | total_square = oos_ret.set_index('DATE')**2
36 | total_square = (1 - (total_square == np.inf) * 1.0) * total_square # drop Inf outliers
37 |
38 | model_output_R2 = 1 - np.sum(residual_square.values)/np.sum(total_square.values)
39 |
40 | if not isinstance(input, np.ndarray):
41 | return model_output_R2
42 |
43 | else:
44 | no_omit_output = complete_r
45 | no_omit_output = pd.DataFrame(no_omit_output, columns=CHARAS_LIST)
46 | no_omit_output['DATE'] = oos_ret['DATE'].to_list()
47 |
48 | no_omit_residual_square = ((oos_ret.set_index('DATE') - no_omit_output.set_index('DATE'))**2).dropna()
49 | no_omit_residual_square = (1 - (no_omit_residual_square == np.inf) * 1.0) * no_omit_residual_square # drop Inf outliers
50 |
51 | no_omit_model_output_R2 = 1 - np.sum(no_omit_residual_square.values)/np.sum(total_square.values)
52 |
53 | return no_omit_model_output_R2 - model_output_R2 # the difference of R^2, i.e. the importance of characteristics
54 |
55 |
56 |
57 | def alpha_plot(model, type, save_dir='imgs'):
58 | if 'alpha' not in os.listdir(save_dir):
59 | os.mkdir(f'{save_dir}/alpha')
60 |
61 | portfolio_ret = pd.read_pickle('data/portfolio_ret.pkl')
62 | oos_result = portfolio_ret.loc[(portfolio_ret['DATE'] >= OOS_start) & (portfolio_ret['DATE'] <= OOS_end)].set_index('DATE')
63 |
64 | output_path = f'results/{type}/{model.name}_{type}.csv'
65 | inference_result = pd.read_csv(output_path)
66 | inference_result = inference_result.set_index('DATE')
67 |
68 | pricing_error_analysis = []
69 | for col in CHARAS_LIST:
70 | raw_return = oos_result[col].mean()
71 | error = oos_result[col] - inference_result[col]
72 | alpha = error.mean()
73 | t_stat = abs(error.mean()/error.std()) * np.sqrt(oos_result.shape[0])
74 | pricing_error_analysis.append([raw_return, alpha, t_stat])
75 |
76 | pricing_error_analysis = pd.DataFrame(pricing_error_analysis, columns = ['raw ret', 'alpha', 't_stat'], index=CHARAS_LIST)
77 |
78 | lower_point = min(np.min(pricing_error_analysis['raw ret']), np.min(pricing_error_analysis['alpha'])) * 1.15
79 | upper_point = max(np.max(pricing_error_analysis['raw ret']), np.max(pricing_error_analysis['alpha'])) * 1.15
80 |
81 | significant_mask = pricing_error_analysis['t_stat'] > 3
82 |
83 | plt.scatter(pricing_error_analysis.loc[significant_mask]['raw ret'], pricing_error_analysis.loc[significant_mask]['alpha'], marker='^', color='r', alpha=0.6, label=f'#Alphas(|t|>3.0)={np.sum(significant_mask*1.0)}')
84 | plt.scatter(pricing_error_analysis.loc[~significant_mask]['raw ret'], pricing_error_analysis.loc[~significant_mask]['alpha'], marker='o', color='b', alpha=0.6, label=f'#Alphas(|t|<3.0)={94-np.sum(significant_mask*1.0)}')
85 | plt.plot(np.linspace(lower_point, upper_point, 10), np.linspace(lower_point, upper_point, 10), color='black')
86 |
87 | plt.ylabel('Alpha (%)')
88 | plt.xlabel('Raw Return (%)')
89 | plt.legend()
90 |
91 | plt.title(model.name)
92 | plt.savefig(f'{save_dir}/alpha/{model.name}_inference_alpha_plot.png')
93 | plt.close()
94 |
95 |
96 | def plot_R2_bar(R_df, type):
97 |
98 | R_df['Model'] = R_df[0].apply(lambda x: x.split('_')[0])
99 |
100 | labels = ['K=1', 'K=2', 'K=3', 'K=4', 'K=5']
101 | #FF = (R_df.loc[R_df['Model']=='FF'][1]*100).to_list()
102 | #PCA = (R_df.loc[R_df['Model']=='PCA'][1]*100).to_list()
103 | #IPCA = (R_df.loc[R_df['Model']=='IPCA'][1]*100).to_list()
104 | #CA0 = (R_df.loc[R_df['Model']=='CA0'][1]*100).to_list()
105 | #CA1 = (R_df.loc[R_df['Model']=='CA1'][1]*100).to_list()
106 | CA2 = (R_df.loc[R_df['Model']=='CA2'][1]*100).to_list()
107 | #CA3 = (R_df.loc[R_df['Model']=='CA3'][1]*100).to_list()
108 | seq2seq1 = (R_df.loc[R_df['Model'] == 'seq2seq1'][1] * 100).to_list()
109 | seq2seq2 = (R_df.loc[R_df['Model'] == 'seq2seq2'][1] * 100).to_list()
110 | seq2seq3 = (R_df.loc[R_df['Model'] == 'seq2seq3'][1] * 100).to_list()
111 |
112 |
113 |
114 |
115 | x = np.arange(len(labels)) # 标签位置
116 | width = 0.11
117 |
118 | fig, ax = plt.subplots(figsize=(15, 5))
119 | #ax.bar(x - width*3 , FF, width, label='FF', color=plt.get_cmap('OrRd')(np.linspace(0, 1, 8)[1]))
120 | #ax.bar(x - width*2 , PCA, width, label='PCA', color=plt.get_cmap('OrRd')(np.linspace(0, 1, 8)[2]))
121 | #ax.bar(x - width , IPCA, width, label='IPCA', color=plt.get_cmap('OrRd')(np.linspace(0, 1, 8)[3]))
122 | #ax.bar(x + 0.00, CA0, width, label='CA0', color=plt.get_cmap('OrRd')(np.linspace(0, 1, 8)[4]))
123 | #ax.bar(x - width*3, CA1, width, label='CA1', color=plt.get_cmap('OrRd')(np.linspace(0, 1, 8)[1]))
124 | ax.bar(x - width, CA2, width, label='CA2', color=plt.get_cmap('OrRd')(np.linspace(0, 1, 8)[1]))
125 | #ax.bar(x - width, CA3, width, label='CA3', color=plt.get_cmap('OrRd')(np.linspace(0, 1, 8)[3]))
126 | ax.bar(x + 0.00, seq2seq1, width, label='seq2seq1', color=plt.get_cmap('OrRd')(np.linspace(0, 1, 8)[2]))
127 | ax.bar(x + width, seq2seq2, width, label='seq2seq2', color=plt.get_cmap('OrRd')(np.linspace(0, 1, 8)[3]))
128 | ax.bar(x + width * 2, seq2seq3, width, label='seq2seq3', color=plt.get_cmap('OrRd')(np.linspace(0, 1, 8)[4]))
129 |
130 |
131 | ax.set_ylabel(f'Portfolio {type} R^2 (%)')
132 | ax.set_xticks(x)
133 | ax.set_xticklabels(labels)
134 | ax.legend()
135 |
136 | fig.tight_layout()
137 |
138 | plt.savefig(f'imgs/{type}_R2.png')
139 | plt.close()
140 |
141 |
142 |
143 | def plot_R2_table(R_df, type):
144 | plt.figure(dpi=200)
145 |
146 | for col in R_df.columns:
147 | R_df[col] = R_df[col].apply(lambda x: round_number(x))
148 |
149 | R_df = R_df.reset_index()
150 | R_df.columns = ['Model', 'K=1', 'K=2', 'K=3', 'K=4', 'K=5']
151 |
152 |
153 | fig_total = ff.create_table(R_df,
154 | colorscale=[[0, 'white'],
155 | [0.01, 'lightgrey'],
156 | [1.0, 'white']],
157 | font_colors=['#000000', '#000000',
158 | '#000000'])
159 | fig_total.update_layout(
160 | autosize=False,
161 | width=500,
162 | height=200,
163 | )
164 | fig_total.write_image(f"imgs/R2_{type}_table.png", scale=4)
165 |
166 |
167 |
168 | def round_number(num):
169 | num = str(round(num*100, 2))
170 | while len(num.split('.')[1]) < 2:
171 | num = num + '0'
172 | return num
173 |
174 |
175 |
176 | if __name__=="__main__":
177 | #CAs = ["CA1_1", "CA1_2", "CA1_3", "CA1_4", "CA1_5", "CA2_1", "CA2_2", "CA2_3", "CA2_4", "CA2_5", "CA3_1", "CA3_2", "CA3_3", "CA3_4", "CA3_5"]
178 | CAs =["CA2_1", "CA2_2", "CA2_3", "CA2_4", "CA2_5"]
179 | Seqs = ["seq2seq1_1", "seq2seq1_2","seq2seq1_3","seq2seq1_4","seq2seq1_5","seq2seq2_1","seq2seq2_2","seq2seq2_3","seq2seq2_4","seq2seq2_5","seq2seq3_1","seq2seq3_2","seq2seq3_3","seq2seq3_4","seq2seq3_5"]
180 | #FFs = ["FF_1", "FF_2", "FF_3", "FF_4", "FF_5", "FF_6"]
181 | #PCAs = ["PCA_1", "PCA_2", "PCA_3", "PCA_4", "PCA_5", "PCA_6"]
182 | #IPCAs = ["IPCA_1", "IPCA_2", "IPCA_3", "IPCA_4", "IPCA_5", "IPCA_6"]
183 | models = CAs + Seqs
184 |
185 | ## Plot R^2 bars
186 | total_R2 = []
187 | for m in models:
188 | total_R2.append(calculate_R2(m, 'inference'))
189 | R_total = pd.DataFrame([models, total_R2]).T
190 |
191 | predict_R2 = []
192 | for m in models:
193 | predict_R2.append(calculate_R2(m, 'predict'))
194 | R_pred = pd.DataFrame([models, predict_R2]).T
195 |
196 | plot_R2_bar(R_total, 'total')
197 | plot_R2_bar(R_pred, 'pred')
198 |
199 | ## Save R^2 tables
200 | R_total_df = pd.DataFrame(np.array(total_R2).reshape(-1, 5), columns = ['K=1', 'K=2', 'K=3', 'K=4', 'K=5'], index=['CA2', 'seq2seq1', 'seq2seq2', 'seq2seq3'])
201 | R_pred_df = pd.DataFrame(np.array(predict_R2).reshape(-1, 5), columns = ['K=1', 'K=2', 'K=3', 'K=4', 'K=5'], index=['CA2', 'seq2seq1', 'seq2seq2', 'seq2seq3'])
202 |
203 | plot_R2_table(R_total_df, 'total')
204 | plot_R2_table(R_pred_df, 'pred')
205 |
206 |
207 | ## Plot characteristics importance heatmap
208 | #models = ["CA1_5", "CA2_5", "CA3_5"]
209 | # #TODO: paste results from R_squares/
210 | # R2_omit = []
211 | #R_minus = pd.DataFrame(np.array(R2_omit).reshape(-1, 94)*100, index=models, columns=CHARAS_LIST).T
212 | #char_ranks = R_minus.T.sum().argsort().argsort().index.to_list()
213 | #char_ranks.reverse()
214 |
215 | #plt.figure(figsize=(8, 15), dpi=200)
216 | #sns.heatmap(R_minus.T[char_ranks].T, cmap='Blues', linewidths=0.6)
217 | #plt.savefig('imgs/omit_char_R2_bias.png', bbox_inches='tight')
218 | #plt.close()
219 |
--------------------------------------------------------------------------------
/data_prepare.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 | pd.options.mode.chained_assignment = None
5 | from tqdm import tqdm
6 |
7 | import os
8 | import pickle
9 | import zipfile
10 | from joblib import delayed, Parallel
11 | from itertools import product
12 | import utils
13 |
14 | import warnings
15 |
16 | warnings.filterwarnings('ignore')
17 |
18 | # if 'new_data.zip' not in os.listdir():
19 | # os.system('wget https://cloud.tsinghua.edu.cn/f/07d6a0223d054247af26/?dl=1 -O new_data.zip')
20 |
21 | # if 'data' not in os.listdir():
22 | # os.mkdir('data')
23 | # os.system('wget https://cloud.tsinghua.edu.cn/f/179082ecf0f147a4840c/?dl=1 -O portfolio_ret.pkl')
24 | # os.system('wget https://cloud.tsinghua.edu.cn/f/b93c6ae7e2014d3a951e/?dl=1 -O ff5.csv')
25 | # os.system('wget https://cloud.tsinghua.edu.cn/f/5f077be9eda0428ab7e5/?dl=1 -O UMD.csv')
26 | # os.system('wget https://cloud.tsinghua.edu.cn/f/a916da12d5a9450eb0df/?dl=1 -O p_charas.pkl')
27 |
28 | # os.system('mv portfolio_ret.pkl data')
29 | # os.system('mv ff5.csv data')
30 | # os.system('mv UMD.csv data')
31 | # os.system('mv p_charas.pkl data')
32 |
33 |
34 | with open('D:/Autoencoder/data/new_data/data/month_ret.pkl', 'rb') as f:
35 | print('Reading month_ret.pkl', end=' ')
36 | mon_ret = pd.read_pickle(f)
37 | mon_ret.to_pickle('data/month_ret.pkl')
38 | print('Done!')
39 |
40 | with open('D:/Autoencoder/data/new_data/data/datashare.pkl', 'rb') as f:
41 | print('Reading datashare.pkl', end=' ')
42 | datashare = pd.read_pickle(f)
43 | datashare['DATE'].drop_duplicates().reset_index(drop=True).to_pickle('data/mon_list.pkl')
44 | # datashare.to_pickle('data/datashare.pkl')
45 | print('Done!')
46 |
47 |
48 | def pre_process(date):
49 | cross_slice = datashare.loc[datashare.DATE == date].copy(deep=False)
50 | omitted_mask = 1.0 * np.isnan(cross_slice.loc[cross_slice['DATE'] == date])
51 | # fill nan values with each factors median
52 | cross_slice.loc[cross_slice.DATE == date] = cross_slice.fillna(0) + omitted_mask * cross_slice.median()
53 | # if all stocks' factor is nan, fill by zero
54 | cross_slice.loc[cross_slice.DATE == date] = cross_slice.fillna(0)
55 |
56 | re_df = []
57 | # rank normalization
58 | for col in utils.CHARAS_LIST:
59 | series = cross_slice[col]
60 | de_duplicate_slice = pd.DataFrame(series.drop_duplicates().to_list(), columns=['chara'])
61 | series = pd.DataFrame(series.to_list(), columns=['chara'])
62 | # sort and assign rank, the same value should have the same rank
63 | de_duplicate_slice['sort_rank'] = de_duplicate_slice['chara'].argsort().argsort()
64 | rank = pd.merge(series, de_duplicate_slice, left_on='chara', right_on='chara', how='right')['sort_rank']
65 | # if all values are zero, the results will contain nan
66 | rank_normal = ((rank - rank.min()) / (rank.max() - rank.min()) * 2 - 1)
67 | re_df.append(rank_normal)
68 | re_df = pd.DataFrame(re_df, index=utils.CHARAS_LIST).T.fillna(0)
69 | re_df['permno'] = list(cross_slice['permno'].astype(int))
70 | re_df['DATE'] = list(cross_slice['DATE'].astype(int))
71 |
72 | return re_df[['permno', 'DATE'] + utils.CHARAS_LIST]
73 |
74 |
75 | def cal_portfolio_ret(it, df):
76 | d, f = it[0], it[1]
77 | # long portfolio, qunatile 0.0~0.1; short portfolio, qunatile 0.9~1.0
78 | long_portfolio = \
79 | df.loc[df.DATE == d][['permno', f]].sort_values(by=f, ascending=False)[:df.loc[df.DATE == d].shape[0] // 10][
80 | 'permno'].to_list()
81 | short_portfolio = \
82 | df.loc[df.DATE == d][['permno', f]].sort_values(by=f, ascending=False)[-df.loc[df.DATE == d].shape[0] // 10:][
83 | 'permno'].to_list()
84 | # long-short portfolio return
85 | long_ret = mon_ret.loc[mon_ret.date == d].drop_duplicates('permno').set_index('permno').reindex(long_portfolio)[
86 | 'ret-rf'].dropna().mean()
87 | short_ret = mon_ret.loc[mon_ret.date == d].drop_duplicates('permno').set_index('permno').reindex(short_portfolio)[
88 | 'ret-rf'].dropna().mean()
89 | chara_ret = 0.5 * (long_ret - short_ret)
90 |
91 | return chara_ret
92 |
93 |
94 | def cal_portfolio_charas(month, df):
95 | mon_portfolio_chara = []
96 | p_name = ['p_' + chr for chr in utils.CHARAS_LIST]
97 | for chr in utils.CHARAS_LIST:
98 | long_portfolio = df.loc[df.DATE == month].sort_values(by=chr, ascending=False).reset_index(drop=True)[
99 | :df.loc[df.DATE == month].shape[0] // 10]['permno'].to_list()
100 | short_portfolio = df.loc[df.DATE == month].sort_values(by=chr, ascending=False).reset_index(drop=True)[
101 | -df.loc[df.DATE == month].shape[0] // 10:]['permno'].to_list()
102 |
103 | long_charas = df.loc[df.DATE == month].set_index('permno').loc[long_portfolio][utils.CHARAS_LIST]
104 | short_charas = df.loc[df.DATE == month].set_index('permno').loc[short_portfolio][utils.CHARAS_LIST]
105 |
106 | mon_portfolio_chara.append([month] + (0.5 * (long_charas.mean() - short_charas.mean())).to_list())
107 |
108 | return pd.DataFrame(mon_portfolio_chara, index=p_name, columns=['DATE'] + utils.CHARAS_LIST)
109 |
110 |
111 | if __name__ == '__main__':
112 | # pre-process share data
113 | processed_df = Parallel(n_jobs=-1)(delayed(pre_process)(d) for d in
114 | tqdm(datashare.DATE.drop_duplicates().to_list(), colour='green',
115 | desc='Processing'))
116 | processed_df = pd.concat(processed_df)
117 |
118 | ##TODO: calculate portfolio returns (or download preprocessed data)
119 | # iter_list = list(product(datashare.DATE.drop_duplicates(), CHARAS_LIST))
120 | # portfolio_rets = Parallel(n_jobs=-1)(delayed(cal_portfolio_ret)(it, df=processed_df) for it in tqdm(iter_list, colour='green', desc='Calculating'))
121 | # portfolio_rets = pd.DataFrame(np.array(portfolio_rets).reshape(-1, 94), index=datashare.DATE.drop_duplicates(), columns=CHARAS_LIST).reset_index()
122 | # portfolio_rets[CHARAS_LIST] = portfolio_rets[CHARAS_LIST].astype(np.float16)
123 |
124 | ##TODO: calculate portfolio characteristics (or download preprocessed data)
125 | # mon_list = pd.read_pickle('data/mon_list.pkl')
126 | # _portfolio_chara_set = Parallel(n_jobs=-1)(delayed(cal_portfolio_charas)(mon, df=processed_df) for mon in tqdm(mon_list, colour='yellow', desc='Calculating P characteristics'))
127 | # p_charas = _portfolio_chara_set[0].copy(deep=False)
128 | # for tdf in _portfolio_chara_set[1:]:
129 | # p_charas = pd.concat([p_charas, tdf])
130 |
131 | processed_df.to_pickle('data/datashare_re.pkl')
132 | # portfolio_rets.to_pickle('data/portfolio_rets.pkl')
133 | # p_charas.to_pickle('data/p_charas.pkl')
134 |
--------------------------------------------------------------------------------
/imgs/R2_pred_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/R2_pred_table.png
--------------------------------------------------------------------------------
/imgs/R2_total_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/R2_total_table.png
--------------------------------------------------------------------------------
/imgs/alpha/CA0_1_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA0_1_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/CA0_2_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA0_2_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/CA0_3_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA0_3_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/CA0_4_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA0_4_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/CA0_5_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA0_5_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/CA0_5_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA0_5_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/CA0_6_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA0_6_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/CA1_1_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA1_1_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/CA1_2_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA1_2_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/CA1_3_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA1_3_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/CA1_4_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA1_4_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/CA1_5_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA1_5_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/CA1_6_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA1_6_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/CA2_1_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA2_1_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/CA2_2_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA2_2_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/CA2_3_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA2_3_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/CA2_4_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA2_4_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/CA2_5_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA2_5_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/CA2_6_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA2_6_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/CA3_1_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA3_1_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/CA3_2_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA3_2_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/CA3_3_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA3_3_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/CA3_4_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA3_4_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/CA3_5_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA3_5_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/CA3_6_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/CA3_6_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/FF_1_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/FF_1_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/FF_2_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/FF_2_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/FF_3_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/FF_3_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/FF_4_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/FF_4_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/FF_5_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/FF_5_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/FF_6_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/FF_6_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/IPCA_1_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/IPCA_1_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/IPCA_2_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/IPCA_2_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/IPCA_3_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/IPCA_3_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/IPCA_4_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/IPCA_4_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/IPCA_5_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/IPCA_5_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/IPCA_6_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/IPCA_6_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/PCA_1_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/PCA_1_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/PCA_2_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/PCA_2_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/PCA_3_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/PCA_3_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/PCA_4_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/PCA_4_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/PCA_5_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/PCA_5_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/PCA_6_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/PCA_6_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/seq2seq1_1_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/seq2seq1_1_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/seq2seq1_2_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/seq2seq1_2_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/seq2seq1_3_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/seq2seq1_3_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/seq2seq1_4_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/seq2seq1_4_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/seq2seq1_5_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/seq2seq1_5_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/seq2seq2_1_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/seq2seq2_1_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/seq2seq2_2_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/seq2seq2_2_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/seq2seq2_3_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/seq2seq2_3_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/seq2seq2_4_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/seq2seq2_4_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/seq2seq2_5_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/seq2seq2_5_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/seq2seq3_1_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/seq2seq3_1_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/seq2seq3_2_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/seq2seq3_2_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/seq2seq3_3_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/seq2seq3_3_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/seq2seq3_4_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/seq2seq3_4_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/alpha/seq2seq3_5_inference_alpha_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/alpha/seq2seq3_5_inference_alpha_plot.png
--------------------------------------------------------------------------------
/imgs/omit_char_R2_bias.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/omit_char_R2_bias.png
--------------------------------------------------------------------------------
/imgs/pred_R2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/pred_R2.png
--------------------------------------------------------------------------------
/imgs/total_R2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanWangyl/Replication-Autoencoder-Asset-Pricing-Models/f816572140558c84388c60316c3abd851e60ccd0/imgs/total_R2.png
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import torch
2 | #from models.PCA import PCA
3 | #from models.FF import FF
4 | #from models.IPCA import IPCA
5 | from models.CA import CA1, CA2, CA3
6 | from models.seq import seq2seq1, seq2seq2, seq2seq3
7 |
8 | import gc
9 | import argparse
10 | import pandas as pd
11 | import numpy as np
12 | import time
13 | import json
14 | from tqdm import tqdm
15 | import utils
16 | from analysis import *
17 | import matplotlib.pyplot as plt
18 | from itertools import product
19 | import os
20 |
21 | import warnings
22 | warnings.filterwarnings('ignore')
23 |
24 |
25 | def model_inference_and_predict_CA(model):
26 | """
27 | Inference and Prediction of NN models:
28 | Returns: model.name_inference.csv & model.name_inference.csv saved in path 'results'
29 | """
30 | model = model.to('cuda')
31 | mon_list = pd.read_pickle('data/mon_list.pkl')
32 | test_mons = mon_list.loc[(mon_list >= model.test_period[0])]
33 |
34 | if not len(model.omit_char): # no omit characteristics
35 | inference_result = pd.DataFrame()
36 | predict_result = pd.DataFrame()
37 | else:
38 | inference_result = []
39 |
40 | T_bar = tqdm(test_mons.groupby(test_mons.apply(lambda x: x//10000)), colour='red', desc=f'{model.name} Inferencing & Predicting')
41 |
42 | stock_index = pd.Series(dtype=np.int64)
43 | for g in T_bar: # rolling train, refit once a year
44 | T_bar.set_postfix({'Year': g[0]})
45 |
46 | model.reset_weight()
47 | model.release_gpu()
48 | # release GPU memory
49 | for _ in range(6): # call function multiple times to clear the cuda cache
50 | torch.cuda.empty_cache()
51 |
52 | train_loss, val_loss = model.train_model()
53 | # plot loss
54 | plt.plot(train_loss, label='train_loss')
55 | plt.plot(val_loss, label='val_loss')
56 | plt.legend()
57 | plt.savefig(f'results/train_loss/{model.name}_loss_{g[0]}.png')
58 | plt.close()
59 |
60 | for m in g[1].to_list():
61 | m_stock_index, _, _, _ = model._get_item(m)
62 | stock_index = pd.concat([stock_index, pd.Series(m_stock_index)]).drop_duplicates().astype(int)
63 |
64 | if not len(model.omit_char): # no omit characteristics
65 | # move inference_R and predict_R to cpu
66 | inference_R = model.inference(m) # return (N, 1)
67 | inference_R = inference_R.cpu().detach().numpy()
68 | inference_R = pd.DataFrame(inference_R, index=m_stock_index, columns=[m])
69 | inference_result = pd.concat([inference_result.reset_index(drop=True), inference_R.reset_index(drop=True)], axis=1) # (N, T)
70 |
71 | predict_R = model.predict(m) # reutrn (N, 1)
72 | predict_R = predict_R.cpu().detach().numpy()
73 | predict_R = pd.DataFrame(predict_R, index=m_stock_index, columns=[m])
74 | predict_result = pd.concat([predict_result.reset_index(drop=True), predict_R.reset_index(drop=True)], axis=1) # (N, T)
75 |
76 | else:
77 | inference_R = model.inference(m) # return (N, m), m is the length of omit_char
78 | inference_result.append(inference_R) # (T, N, m)
79 |
80 | # refit: change train period and valid period
81 | model.refit()
82 |
83 | if not len(model.omit_char):
84 | inference_result = pd.DataFrame(inference_result.values.T, index=test_mons, columns=CHARAS_LIST)
85 | inference_result.to_csv(f'results/inference/{model.name}_inference.csv')
86 |
87 | predict_result = pd.DataFrame(predict_result.values.T, index=test_mons, columns=CHARAS_LIST)
88 | predict_result.to_csv(f'results/predict/{model.name}_predict.csv')
89 |
90 | # GC: release RAM memory(model)
91 | del model
92 | gc.collect()
93 | return inference_result
94 |
95 | def model_inference_and_predict_seq2seq(model):
96 | """
97 | Inference and Prediction of seq2seq models:
98 | Returns: model.name_inference.csv & model.name_inference.csv saved in path 'results'
99 | """
100 | model = model.to('cuda')
101 | mon_list = pd.read_pickle('data/mon_list.pkl')
102 | test_mons = mon_list.loc[(mon_list >= model.test_period[0])]
103 |
104 | if not len(model.omit_char): # no omit characteristics
105 | inference_result = pd.DataFrame()
106 | predict_result = pd.DataFrame()
107 | else:
108 | inference_result = []
109 |
110 | T_bar = tqdm(test_mons.groupby(test_mons.apply(lambda x: x//10000)), colour='red', desc=f'{model.name} Inferencing & Predicting')
111 |
112 | stock_index = pd.Series(dtype=np.int64)
113 | for g in T_bar: # rolling train, refit once a year
114 | T_bar.set_postfix({'Year': g[0]})
115 |
116 |
117 | model.release_gpu()
118 | # release GPU memory
119 | for _ in range(6): # call function multiple times to clear the cuda cache
120 | torch.cuda.empty_cache()
121 |
122 | train_loss, val_loss = model.train_model()
123 | # plot loss
124 | plt.plot(train_loss, label='train_loss')
125 | plt.plot(val_loss, label='val_loss')
126 | plt.legend()
127 | plt.savefig(f'results/train_loss/{model.name}_loss_{g[0]}.png')
128 | plt.close()
129 |
130 | for m in g[1].to_list():
131 | m_stock_index, _, _, _ = model._get_item(m)
132 | stock_index = pd.concat([stock_index, pd.Series(m_stock_index)]).drop_duplicates().astype(int)
133 |
134 | if not len(model.omit_char): # no omit characteristics
135 | # move inference_R and predict_R to cpu
136 | inference_R = model.inference(m) # return (N, 1)
137 | inference_R = inference_R.cpu().detach().numpy()
138 | inference_R = pd.DataFrame(inference_R, index=m_stock_index, columns=[m])
139 | inference_result = pd.concat([inference_result.reset_index(drop=True), inference_R.reset_index(drop=True)], axis=1) # (N, T)
140 |
141 | predict_R = model.predict(m) # reutrn (N, 1)
142 | predict_R = predict_R.cpu().detach().numpy()
143 | predict_R = pd.DataFrame(predict_R, index=m_stock_index, columns=[m])
144 | predict_result = pd.concat([predict_result.reset_index(drop=True), predict_R.reset_index(drop=True)], axis=1) # (N, T)
145 |
146 | else:
147 | inference_R = model.inference(m) # return (N, m), m is the length of omit_char
148 | inference_result.append(inference_R) # (T, N, m)
149 |
150 | # refit: change train period and valid period
151 | model.refit()
152 |
153 | if not len(model.omit_char):
154 | inference_result = pd.DataFrame(inference_result.values.T, index=test_mons, columns=CHARAS_LIST)
155 | inference_result.to_csv(f'results/inference/{model.name}_inference.csv')
156 |
157 | predict_result = pd.DataFrame(predict_result.values.T, index=test_mons, columns=CHARAS_LIST)
158 | predict_result.to_csv(f'results/predict/{model.name}_predict.csv')
159 |
160 | # GC: release RAM memory(model)
161 | del model
162 | gc.collect()
163 | return inference_result
164 |
165 |
166 |
167 | def git_push(msg):
168 | os.system('git add R_squares')
169 | os.system(f'git commit -m "{msg}"')
170 | os.system('git push')
171 |
172 |
173 |
174 | def model_selection(model_type, model_K, omit_char=[]):
175 | assert model_type in ['seq2seq1', 'seq2seq3','CA2'],f'No Such Model: {model_type}'
176 |
177 |
178 | #if model_type == 'CA1':
179 | #return {
180 | #'name': f'CA1_{model_K}',
181 | #'omit_char': omit_char,
182 | #'model': CA1(hidden_size=model_K, dropout=CA_DR, lr=CA_LR, omit_char=omit_char)
183 | #}
184 |
185 | if model_type == 'CA2':
186 | return {
187 | 'name': f'CA2_{model_K}',
188 | 'omit_char': omit_char,
189 | 'model': CA2(hidden_size=model_K, dropout=CA_DR, lr=CA_LR, omit_char=omit_char)
190 | }
191 | #elif model_type == 'CA3':
192 | #return {
193 | #'name': f'CA3_{model_K}',
194 | #'omit_char': omit_char,
195 | #'model': CA3(hidden_size=model_K, dropout=CA_DR, lr=CA_LR, omit_char=omit_char)
196 | #}
197 |
198 | #elif model_type == 'seq2seq1':
199 | # return {
200 | #'name': f'seq2seq1_{model_K}',
201 | #'omit_char': omit_char,
202 | #'model': seq2seq1(hidden_size=model_K, dropout=CA_DR, lr=CA_LR, omit_char=omit_char)
203 | #}
204 | elif model_type == 'seq2seq1':
205 | return {
206 | 'name': f'seq2seq1_{model_K}',
207 | 'omit_char': omit_char,
208 | 'model': seq2seq1(hidden_size=model_K, dropout=CA_DR, lr=CA_LR, omit_char=omit_char)
209 | }
210 | else:
211 | return {
212 | 'name': f'seq2seq3_{model_K}',
213 | 'omit_char': omit_char,
214 | 'model': seq2seq3(hidden_size=model_K, dropout=CA_DR, lr=0.01, omit_char=omit_char)
215 | }
216 |
217 |
218 |
219 | if __name__ == "__main__":
220 | parser = argparse.ArgumentParser()
221 | parser.add_argument('--Model', type=str, default='seq2seq1 seq2seq3 CA2')
222 | parser.add_argument('--K', type=str, default='1 2 3 4 5')
223 | parser.add_argument('--omit_char', type=str, default='')
224 |
225 | args = parser.parse_args()
226 |
227 | if 'results' not in os.listdir('./'):
228 | os.mkdir('results')
229 | if 'train_loss' not in os.listdir('./results'):
230 | os.mkdir('results/train_loss')
231 | if 'inference' not in os.listdir('./results'):
232 | os.mkdir('results/inference')
233 | if 'predict' not in os.listdir('./results'):
234 | os.mkdir('results/predict')
235 | if 'imgs' not in os.listdir('./'):
236 | os.mkdir('imgs')
237 |
238 |
239 | models_name = []
240 | R_square = []
241 | for g in product(args.Model.split(' '), args.K.split(' ')):
242 | if isinstance(args.omit_char, str) and len(args.omit_char) > 0:
243 | omit_chars = args.omit_char.split(' ')
244 | else:
245 | omit_chars = []
246 |
247 | model = model_selection(g[0], int(g[1]), omit_chars)
248 |
249 | print(f"{time.strftime('%a, %d %b %Y %H:%M:%S +0800', time.gmtime())} | Model: {model['name']} | {omit_chars}")
250 | print('name : ', model['name'])
251 | models_name.append(model['name'])
252 |
253 | if model['name'].split('_')[0][:-1] == 'CA':
254 | print('model_inference_and_predict_CA')
255 | # if have omit char, inf_ret (T, N, m)
256 | inf_ret = model_inference_and_predict_CA(model['model'])
257 | else:
258 | print('model_inference_and_predict_seq')
259 | inf_ret = model_inference_and_predict_seq2seq(model['model'])
260 |
261 | gc.collect()
262 |
263 | # Save total R^2
264 | if not len(model['omit_char']):
265 | R_square.append(calculate_R2(model['model'], 'inference'))
266 | alpha_plot(model['model'], 'inference', save_dir='imgs')
267 | # alpha_plot(model['model'], 'predict', save_dir='alpha_imgs')
268 | else:
269 | inf_ret = np.array(inf_ret)
270 | for i in range(len(model['omit_char'])):
271 | inference_r = inf_ret[:, :, i] # T * N
272 | complete_r = inf_ret[:, :, -1]
273 | R_square.append(calculate_R2(None, None, inference_r, complete_r))
274 |
275 | del model
276 |
277 | # save R_square to json
278 | p = time.localtime()
279 | time_str = "{:0>4d}-{:0>2d}-{:0>2d}_{:0>2d}-{:0>2d}-{:0>2d}".format(p.tm_year, p.tm_mon, p.tm_mday, p.tm_hour, p.tm_min, p.tm_sec)
280 | filename = f"R_squares/{time_str}.json"
281 | obj = {
282 | "models": models_name,
283 | 'omit_char': args.omit_char.split(' '),
284 | "R2_total": R_square,
285 | }
286 |
287 | with open(filename, "w") as out_file:
288 | json.dump(obj, out_file)
289 |
290 | # git push
291 | # git_push(f"Run main.py")
292 |
293 |
294 |
295 |
--------------------------------------------------------------------------------
/models/CA.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 | import numpy as np
4 | import collections
5 | from .modelBase import modelBase
6 | from utils import CHARAS_LIST
7 |
8 | import torch
9 | from torch import nn
10 | from torch.utils.data import Dataset, DataLoader, TensorDataset
11 |
12 |
13 | MAX_EPOCH = 200
14 |
15 | class CA_base(nn.Module, modelBase):
16 | def __init__(self, name, omit_char=[], device='cuda'):
17 | nn.Module.__init__(self)
18 | modelBase.__init__(self, name)
19 | self.beta_nn = None
20 | self.factor_nn = None
21 | self.optimizer = None
22 | self.criterion = None
23 | self.omit_char = omit_char
24 |
25 | self.factor_nn_pred = []
26 |
27 | self.device = device
28 |
29 | self.datashare_chara = pd.read_pickle('./data/datashare_re.pkl').astype(np.float64)
30 | self.p_charas = pd.read_pickle('./data/p_charas.pkl').astype(np.float64).reset_index()
31 | self.portfolio_ret= pd.read_pickle('./data/portfolio_ret.pkl').astype(np.float64)
32 | self.mon_ret = pd.read_pickle('./data/month_ret.pkl').astype(np.float64)
33 |
34 | self.train_dataloader = None
35 | self.valid_dataloader = None
36 | self.test_dataloader = None
37 |
38 |
39 | def debug(self, month):
40 | beta_nn_input = self.p_charas.loc[self.p_charas['DATE'] == month][CHARAS_LIST]
41 | # beta_nn_input = self.datashare_chara.loc[self.datashare_chara['DATE'] == month].set_index('permno')[charas]
42 | print(beta_nn_input)
43 |
44 |
45 | def _get_item(self, month):
46 | if month not in self.p_charas['DATE'].values:
47 | # find the closest month in p_charas to month
48 | month = self.p_charas['DATE'].values[np.argmin(np.abs(self.p_charas['DATE'].values - month))]
49 |
50 | beta_nn_input = self.p_charas.loc[self.p_charas['DATE'] == month][CHARAS_LIST] # (94, 94)
51 | labels = self.portfolio_ret.loc[self.portfolio_ret['DATE'] == month][CHARAS_LIST].T.values # (94, 1)
52 | beta_nn_input['ret-rf'] = labels
53 | align_df = beta_nn_input.copy(deep=False).dropna()
54 |
55 | factor_nn_input = self.portfolio_ret.loc[self.portfolio_ret['DATE'] == month][CHARAS_LIST]
56 |
57 | # exit(0) if there is any nan in align_df
58 | if align_df.isnull().values.any():
59 | assert False, f'There is nan in align_df of : {month}'
60 | # return stock index (L), beta_nn_input (94*94=P*N), factor_nn_input (94*1=P*1), labels (94, = N,)
61 | return align_df.index, align_df.values[:, :-1].T, factor_nn_input.T.values , align_df.values[:, -1].T
62 |
63 |
64 | def dataloader(self, period):
65 | mon_list = pd.read_pickle('data/mon_list.pkl')
66 | mon_list = mon_list.loc[(mon_list >= period[0]) & (mon_list <= period[1])]
67 | beta_nn_input_set = []
68 | factor_nn_input_set = []
69 | label_set = []
70 | for mon in mon_list:
71 | _, _beta_input, _factor_input, label = self._get_item(mon)
72 | beta_nn_input_set.append(_beta_input)
73 | factor_nn_input_set.append(_factor_input)
74 | label_set.append(label)
75 |
76 | beta_nn_input_set = torch.tensor(beta_nn_input_set, dtype=torch.float32).to(self.device)
77 | factor_nn_input_set = torch.tensor(factor_nn_input_set, dtype=torch.float32).to(self.device)
78 | label_set = torch.tensor(label_set, dtype=torch.float32).to(self.device)
79 |
80 | dataset = TensorDataset(beta_nn_input_set, factor_nn_input_set, label_set)
81 | return DataLoader(dataset, batch_size=1, shuffle=True)
82 |
83 |
84 | def forward(self, char, pfret):
85 | processed_char = self.beta_nn(char)
86 | processed_pfret = self.factor_nn(pfret)
87 | return torch.sum(processed_char * processed_pfret, dim=1)
88 |
89 |
90 | # train_one_epoch
91 | def __train_one_epoch(self):
92 | epoch_loss = 0.0
93 | for i, (beta_nn_input, factor_nn_input, labels) in enumerate(self.train_dataloader):
94 | self.optimizer.zero_grad()
95 | # beta_nn_input reshape: (1, 94, 94) -> (94, 94) (1*P*N => N*P)
96 | # factor_nn_input reshape: (1, 94, 1) -> (1, 94) (1*P*1 => 1*P)
97 | # labels reshape: (1, 94) -> (94, ) (1*N => N,)
98 | beta_nn_input = beta_nn_input.squeeze(0).T
99 | factor_nn_input = factor_nn_input.squeeze(0).T
100 | labels = labels.squeeze(0)
101 | output = self.forward(beta_nn_input, factor_nn_input)
102 | loss = self.criterion(output, labels)
103 |
104 | loss.backward()
105 | self.optimizer.step()
106 | epoch_loss += loss.item()
107 |
108 | if i % 100 == 0:
109 | # print(f'Batches: {i}, loss: {loss.item()}')
110 | pass
111 |
112 | return epoch_loss / len(self.train_dataloader)
113 |
114 |
115 | def __valid_one_epoch(self):
116 | epoch_loss = 0.0
117 | for i, (beta_nn_input, factor_nn_input, labels) in enumerate(self.valid_dataloader):
118 | # beta_nn_input reshape: (1, 94, 94) -> (94, 94) (1*P*N => N*P)
119 | # factor_nn_input reshape: (1, 94, 1) -> (1, 94) (1*P*1 => 1*P)
120 | # labels reshape: (1, 94) -> (94, ) (1*N => N,)
121 | beta_nn_input = beta_nn_input.squeeze(0).T
122 | factor_nn_input = factor_nn_input.squeeze(0).T
123 | labels = labels.squeeze(0)
124 |
125 | output = self.forward(beta_nn_input, factor_nn_input)
126 | loss = self.criterion(output, labels)
127 | epoch_loss += loss.item()
128 |
129 | return epoch_loss / len(self.valid_dataloader)
130 |
131 |
132 | def train_model(self):
133 | if 'saved_models' not in os.listdir('./'):
134 | os.mkdir('saved_models')
135 |
136 | self.train_dataloader = self.dataloader(self.train_period)
137 | self.valid_dataloader = self.dataloader(self.valid_period)
138 | self.test_dataloader = self.dataloader(self.test_period)
139 |
140 | min_error = np.Inf
141 | no_update_steps = 0
142 | valid_loss = []
143 | train_loss = []
144 | for i in range(MAX_EPOCH):
145 | # print(f'Epoch {i}')
146 | self.train()
147 | train_error = self.__train_one_epoch()
148 | train_loss.append(train_error)
149 |
150 | self.eval()
151 | # valid and early stop
152 | with torch.no_grad():
153 | valid_error = self.__valid_one_epoch()
154 |
155 | valid_loss.append(valid_error)
156 | if valid_error < min_error:
157 | min_error = valid_error
158 | no_update_steps = 0
159 | # save model
160 | torch.save(self.state_dict(), f'./saved_models/{self.name}.pt')
161 | else:
162 | no_update_steps += 1
163 |
164 | if no_update_steps > 2: # early stop, if consecutive 3 epoches no improvement on validation set
165 | print(f'Early stop at epoch {i}')
166 | break
167 | # load from (best) saved model
168 | self.load_state_dict(torch.load(f'./saved_models/{self.name}.pt'))
169 | return train_loss, valid_loss
170 |
171 |
172 | def test_model(self):
173 | # beta, factor, label = self.test_dataset
174 | # i = np.random.randint(len(beta))
175 | # beta_nn_input = beta[i]
176 | # factor_nn_input = factor[i]
177 | # labels = label[i]
178 | output = None
179 | label = None
180 | for i, beta_nn_input, factor_nn_input, labels in enumerate(self.test_dataloader):
181 | # convert to tensor
182 | # beta_nn_input = torch.tensor(beta_nn_input, dtype=torch.float32).T.to(self.device)
183 | # factor_nn_input = torch.tensor(factor_nn_input, dtype=torch.float32).T.to(self.device)
184 | # labels = torch.tensor(labels, dtype=torch.float32).T.to(self.device)
185 | output = self.forward(beta_nn_input, factor_nn_input)
186 | break
187 |
188 | loss = self.criterion(output, labels)
189 | print(f'Test loss: {loss.item()}')
190 | print(f'Predicted: {output}')
191 | print(f'Ground truth: {labels}')
192 | return output, labels
193 |
194 |
195 | def calBeta(self, month, skip_char=[]):
196 | _, beta_nn_input, _, _ = self._get_item(month) # beta input: 94*94 = P*N
197 |
198 | # if some variables need be omitted
199 | if len(skip_char):
200 | beta_nn_input = pd.DataFrame(beta_nn_input.T, columns=CHARAS_LIST) # N*P
201 | beta_nn_input[skip_char] = beta_nn_input[skip_char] * 0.0
202 | beta_nn_input = beta_nn_input.values.T # P*N
203 |
204 | beta_nn_input = torch.tensor(beta_nn_input, dtype=torch.float32).T.to(self.device) # N*P
205 | return self.beta_nn(beta_nn_input) # N*K
206 |
207 |
208 | def calFactor(self, month, skip_char=[]):
209 | _, _, factor_nn_input, _ = self._get_item(month) # factor input: P*1
210 |
211 | # if some variables need be omitted
212 | if len(skip_char):
213 | factor_nn_input = pd.DataFrame(factor_nn_input.T, columns=CHARAS_LIST) # 1*P
214 | factor_nn_input[skip_char] = factor_nn_input[skip_char] * 0.0
215 | factor_nn_input = factor_nn_input.values.T # P*1
216 |
217 | factor_nn_input = torch.tensor(factor_nn_input, dtype=torch.float32).T.to(self.device) # 1*P
218 | factor_pred = self.factor_nn(factor_nn_input).T # K*1
219 |
220 | self.factor_nn_pred.append(factor_pred)
221 |
222 | return factor_pred # K*1
223 |
224 |
225 | def inference(self, month):
226 | if len(self.omit_char) == 0:
227 | assert month >= self.test_period[0], f"Month error, {month} is not in test period {self.test_period}"
228 |
229 | mon_factor, mon_beta = self.calFactor(month), self.calBeta(month)
230 |
231 | assert mon_beta.shape[1] == mon_factor.shape[0], f"Dimension mismatch between mon_factor: {mon_factor.shape} and mon_beta: {mon_beta.shape}"
232 |
233 | # R_{N*1} = Beta_{N*K} @ F_{K*1}
234 | return mon_beta @ mon_factor
235 | else:
236 | ret_R = []
237 | for char in self.omit_char:
238 | mon_factor, mon_beta = self.calFactor(month, [char]), self.calBeta(month, [char])
239 | ret_R.append((mon_beta @ mon_factor).cpu().detach().numpy()) # N*1
240 |
241 | mon_factor, mon_beta = self.calFactor(month), self.calBeta(month)
242 | ret_R.append((mon_beta @ mon_factor).cpu().detach().numpy()) # also add complete result
243 |
244 | return np.array(ret_R).squeeze(2).T # N*m
245 |
246 |
247 | def cal_delayed_Factor(self, month):
248 | # calculate the last day of the previous month
249 | if self.refit_cnt == 0:
250 | avg_f_pred = self.factor_nn_pred[0] # input of the first predict take hat{f}_t
251 | # print(avg_f_pred.shape)
252 | else:
253 | avg_f_pred = torch.mean(torch.stack(self.factor_nn_pred[:self.refit_cnt]), dim=0)
254 |
255 | return avg_f_pred
256 |
257 |
258 | def reset_weight(self):
259 | for layer in self.beta_nn: # reset beta_nn parameters
260 | if hasattr(layer, 'reset_parameters'):
261 | layer.reset_parameters()
262 |
263 | for layer in self.factor_nn: # reset factor_nn parameters
264 | if hasattr(layer, 'reset_parameters'):
265 | layer.reset_parameters()
266 |
267 | self.optimizer.state = collections.defaultdict(dict) # reset optimizer state
268 |
269 |
270 | def release_gpu(self):
271 | if self.train_dataloader is not None:
272 | del self.train_dataloader
273 | if self.valid_dataloader is not None:
274 | del self.valid_dataloader
275 | if self.test_dataloader is not None:
276 | del self.test_dataloader
277 | torch.cuda.empty_cache()
278 |
279 |
280 |
281 | class CA0(CA_base):
282 | def __init__(self, hidden_size, lr=0.001, omit_char=[], device='cuda'):
283 | CA_base.__init__(self, name=f'CA0_{hidden_size}', omit_char=omit_char, device=device)
284 | # P -> K
285 | self.beta_nn = nn.Sequential(
286 | # output layer
287 | nn.Linear(94, hidden_size)
288 | )
289 | self.factor_nn = nn.Sequential(
290 | nn.Linear(94, hidden_size)
291 | )
292 |
293 | self.optimizer = torch.optim.Adam(self.parameters(), lr=lr)
294 | self.criterion = nn.MSELoss().to(device)
295 |
296 |
297 |
298 | class CA1(CA_base):
299 | def __init__(self, hidden_size, dropout=0.5, lr=0.001, omit_char=[], device='cuda'):
300 | CA_base.__init__(self, name=f'CA1_{hidden_size}', omit_char=omit_char, device=device)
301 | self.dropout = dropout
302 | # P -> 32 -> K
303 | self.beta_nn = nn.Sequential(
304 | # hidden layer 1
305 | nn.Linear(94, 32),
306 | nn.BatchNorm1d(32),
307 | nn.ReLU(),
308 | nn.Dropout(self.dropout),
309 | # output layer
310 | nn.Linear(32, hidden_size)
311 | )
312 | self.factor_nn = nn.Sequential(
313 | nn.Linear(94, hidden_size)
314 | )
315 |
316 | self.optimizer = torch.optim.Adam(self.parameters(), lr=lr)
317 | self.criterion = nn.MSELoss().to(device)
318 |
319 |
320 |
321 | class CA2(CA_base):
322 | def __init__(self, hidden_size, dropout=0.5, lr=0.001, omit_char=[], device='cuda'):
323 | CA_base.__init__(self, name=f'CA2_{hidden_size}', omit_char=omit_char, device=device)
324 | self.dropout = dropout
325 | # P -> 32 -> 16 -> K
326 | self.beta_nn = nn.Sequential(
327 | # hidden layer 1
328 | nn.Linear(94, 32),
329 | nn.BatchNorm1d(32),
330 | nn.ReLU(),
331 | nn.Dropout(self.dropout),
332 | # hidden layer 2
333 | nn.Linear(32, 16),
334 | nn.BatchNorm1d(16),
335 | nn.ReLU(),
336 | nn.Dropout(self.dropout),
337 | # output layer
338 | nn.Linear(16, hidden_size)
339 | )
340 | self.factor_nn = nn.Sequential(
341 | nn.Linear(94, hidden_size)
342 | )
343 |
344 | self.optimizer = torch.optim.Adam(self.parameters(), lr=lr)
345 | self.criterion = nn.MSELoss().to(device)
346 |
347 |
348 |
349 | class CA3(CA_base):
350 | def __init__(self, hidden_size, dropout=0.5, lr=0.001, omit_char=[], device='cuda'):
351 | CA_base.__init__(self, name=f'CA3_{hidden_size}', omit_char=omit_char, device=device)
352 | self.dropout = dropout
353 | # P -> 32 -> 16 -> 8 -> K
354 | self.beta_nn = nn.Sequential(
355 | # hidden layer 1
356 | nn.Linear(94, 32),
357 | nn.BatchNorm1d(32),
358 | nn.ReLU(),
359 | nn.Dropout(self.dropout),
360 | # hidden layer 2
361 | nn.Linear(32, 16),
362 | nn.BatchNorm1d(16),
363 | nn.ReLU(),
364 | nn.Dropout(self.dropout),
365 | # hidden layer 3
366 | nn.Linear(16, 8),
367 | nn.BatchNorm1d(8),
368 | nn.ReLU(),
369 | nn.Dropout(self.dropout),
370 | # output layer
371 | nn.Linear(8, hidden_size)
372 | )
373 | self.factor_nn = nn.Sequential(
374 | nn.Linear(94, hidden_size)
375 | )
376 |
377 | self.optimizer = torch.optim.Adam(self.parameters(), lr=lr, weight_decay=0.01)
378 | self.criterion = nn.MSELoss().to(device)
--------------------------------------------------------------------------------
/models/IPCA.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 |
4 | import sys
5 | sys.path.append('../')
6 |
7 | from utils import *
8 | from .modelBase import modelBase
9 |
10 |
11 | class IPCA(modelBase):
12 | def __init__(self, K, omit_char=[]):
13 | super(IPCA, self).__init__(f'IPCA_{K}')
14 | self.K = K
15 | self.omit_char = omit_char
16 | np.random.seed(10)
17 | self.gamma = np.random.random([94, self.K]) # P = 94, we have total 94 characteristics
18 | self.valid_error = []
19 | self.__prepare_data()
20 |
21 |
22 | def __prepare_data(self):
23 | self.portfolio_ret = pd.read_pickle('data/portfolio_ret.pkl')
24 | self.p_charas = pd.read_pickle('data/p_charas.pkl')
25 | self.mon_list = pd.read_pickle('data/mon_list.pkl')
26 |
27 |
28 | def __valid(self):
29 | MSE_set = []
30 | for mon in self.mon_list[(self.mon_list >= self.valid_period[0]) & (self.mon_list <= self.valid_period[1])]:
31 | Z = self.p_charas.loc[self.p_charas.DATE == mon][CHARAS_LIST].values # N * P
32 | y = self.portfolio_ret.loc[self.portfolio_ret.DATE == mon][CHARAS_LIST].values.T # N * 1
33 | beta = Z @ self.gamma # N * K
34 | f_hat = np.array(np.matrix(beta.T @ beta).I @ beta.T @ y) # K * 1
35 | residual = y - beta @ f_hat
36 | MSE = np.sum(residual**2)
37 | MSE_set.append(MSE)
38 |
39 | valid_error = sum(MSE_set)
40 | self.valid_error.append(valid_error)
41 |
42 | return valid_error
43 |
44 |
45 | def __gamma_iter(self, gamma_old):
46 | numer = np.zeros((94*self.K, 1))
47 | denom = np.zeros((94*self.K, 94*self.K))
48 | for mon in self.mon_list[(self.mon_list >= self.train_period[0]) & (self.mon_list <= self.train_period[1])]:
49 | Z = self.p_charas.loc[self.p_charas.DATE == mon][CHARAS_LIST].values # N * P
50 | y = self.portfolio_ret.loc[self.portfolio_ret.DATE == mon][CHARAS_LIST].values.T # N * 1
51 | beta = Z @ gamma_old # N * K
52 | f_hat = np.array(np.matrix(beta.T @ beta).I @ beta.T @ y) # K * 1
53 | numer += (np.kron(f_hat, Z.T) @ y)
54 | denom += (np.kron(f_hat, Z.T) @ np.kron(f_hat.T, Z))
55 |
56 | gamma_new = (np.linalg.pinv(denom) @ numer).reshape(self.K, 94)
57 | gamma_new = gamma_new.T
58 |
59 | return gamma_new
60 |
61 |
62 | def train_model(self):
63 | update_cnt = 0
64 | min_valid_err = np.Inf
65 | best_gamma = np.zeros((94, self.K))
66 | while update_cnt < 5:
67 | self.gamma = self.__gamma_iter(self.gamma)
68 | valid_error = self.__valid()
69 | if valid_error < min_valid_err:
70 | min_valid_err = valid_error
71 | best_gamma = self.gamma
72 | update_cnt = 0
73 | else:
74 | update_cnt += 1
75 |
76 | self.gamma = best_gamma
77 |
78 |
79 | def inference(self, month):
80 | if not len(self.omit_char):
81 | Z = self.p_charas.loc[self.p_charas.DATE == month][CHARAS_LIST].values # N * P
82 | y = self.portfolio_ret.loc[self.portfolio_ret.DATE == month][CHARAS_LIST].values.T # N * 1
83 | beta = Z @ self.gamma # N * K
84 | f_hat = np.array(np.matrix(beta.T @ beta).I @ beta.T @ y) # K * 1
85 | return (beta @ f_hat).flatten() # N, 1
86 | else:
87 | inference_R = []
88 | Z = self.p_charas.loc[self.p_charas.DATE == month][CHARAS_LIST].copy(deep=False)
89 | y = self.portfolio_ret.loc[self.portfolio_ret.DATE == month][CHARAS_LIST].copy(deep=False)
90 |
91 | for char in self.omit_char:
92 | Z_input = Z.copy(deep=False)
93 | y_input = y.copy(deep=False)
94 | Z_input[[char]] = Z_input[[char]] * 0.0
95 | y_input[[char]] = y_input[[char]] * 0.0
96 | Z_input = Z_input.values
97 | y_input = y_input.values.T
98 | beta = Z_input @ self.gamma
99 | f_hat = np.array(np.matrix(beta.T @ beta).I @ beta.T @ y_input) # K * 1
100 | inference_R.append((beta @ f_hat).flatten()) # m * N
101 |
102 | Z_input = Z.values
103 | y_input = y.values.T
104 | beta = Z_input @ self.gamma
105 | f_hat = np.array(np.matrix(beta.T @ beta).I @ beta.T @ y_input) # K * 1
106 | inference_R.append((beta @ f_hat).flatten()) # m * N
107 |
108 | return np.array(inference_R).T # N * m
109 |
110 |
111 | def predict(self, month):
112 | if self.refit_cnt == 0:
113 | return self.inference(month)
114 |
115 | lag_f_hat = []
116 | for mon in self.mon_list[(self.mon_list >= 19870101) & (self.mon_list < month)]:
117 | Z = self.p_charas.loc[self.p_charas.DATE == mon][CHARAS_LIST].values # N * P
118 | y = self.portfolio_ret.loc[self.portfolio_ret.DATE == mon][CHARAS_LIST].values.T # N * 1
119 | beta = Z @ self.gamma # N * K
120 | f_hat = np.array(np.matrix(beta.T @ beta).I @ beta.T @ y) # K * 1
121 | lag_f_hat.append(f_hat)
122 |
123 | Z = self.p_charas.loc[self.p_charas.DATE == month][CHARAS_LIST].values # N * P
124 | y = self.portfolio_ret.loc[self.portfolio_ret.DATE == month][CHARAS_LIST].values.T # N * 1
125 | beta = Z @ self.gamma # N * K
126 |
127 | # return average of prevailing sample hat{f} (from 198701) up to t-1
128 | avg_lag_f = np.mean(lag_f_hat, axis=0)
129 | return beta @ avg_lag_f
--------------------------------------------------------------------------------
/models/Seq2Seq:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 | import numpy as np
4 | import collections
5 | from .modelBase import modelBase
6 | from utils import CHARAS_LIST
7 |
8 | from io import open
9 | import unicodedata
10 | import string
11 | import re
12 | import random
13 |
14 | import torch
15 | import torch.nn as nn
16 | from torch.autograd import Variable
17 | from torch import optim
18 | import torch.nn.functional as F
19 |
20 | use_cuda = torch.cuda.is_available()
21 |
22 | MAX_EPOCH = 200
23 |
24 | class seq2seq_base(nn.Module, modelBase):
25 | def __init__(self, name, omit_char=[], device='cuda'):
26 | nn.Module.__init__(self)
27 | modelBase.__init__(self, name)
28 | self.beta_seq = None
29 | self.factor_seq = None
30 | self.optimizer = None
31 | self.criterion = None
32 | self.omit_char = omit_char
33 |
34 | self.factor_seq_pred = []
35 |
36 | self.device = device
37 |
38 | self.datashare_chara = pd.read_pickle('./data/datashare_re.pkl').astype(np.float64)
39 | self.p_charas = pd.read_pickle('./data/p_charas.pkl').astype(np.float64).reset_index()
40 | self.portfolio_ret= pd.read_pickle('./data/portfolio_ret.pkl').astype(np.float64)
41 | self.mon_ret = pd.read_pickle('./data/month_ret.pkl').astype(np.float64)
42 |
43 | self.train_dataloader = None
44 | self.valid_dataloader = None
45 | self.test_dataloader = None
46 |
47 |
48 | def debug(self, month):
49 | beta_seq_input = self.p_charas.loc[self.p_charas['DATE'] == month][CHARAS_LIST]
50 | # beta_seq_input = self.datashare_chara.loc[self.datashare_chara['DATE'] == month].set_index('permno')[charas]
51 | print(beta_seq_input)
52 |
53 |
54 | def _get_item(self, month):
55 | if month not in self.p_charas['DATE'].values:
56 | # find the closest month in p_charas to month
57 | month = self.p_charas['DATE'].values[np.argmin(np.abs(self.p_charas['DATE'].values - month))]
58 |
59 | beta_seq_input = self.p_charas.loc[self.p_charas['DATE'] == month][CHARAS_LIST] # (94, 94)
60 | labels = self.portfolio_ret.loc[self.portfolio_ret['DATE'] == month][CHARAS_LIST].T.values # (94, 1)
61 | beta_seq_input['ret-rf'] = labels
62 | align_df = beta_seq_input.copy(deep=False).dropna()
63 |
64 | factor_seq_input = self.portfolio_ret.loc[self.portfolio_ret['DATE'] == month][CHARAS_LIST]
65 |
66 | # exit(0) if there is any nan in align_df
67 | if align_df.isnull().values.any():
68 | assert False, f'There is nan in align_df of : {month}'
69 | # return stock index (L), beta_seq_input (94*94=P*N), factor_seq_input (94*1=P*1), labels (94, = N,)
70 | return align_df.index, align_df.values[:, :-1].T, factor_seq_input.T.values , align_df.values[:, -1].T
71 |
72 |
73 | def dataloader(self, period):
74 | mon_list = pd.read_pickle('data/mon_list.pkl')
75 | mon_list = mon_list.loc[(mon_list >= period[0]) & (mon_list <= period[1])]
76 | beta_seq_input_set = []
77 | factor_seq_input_set = []
78 | label_set = []
79 | for mon in mon_list:
80 | _, _beta_input, _factor_input, label = self._get_item(mon)
81 | beta_seq_input_set.append(_beta_input)
82 | factor_seq_input_set.append(_factor_input)
83 | label_set.append(label)
84 |
85 | beta_seq_input_set = torch.tensor(beta_seq_input_set, dtype=torch.float32).to(self.device)
86 | factor_seq_input_set = torch.tensor(factor_seq_input_set, dtype=torch.float32).to(self.device)
87 | label_set = torch.tensor(label_set, dtype=torch.float32).to(self.device)
88 |
89 | dataset = TensorDataset(beta_seq_input_set, factor_seq_input_set, label_set)
90 | return DataLoader(dataset, batch_size=1, shuffle=True)
91 |
92 |
93 | def forward(self, char, pfret):
94 | processed_char = self.beta_seq(char)
95 |
96 | encoder_hidden = self.encoder_factor_seq.initHidden()
97 | input_length = char.size(0)
98 | encoder_outputs = torch.zeros(input_length, self.hidden_size, device=device)
99 |
100 | for ei in range(input_length):
101 | encoder_output, encoder_hidden = self.encoder_factor_seq(char[ei], encoder_hidden)
102 | encoder_outputs[ei] = encoder_output[0, 0]
103 |
104 | # Factor Seq: Decoding
105 | decoder_input = torch.tensor([[0]], device=device) # Replace 0 with whatever start token you use
106 | decoder_hidden = encoder_hidden
107 | decoded_sequence = torch.zeros(input_length, 94, device=device) # Assuming output size is 94
108 |
109 | for di in range(input_length):
110 | decoder_output, decoder_hidden = self.decoder_factor_seq(decoder_input, decoder_hidden)
111 | decoded_sequence[di] = decoder_output
112 | decoder_input = decoder_output.argmax(1)
113 |
114 | # Now, 'decoded_sequence' can be used as 'factor_seq'
115 | processed_pfret = decoded_sequence
116 | return torch.sum(processed_char * processed_pfret, dim=1)
117 |
118 |
119 | # train_one_epoch
120 | def __train_one_epoch(self):
121 | epoch_loss = 0.0
122 | for i, (beta_seq_input, factor_seq_input, labels) in enumerate(self.train_dataloader):
123 | self.optimizer.zero_grad()
124 | # beta_nn_input reshape: (1, 94, 94) -> (94, 94) (1*P*N => N*P)
125 | # factor_nn_input reshape: (1, 94, 1) -> (1, 94) (1*P*1 => 1*P)
126 | # labels reshape: (1, 94) -> (94, ) (1*N => N,)
127 | beta_seq_input = beta_seq_input.squeeze(0).T
128 | factor_seq_input = factor_seq_input.squeeze(0).T
129 | labels = labels.squeeze(0)
130 | output = self.forward(beta_seq_input, factor_seq_input)
131 | loss = self.criterion(output, labels)
132 |
133 | loss.backward()
134 | self.optimizer.step()
135 | epoch_loss += loss.item()
136 |
137 | if i % 100 == 0:
138 | # print(f'Batches: {i}, loss: {loss.item()}')
139 | pass
140 |
141 | return epoch_loss / len(self.train_dataloader)
142 |
143 |
144 | def __valid_one_epoch(self):
145 | epoch_loss = 0.0
146 | for i, (beta_seq_input, factor_seq_input, labels) in enumerate(self.valid_dataloader):
147 | # beta_nn_input reshape: (1, 94, 94) -> (94, 94) (1*P*N => N*P)
148 | # factor_nn_input reshape: (1, 94, 1) -> (1, 94) (1*P*1 => 1*P)
149 | # labels reshape: (1, 94) -> (94, ) (1*N => N,)
150 | beta_seq_input = beta_seq_input.squeeze(0).T
151 | factor_seq_input = factor_seq_input.squeeze(0).T
152 | labels = labels.squeeze(0)
153 |
154 | output = self.forward(beta_seq_input, factor_seq_input)
155 | loss = self.criterion(output, labels)
156 | epoch_loss += loss.item()
157 |
158 | return epoch_loss / len(self.valid_dataloader)
159 |
160 |
161 | def train_model(self):
162 | if 'saved_models' not in os.listdir('./'):
163 | os.mkdir('saved_models')
164 |
165 | self.train_dataloader = self.dataloader(self.train_period)
166 | self.valid_dataloader = self.dataloader(self.valid_period)
167 | self.test_dataloader = self.dataloader(self.test_period)
168 |
169 | min_error = np.Inf
170 | no_update_steps = 0
171 | valid_loss = []
172 | train_loss = []
173 | for i in range(MAX_EPOCH):
174 | # print(f'Epoch {i}')
175 | self.train()
176 | train_error = self.__train_one_epoch()
177 | train_loss.append(train_error)
178 |
179 | self.eval()
180 | # valid and early stop
181 | with torch.no_grad():
182 | valid_error = self.__valid_one_epoch()
183 |
184 | valid_loss.append(valid_error)
185 | if valid_error < min_error:
186 | min_error = valid_error
187 | no_update_steps = 0
188 | # save model
189 | torch.save(self.state_dict(), f'./saved_models/{self.name}.pt')
190 | else:
191 | no_update_steps += 1
192 |
193 | if no_update_steps > 2: # early stop, if consecutive 3 epoches no improvement on validation set
194 | print(f'Early stop at epoch {i}')
195 | break
196 | # load from (best) saved model
197 | self.load_state_dict(torch.load(f'./saved_models/{self.name}.pt'))
198 | return train_loss, valid_loss
199 |
200 |
201 | def test_model(self):
202 | # beta, factor, label = self.test_dataset
203 | # i = np.random.randint(len(beta))
204 | # beta_nn_input = beta[i]
205 | # factor_nn_input = factor[i]
206 | # labels = label[i]
207 | output = None
208 | label = None
209 | for i, beta_seq_input, factor_seq_input, labels in enumerate(self.test_dataloader):
210 | # convert to tensor
211 | # beta_nn_input = torch.tensor(beta_nn_input, dtype=torch.float32).T.to(self.device)
212 | # factor_nn_input = torch.tensor(factor_nn_input, dtype=torch.float32).T.to(self.device)
213 | # labels = torch.tensor(labels, dtype=torch.float32).T.to(self.device)
214 | output = self.forward(beta_seq_input, factor_seq_input)
215 | break
216 |
217 | loss = self.criterion(output, labels)
218 | print(f'Test loss: {loss.item()}')
219 | print(f'Predicted: {output}')
220 | print(f'Ground truth: {labels}')
221 | return output, labels
222 |
223 |
224 | def calBeta(self, month, skip_char=[]):
225 | _, beta_seq_input, _, _ = self._get_item(month) # beta input: 94*94 = P*N
226 |
227 | # if some variables need be omitted
228 | if len(skip_char):
229 | beta_seq_input = pd.DataFrame(beta_seq_input.T, columns=CHARAS_LIST) # N*P
230 | beta_seq_input[skip_char] = beta_seq_input[skip_char] * 0.0
231 | beta_seq_input = beta_seq_input.values.T # P*N
232 |
233 | beta_seq_input = torch.tensor(beta_seq_input, dtype=torch.float32).T.to(self.device) # N*P
234 | return self.beta_seq(beta_seq_input) # N*K
235 |
236 |
237 | def calFactor(self, month, skip_char=[]):
238 | _, _, factor_seq_input, _ = self._get_item(month) # factor input: P*1
239 |
240 | # if some variables need be omitted
241 | if len(skip_char):
242 | factor_seq_input = pd.DataFrame(factor_seq_input.T, columns=CHARAS_LIST) # 1*P
243 | factor_seq_input[skip_char] = factor_seq_input[skip_char] * 0.0
244 | factor_seq_input = factor_seq_input.values.T # P*1
245 |
246 | factor_seq_input = torch.tensor(factor_seq_input, dtype=torch.float32).T.to(self.device) # 1*P
247 | factor_pred = self.factor_seq(factor_seq_input).T # K*1
248 |
249 | self.factor_seq_pred.append(factor_pred)
250 |
251 | return factor_pred # K*1
252 |
253 |
254 | def inference(self, month):
255 | if len(self.omit_char) == 0:
256 | assert month >= self.test_period[0], f"Month error, {month} is not in test period {self.test_period}"
257 |
258 | mon_factor, mon_beta = self.calFactor(month), self.calBeta(month)
259 |
260 | assert mon_beta.shape[1] == mon_factor.shape[0], f"Dimension mismatch between mon_factor: {mon_factor.shape} and mon_beta: {mon_beta.shape}"
261 |
262 | # R_{N*1} = Beta_{N*K} @ F_{K*1}
263 | return mon_beta @ mon_factor
264 | else:
265 | ret_R = []
266 | for char in self.omit_char:
267 | mon_factor, mon_beta = self.calFactor(month, [char]), self.calBeta(month, [char])
268 | ret_R.append((mon_beta @ mon_factor).cpu().detach().numpy()) # N*1
269 |
270 | mon_factor, mon_beta = self.calFactor(month), self.calBeta(month)
271 | ret_R.append((mon_beta @ mon_factor).cpu().detach().numpy()) # also add complete result
272 |
273 | return np.array(ret_R).squeeze(2).T # N*m
274 |
275 |
276 | def cal_delayed_Factor(self, month):
277 | # calculate the last day of the previous month
278 | if self.refit_cnt == 0:
279 | avg_f_pred = self.factor_seq_pred[0] # input of the first predict take hat{f}_t
280 | # print(avg_f_pred.shape)
281 | else:
282 | avg_f_pred = torch.mean(torch.stack(self.factor_seq_pred[:self.refit_cnt]), dim=0)
283 |
284 | return avg_f_pred
285 |
286 |
287 | def reset_weight(self):
288 | for layer in self.beta_seq: # reset beta_nn parameters
289 | if hasattr(layer, 'reset_parameters'):
290 | layer.reset_parameters()
291 |
292 | for layer in self.factor_seq: # reset factor_nn parameters
293 | if hasattr(layer, 'reset_parameters'):
294 | layer.reset_parameters()
295 |
296 | self.optimizer.state = collections.defaultdict(dict) # reset optimizer state
297 |
298 |
299 | def release_gpu(self):
300 | if self.train_dataloader is not None:
301 | del self.train_dataloader
302 | if self.valid_dataloader is not None:
303 | del self.valid_dataloader
304 | if self.test_dataloader is not None:
305 | del self.test_dataloader
306 | torch.cuda.empty_cache()
307 |
308 |
309 | class EncoderRNN(nn.Module):
310 | def __init__(self, input_size, hidden_size):
311 | super(EncoderRNN, self).__init__()
312 | self.hidden_size = hidden_size
313 |
314 | self.embedding = nn.Embedding(input_size, hidden_size)
315 | self.gru = nn.GRU(hidden_size, hidden_size)
316 |
317 | def forward(self, input, hidden):
318 | embedded = self.embedding(input).view(1, 1, -1)
319 | output = embedded
320 | output, hidden = self.gru(output, hidden)
321 | return output, hidden
322 |
323 | def initHidden(self):
324 | result = Variable(torch.zeros(1, 1, self.hidden_size))
325 | if use_cuda:
326 | return result.cuda()
327 | else:
328 | return result
329 |
330 | class DecoderRNN(nn.Module):
331 | def __init__(self, hidden_size, output_size):
332 | super(DecoderRNN, self).__init__()
333 | self.hidden_size = hidden_size
334 |
335 | self.embedding = nn.Embedding(output_size, hidden_size)
336 | self.gru = nn.GRU(hidden_size, hidden_size)
337 | self.out = nn.Linear(hidden_size, output_size)
338 | self.softmax = nn.LogSoftmax(dim=1)
339 |
340 | def forward(self, input, hidden):
341 | output = self.embedding(input).view(1, 1, -1)
342 | output = F.relu(output)
343 | output, hidden = self.gru(output, hidden)
344 | output = self.softmax(self.out(output[0]))
345 | return output, hidden
346 |
347 | def initHidden(self):
348 | result = Variable(torch.zeros(1, 1, self.hidden_size))
349 | if use_cuda:
350 | return result.cuda()
351 | else:
352 | return result
353 |
354 |
355 |
356 |
357 | class seq2seq0(seq2seq_base):
358 | def __init__(self, hidden_size, lr=0.001, omit_char=[], device='cuda'):
359 | seq2seq_base.__init__(self, name=f'seq2seq0_{hidden_size}', omit_char=omit_char, device=device)
360 | # P -> K
361 | self.beta_seq = nn.Sequential(
362 | # output layer
363 | nn.Linear(94, hidden_size)
364 | )
365 | # Initialize the encoder and decoder for factor_seq
366 | self.encoder_factor_seq = EncoderRNN(94, hidden_size).to(device)
367 | self.decoder_factor_seq = DecoderRNN(hidden_size, 94).to(device) # output size is 94
368 |
369 | self.optimizer = torch.optim.Adam(self.parameters(), lr=lr)
370 | self.criterion = nn.MSELoss().to(device)
371 |
372 |
373 |
374 | class seq2seq1(seq2seq_base):
375 | def __init__(self, hidden_size, dropout=0.5, lr=0.001, omit_char=[], device='cuda'):
376 | seq2seq_base.__init__(self, name=f'seq2seq1_{hidden_size}', omit_char=omit_char, device=device)
377 | self.dropout = dropout
378 | # P -> 32 -> K
379 | self.beta_seq = nn.Sequential(
380 | # hidden layer 1
381 | nn.Linear(94, 32),
382 | nn.BatchNorm1d(32),
383 | nn.ReLU(),
384 | nn.Dropout(self.dropout),
385 | # output layer
386 | nn.Linear(32, hidden_size)
387 | )
388 | self.encoder_factor_seq = EncoderRNN(94, hidden_size).to(device)
389 | self.decoder_factor_seq = DecoderRNN(hidden_size, 94).to(device)
390 |
391 | self.optimizer = torch.optim.Adam(self.parameters(), lr=lr)
392 | self.criterion = nn.MSELoss().to(device)
393 |
394 |
395 | ###应该self.factor_seq 被self.decoder_factor_seq替换
396 |
397 | class seq2seq2(seq2seq_base):
398 | def __init__(self, hidden_size, dropout=0.5, lr=0.001, omit_char=[], device='cuda'):
399 | seq2seq_base.__init__(self, name=f'seq2seq2_{hidden_size}', omit_char=omit_char, device=device)
400 | self.dropout = dropout
401 | # P -> 32 -> 16 -> K
402 | self.beta_nn = nn.Sequential(
403 | # hidden layer 1
404 | nn.Linear(94, 32),
405 | nn.BatchNorm1d(32),
406 | nn.ReLU(),
407 | nn.Dropout(self.dropout),
408 | # hidden layer 2
409 | nn.Linear(32, 16),
410 | nn.BatchNorm1d(16),
411 | nn.ReLU(),
412 | nn.Dropout(self.dropout),
413 | # output layer
414 | nn.Linear(16, hidden_size)
415 | )
416 | self.encoder_factor_seq = EncoderRNN(94, hidden_size).to(device)
417 | self.decoder_factor_seq = DecoderRNN(hidden_size, 94).to(device)
418 |
419 | self.optimizer = torch.optim.Adam(self.parameters(), lr=lr)
420 | self.criterion = nn.MSELoss().to(device)
421 |
422 |
423 |
424 | class seq2seq3(seq2seq_base):
425 | def __init__(self, hidden_size, dropout=0.5, lr=0.001, omit_char=[], device='cuda'):
426 | seq2seq_base.__init__(self, name=f'seq2seq3_{hidden_size}', omit_char=omit_char, device=device)
427 | self.dropout = dropout
428 | # P -> 32 -> 16 -> 8 -> K
429 | self.beta_seq = nn.Sequential(
430 | # hidden layer 1
431 | nn.Linear(94, 32),
432 | nn.BatchNorm1d(32),
433 | nn.ReLU(),
434 | nn.Dropout(self.dropout),
435 | # hidden layer 2
436 | nn.Linear(32, 16),
437 | nn.BatchNorm1d(16),
438 | nn.ReLU(),
439 | nn.Dropout(self.dropout),
440 | # hidden layer 3
441 | nn.Linear(16, 8),
442 | nn.BatchNorm1d(8),
443 | nn.ReLU(),
444 | nn.Dropout(self.dropout),
445 | # output layer
446 | nn.Linear(8, hidden_size)
447 | )
448 | self.encoder_factor_seq = EncoderRNN(94, hidden_size).to(device)
449 | self.decoder_factor_seq = DecoderRNN(hidden_size, 94).to(device)
450 |
451 | self.optimizer = torch.optim.Adam(self.parameters(), lr=lr, weight_decay=0.01)
452 | self.criterion = nn.MSELoss().to(device)
453 |
--------------------------------------------------------------------------------
/models/modelBase.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import datetime
4 | from dateutil.relativedelta import relativedelta
5 |
6 | class modelBase:
7 | def __init__(self, name):
8 | self.name = name
9 | self.train_idx = 0
10 | self.refit_cnt = 0
11 |
12 | # initial train, valid and test periods are default accroding to original paper
13 | self.train_period = [19570101, 19741231]
14 | self.valid_period = [19750101, 19861231]
15 | self.test_period = [19870101, 19871231]
16 |
17 |
18 | def train_model(self):
19 | # print('trained')
20 | pass
21 |
22 |
23 | def calBeta(self, month):
24 | """
25 | Calculate specific month's beta. Should be specified by different models
26 | -> return np.array, dim = (N, K)
27 | """
28 | # return np.zeros([13000, 3])
29 | pass
30 |
31 |
32 | def calFactor(self, month):
33 | """
34 | Calculate specific month's factor. Should be specified by different models
35 | -> return np.array, dim = (K, 1)
36 | """
37 | # return np.zeros([3, 1])
38 | pass
39 |
40 |
41 | def cal_delayed_Factor(self, month):
42 | """
43 | Calculate delayed month's factor, i.e. mean average of factors up to t-1. Should be specified by different models
44 | -> return np.array, dim = (K, 1)
45 | """
46 | pass
47 |
48 |
49 | def inference(self, month):
50 | assert month >= self.test_period[0], f"Month error, {month} is not in test period {self.test_period}"
51 |
52 | mon_factor, mon_beta = self.calFactor(month), self.calBeta(month)
53 |
54 | assert mon_beta.shape[1] == mon_factor.shape[0], f"Dimension mismatch between mon_factor: {mon_factor.shape} and mon_beta: {mon_beta.shape}"
55 |
56 | # R_{N*1} = Beta_{N*K} @ F_{K*1}
57 | return mon_beta @ mon_factor
58 |
59 |
60 | def predict(self, month):
61 | assert month >= self.test_period[0] and month <= self.test_period[1], f"Month error, {month} is not in test period {self.test_period}"
62 |
63 | lag_factor, mon_beta = self.cal_delayed_Factor(month), self.calBeta(month)
64 |
65 | assert mon_beta.shape[1] == lag_factor.shape[0], f"Dimension mismatch between lag_factor: {lag_factor.shape} and mon_beta: {mon_beta.shape}"
66 |
67 | # R_{N*1} = Beta_{N*K} @ lag_F_avg{K*1}
68 | return mon_beta @ lag_factor
69 |
70 |
71 | def refit(self):
72 | # self.train_period[1] += 10000 # method in original paper: increase training size by one year each time refit
73 | self.train_period = (pd.Series(self.train_period) + 10000).to_list() # rolling training
74 | self.valid_period = (pd.Series(self.valid_period) + 10000).to_list()
75 | self.test_period = (pd.Series(self.test_period) + 10000).to_list()
76 | self.refit_cnt += 1
77 |
78 |
--------------------------------------------------------------------------------
/models/seq.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 | import numpy as np
4 | import collections
5 | from .modelBase import modelBase
6 | from utils import CHARAS_LIST
7 |
8 | from io import open
9 | import torch
10 | import torch.nn as nn
11 | from torch.autograd import Variable
12 | from torch import optim
13 | import torch.nn.functional as F
14 | from torch.utils.data import Dataset, DataLoader, TensorDataset
15 |
16 | use_cuda = torch.cuda.is_available()
17 |
18 | MAX_EPOCH = 200
19 |
20 |
21 | class seq2seq_base(nn.Module, modelBase):
22 | def __init__(self, name, omit_char=[], device='cuda'):
23 | nn.Module.__init__(self)
24 | modelBase.__init__(self, name)
25 | self.beta_seq = None
26 | self.factor_seq = None
27 | self.optimizer = None
28 | self.criterion = None
29 | self.omit_char = omit_char
30 |
31 | self.factor_seq_pred = []
32 |
33 | self.device = device
34 |
35 | self.datashare_chara = pd.read_pickle('./data/datashare_re.pkl').astype(np.float64)
36 | self.p_charas = pd.read_pickle('./data/p_charas.pkl').astype(np.float64).reset_index()
37 | self.portfolio_ret = pd.read_pickle('./data/portfolio_ret.pkl').astype(np.float64)
38 | self.mon_ret = pd.read_pickle('./data/month_ret.pkl').astype(np.float64)
39 |
40 | self.train_dataloader = None
41 | self.valid_dataloader = None
42 | self.test_dataloader = None
43 |
44 | def debug(self, month):
45 | beta_seq_input = self.p_charas.loc[self.p_charas['DATE'] == month][CHARAS_LIST]
46 | # beta_seq_input = self.datashare_chara.loc[self.datashare_chara['DATE'] == month].set_index('permno')[charas]
47 | print(beta_seq_input)
48 |
49 | def _get_item(self, month):
50 | if month not in self.p_charas['DATE'].values:
51 | # find the closest month in p_charas to month
52 | month = self.p_charas['DATE'].values[np.argmin(np.abs(self.p_charas['DATE'].values - month))]
53 |
54 | beta_seq_input = self.p_charas.loc[self.p_charas['DATE'] == month][CHARAS_LIST] # (94, 94)
55 | labels = self.portfolio_ret.loc[self.portfolio_ret['DATE'] == month][CHARAS_LIST].T.values # (94, 1)
56 | beta_seq_input['ret-rf'] = labels
57 | align_df = beta_seq_input.copy(deep=False).dropna()
58 |
59 | factor_seq_input = self.portfolio_ret.loc[self.portfolio_ret['DATE'] == month][CHARAS_LIST]
60 |
61 | # exit(0) if there is any nan in align_df
62 | if align_df.isnull().values.any():
63 | assert False, f'There is nan in align_df of : {month}'
64 | # return stock index (L), beta_seq_input (94*94=P*N), factor_seq_input (94*1=P*1), labels (94, = N,)
65 | return align_df.index, align_df.values[:, :-1].T, factor_seq_input.T.values, align_df.values[:, -1].T
66 |
67 | def dataloader(self, period):
68 | mon_list = pd.read_pickle('data/mon_list.pkl')
69 | mon_list = mon_list.loc[(mon_list >= period[0]) & (mon_list <= period[1])]
70 | beta_seq_input_set = []
71 | factor_seq_input_set = []
72 | label_set = []
73 | for mon in mon_list:
74 | _, _beta_input, _factor_input, label = self._get_item(mon)
75 | beta_seq_input_set.append(_beta_input)
76 | factor_seq_input_set.append(_factor_input)
77 | label_set.append(label)
78 |
79 | beta_seq_input_set = torch.tensor(beta_seq_input_set, dtype=torch.float32).to(self.device)
80 | factor_seq_input_set = torch.tensor(factor_seq_input_set, dtype=torch.float32).to(self.device)
81 | label_set = torch.tensor(label_set, dtype=torch.float32).to(self.device)
82 |
83 | dataset = TensorDataset(beta_seq_input_set, factor_seq_input_set, label_set)
84 | return DataLoader(dataset, batch_size=1, shuffle=True)
85 |
86 | def forward(self, char, pfret):
87 | processed_char = self.beta_seq(char)
88 | # print(processed_char.shape)
89 | decoded_sequence = self.factor_seq(pfret)
90 |
91 | # Now, 'decoded_sequence' can be used as 'factor_seq'
92 | processed_pfret = decoded_sequence
93 |
94 | # return torch.sum(processed_char * processed_pfret, dim=1)
95 | return torch.mm(processed_char, processed_pfret)
96 |
97 | # train_one_epoch
98 | def __train_one_epoch(self):
99 | epoch_loss = 0.0
100 | self.train()
101 | for i, (beta_seq_input, factor_seq_input, labels) in enumerate(self.train_dataloader):
102 |
103 | # beta_nn_input reshape: (1, 94, 94) -> (94, 94) (1*P*N => N*P)
104 | # factor_nn_input reshape: (1, 94, 1) -> (1, 94) (1*P*1 => 1*P)
105 | # labels reshape: (1, 94) -> (94, ) (1*N => N,)
106 | beta_seq_input = beta_seq_input.squeeze(0).T
107 | factor_seq_input = factor_seq_input.squeeze(0).T
108 | labels = labels.squeeze(0)
109 | output = self.forward(beta_seq_input, factor_seq_input)
110 | loss = self.criterion(output, labels)
111 |
112 | self.optimizer.zero_grad()
113 | loss.backward()
114 | self.optimizer.step()
115 | epoch_loss += loss.item()
116 |
117 | if i % 100 == 0:
118 | # print(f'Batches: {i}, loss: {loss.item()}')
119 | pass
120 |
121 | return epoch_loss / len(self.train_dataloader)
122 |
123 | def __valid_one_epoch(self):
124 | epoch_loss = 0.0
125 | for i, (beta_seq_input, factor_seq_input, labels) in enumerate(self.valid_dataloader):
126 | # beta_nn_input reshape: (1, 94, 94) -> (94, 94) (1*P*N => N*P)
127 | # factor_nn_input reshape: (1, 94, 1) -> (1, 94) (1*P*1 => 1*P)
128 | # labels reshape: (1, 94) -> (94, ) (1*N => N,)
129 | beta_seq_input = beta_seq_input.squeeze(0).T
130 | factor_seq_input = factor_seq_input.squeeze(0).T
131 | labels = labels.squeeze(0)
132 |
133 | output = self.forward(beta_seq_input, factor_seq_input)
134 | loss = self.criterion(output, labels)
135 | epoch_loss += loss.item()
136 |
137 | return epoch_loss / len(self.valid_dataloader)
138 |
139 | def train_model(self):
140 | if 'saved_models' not in os.listdir('./'):
141 | os.mkdir('saved_models')
142 |
143 | self.train_dataloader = self.dataloader(self.train_period)
144 | self.valid_dataloader = self.dataloader(self.valid_period)
145 | self.test_dataloader = self.dataloader(self.test_period)
146 |
147 | min_error = np.Inf
148 | no_update_steps = 0
149 | valid_loss = []
150 | train_loss = []
151 | for i in range(MAX_EPOCH):
152 | # print(f'Epoch {i}')
153 | train_error = self.__train_one_epoch()
154 | train_loss.append(train_error)
155 |
156 | self.eval()
157 | # valid and early stop
158 | with torch.no_grad():
159 | valid_error = self.__valid_one_epoch()
160 |
161 | valid_loss.append(valid_error)
162 | if valid_error < min_error:
163 | min_error = valid_error
164 | no_update_steps = 0
165 | # save model
166 | torch.save(self.state_dict(), f'./saved_models/{self.name}.pt')
167 | else:
168 | no_update_steps += 1
169 |
170 | if no_update_steps > 2: # early stop, if consecutive 3 epoches no improvement on validation set
171 | print(f'Early stop at epoch {i}')
172 | break
173 | # load from (best) saved model
174 | self.load_state_dict(torch.load(f'./saved_models/{self.name}.pt'))
175 | return train_loss, valid_loss
176 |
177 | def test_model(self):
178 | # beta, factor, label = self.test_dataset
179 | # i = np.random.randint(len(beta))
180 | # beta_nn_input = beta[i]
181 | # factor_nn_input = factor[i]
182 | # labels = label[i]
183 | output = None
184 | label = None
185 | for i, beta_seq_input, factor_seq_input, labels in enumerate(self.test_dataloader):
186 | # convert to tensor
187 | # beta_nn_input = torch.tensor(beta_nn_input, dtype=torch.float32).T.to(self.device)
188 | # factor_nn_input = torch.tensor(factor_nn_input, dtype=torch.float32).T.to(self.device)
189 | # labels = torch.tensor(labels, dtype=torch.float32).T.to(self.device)
190 | output = self.forward(beta_seq_input, factor_seq_input)
191 | break
192 |
193 | loss = self.criterion(output, labels)
194 | print(f'Test loss: {loss.item()}')
195 | print(f'Predicted: {output}')
196 | print(f'Ground truth: {labels}')
197 | return output, labels
198 |
199 | def calBeta(self, month, skip_char=[]):
200 | _, beta_seq_input, _, _ = self._get_item(month) # beta input: 94*94 = P*N
201 |
202 | # if some variables need be omitted
203 | if len(skip_char):
204 | beta_seq_input = pd.DataFrame(beta_seq_input.T, columns=CHARAS_LIST) # N*P
205 | beta_seq_input[skip_char] = beta_seq_input[skip_char] * 0.0
206 | beta_seq_input = beta_seq_input.values.T # P*N
207 |
208 | beta_seq_input = torch.tensor(beta_seq_input, dtype=torch.float32).T.to(self.device) # N*P
209 | return self.beta_seq(beta_seq_input) # N*K
210 |
211 | def calFactor(self, month, skip_char=[]):
212 | _, _, factor_seq_input, _ = self._get_item(month) # factor input: P*1
213 |
214 | # if some variables need be omitted
215 | if len(skip_char):
216 | factor_seq_input = pd.DataFrame(factor_seq_input.T, columns=CHARAS_LIST) # 1*P
217 | factor_seq_input[skip_char] = factor_seq_input[skip_char] * 0.0
218 | factor_seq_input = factor_seq_input.values.T # P*1
219 |
220 | factor_seq_input = torch.tensor(factor_seq_input, dtype=torch.float32).T.to(self.device) # 1*P
221 | factor_pred = self.factor_seq(factor_seq_input) # K*1
222 |
223 | self.factor_seq_pred.append(factor_pred)
224 |
225 | return factor_pred # K*1
226 |
227 | def inference(self, month):
228 | if len(self.omit_char) == 0:
229 | assert month >= self.test_period[0], f"Month error, {month} is not in test period {self.test_period}"
230 |
231 | mon_factor, mon_beta = self.calFactor(month), self.calBeta(month)
232 |
233 | assert mon_beta.shape[1] == mon_factor.shape[
234 | 0], f"Dimension mismatch between mon_factor: {mon_factor.shape} and mon_beta: {mon_beta.shape}"
235 |
236 | # R_{N*1} = Beta_{N*K} @ F_{K*1}
237 | return mon_beta @ mon_factor
238 | else:
239 | ret_R = []
240 | for char in self.omit_char:
241 | mon_factor, mon_beta = self.calFactor(month, [char]), self.calBeta(month, [char])
242 | ret_R.append((mon_beta @ mon_factor).cpu().detach().numpy()) # N*1
243 |
244 | mon_factor, mon_beta = self.calFactor(month), self.calBeta(month)
245 | ret_R.append((mon_beta @ mon_factor).cpu().detach().numpy()) # also add complete result
246 |
247 | return np.array(ret_R).squeeze(2).T # N*m
248 |
249 | def cal_delayed_Factor(self, month):
250 | # calculate the last day of the previous month
251 | if self.refit_cnt == 0:
252 | avg_f_pred = self.factor_seq_pred[0] # input of the first predict take hat{f}_t
253 | # print(avg_f_pred.shape)
254 | else:
255 | avg_f_pred = torch.mean(torch.stack(self.factor_seq_pred[:self.refit_cnt]), dim=0)
256 |
257 | return avg_f_pred
258 |
259 | def reset_weight(self):
260 | for layer in self.beta_seq: # reset beta_nn parameters
261 | if hasattr(layer, 'reset_parameters'):
262 | layer.reset_parameters()
263 |
264 | for layer in self.factor_seq: # reset factor_nn parameters
265 | if hasattr(layer, 'reset_parameters'):
266 | layer.reset_parameters()
267 |
268 | self.optimizer.state = collections.defaultdict(dict) # reset optimizer state
269 |
270 | def release_gpu(self):
271 | if self.train_dataloader is not None:
272 | del self.train_dataloader
273 | if self.valid_dataloader is not None:
274 | del self.valid_dataloader
275 | if self.test_dataloader is not None:
276 | del self.test_dataloader
277 | torch.cuda.empty_cache()
278 |
279 |
280 | class EncoderRNN(nn.Module):
281 | def __init__(self, input_size, hidden_size):
282 | super(EncoderRNN, self).__init__()
283 | self.input_size = input_size
284 | self.hidden_size = hidden_size
285 |
286 | self.embedding = nn.Linear(input_size, hidden_size)
287 | self.relu = nn.ReLU()
288 | self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
289 |
290 | def forward(self, input, hidden):
291 | # input = input.long()
292 | b, l = input.shape
293 | input = input.reshape(b, l, 1)
294 | embedded = self.embedding(input)
295 | embedded = self.relu(embedded)
296 | output, hidden = self.gru(embedded, hidden)
297 |
298 | return output, hidden
299 |
300 | def initHidden(self):
301 | result = Variable(torch.zeros(1, 1, self.hidden_size))
302 | # result = Variable(torch.zeros(1, self.hidden_size, self.input_size))
303 | if use_cuda:
304 | return result.cuda()
305 | else:
306 | return result
307 |
308 |
309 | class DecoderRNN(nn.Module):
310 | def __init__(self, hidden_size, output_size):
311 | super(DecoderRNN, self).__init__()
312 | self.hidden_size = hidden_size
313 |
314 | # self.embedding = nn.Linear(hidden_size, hidden_size)
315 | self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
316 | self.relu = nn.ReLU()
317 | self.out = nn.Linear(hidden_size, output_size)
318 | self.act = nn.Sigmoid()
319 |
320 | def forward(self, input, hidden):
321 | output, hidden = self.gru(input, hidden)
322 | output = self.relu(output)
323 | output = self.out(output[0])
324 | output = self.act(output)
325 | return output, hidden
326 |
327 | def initHidden(self):
328 | result = Variable(torch.zeros(1, 1, self.hidden_size))
329 | if use_cuda:
330 | return result.cuda()
331 | else:
332 | return result
333 |
334 |
335 | class FactorSeq(nn.Module):
336 | def __init__(self, input_size, hidden_size, device='cuda'):
337 | super(FactorSeq, self).__init__()
338 | self.encoder = EncoderRNN(input_size, hidden_size)
339 | self.decoder = DecoderRNN(hidden_size, input_size)
340 | self.hidden_size = hidden_size
341 | self.device = device
342 |
343 | def forward(self, input):
344 | encoder_hidden = self.encoder.initHidden()
345 |
346 | encoder_output, encoder_hidden = self.encoder(input, encoder_hidden)
347 | decoder_output, _ = self.decoder(encoder_output, encoder_hidden)
348 | decoded_sequence = decoder_output
349 | # print(decoded_sequence.shape)
350 | # decoder_input = decoder_output.argmax(1)
351 |
352 | return decoded_sequence
353 |
354 |
355 | class seq2seq0(seq2seq_base):
356 | def __init__(self, hidden_size, lr=0.001, omit_char=[], device='cuda'):
357 | seq2seq_base.__init__(self, name=f'seq2seq0_{hidden_size}', omit_char=omit_char, device=device)
358 | # P -> K
359 | self.beta_seq = nn.Sequential(
360 | # output layer
361 | nn.Linear(94, hidden_size)
362 | )
363 | # Initialize the encoder and decoder for factor_seq
364 | self.encoder_factor_seq = EncoderRNN(94, hidden_size).to(device)
365 | self.decoder_factor_seq = DecoderRNN(hidden_size, 94).to(device) # output size is 94
366 |
367 | self.optimizer = torch.optim.Adam(self.parameters(), lr=lr)
368 | self.criterion = nn.MSELoss().to(device)
369 |
370 |
371 | class seq2seq1(seq2seq_base):
372 | def __init__(self, hidden_size, dropout=0.5, lr=0.001, omit_char=[], device='cuda'):
373 | seq2seq_base.__init__(self, name=f'seq2seq1_{hidden_size}', omit_char=omit_char, device=device)
374 | self.hidden_size = hidden_size
375 | self.dropout = dropout
376 | # P -> 32 -> K
377 | K = 94
378 | self.beta_seq = nn.Sequential(
379 | # hidden layer 1
380 | nn.Linear(94, 32),
381 | nn.BatchNorm1d(32),
382 | nn.ReLU(),
383 | nn.Dropout(self.dropout),
384 | # output layer
385 | nn.Linear(32, K)
386 | )
387 | self.factor_seq = FactorSeq(input_size=1, hidden_size=hidden_size)
388 |
389 | self.optimizer = torch.optim.Adam(self.parameters(), lr=lr)
390 | self.criterion = nn.MSELoss().to(device)
391 |
392 |
393 | ###应该self.factor_seq 被self.decoder_factor_seq替换
394 |
395 | class seq2seq2(seq2seq_base):
396 | def __init__(self, hidden_size, dropout=0.5, lr=0.001, omit_char=[], device='cuda'):
397 | seq2seq_base.__init__(self, name=f'seq2seq2_{hidden_size}', omit_char=omit_char, device=device)
398 | self.hidden_size = hidden_size
399 | self.dropout = dropout
400 | # P -> 32 -> 16 -> K
401 | K = 94
402 | self.beta_seq = nn.Sequential(
403 | # hidden layer 1
404 | nn.Linear(94, 32),
405 | nn.BatchNorm1d(32),
406 | nn.ReLU(),
407 | nn.Dropout(self.dropout),
408 | # hidden layer 2
409 | nn.Linear(32, 16),
410 | nn.BatchNorm1d(16),
411 | nn.ReLU(),
412 | nn.Dropout(self.dropout),
413 | # output layer
414 | nn.Linear(16, K)
415 | )
416 | self.factor_seq = FactorSeq(input_size=1, hidden_size=hidden_size)
417 |
418 | self.optimizer = torch.optim.Adam(self.parameters(), lr=lr)
419 | self.criterion = nn.MSELoss().to(device)
420 |
421 |
422 | class seq2seq3(seq2seq_base):
423 | def __init__(self, hidden_size, dropout=0.5, lr=0.001, omit_char=[], device='cuda'):
424 | seq2seq_base.__init__(self, name=f'seq2seq3_{hidden_size}', omit_char=omit_char, device=device)
425 | self.hidden_size = hidden_size
426 | self.dropout = dropout
427 | # P -> 32 -> 16 -> 8 -> K
428 | K = 94
429 | self.beta_seq = nn.Sequential(
430 | # hidden layer 1
431 | nn.Linear(94, 32),
432 | nn.BatchNorm1d(32),
433 | nn.ReLU(),
434 | nn.Dropout(self.dropout),
435 | # hidden layer 2
436 | nn.Linear(32, 16),
437 | nn.BatchNorm1d(16),
438 | nn.ReLU(),
439 | nn.Dropout(self.dropout),
440 | # hidden layer 3
441 | nn.Linear(16, 8),
442 | nn.BatchNorm1d(8),
443 | nn.ReLU(),
444 | nn.Dropout(self.dropout),
445 | # output layer
446 | nn.Linear(8, K)
447 | )
448 | self.factor_seq = FactorSeq(input_size=1, hidden_size=hidden_size)
449 |
450 | self.optimizer = torch.optim.Adam(self.parameters(), lr=lr, weight_decay=0.01)
451 | self.criterion = nn.MSELoss().to(device)
452 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 |
4 | # # stock-level characteristics with index corresponding to original paper
5 | # annual_chara = {
6 | # 'absacc': 1, 'acc': 2, 'age': 4, 'agr': 5, 'bm': 9,
7 | # 'bm_ia': 10, 'cashdebt': 12, 'cashpr': 13, 'cfp': 14, 'cfp_ia': 15,
8 | # 'chatoia': 16, 'chcsho': 17, 'chempia': 18, 'chinv': 19, 'chpmia': 21,
9 | # 'convind': 24, 'currat': 25, 'depr': 26, 'divi': 27, 'divo': 28,
10 | # 'dy': 30, 'egr': 32, 'ep': 33, 'gma': 34, 'grcapx': 35,
11 | # 'grltnoa': 36, 'herf': 37, 'hire': 38, 'invest': 42, 'lev': 43,
12 | # 'lgr': 44, 'mve_ia': 52, 'operprof': 54, 'orgcap': 55, 'pchcapx_ia': 56,
13 | # 'pchcurrat': 57, 'pchdepr': 58, 'pchgm_pchsale': 59, 'pchquick': 60, 'pchsale_pchinvt': 61,
14 | # 'pchsale_pchrect': 62, 'pchsale_pchxsga': 63, 'pchsaleinv': 64, 'pctacc': 65, 'ps': 67,
15 | # 'quick': 68, 'rd': 69, 'rd_mve': 70, 'rd_sale': 71, 'realestate': 72,
16 | # 'roic': 77, 'salecash': 79, 'saleinv': 80, 'salerec': 81, 'secured': 82,
17 | # 'securedind': 83, 'sgr': 84, 'sin': 85, 'sp': 86, 'tang': 91, 'tb': 92
18 | # }
19 |
20 | # quarter_chara = {
21 | # 'aeavol': 3, 'cash': 11, 'chtx': 22, 'cinvest': 23,
22 | # 'ear': 31, 'ms': 50, 'nincr': 53, 'roaq': 74,
23 | # 'roavol': 75, 'roeq': 76, 'rsup': 78, 'stdacc': 89, 'stdcf': 90
24 | # }
25 |
26 | # month_chara = {
27 | # 'baspread': 6, 'beta': 7, 'betasq': 8, 'chmom': 20,
28 | # 'dolvol': 29, 'idiovol': 39, 'ill': 40, 'indmom': 41,
29 | # 'maxret': 45, 'mom12m': 46, 'mom1m': 47, 'mom36m': 48,
30 | # 'mom6m': 49, 'mvel1': 51, 'pricedelay': 66, 'retvol': 73,
31 | # 'std_dolvol': 87, 'std_turn': 88, 'turn': 93, 'zerotrade': 94
32 | # }
33 |
34 | CHARAS_LIST = ['absacc','acc','age','agr','bm','bm_ia','cashdebt','cashpr','cfp','cfp_ia','chatoia','chcsho','chempia','chinv','chpmia','convind','currat','depr','divi','divo','dy','egr','ep','gma','grcapx','grltnoa','herf','hire','invest','lev','lgr','mve_ia','operprof','orgcap','pchcapx_ia','pchcurrat','pchdepr','pchgm_pchsale','pchquick','pchsale_pchinvt','pchsale_pchrect','pchsale_pchxsga','pchsaleinv','pctacc','ps','quick','rd','rd_mve','rd_sale','realestate','roic','salecash','saleinv','salerec','secured','securedind','sgr','sin','sp','tang','tb','aeavol','cash','chtx','cinvest','ear','ms','nincr','roaq','roavol','roeq','rsup','stdacc','stdcf','baspread','beta','betasq','chmom','dolvol','idiovol','ill','indmom','maxret','mom12m','mom1m','mom36m','mom6m','mvel1','pricedelay','retvol','std_dolvol','std_turn','turn','zerotrade']
35 |
36 |
37 | # default learning rate of CA model
38 | CA_DR = 0.5 # drop out rate
39 | CA_LR = 0.001 # learning rate
40 |
41 | # out of sample period
42 | OOS_start = 19870101
43 | OOS_end = 20161231
44 |
45 |
46 |
47 | class HiddenPrints:
48 | def __init__(self, activated=True):
49 | self.activated = activated
50 | self.original_stdout = None
51 |
52 | def open(self):
53 | sys.stdout.close()
54 | sys.stdout = self.original_stdout
55 |
56 | def close(self):
57 | self.original_stdout = sys.stdout
58 | sys.stdout = open(os.devnull, 'w')
59 |
60 | def __enter__(self):
61 | if self.activated:
62 | self.close()
63 |
64 | def __exit__(self, exc_type, exc_val, exc_tb):
65 | if self.activated:
66 | self.open()
67 |
68 |
69 |
70 | def git_push(message):
71 | os.system('git add results')
72 | os.system(f'git commit -m "no_dropout: {message}"')
73 | os.system('git push')
--------------------------------------------------------------------------------