├── 聚类分析.R
├── LM_Optim.R
├── PCA_Optim.R
├── SVM_Optim.R
├── wufenwei.R
├── Cluster_Optim.R
├── EWeight_OPtim.R
├── PCA_Selection.R
├── Pics
    ├── MWeighted.png
    ├── EqualWeighted.png
    └── EqualWeighted2.png
├── factor_performance.R
├── factor_return_plot.R
├── .gitattributes
├── README.md
├── impact.R
├── .gitignore
├── redo_factor1.R
├── diy_factor.R
├── factor_return.R
├── 等权重打分.R
└── PartII_ChoosingTheStocks.R


/聚类分析.R:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyhong/Machine-learning-on-Stocks-Selection/HEAD/聚类分析.R


--------------------------------------------------------------------------------
/LM_Optim.R:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyhong/Machine-learning-on-Stocks-Selection/HEAD/LM_Optim.R


--------------------------------------------------------------------------------
/PCA_Optim.R:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyhong/Machine-learning-on-Stocks-Selection/HEAD/PCA_Optim.R


--------------------------------------------------------------------------------
/SVM_Optim.R:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyhong/Machine-learning-on-Stocks-Selection/HEAD/SVM_Optim.R


--------------------------------------------------------------------------------
/wufenwei.R:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyhong/Machine-learning-on-Stocks-Selection/HEAD/wufenwei.R


--------------------------------------------------------------------------------
/Cluster_Optim.R:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyhong/Machine-learning-on-Stocks-Selection/HEAD/Cluster_Optim.R


--------------------------------------------------------------------------------
/EWeight_OPtim.R:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyhong/Machine-learning-on-Stocks-Selection/HEAD/EWeight_OPtim.R


--------------------------------------------------------------------------------
/PCA_Selection.R:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyhong/Machine-learning-on-Stocks-Selection/HEAD/PCA_Selection.R


--------------------------------------------------------------------------------
/Pics/MWeighted.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyhong/Machine-learning-on-Stocks-Selection/HEAD/Pics/MWeighted.png


--------------------------------------------------------------------------------
/Pics/EqualWeighted.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyhong/Machine-learning-on-Stocks-Selection/HEAD/Pics/EqualWeighted.png


--------------------------------------------------------------------------------
/factor_performance.R:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyhong/Machine-learning-on-Stocks-Selection/HEAD/factor_performance.R


--------------------------------------------------------------------------------
/factor_return_plot.R:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyhong/Machine-learning-on-Stocks-Selection/HEAD/factor_return_plot.R


--------------------------------------------------------------------------------
/Pics/EqualWeighted2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyhong/Machine-learning-on-Stocks-Selection/HEAD/Pics/EqualWeighted2.png


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain
18 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Machine-learning-on-Stocks-Selection
 2 | 机器学习选股模型
 3 | 利用多种数据挖掘方法，建立Alpha多因子选股中性策略。效果其实还好嘛……
 4 | 
 5 | ![Alt text](https://raw.githubusercontent.com/pyhong/Machine-learning-on-Stocks-Selection/master/Pics/EqualWeighted.png)
 6 | 
 7 | ![Alt text](https://raw.githubusercontent.com/pyhong/Machine-learning-on-Stocks-Selection/master/Pics/EqualWeighted2.png)
 8 | 
 9 | ![Alt text](https://raw.githubusercontent.com/pyhong/Machine-learning-on-Stocks-Selection/master/Pics/MWeighted.png)
10 | 
11 | 线性回归结果这么好也是吓了一跳，重新最小化风险组合下，支持向量回归结果也还可以嘛。
12 | 但是选股结果无法用传统经济金融意义去理解，确实很难去说服别人。
13 | PS：BP神经网络效果永远都是差的不得了，干脆把代码删掉好了（手动再见）。
14 | 个人感觉除非去到深度学习里面的各种神经网络，不然BPNN这种坑爹货在哪里都比不上SVM、randomforest，甚至连线性regulization都比不上。
15 | 


--------------------------------------------------------------------------------
/impact.R:
--------------------------------------------------------------------------------
 1 | library(WindR)
 2 | w.start()
 3 | code<-w.wset('IndexConstituent','date=20151105;windcode=000300.SH')$Data$wind_code
 4 | 
 5 | #估计冲击成本函数
 6 | lm.sol<-lapply(code,function(x)
 7 | {
 8 |   tp<-w_wsd_data<-w.wsd(x,"pct_chg,volume","2014-01-01","2014-12-31")$Data
 9 |   chg<-w_wsd_data$PCT_CHG[-1]
10 |   vch<-w_wsd_data$VOLUME[-1]-w_wsd_data$VOLUME[-nrow(w_wsd_data)]
11 |   r2<-0
12 |   i<-0
13 |   fit<-NULL
14 |   try({for(k in seq(0.2,2,by=0.2))
15 |   {
16 |     r2.old<-r2
17 |     i.old<-i
18 |     i<-k
19 |     lmsol<-lm(chg~I(vch^k)+0)
20 |     r2<-summary(lmsol)$adj.r.squared
21 |     if(r2<r2.old) {r2<-r2.old;i<-i.old}
22 |   }
23 |   fit<-lm(chg~I(vch^i)+0)
24 |   })
25 |   return(fit)
26 | })
27 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Windows image file caches
 2 | Thumbs.db
 3 | ehthumbs.db
 4 | 
 5 | # Folder config file
 6 | Desktop.ini
 7 | 
 8 | # Recycle Bin used on file shares
 9 | $RECYCLE.BIN/
10 | 
11 | # Windows Installer files
12 | *.cab
13 | *.msi
14 | *.msm
15 | *.msp
16 | 
17 | # Windows shortcuts
18 | *.lnk
19 | 
20 | # =========================
21 | # Operating System Files
22 | # =========================
23 | 
24 | # OSX
25 | # =========================
26 | 
27 | .DS_Store
28 | .AppleDouble
29 | .LSOverride
30 | 
31 | # Thumbnails
32 | ._*
33 | 
34 | # Files that might appear in the root of a volume
35 | .DocumentRevisions-V100
36 | .fseventsd
37 | .Spotlight-V100
38 | .TemporaryItems
39 | .Trashes
40 | .VolumeIcon.icns
41 | 
42 | # Directories potentially created on remote AFP share
43 | .AppleDB
44 | .AppleDesktop
45 | Network Trash Folder
46 | Temporary Items
47 | .apdisk
48 | 


--------------------------------------------------------------------------------
/redo_factor1.R:
--------------------------------------------------------------------------------
 1 | dtime<-c("20100331", "20100630", "20100930", "20101231", "20110331", "20110630", "20110930", "20111230", "20120330", "20120629", "20120928", "20121231", "20130329", "20130628", "20130930", "20131231", "20140331", "20140630", "20140930", "20141231", "20150331", "20150630")
 2 | factor_dir <- dir("试题/附录2：300支股票对应的财务指标",full.names=T,pattern="*.csv")
 3 | factor_dir_name <- dir("试题/附录2：300支股票对应的财务指标",full.names=F,pattern="*.csv")
 4 | factor_list <- lapply(factor_dir,function(fl){read.csv(fl, header = T, stringsAsFactors = F)})
 5 | tp <- stock_log
 6 | stock_log<-stock_log[as.numeric(stock_log$date)>20100930)&as.numeric(stock_log$date)<=20150630),]
 7 | what<<-1
 8 | lapply(factor_list,function(x)
 9 |   {
10 |     tmp_LReturn <- stock_log
11 |     x$date <- format(as.Date(x$date),"%Y%m%d")
12 |     index <- match(dtime,x$date)
13 |     x <- x[index,]
14 |     x <- x[as.numeric(x$date)<=20140930]
15 |     x[is.na(x)]<--Inf
16 |     for(i in 2:ncol(x))
17 |       {
18 |         j <- 1
19 |         while(j<=(nrow(x)-1))
20 |         {
21 |          count <- 1
22 |          while(x[j,i]==x[j+count,i]&x[j,i]!=-Inf&(j+count<=nrow(x)))
23 |          {
24 |            x[j+count,i] <- -Inf
25 |             count <- count+1
26 |          }
27 |          j <- count+j
28 |         }
29 |       }
30 |     for(i in 1:(nrow(x)-1))
31 |       {
32 |         index<-stock_log$date>=x$date[i]&stock_log$date<x$date[i+1]
33 |         tmp_LReturn[index,-1] <- tmp_LReturn[index,order(x[i,-1],decreasing=T)+1]
34 |       }
35 |     write.csv(tmp_LReturn,paste("好困/",factor_dir_name[what],sep=""),row.names=F)
36 |     what <<- what+1
37 |     }
38 |   )


--------------------------------------------------------------------------------
/diy_factor.R:
--------------------------------------------------------------------------------
 1 | library(WindR)
 2 | w.start()
 3 | code300<-w.wset('IndexConstituent','date=20151108;windcode=000300.SH')$Data$wind_code
 4 | w_wsd_data<-w.wsd(code300,"close","2010-04-01","2015-06-30","Period=W")$Data
 5 | View(w_wsd_data)
 6 | write.csv(w_wsd_data,"hs300周收盘价.csv",row.names=F)
 7 | w_wsd_data_after <- w_wsd_data[-1,-1]
 8 | w_wsd_data_forward <- w_wsd_data[-nrow(w_wsd_data),-1]
 9 | LReturnWeek <- w_wsd_data[-1,]
10 | LReturnWeek[,-1]<- log(w_wsd_data_after/w_wsd_data_forward)
11 | write.csv(LReturnWeek,"hs300周对数收益率.csv",row.names=F)
12 | w_wsd_data<-w.wsd(code300,"close","2010-04-01","2015-06-30","Period=M")$Data
13 | write.csv(w_wsd_data,"hs300月收盘价.csv",row.names=F)
14 | w_wsd_data_after <- w_wsd_data[-1,-1]
15 | w_wsd_data_forward <- w_wsd_data[-nrow(w_wsd_data),-1]
16 | LReturnMonth <- w_wsd_data[-1,]
17 | LReturnMonth[,-1]<- log(w_wsd_data_after/w_wsd_data_forward)
18 | write.csv(LReturnMonth,"hs300月对数收益率.csv",row.names=F)
19 | 
20 | #Begin to sort monthly log return by factors
21 | factor_dir <- dir("试题/自找数据/factor",full.names=T,pattern="*.csv")
22 | factor_dir_name <- dir("试题/自找数据/factor",full.names=F,pattern="*.csv")
23 | factor_list <- lapply(factor_dir,function(fl)
24 |   {
25 |     fltmp <- read.csv(fl, header = F, stringsAsFactors = F,skip = 3)
26 |     names <- read.csv(fl, header = F, stringsAsFactors = F, nrows = 1)
27 |     names[,1] <- "date"
28 |     names[,-1] <- substr(names[1,-1],1,6)
29 |     colnames(fltmp) <- names 
30 |     fltmp$date <- format(as.Date(fltmp$date),"%Y%m%d")
31 |     fltmp
32 |   })
33 | stock_name <- substr(code300,1,6)
34 | stock_log <- LReturnMonth
35 | colnames(stock_log) <- c("date",stock_name)
36 | stock_log$date <- format(as.Date(stock_log$date),"%Y%m%d")
37 | 
38 | for(j in 1:length(factor_list))
39 |   {
40 |       x <- factor_list[[j]]
41 |       x <- x[as.numeric(x$date)<=20150531,]
42 |       rank_x <- apply(x[,-1],1,function(y){order(y,decreasing=T)})
43 |       tmp_LReturn <- stock_log
44 |       tmp_LReturn <- tmp_LReturn[as.numeric(tmp_LReturn$date)<=20150631,]
45 |       for(i in 1:ncol(rank_x))
46 |       {
47 |         tmp_LReturn[i,-1]<-tmp_LReturn[i,rank_x[,i]+1]
48 |       }
49 |       write.csv(tmp_LReturn,paste(factor_dir_name[j],sep=""),row.names=F)
50 |       print(j)
51 |   }
52 | 


--------------------------------------------------------------------------------
/factor_return.R:
--------------------------------------------------------------------------------
 1 | factor_dir <- dir("试题/附录2：300支股票对应的财务指标",full.names=T,pattern="*.csv")
 2 | factor_dir_name <- dir("试题/附录2：300支股票对应的财务指标",full.names=F,pattern="*.csv")
 3 | factor_list <- lapply(factor_dir,function(fl){read.csv(fl, header = T, stringsAsFactors = F)})
 4 | stock_dir <- dir("试题/附录3：沪深300成分股日线数据",full.names=T,pattern="*.txt")
 5 | stock_date <<- NULL
 6 | day.begin.factor <- 20100331
 7 | day.end.factor <- 20141230
 8 | day.begin <- 20100630
 9 | day.end <- 20141231
10 | stock_list <- lapply(stock_dir,function(fl)
11 |   {
12 |     tt <- read.table(fl,skip=2,header=F,sep=",",fill=T,stringsAsFactors=F)[,c(1,5)]
13 |     tt <- tt[-nrow(tt),]
14 |     colnames(tt) <- c("date","close")
15 |     tt[,1 ]<- as.numeric(tt[,1])
16 |     tt <- tt[(tt$date>=day.begin&tt$date<=day.end),]
17 |     stock_date <<- union(stock_date,tt[,1])
18 |     return(tt)
19 |   })
20 | stock_date <- sort(stock_date)
21 | stock_name <- substr(stock_dir,nchar(stock_dir)-9,nchar(stock_dir)-4)
22 | stock_return <- data.frame(matrix(NA,nrow=length(stock_date),ncol=length(stock_name)+1))
23 | colnames(stock_return) <- c("date",stock_name)
24 | stock_return[,1] <- stock_date
25 | stock_date_index <- rep(1,length(stock_name))
26 | list_index <<-2
27 | lapply(stock_list,function(x)
28 |   {
29 |     index <- match(x$date,stock_date)
30 |     stock_return[index,list_index] <<- x[,2]
31 |     list_index <<- list_index + 1
32 |   })
33 | stock_log <- stock_return[-1,]
34 | stock_log[,-1] <-log(stock_return[-1,-1]/stock_return[-nrow(stock_return),-1])
35 | date_1 <- stock_date[-length(stock_date)]
36 | date_2 <- stock_date[-1]
37 | popup_date <- c(stock_date[1],date_1[which(substr(date_1,5,6)!=substr(date_2,5,6))-1])
38 | popup_date <- c(day.begin.factor,popup_date[c(1:length(popup_date)-1)%%3==0],day.end.factor)
39 | popup_date <- data.frame(d1=popup_date[-length(popup_date)],d2=popup_date[-1],d3=c(popup_date[-(1:2)],Inf))
40 | #计算因子收益的函数
41 | factor_return <<- data.frame(matrix(NA,nrow=length(stock_date),ncol=length(stock_name)+1))
42 | factor_stockrank <<- data.frame(matrix(NA,nrow=length(stock_date),ncol=length(stock_name)+1))
43 | rank.of.stock <- match(stock_name,substr(colnames(factor_list[[1]]),2,7)[-1])
44 | factor_list<-lapply(factor_list,function(x)
45 | {
46 |   x<-x[,c(1,rank.of.stock+1)]
47 |   x[,1]<-as.numeric(format(as.Date(x$date),"%Y%m%d"))
48 |   return(x)
49 | })
50 | name_i <<- 1
51 | lapply(factor_list,function(x)
52 | {
53 |   stock_log_tmp <<- stock_log
54 |   apply(popup_date,1,function(y)
55 |   {
56 |     x <- x[x$date==y[1],]
57 |     x <- x[-1]
58 |     x_rank <- rank(x)
59 |     index_tmp <- (stock_log_tmp$date>=y[2])&(stock_log_tmp$date<=y[3])
60 |     if(length(index_tmp)>0)
61 |     {
62 |       tmp_log <- stock_log_tmp[index_tmp,]
63 |       tmp_log <- tmp_log[,c(1,x_rank+1)]
64 |       stock_log_tmp[index_tmp,] <<- tmp_log
65 |     }
66 |   })
67 |   write.csv(stock_log_tmp,paste("output/1/","未分五档",factor_dir_name[name_i],sep=""),row.names=F)
68 |   name_i <<- name_i+1
69 | })
70 | 
71 | 


--------------------------------------------------------------------------------
/等权重打分.R:
--------------------------------------------------------------------------------
 1 | load("E:/金融建模/data/data_for_trainging_and_prediction.RData")
 2 | 
 3 | # 提取数据
 4 | S1 <- FactorList2[[1]] # PB市净率
 5 | S2 <- FactorList2[[2]] # PCF市现率
 6 | S3 <- FactorList2[[4]] # PS市销率
 7 | S4 <- FactorList2[[6]] # 换手率
 8 | S5 <- FactorList2[[10]] # 市盈率TTM
 9 | S6 <- FactorList2[[14]] #总市值
10 | 
11 | S6list <- list() #将S6的数据修改成数值
12 | for(i in 1:301){
13 |     S6list[[i]] <- do.call(rbind,lapply(strsplit(S6[,i],','), paste, collapse = ''))
14 | }
15 | temp <- do.call(cbind, S6list)
16 | temp <- as.data.frame(temp)
17 | colnames(temp) <- colnames(S6)
18 | S6 <- temp 
19 | 
20 | F1 <- FactorList1[[6]] # 每股经营活动产生的现金流量净额
21 | F2 <- FactorList1[[9]] # 每股收益EPS
22 | F3 <- FactorList1[[12]] # 权益乘数
23 | F4 <- FactorList1[[16]] # 资产负债率
24 | F5 <- FactorList1[[17]] # 总负债（同比增长率）
25 | 
26 | colnames(S1) <- paste("X", colnames(S1), sep = '')
27 | colnames(S2) <- paste("X", colnames(S2), sep = '')
28 | colnames(S3) <- paste("X", colnames(S3), sep = '')
29 | colnames(S4) <- paste("X", colnames(S4), sep = '')
30 | colnames(S5) <- paste("X", colnames(S5), sep = '')
31 | colnames(S6) <- paste("X", colnames(S6), sep = '')
32 | 
33 | # 获取2015年上半年所需因子数据
34 | FacName <- c(paste('S', c(1:6), sep = ''), paste('F', c(1:5), sep = ''))
35 | for(i in 1:11){
36 |     assign(FacName[i], get(FacName[i])[53:58,]) 
37 | }
38 | 
39 | # 标准化数据
40 | DataScale <- function(dt){
41 |     dtt <- t(dt[, -1])
42 |     time <- t(as.numeric(dt[, 1]))
43 |     temp <- apply(dtt, 2, function(x){scale(as.numeric(x))}) 
44 |     
45 |     temp <- cbind(row.names(dtt), as.data.frame(temp))
46 | }
47 | 
48 | for(i in 1:11){
49 |     assign(FacName[i], DataScale(get(FacName[i])))
50 | } # 进行标准化
51 | 
52 | # 合并数据
53 | MaIndex <- match(S1[,1], F1[,1])
54 | temp <- cbind(S1, S2[,-1], S3[,-1], S4[,-1], S5[,-1], S6[,-1],
55 |               F1[MaIndex,-1], F2[MaIndex,-1], F3[MaIndex,-1],
56 |               F4[MaIndex,-1], F5[MaIndex,-1])
57 | 
58 | Index <- seq(from = 2, to = 67, 6)
59 | fac1 <- temp[, c(1, Index)] # 选取2015年1月股票组合所需因子暴露
60 | fac2 <- temp[, c(1, Index+1)] # 选取2015年2月股票组合所需因子暴露
61 | fac3 <- temp[, c(1, Index+2)]
62 | fac4 <- temp[, c(1, Index+3)]
63 | fac5 <- temp[, c(1, Index+4)]
64 | fac6 <- temp[, c(1, Index+5)]
65 | 
66 | # 等权重求得分排序
67 | Rank <- function(Fdata){
68 |     Score <- apply(Fdata[, c(8, 9)], 1, sum) - apply(Fdata[, c(-1, -8, -9)], 1, sum)
69 |     Stock <- cbind(as.data.frame(Fdata[,1]), Score)
70 |     as.character(Stock[order(Stock[,2], decreasing = T)[1:50], 1])
71 | }
72 | 
73 | TELReturn <- t(ELReturn)
74 | ELReturn1 <- TELReturn[match(Rank(fac1), row.names(TELReturn)), 53]
75 | ELReturn2 <- TELReturn[match(Rank(fac2), row.names(TELReturn)), 54]
76 | ELReturn3 <- TELReturn[match(Rank(fac3), row.names(TELReturn)), 55]
77 | ELReturn4 <- TELReturn[match(Rank(fac4), row.names(TELReturn)), 56]
78 | ELReturn5 <- TELReturn[match(Rank(fac5), row.names(TELReturn)), 57]
79 | ELReturn6 <- TELReturn[match(Rank(fac6), row.names(TELReturn)), 58]
80 | 
81 | ELR <- c(mean(as.numeric(ELReturn1)),
82 |          mean(as.numeric(ELReturn2)),
83 |          mean(as.numeric(ELReturn3)),
84 |          mean(as.numeric(ELReturn4), na.rm = T),
85 |          mean(as.numeric(ELReturn5), na.rm = T),
86 |          mean(as.numeric(ELReturn6)))
87 | plot(cumsum(ELR)*100, type = 'l')
88 | 
89 | set <- cbind(Rank(fac1),Rank(fac2),Rank(fac3),Rank(fac4),Rank(fac5),Rank(fac6))
90 | set <- as.data.frame(set)
91 | write.csv(set,"等权重方法选股结果.csv")
92 | 


--------------------------------------------------------------------------------
/PartII_ChoosingTheStocks.R:
--------------------------------------------------------------------------------
  1 | #回归法选股
  2 | #SVR和LM
  3 | 
  4 | #Get the list of components of HS300
  5 | # library(WindR)
  6 | # w.start()
  7 | # w_wset_data <- w.wset('SectorConstituent','date=20151108;windcode=000300.SH')$Data
  8 | # write.csv(w_wset_data,"HS300成份股.csv",row.names=F)
  9 | 
 10 | #Get the relative log return to HS300
 11 | MonthLogReturn <- read.csv("hs300月对数收益率.csv",head=T)
 12 | MonthLogReturn[,-1] <- apply(MonthLogReturn[,-1],c(1,2),function(x){ifelse(x==0,x<-NA,x)})
 13 | StockName<-substr(colnames(MonthLogReturn)[-1],2,7)
 14 | # HS300Return <- w.wsd("000300.SH","pct_chg","2010-05-31","2015-06-30","Period=M")$Data
 15 | # HS300LogReturn <- HS300Return
 16 | # HS300LogReturn[,-1] <- log(1+HS300Return[,-1]/100)
 17 | # write.csv(HS300LogReturn,"HS300指数对数收益率.csv",row.names=F)
 18 | HS300LogReturn <- read.csv("HS300指数对数收益率.csv",head=T,stringsAsFactors=F)
 19 | ELReturn <- MonthLogReturn
 20 | ELReturn[,-1] <- apply(MonthLogReturn[,-1],2,function(x){x-HS300LogReturn[,-1]})
 21 | ELReturn <- ELReturn[as.numeric(format(as.Date(ELReturn$DATETIME),"%Y%m%d"))>=20100930,]
 22 | #Get the factor list
 23 | stockdate<-c("20100331","20100430",format(as.Date(HS300LogReturn$DATETIME),"%Y%m%d"))
 24 | stockdate1<-stockdate[as.numeric(stockdate)<=20141231]
 25 | stockdate2<-stockdate[as.numeric(stockdate)>=20100831&as.numeric(stockdate)<=20150531]
 26 | FactorDir1 <- dir("试题/附录2：300支股票对应的财务指标/",full.names=T,pattern=".csv")
 27 | FactorName1 <- dir("试题/附录2：300支股票对应的财务指标/",full.names=F,pattern=".csv")
 28 | FactorList1 <- lapply(FactorDir1,function(x)
 29 |   {
 30 |     tplist <- read.csv(x,head=T)
 31 |     tplist$date<-format(as.Date(tplist$date),"%Y%m%d")
 32 |     tplist <- tplist[match(stockdate1,tplist$date),]
 33 |     return(tplist)
 34 |   })
 35 | FactorDir2 <- dir("试题/自找数据/factor/",full.names=T)
 36 | FactorName2 <- dir("试题/自找数据/factor/",full.names=F)
 37 | FactorList2 <- lapply(FactorDir2,function(x)
 38 |   {
 39 |     tplist <- read.csv(x, header = F, stringsAsFactors = F,skip = 3)
 40 |     names <- read.csv(x, header = F, stringsAsFactors = F, nrows = 1)
 41 |     names[1]<-"date"
 42 |     colnames(tplist)<-names
 43 |     tplist$date<-format(as.Date(tplist$date),"%Y%m%d")
 44 |     tplist <- tplist[match(stockdate2,tplist$date),]
 45 |     return(tplist)
 46 |   })
 47 | 
 48 | FinalTable <- NULL
 49 | for(i in 2:ncol(ELReturn))
 50 | {
 51 |   index <- as.numeric(format(as.Date(ELReturn$DATETIME),"%Y%m%d"))<=20141231
 52 |   TmpTable <- matrix(ELReturn[index,i],ncol=1)
 53 |   for(tp in FactorList1)  {tp<-tp[index,c(1,match(colnames(ELReturn[,-1]),colnames(tp[,-1]))+1)];TmpTable <- cbind(TmpTable,tp[,i])}
 54 |   for(tp in FactorList2)  {tp<-tp[index,c(1,match(colnames(ELReturn[,-1]),paste("X",colnames(tp[,-1]),sep=""))+1)];TmpTable <- cbind(TmpTable,tp[,i])}
 55 |   FinalTable <- rbind(FinalTable,TmpTable)
 56 | }
 57 | 
 58 | EffectiveIndex <- c(1,22,23,25,27,7,10,13,31,17,18,35) #这个是对着那个excel表选出来
 59 | 
 60 | PredictDate <- stockdate[stockdate>20141231]
 61 | for(i in PredictDate)
 62 | {
 63 |   index <- as.numeric(format(as.Date(ELReturn$DATETIME),"%Y%m%d"))==i
 64 |   TmpTable <- ELReturn[index,]
 65 |   names <- colnames(TmpTable)
 66 |   for(tp in FactorList1) 
 67 |   {
 68 |     tp <- tp[index,c(1,match(colnames(ELReturn[,-1]),colnames(tp[,-1]))+1)];
 69 |     colnames(tp) <- names;
 70 |     TmpTable <- rbind(TmpTable,tp,deparse.level = 0);
 71 |   }
 72 |   for(tp in FactorList2) 
 73 |   {
 74 |     tp <- tp[index,c(1,match(colnames(ELReturn[,-1]),paste("X",colnames(tp[,-1]),sep=""))+1)];
 75 |     colnames(tp) <- names;
 76 |     TmpTable <- rbind(TmpTable,tp,deparse.level = 0);
 77 |   }
 78 |   names <- c("超额对数收益率",substr(FactorName1,1,nchar(FactorName1)-4),substr(FactorName2,1,nchar(FactorName2)-4))
 79 |   rownames(TmpTable) <- names
 80 |   TmpTable <- TmpTable[EffectiveIndex,]
 81 |   write.csv(t(TmpTable),paste("DataForRegression_Predict/",i,".csv",sep=""))
 82 | }
 83 | 
 84 | names <- c("超额对数收益率",substr(FactorName1,1,nchar(FactorName1)-4),substr(FactorName2,1,nchar(FactorName2)-4))
 85 | colnames(FinalTable) <- names
 86 | 
 87 | #Choose the effective factors
 88 | 
 89 | EffectiveTable <- FinalTable[,c(1,EffectiveIndex)]
 90 | # EffectiveTable <- FinalTable
 91 | EffectiveTable <- EffectiveTable[complete.cases(EffectiveTable),]
 92 | write.csv(EffectiveTable,"DataForRegression_Train.csv",row.names=F)
 93 | 
 94 | 
 95 | EffectiveTable <- read.csv("DataForRegression_Train.csv",stringsAsFactor=F,head=T)
 96 | colmeans <- colMeans(EffectiveTable)
 97 | colsd <- apply(EffectiveTable,2,sd)
 98 | 
 99 | EffectiveTable <- scale(EffectiveTable,center=T,scale=T)
100 | EffectiveTable <- data.frame(EffectiveTable)
101 | 
102 | PredictDir <- dir("DataForRegression_Predict/",full.names = T)
103 | PredictTable <- lapply(PredictDir,function(x)
104 |   {
105 |     tp <- read.csv(x,head=F,skip=2,stringsAsFactors=F)
106 |     names <- read.csv(x,head=F,nrows=1,stringsAsFactors=F)
107 |     names[1,1]<-"code"
108 |     colnames(tp)<-names[1,]
109 |     tp
110 |   })
111 | 
112 | 
113 | 
114 | 
115 | #Linear Regression
116 | lm.sol <- lm(EffectiveTable$超额对数收益率 ~ ., data=EffectiveTable)
117 | RegressionResult.lm <- predict(lm.sol,newdata=EffectiveTable[,-1]) 
118 | plot(RegressionResult.lm,EffectiveTable$超额对数收益率)
119 | names<-colnames(EffectiveTable)
120 | lmStock <<- data.frame(matrix(0,ncol=6,nrow=50))
121 | count <<- 1
122 | PReturn<-sapply(PredictTable,function(x)
123 |   {
124 |     names<-colnames(EffectiveTable)
125 |     x<-x[complete.cases(x),]
126 |     y<-(x[,-(1:2)]-colmeans[-1])/colsd[-1]
127 |     y<-x[,-(1:2)]
128 |     colnames(y)<-names[-1]
129 |     pre<-predict(lm.sol,newdata=y)
130 |     index<-order(pre,decreasing=T)[1:50]
131 |     lmStock[,count]<<-substr(x[index,1],2,10)
132 |     count <<- count+1
133 |     mean(x[index,2],na.rm=T)
134 |   })
135 | write.csv(lmStock,"回归结果/lm_stock.csv",row.names=F)
136 | result.lm <- cbind(PReturn,cumsum(PReturn))
137 | colnames(result.lm)<-c("每月超额收益率","累积超额收益率")
138 | write.csv(result.lm,"回归结果/lm_return.csv")
139 | plot(cumsum(PReturn),type="l",col="red",xlab="2015年前六月",ylab="累计收益率",main="线性回归")
140 | #SVM
141 | if(!require(e1071)) {install.packages("e1071");library(e1071)}
142 | svm.sol<-svm(EffectiveTable$超额对数收益率~.,data=EffectiveTable,cost=1.4)
143 | RegressionResult.svm <- predict(svm.sol,EffectiveTable[,2:ncol(EffectiveTable)])
144 | plot(RegressionResult.svm,EffectiveTable$超额对数收益率)
145 | svmStock <<- data.frame(matrix(0,ncol=6,nrow=50))
146 | count <<- 1
147 | PReturn<-sapply(PredictTable,function(x)
148 | {
149 |   names<-colnames(EffectiveTable)
150 |   x<-x[complete.cases(x),]
151 |   y<-x[,-(1:2)]
152 |   y<-(x[,-(1:2)]-colmeans[-1])/colsd[-1]
153 |   colnames(y)<-names[-1]
154 |   pre<-predict(svm.sol,newdata=y)
155 |   index<-order(pre,decreasing=T)[1:50]
156 |   svmStock[,count]<<-substr(x[index,1],2,10)
157 |   count <<- count+1
158 |   mean(x[index,2],na.rm=T)
159 | })
160 | write.csv(svmStock,"回归结果/svm_stock.csv",row.names=F)
161 | result.svm <- cbind(PReturn,cumsum(PReturn))
162 | colnames(result.svm)<-c("每月超额收益率","累积超额收益率")
163 | write.csv(result.svm,"回归结果/svm_return.csv")
164 | plot(cumsum(PReturn),type="l",col="red",xlab="2015年前六月",ylab="累计收益率",main="SVR回归")
165 | #randomForest
166 | if(!(require(randomForest))) {install.packages("randomForest");library(randomForest)}
167 | rdForest.sol <- randomForest(EffectiveTable$超额对数收益率~.,data=EffectiveTable)
168 | RegressionResult.rdForest <- predict(rdForest.sol,newdata=EffectiveTable[,2:ncol(EffectiveTable)])
169 | plot(EffectiveTable$超额对数收益率,RegressionResult.rdForest)
170 | rdFStock <<- data.frame(matrix(0,ncol=6,nrow=50))
171 | count <<- 1
172 | PReturn<-sapply(PredictTable,function(x)
173 | {
174 |   names<-colnames(EffectiveTable)
175 |   x<-x[complete.cases(x),]
176 |   y<-x[,-(1:2)]
177 |   y<-(x[,-(1:2)]-colmeans[-1])/colsd[-1]
178 |   colnames(y)<-names[-1]
179 |   pre<-predict(rdForest.sol,newdata=y)
180 |   index<-order(pre,decreasing=T)[1:50]
181 |   rdFStock[,count]<<-substr(x[index,1],2,10)
182 |   count <<- count+1
183 |   mean(x[index,2],na.rm=T)
184 | })
185 | write.csv(rdFStock,"回归结果/rdF_stock.csv",row.names=F)
186 | result.rdF <- cbind(PReturn,cumsum(PReturn))
187 | colnames(result.rdF)<-c("每月超额收益率","累积超额收益率")
188 | write.csv(result.rdF,"回归结果/rdF_return.csv")
189 | plot(cumsum(PReturn),type="l",col="red",xlab="2015年前六月",ylab="累计收益率",main="RandomForest回归")
190 | 


--------------------------------------------------------------------------------