├── A_hybrid_method_of_exponential_smoothing_and_recurrent_neural_networks_for_time_series_forecasting.pdf ├── ES_RNN_SlawekSmyl.pdf ├── LICENSE ├── R ├── merge.R ├── merge_PI.R └── readme.txt ├── README.md ├── c++ ├── ES_RNN.cc ├── ES_RNN_E.cc ├── ES_RNN_E_PI.cc ├── ES_RNN_PI.cc ├── linux_example_scripts │ ├── build_mkl │ ├── readme.txt │ └── run18 ├── readme.txt ├── slstm.cpp ├── slstm.h └── windows_VisualStudio │ ├── M4.sln │ ├── M41 │ ├── ES_RNN.cc │ ├── M41.vcxproj │ ├── slstm.cpp │ └── slstm.h │ ├── M42 │ ├── ES_RNN_PI.cc │ ├── M42.vcxproj │ ├── M42.vcxproj.filters │ └── slstm.h │ ├── M43 │ ├── ES_RNN_E.cc │ ├── M43.filters │ ├── M43.vcxproj │ └── slstm.h │ ├── M44 │ ├── ES_RNN_E_PI.cc │ ├── M44.filters │ ├── M44.vcxproj │ └── slstm.h │ ├── readme.txt │ └── x64 │ └── RelWithDebug │ ├── readme.txt │ ├── run61.cmd │ └── run61_e.cmd ├── readme.txt └── sql ├── createM72nn_SQLServer.sql ├── createM72nn_mysql.txt └── readme.txt /A_hybrid_method_of_exponential_smoothing_and_recurrent_neural_networks_for_time_series_forecasting.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/slaweks17/ES_RNN/36414dd68c7f15632d6f9008d2651b7331bfca56/A_hybrid_method_of_exponential_smoothing_and_recurrent_neural_networks_for_time_series_forecasting.pdf -------------------------------------------------------------------------------- /ES_RNN_SlawekSmyl.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/slaweks17/ES_RNN/36414dd68c7f15632d6f9008d2651b7331bfca56/ES_RNN_SlawekSmyl.pdf -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 slaweks17 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /R/merge.R: -------------------------------------------------------------------------------- 1 | # Merging outputs, per category, M4 competition, for point forecasts, so for ES_RNN and ES_RNN_E 2 | # Author: Slawek Smyl, Mar-May 2018 3 | 4 | 5 | #The c++ executables write to one (occasinally two, sorry :-), so in such case move files to one dir before continuing) directories. 6 | #(One logical run of several instances of the same program will produce a number files, e.g. outputs with different ibig value) 7 | #This script merges, averages values, and writes them down to the same directory - FOREC_DIR 8 | ############################################################################### 9 | 10 | #directory that should include all *-train.csv files, as well as M4-info.csv 11 | DATA_DIR="F:/progs/data/M4DataSet/" 12 | m4Info_df=read.csv(paste0(DATA_DIR,"M4-info.csv")) 13 | options(stringsAsFactors =FALSE) 14 | 15 | #directory with all the output files produced by the c++ code we want to merge 16 | FOREC_DIR='F:\\progs\\data\\M4\\Quarterly2018-05-31_09_30' #do not end with separator 17 | 18 | LBACK=1 #shoud be as in the c++ code, LBACK>0 means backtesting 19 | SP="Quarterly" 20 | #SP="Yearly" 21 | #SP="Daily" 22 | #SP="Hourly" 23 | 24 | #//----------------PARAMS ---------- comment/uncomment following 3 variables 25 | #for ES_RNN_E, so for all except Monthly and Quarterly runs: 26 | #NUM_OF_SEEDS=1 27 | #NUM_OF_CHUNKS=1 28 | #IBIGS= 29 | 30 | #for ES_RNN (do for Monthly and Quarterly): 31 | NUM_OF_CHUNKS=2 #same as NUM_OF_CHUNKS constant the the c++ cource code, changing it is not recommended. 32 | NUM_OF_SEEDS=3 #It is equal to the number of seeds in the startup script, (or number of teams of worker processes) 33 | # so number_of_concurrent_executables==number_of_lines_in_the_running script/NUM_OF_CHUNKS, and number_of_chunks 34 | #E.g if using following script for ES_RNN: 35 | # start 10 1 0 36 | # start 10 2 0 37 | # start 20 1 5 38 | # start 20 2 5 39 | # start 30 1 10 40 | # start 30 2 10 41 | # we have here three seeds: 10,20,30, and two chunks: 1,2. (The pairs of workes have IBIG offsets of 0,5,10) 42 | IBIGS=3 #number of complete runs by each executables, so if programs are not interrupted, this should be equal to the constant BIG_LOOP in the c++ code, by default 3. 43 | 44 | 45 | m4_df=read.csv(paste0(DATA_DIR,SP,"-train.csv")) 46 | 47 | sMAPE<-function(forec,actual) { 48 | mean(abs(forec-actual)/(abs(forec)+abs(actual)))*200 49 | } 50 | errorFunc=sMAPE 51 | 52 | 53 | spInfo_df=m4Info_df[m4Info_df$SP==SP,] 54 | ids=spInfo_df$M4id 55 | horizon=spInfo_df[1,"Horizon"] 56 | 57 | #VARIABLE + "_" + to_string(seedForChunks) + "_" + to_string(chunkNo) + "_" + to_string(ibigDb)+"_LB"+ to_string(LBACK)+ ".csv"; 58 | inputFiles=list.files(path = FOREC_DIR, pattern = paste0(SP,".*LB",LBACK), full.names = T) 59 | if (length(inputFiles)!=NUM_OF_SEEDS*NUM_OF_CHUNKS*IBIGS) { 60 | stop("length(inputFiles)!=NUM_OF_SEEDS*NUM_OF_CHUNKS*IBIGS") 61 | } 62 | 63 | 64 | comp_df=NULL 65 | fil=inputFiles[1] 66 | for (fil in inputFiles) { 67 | print(fil) 68 | c_df=read.csv(fil, header=F) 69 | comp_df=rbind(comp_df,c_df) 70 | } 71 | names(comp_df)[1]='id' 72 | 73 | forecSeries=sort(unique(comp_df$id)) 74 | if (length(forecSeries)!=length(ids) && LBACK==0) { 75 | stop(paste0("Expected number of cases:",length(ids)," but got:",length(forecSeries))) 76 | } 77 | 78 | SIZE_OF_CHUNK=1000 79 | out_df=NULL; ou_df=NULL 80 | fSeries=forecSeries[1] 81 | for (fSeries in forecSeries) { 82 | oneSeriesForecs_df=comp_df[comp_df$id==fSeries,] 83 | o1=colMeans(oneSeriesForecs_df[,2:ncol(oneSeriesForecs_df)]) 84 | o_df=data.frame(id=fSeries, as.list(o1), stringsAsFactors =F) 85 | ou_df=rbind(ou_df, o_df) 86 | if (nrow(ou_df)>=SIZE_OF_CHUNK) { 87 | out_df=rbind(out_df,ou_df) 88 | ou_df=NULL 89 | print(nrow(out_df)) 90 | } 91 | } 92 | out_df=rbind(out_df,ou_df) 93 | print(nrow(out_df)) 94 | out_df=out_df[order(as.integer(substring(out_df$id, 2))),] 95 | 96 | #FOREC_DIR="e:\\temp" 97 | outPath=paste0(FOREC_DIR,'\\',SP,"Forec.csv") 98 | write.csv(out_df,file=outPath,row.names = F) 99 | 100 | ################ Main work done, now just diagnostics calculations and plots 101 | 102 | #display a sample of forecasts and, if LBACK>0, actuals 103 | MAX_NUM_OF_POINTS_TO_SHOW=200 104 | for (i in 1:100) { 105 | irand=sample(1:length(forecSeries),1) 106 | fSeries=forecSeries[irand] 107 | forec=as.numeric(out_df[out_df$id==fSeries,2:ncol(out_df)]) 108 | actual=as.numeric(m4_df[m4_df$V1==fSeries,2:ncol(m4_df)]) 109 | actual=actual[!is.na(actual)] 110 | if (length(actual)>MAX_NUM_OF_POINTS_TO_SHOW) { 111 | actual=actual[(length(actual)-MAX_NUM_OF_POINTS_TO_SHOW):length(actual)] 112 | } 113 | if (LBACK==0) { 114 | plot(c(actual,forec), col=c(rep(1,length(actual)),rep(2,length(forec))), main=fSeries) 115 | } else { 116 | ymin=min(actual,forec) 117 | ymax=max(actual,forec) 118 | plot(1:length(actual),actual, main=fSeries, ylim=c(ymin,ymax)) 119 | lines((length(actual)-length(forec)+1):length(actual), forec, col=2, type='p') 120 | } 121 | 122 | Sys.sleep(5) 123 | } 124 | 125 | 126 | #calc error metrics 127 | if (LBACK>0) { 128 | summErrors=0 129 | fSeries=forecSeries[1] 130 | i=1 131 | for (fSeries in forecSeries) { 132 | if (i%%1000==0) 133 | cat(".") 134 | forec=as.numeric(out_df[out_df$id==fSeries,2:ncol(out_df)]) 135 | actual=as.numeric(m4_df[m4_df$V1==fSeries,2:ncol(m4_df)]) 136 | actual=actual[!is.na(actual)] 137 | actual=actual[(length(actual)-LBACK*horizon+1):(length(actual)-(LBACK-1)*horizon)] 138 | summErrors=summErrors+errorFunc(forec,actual) 139 | i=i+1 140 | } 141 | print(".") 142 | print(paste0("avg error:",round(summErrors/length(forecSeries),2))) 143 | } 144 | -------------------------------------------------------------------------------- /R/merge_PI.R: -------------------------------------------------------------------------------- 1 | # Merging outputs, per category, M4 competition, for Prediction Intervals , so for ES_RNN_PI and ES_RNN_E_PI 2 | # Author: Slawek Smyl, Mar-May 2018 3 | 4 | 5 | #The c++ executables write to one (occasinally two, sorry :-), so in such case move files to one dir before continuing) directories. 6 | #(One logical run of several instances of the same program will produce a number files, e.g. outputs with different ibig value) 7 | #This script merges, averages values, and writes them down to the same directory - FOREC_DIR 8 | ############################################################################### 9 | 10 | #directory that should include all *-train.csv files, as well as M4-info.csv 11 | DATA_DIR="F:/progs/data/M4DataSet/" 12 | m4Info_df=read.csv(paste0(DATA_DIR,"M4-info.csv")) 13 | options(stringsAsFactors =FALSE) 14 | memory.limit(10000) 15 | 16 | #directory with all the output files produced by the c++ code we want to merge 17 | FOREC_DIR='F:\\progs\\data\\M4\\Hourlygood' #do not end with separator 18 | 19 | LBACK=1 #shoud be as in the c++ code, LBACK>0 means backtesting 20 | #SP="Quarterly" 21 | #SP="Yearly" 22 | #SP="Daily" 23 | SP="Hourly" 24 | m4_df=read.csv(paste0(DATA_DIR,SP,"-train.csv")) 25 | 26 | 27 | #//----------------PARAMS ---------- comment/uncomment following 3 variables 28 | #for ES_RNN_E_PI, so for all except Monthly and Quarterly runs: 29 | NUM_OF_SEEDS=1 30 | NUM_OF_CHUNKS=1 31 | #IBIGS=/2 32 | IBIGS=6 33 | 34 | #for ES_RNN_PI (do for Monthly and Quarterly): 35 | #NUM_OF_CHUNKS=2 #same as NUM_OF_CHUNKS constant the the c++ cource code, changing it is not recommended. 36 | #NUM_OF_SEEDS=3 #It is equal to the number of seeds in the startup script, (or number of teams of worker processes) 37 | # so number_of_concurrent_executables==number_of_lines_in_the_running script/NUM_OF_CHUNKS, and number_of_chunks 38 | #E.g if using following script for ES_RNN: 39 | # start 10 1 0 40 | # start 10 2 0 41 | # start 20 1 5 42 | # start 20 2 5 43 | # start 30 1 10 44 | # start 30 2 10 45 | # we have here three seeds: 10,20,30, and two chunks: 1,2. (The pairs of workes have IBIG offsets of 0,5,10) 46 | #IBIGS=3 #number of complete runs by each executables, so if programs are not interrupted, this should be equal to the constant BIG_LOOP in the c++ code, by default 3. 47 | 48 | ALPHA = 0.05; 49 | ALPHA_MULTIP = 2 / ALPHA; 50 | 51 | MSIS<-function(forecL,forecH,actual) { 52 | sumDiffs=0 53 | for (i in 1:(length(actual)-seasonality)) { 54 | sumDiffs=sumDiffs+abs(actual[i+seasonality]-actual[i]) 55 | } 56 | avgAbsDiff=sumDiffs/(length(actual)-seasonality) 57 | 58 | actual=actual[(length(actual)-LBACK*horizon+1):(length(actual)-(LBACK-1)*horizon)] 59 | 60 | msis=sum(forecH-forecL)+sum(pmax(0,forecL-actual))*ALPHA_MULTIP+sum(pmax(0,actual-forecH))*ALPHA_MULTIP 61 | msis/horizon/avgAbsDiff 62 | } 63 | errorFunc=MSIS 64 | 65 | spInfo_df=m4Info_df[m4Info_df$SP==SP,] 66 | ids=spInfo_df$M4id 67 | horizon=spInfo_df[1,"Horizon"] 68 | seasonality=spInfo_df[1,"Frequency"] 69 | 70 | 71 | #lower 72 | #VARIABLE + "_" + to_string(seedForChunks) + "_" + to_string(chunkNo) + "_" + to_string(ibigDb)+"_LB"+ to_string(LBACK)+ ".csv"; 73 | inputFiles=list.files(path = FOREC_DIR, pattern = paste0(SP,".*LLB",LBACK), full.names = T) 74 | if (length(inputFiles)!=NUM_OF_SEEDS*NUM_OF_CHUNKS*IBIGS) { 75 | stop("length(inputFiles)!=NUM_OF_SEEDS*NUM_OF_CHUNKS*IBIGS") 76 | } 77 | 78 | comp_df=NULL 79 | fil=inputFiles[1] 80 | for (fil in inputFiles) { 81 | print(fil) 82 | c_df=read.csv(fil, header=F) 83 | comp_df=rbind(comp_df,c_df) 84 | } 85 | names(comp_df)[1]='id' 86 | 87 | forecSeries=sort(unique(comp_df$id)) 88 | if (length(forecSeries)!=length(ids) && LBACK==0) { 89 | stop(paste0("Expected number of cases:",length(ids)," but got:",length(forecSeries))) 90 | } 91 | 92 | SIZE_OF_CHUNK=1000 93 | out_df=NULL; ou_df=NULL 94 | fSeries=forecSeries[1] 95 | for (fSeries in forecSeries) { 96 | oneSeriesForecs_df=comp_df[comp_df$id==fSeries,] 97 | o1=colMeans(oneSeriesForecs_df[,2:ncol(oneSeriesForecs_df)]) 98 | o_df=data.frame(id=fSeries, as.list(o1), stringsAsFactors =F) 99 | ou_df=rbind(ou_df, o_df) 100 | if (nrow(ou_df)>=SIZE_OF_CHUNK) { 101 | out_df=rbind(out_df,ou_df) 102 | ou_df=NULL 103 | print(nrow(out_df)) 104 | } 105 | } 106 | out_df=rbind(out_df,ou_df) 107 | print(nrow(out_df)) 108 | out_df=out_df[order(as.integer(substring(out_df$id, 2))),] 109 | 110 | outPath=paste0(FOREC_DIR,'\\',SP,"ForecL.csv") 111 | write.csv(out_df,file=outPath,row.names = F) 112 | 113 | lower_df=out_df 114 | 115 | ##################################### 116 | #higher 117 | inputFiles=list.files(path = FOREC_DIR, pattern = paste0(SP,".*HLB",LBACK), full.names = T) 118 | if (length(inputFiles)!=NUM_OF_SEEDS*NUM_OF_CHUNKS*IBIGS) { 119 | stop("length(inputFiles)!=NUM_OF_SEEDS*NUM_OF_CHUNKS*IBIGS") 120 | } 121 | 122 | comp_df=NULL 123 | fil=inputFiles[1] 124 | for (fil in inputFiles) { 125 | print(fil) 126 | c_df=read.csv(fil, header=F) 127 | comp_df=rbind(comp_df,c_df) 128 | } 129 | names(comp_df)[1]='id' 130 | 131 | forecSeries=sort(unique(comp_df$id)) 132 | if (length(forecSeries)!=length(ids) && LBACK==0) { 133 | print(paste0("Warning. Expected number of cases:",length(ids)," but got:",length(forecSeries))) 134 | } 135 | 136 | SIZE_OF_CHUNK=1000 137 | out_df=NULL; ou_df=NULL 138 | fSeries=forecSeries[1] 139 | for (fSeries in forecSeries) { 140 | oneSeriesForecs_df=comp_df[comp_df$id==fSeries,] 141 | o1=colMeans(oneSeriesForecs_df[,2:ncol(oneSeriesForecs_df)]) 142 | o_df=data.frame(id=fSeries, as.list(o1), stringsAsFactors =F) 143 | ou_df=rbind(ou_df, o_df) 144 | if (nrow(ou_df)>=SIZE_OF_CHUNK) { 145 | out_df=rbind(out_df,ou_df) 146 | ou_df=NULL 147 | print(nrow(out_df)) 148 | } 149 | } 150 | out_df=rbind(out_df,ou_df) 151 | print(nrow(out_df)) 152 | out_df=out_df[order(as.integer(substring(out_df$id, 2))),] 153 | 154 | outPath=paste0(FOREC_DIR,'\\',SP,"ForecH.csv") 155 | write.csv(out_df,file=outPath,row.names = F) 156 | 157 | higher_df=out_df 158 | 159 | 160 | ################ Main work done, now just diagnostics calculations and plots 161 | 162 | #display a sample of forecasts and, if LBACK>0, actuals 163 | MAX_NUM_OF_POINTS_TO_SHOW=200 164 | i=1 165 | for (i in 1:100) { 166 | irand=sample(1:length(forecSeries),1) 167 | fSeries=forecSeries[irand] 168 | forecL=as.numeric(lower_df[lower_df$id==fSeries,2:ncol(lower_df)]) 169 | forecH=as.numeric(higher_df[higher_df$id==fSeries,2:ncol(higher_df)]) 170 | actual=as.numeric(m4_df[m4_df$V1==fSeries,2:ncol(m4_df)]) 171 | actual=actual[!is.na(actual)] 172 | if (length(actual)>MAX_NUM_OF_POINTS_TO_SHOW) { 173 | actual=actual[(length(actual)-MAX_NUM_OF_POINTS_TO_SHOW):length(actual)] 174 | } 175 | if (LBACK==0) { 176 | plot(c(actual,forecH), col=c(rep(1,length(actual)),rep(2,length(forecH))), main=fSeries) 177 | lines(c(actual,forecL), col=c(rep(1,length(actual)),rep(3,length(forecL))), type='p') 178 | } else { 179 | ymin=min(actual,forecL) 180 | ymax=max(actual,forecH) 181 | plot(1:length(actual),actual, main=fSeries, ylim=c(ymin,ymax)) 182 | lines((length(actual)-length(forecH)+1):length(actual), forecH, col=2, type='p') 183 | lines((length(actual)-length(forecL)+1):length(actual), forecL, col=3, type='p') 184 | } 185 | 186 | Sys.sleep(5) 187 | } 188 | 189 | 190 | 191 | #calc error metric: MSIS 192 | if (LBACK>0) { 193 | summErrors=0 194 | fSeries=forecSeries[1] 195 | i=1 196 | for (fSeries in forecSeries) { 197 | if (i%%1000==0) 198 | cat(".") 199 | forecL=as.numeric(lower_df[lower_df$id==fSeries,2:ncol(lower_df)]) 200 | forecH=as.numeric(higher_df[higher_df$id==fSeries,2:ncol(higher_df)]) 201 | actual=as.numeric(m4_df[m4_df$V1==fSeries,2:ncol(m4_df)]) 202 | actual=actual[!is.na(actual)] 203 | summErrors=summErrors+errorFunc(forecL, forecH, actual) 204 | i=i+1 205 | } 206 | print(".") 207 | print(paste0("avg error:",round(summErrors/length(forecSeries),2))) 208 | } 209 | 210 | 211 | -------------------------------------------------------------------------------- /R/readme.txt: -------------------------------------------------------------------------------- 1 | When the c++ workers run, they output results (forecasts) to a directory or two. 2 | (Sorry occasionally two directories are filled, in such case first "manually" put all the output files to a single dir) 3 | These scripts merge them into one file and save it, show a sample of graphs, and if this is backtesting run (LBACK>0), calculate some accuracy metrics. 4 | 5 | Both scripts needs to be updated with your input, output dirs, and other params, see inside, there are a lot of comments there. 6 | 7 | merge.R is meant to be used for point forecst runs, so for ES_RNN and ES_RNN_E programs. 8 | mergePI.R - for Prediction Interval runs, so for ES_RNN_PI and ES_RNN_E_PI programs. 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ES_RNN 2 | The repository contains current, slightly updated, version of ES_RNN - a hybrid Exponential Smoothing/Recurrent NN method that won M4 Forecasting Competition 3 | -------------------------------------------------------------------------------- /c++/linux_example_scripts/build_mkl: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | c++ -DEIGEN_FAST_MATH -fPIC -funroll-loops -fno-finite-math-only -Wall -Wno-missing-braces -std=c++11 -Ofast -g -march=native -O2 -g -DNDEBUG -I/home/uber/progs/dynet -I/home/uber/progs/eigen -I/home/uber/progs/dynet/buildMKL $1.cc slstm.cpp -o $1 -lodbc -rdynamic /home/uber/progs/dynet/buildMKL/dynet/libdynet.so -lpthread -lrt -Wl,-rpath,/home/uber/progs/dynet/buildMKL/dynet 3 | 4 | -------------------------------------------------------------------------------- /c++/linux_example_scripts/readme.txt: -------------------------------------------------------------------------------- 1 | build_mkl builds a specified program , linking it with MKL-compiled version of Dynet. 2 | usage, e.g.: 3 | ./build_mkl ES_RNN 4 | (no extension). 5 | ____You need to modify it, to point to your location of Dynet library.____ 6 | Also, remove -lodbc if you do not use it, and especially if you had not installed it :-) 7 | 8 | run18 is a script that runs 9 pairs of workers, to be used with ES_RNN and ES_RNN_PI. 9 | So it assumes it runs on a nice 18-core machine :-), and in such case you BIG_LOOP constant in the .cc files should probably be = 1, no big need for more than 9 runs for assembling. 10 | usage, e.g.: 11 | ./run18 ES_RNN 12 | 13 | 14 | -------------------------------------------------------------------------------- /c++/linux_example_scripts/run18: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | rm ./nohup.out 3 | nohup nice -n 10 ./$1 9 1 & 4 | nohup nice -n 10 ./$1 9 2 & 5 | nohup nice -n 10 ./$1 10 1 5 & 6 | nohup nice -n 10 ./$1 10 2 5 & 7 | nohup nice -n 10 ./$1 11 1 10 & 8 | nohup nice -n 10 ./$1 11 2 10 & 9 | nohup nice -n 10 ./$1 12 1 15 & 10 | nohup nice -n 10 ./$1 12 2 15 & 11 | nohup nice -n 10 ./$1 13 1 20 & 12 | nohup nice -n 10 ./$1 13 2 20 & 13 | nohup nice -n 10 ./$1 14 1 25 & 14 | nohup nice -n 10 ./$1 14 2 25 & 15 | nohup nice -n 10 ./$1 15 1 30 & 16 | nohup nice -n 10 ./$1 15 2 30 & 17 | nohup nice -n 10 ./$1 16 1 35 & 18 | nohup nice -n 10 ./$1 16 2 35 & 19 | nohup nice -n 10 ./$1 17 1 40 & 20 | nohup nice -n 10 ./$1 17 2 40 & 21 | -------------------------------------------------------------------------------- /c++/readme.txt: -------------------------------------------------------------------------------- 1 | The programs require Dynet (https://github.com/clab/dynet) installed, compiled for C++. 2 | I have also been using Intel MKL, donwloadable freely, and built Dynet to use MKL. 3 | In my early testing CPU perf was better than GPU one, so did not used GPU builds of Dynet. 4 | There will be 4 projects, each containing one .cc file and slstm.*. 5 | The programs can be run on Windows, Linux, and Mac. 6 | See inside *.cc files - there are more details. You need to setup some params. 7 | 8 | I provide example scripts for Linux, and a VS 2015 solution for Windows. -------------------------------------------------------------------------------- /c++/slstm.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | My implementation of dilated LSTMs, based on Dynet LSTM builders 3 | - DilatedLSTMBuilder - standard Dilated LSTM (https://papers.nips.cc/paper/6613-dilated-recurrent-neural-networks.pdf) 4 | - ResidualDilatedLSTMBuilder - Dilated LSTM with special Residual shortcuts, after https://arxiv.org/abs/1701.03360 5 | - AttentiveDilatedLSTMBuilder - Dilated LSTM with Attention mechanism, as in the second stage of https://arxiv.org/abs/1704.02971 6 | * 7 | Slawek Smyl, Mar-May 2018 8 | */ 9 | 10 | #include "slstm.h" 11 | #include "dynet/lstm.h" 12 | #include "dynet/param-init.h" 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #if defined DEBUG 20 | #define _DEBUG 21 | #endif 22 | 23 | using namespace std; 24 | 25 | namespace dynet { 26 | 27 | // ResidualDilatedLSTMBuilder based on Vanilla LSTM 28 | enum { _X2I, _H2I, _BI, _X2F, _H2F, _BF, _X2O, _H2O, _BO, _X2G, _H2G, _BG }; 29 | enum { LN_GH, LN_BH, LN_GX, LN_BX, LN_GC, LN_BC }; 30 | 31 | ResidualDilatedLSTMBuilder::ResidualDilatedLSTMBuilder() : has_initial_state(false), layers(0), input_dim(0), hid(0), dropout_rate_h(0), ln_lstm(false), forget_bias(1.f), dropout_masks_valid(false) { } 32 | 33 | ResidualDilatedLSTMBuilder::ResidualDilatedLSTMBuilder(vector dilations, 34 | unsigned input_dim, 35 | unsigned hidden_dim, 36 | ParameterCollection& model, 37 | bool ln_lstm, float forget_bias) : dilations(dilations), layers(unsigned(dilations.size())), 38 | input_dim(input_dim), hid(hidden_dim), ln_lstm(ln_lstm), forget_bias(forget_bias), dropout_masks_valid(false) { 39 | unsigned layer_input_dim = input_dim; 40 | local_model = model.add_subcollection("ResidualDilated-lstm-builder"); 41 | for (unsigned i = 0; i < layers; ++i) { 42 | // i 43 | Parameter p_x2i = local_model.add_parameters({ hidden_dim * 4, layer_input_dim }); 44 | Parameter p_h2i = local_model.add_parameters({ hidden_dim * 4, hidden_dim }); 45 | //Parameter p_c2i = model.add_parameters({hidden_dim, hidden_dim}); 46 | Parameter p_bi = local_model.add_parameters({ hidden_dim * 4 }, ParameterInitConst(0.f)); 47 | 48 | layer_input_dim = hidden_dim; // output (hidden) from 1st layer is input to next 49 | 50 | vector ps = { p_x2i, p_h2i, /*p_c2i,*/ p_bi }; 51 | params.push_back(ps); 52 | 53 | if (ln_lstm) { 54 | Parameter p_gh = model.add_parameters({ hidden_dim * 4 }, ParameterInitConst(1.f)); 55 | Parameter p_bh = model.add_parameters({ hidden_dim * 4 }, ParameterInitConst(0.f)); 56 | Parameter p_gx = model.add_parameters({ hidden_dim * 4 }, ParameterInitConst(1.f)); 57 | Parameter p_bx = model.add_parameters({ hidden_dim * 4 }, ParameterInitConst(0.f)); 58 | Parameter p_gc = model.add_parameters({ hidden_dim }, ParameterInitConst(1.f)); 59 | Parameter p_bc = model.add_parameters({ hidden_dim }, ParameterInitConst(0.f)); 60 | vector ln_ps = { p_gh, p_bh, p_gx, p_bx, p_gc, p_bc }; 61 | ln_params.push_back(ln_ps); 62 | } 63 | } // layers 64 | dropout_rate = 0.f; 65 | dropout_rate_h = 0.f; 66 | } 67 | 68 | void ResidualDilatedLSTMBuilder::new_graph_impl(ComputationGraph& cg, bool update) { 69 | param_vars.clear(); 70 | if (ln_lstm)ln_param_vars.clear(); 71 | for (unsigned i = 0; i < layers; ++i) { 72 | auto& p = params[i]; 73 | vector vars; 74 | for (unsigned j = 0; j < p.size(); ++j) { vars.push_back(update ? parameter(cg, p[j]) : const_parameter(cg, p[j])); } 75 | param_vars.push_back(vars); 76 | if (ln_lstm) { 77 | auto& ln_p = ln_params[i]; 78 | vector ln_vars; 79 | for (unsigned j = 0; j < ln_p.size(); ++j) { ln_vars.push_back(update ? parameter(cg, ln_p[j]) : const_parameter(cg, ln_p[j])); } 80 | ln_param_vars.push_back(ln_vars); 81 | } 82 | } 83 | 84 | _cg = &cg; 85 | } 86 | // layout: 0..layers = c 87 | // layers+1..2*layers = h 88 | void ResidualDilatedLSTMBuilder::start_new_sequence_impl(const vector& hinit) { 89 | h.clear(); 90 | c.clear(); 91 | 92 | if (hinit.size() > 0) { 93 | DYNET_ARG_CHECK(layers * 2 == hinit.size(), 94 | "ResidualDilatedLSTMBuilder must be initialized with 2 times as many expressions as layers " 95 | "(hidden state, and cell for each layer). However, for " << layers << " layers, " << 96 | hinit.size() << " expressions were passed in"); 97 | h0.resize(layers); 98 | c0.resize(layers); 99 | for (unsigned i = 0; i < layers; ++i) { 100 | c0[i] = hinit[i]; 101 | h0[i] = hinit[i + layers]; 102 | } 103 | has_initial_state = true; 104 | } 105 | else { 106 | has_initial_state = false; 107 | } 108 | 109 | dropout_masks_valid = false; 110 | } 111 | 112 | void ResidualDilatedLSTMBuilder::set_dropout_masks(unsigned batch_size) { 113 | masks.clear(); 114 | for (unsigned i = 0; i < layers; ++i) { 115 | std::vector masks_i; 116 | unsigned idim = (i == 0) ? input_dim : hid; 117 | if (dropout_rate > 0.f || dropout_rate_h > 0.f) { 118 | float retention_rate = 1.f - dropout_rate; 119 | float retention_rate_h = 1.f - dropout_rate_h; 120 | float scale = 1.f / retention_rate; 121 | float scale_h = 1.f / retention_rate_h; 122 | // in 123 | masks_i.push_back(random_bernoulli(*_cg, Dim({ idim }, batch_size), retention_rate, scale)); 124 | // h 125 | masks_i.push_back(random_bernoulli(*_cg, Dim({ hid }, batch_size), retention_rate_h, scale_h)); 126 | masks.push_back(masks_i); 127 | } 128 | } 129 | dropout_masks_valid = true; 130 | } 131 | 132 | ParameterCollection & ResidualDilatedLSTMBuilder::get_parameter_collection() { 133 | return local_model; 134 | } 135 | 136 | // TODO - Make this correct 137 | // Copied c from the previous step (otherwise c.size()< h.size()) 138 | // Also is creating a new step something we want? 139 | // wouldn't overwriting the current one be better? 140 | Expression ResidualDilatedLSTMBuilder::set_h_impl(int prev, const vector& h_new) { 141 | DYNET_ARG_CHECK(h_new.empty() || h_new.size() == layers, 142 | "ResidualDilatedLSTMBuilder::set_h expects as many inputs as layers, but got " << 143 | h_new.size() << " inputs for " << layers << " layers"); 144 | const unsigned t = h.size(); 145 | h.push_back(vector(layers)); 146 | c.push_back(vector(layers)); 147 | for (unsigned i = 0; i < layers; ++i) { 148 | Expression h_i = h_new[i]; 149 | Expression c_i = c[t - 1][i]; 150 | h[t][i] = h_i; 151 | c[t][i] = c_i; 152 | } 153 | return h[t].back(); 154 | } 155 | // Current implementation : s_new is either {new_c[0],...,new_c[n]} 156 | // or {new_c[0],...,new_c[n],new_h[0],...,new_h[n]} 157 | Expression ResidualDilatedLSTMBuilder::set_s_impl(int prev, const std::vector& s_new) { 158 | DYNET_ARG_CHECK(s_new.size() == layers || s_new.size() == 2 * layers, 159 | "ResidualDilatedLSTMBuilder::set_s expects either as many inputs or twice as many inputs as layers, but got " << s_new.size() << " inputs for " << layers << " layers"); 160 | bool only_c = s_new.size() == layers; 161 | const unsigned t = c.size(); 162 | h.push_back(vector(layers)); 163 | c.push_back(vector(layers)); 164 | for (unsigned i = 0; i < layers; ++i) { 165 | Expression h_i = only_c ? h[t - 1][i] : s_new[i + layers]; 166 | Expression c_i = s_new[i]; 167 | h[t][i] = h_i; 168 | c[t][i] = c_i; 169 | } 170 | return h[t].back(); 171 | } 172 | 173 | Expression ResidualDilatedLSTMBuilder::add_input_impl(int prev, const Expression& x) { 174 | h.push_back(vector(layers)); 175 | c.push_back(vector(layers)); 176 | vector& ht = h.back(); 177 | vector& ct = c.back(); 178 | Expression in = x; 179 | if ((dropout_rate > 0.f || dropout_rate_h > 0.f) && !dropout_masks_valid) set_dropout_masks(x.dim().bd); 180 | for (unsigned i = 0; i < layers; ++i) { 181 | int dilation_offset = dilations[i] - 1; 182 | const vector& vars = param_vars[i]; 183 | 184 | Expression i_h_tm1, i_c_tm1; 185 | bool has_prev_state = (prev >= 0 || has_initial_state); 186 | if (prev < dilation_offset) { 187 | if (has_initial_state) { 188 | // intial value for h and c at timestep 0 in layer i 189 | // defaults to zero matrix input if not set in add_parameter_edges 190 | i_h_tm1 = h0[i]; 191 | i_c_tm1 = c0[i]; 192 | } 193 | else { 194 | i_h_tm1 = zeros(*_cg, Dim({ vars[_BI].dim()[0] / 4 }, x.dim().bd)); 195 | i_c_tm1 = i_h_tm1; 196 | } 197 | } 198 | else { 199 | i_h_tm1 = h[prev - dilation_offset][i]; 200 | i_c_tm1 = c[prev - dilation_offset][i]; 201 | } 202 | // apply dropout according to https://arxiv.org/abs/1512.05287 (tied weights) 203 | if (dropout_rate > 0.f) { 204 | in = cmult(in, masks[i][0]); 205 | } 206 | if (has_prev_state && dropout_rate_h > 0.f) 207 | i_h_tm1 = cmult(i_h_tm1, masks[i][1]); 208 | // input 209 | Expression tmp; 210 | Expression i_ait; 211 | Expression i_aft; 212 | Expression i_aot; 213 | Expression i_agt; 214 | if (ln_lstm) { 215 | const vector& ln_vars = ln_param_vars[i]; 216 | if (has_prev_state) 217 | tmp = vars[_BI] + layer_norm(vars[_X2I] * in, ln_vars[LN_GX], ln_vars[LN_BX]) + layer_norm(vars[_H2I] * i_h_tm1, ln_vars[LN_GH], ln_vars[LN_BH]); 218 | else 219 | tmp = vars[_BI] + layer_norm(vars[_X2I] * in, ln_vars[LN_GX], ln_vars[LN_BX]); 220 | } 221 | else { 222 | if (has_prev_state) 223 | tmp = affine_transform({ vars[_BI], vars[_X2I], in, vars[_H2I], i_h_tm1 }); 224 | else 225 | tmp = affine_transform({ vars[_BI], vars[_X2I], in }); 226 | } 227 | i_ait = pick_range(tmp, 0, hid); 228 | i_aft = pick_range(tmp, hid, hid * 2); 229 | i_aot = pick_range(tmp, hid * 2, hid * 3); 230 | i_agt = pick_range(tmp, hid * 3, hid * 4); 231 | Expression i_it = logistic(i_ait); 232 | if (forget_bias != 0.0) 233 | tmp = logistic(i_aft + forget_bias); 234 | else 235 | tmp = logistic(i_aft); 236 | 237 | Expression i_ft = tmp; 238 | Expression i_ot = logistic(i_aot); 239 | Expression i_gt = tanh(i_agt); 240 | 241 | ct[i] = has_prev_state ? (cmult(i_ft, i_c_tm1) + cmult(i_it, i_gt)) : cmult(i_it, i_gt); 242 | if (ln_lstm) { 243 | const vector& ln_vars = ln_param_vars[i]; 244 | if (i==0) 245 | in = ht[i] = cmult(i_ot, tanh(layer_norm(ct[i], ln_vars[LN_GC], ln_vars[LN_BC]))); 246 | else 247 | in = ht[i] = cmult(i_ot, in+tanh(layer_norm(ct[i], ln_vars[LN_GC], ln_vars[LN_BC]))); 248 | } 249 | else { 250 | if (i==0) 251 | in = ht[i] = cmult(i_ot, tanh(ct[i])); 252 | else 253 | in = ht[i] = cmult(i_ot, in+tanh(ct[i])); 254 | } 255 | } 256 | return ht.back(); 257 | } 258 | 259 | void ResidualDilatedLSTMBuilder::copy(const RNNBuilder & rnn) { 260 | const ResidualDilatedLSTMBuilder & rnn_lstm = (const ResidualDilatedLSTMBuilder&)rnn; 261 | DYNET_ARG_CHECK(params.size() == rnn_lstm.params.size(), 262 | "Attempt to copy ResidualDilatedLSTMBuilder with different number of parameters " 263 | "(" << params.size() << " != " << rnn_lstm.params.size() << ")"); 264 | for (size_t i = 0; i < params.size(); ++i) 265 | for (size_t j = 0; j < params[i].size(); ++j) 266 | params[i][j] = rnn_lstm.params[i][j]; 267 | for (size_t i = 0; i < ln_params.size(); ++i) 268 | for (size_t j = 0; j < ln_params[i].size(); ++j) 269 | ln_params[i][j] = rnn_lstm.ln_params[i][j]; 270 | } 271 | 272 | void ResidualDilatedLSTMBuilder::set_dropout(float d) { 273 | DYNET_ARG_CHECK(d >= 0.f && d <= 1.f, 274 | "dropout rate must be a probability (>=0 and <=1)"); 275 | dropout_rate = d; 276 | dropout_rate_h = d; 277 | } 278 | 279 | void ResidualDilatedLSTMBuilder::set_dropout(float d, float d_h) { 280 | DYNET_ARG_CHECK(d >= 0.f && d <= 1.f && d_h >= 0.f && d_h <= 1.f, 281 | "dropout rate must be a probability (>=0 and <=1)"); 282 | dropout_rate = d; 283 | dropout_rate_h = d_h; 284 | } 285 | 286 | void ResidualDilatedLSTMBuilder::disable_dropout() { 287 | dropout_rate = 0.f; 288 | dropout_rate_h = 0.f; 289 | } 290 | 291 | 292 | 293 | 294 | //enum { _X2I, _H2I, _BI, _X2F, _H2F, _BF, _X2O, _H2O, _BO, _X2G, _H2G, _BG }; 295 | enum { _X2I_, _H2I_, _BI_, _XA1, _HA1, _SA1, _BA1, _A2, _B2 }; 296 | 297 | 298 | //*************************** 299 | 300 | 301 | 302 | AttentiveDilatedLSTMBuilder::AttentiveDilatedLSTMBuilder() : has_initial_state(false), layers(0), input_dim(0), hid(0), dropout_rate_h(0), weightnoise_std(0), dropout_masks_valid(false) { } 303 | 304 | AttentiveDilatedLSTMBuilder::AttentiveDilatedLSTMBuilder(vector max_dilations, 305 | unsigned input_dim, 306 | unsigned hidden_dim, 307 | unsigned attention_dim, 308 | ParameterCollection& model) 309 | : max_dilations(max_dilations), layers(unsigned(max_dilations.size())), 310 | input_dim(input_dim), hid(hidden_dim), attention_dim(attention_dim), weightnoise_std(0), dropout_masks_valid(false) { 311 | unsigned layer_input_dim = input_dim; 312 | local_model = model.add_subcollection("compact-vanilla-lstm-builder"); 313 | for (unsigned i = 0; i < layers; ++i) { 314 | // i 315 | Parameter p_Wx = local_model.add_parameters({ hidden_dim * 4, layer_input_dim }); 316 | Parameter p_Wh = local_model.add_parameters({ hidden_dim * 4, hidden_dim }); 317 | Parameter p_b = local_model.add_parameters({ hidden_dim * 4 }, ParameterInitConst(0.f)); 318 | 319 | Parameter p_Wxa1 = local_model.add_parameters({ attention_dim, layer_input_dim }); 320 | Parameter p_Wha1 = local_model.add_parameters({ attention_dim, hidden_dim }); 321 | Parameter p_Wsa1 = local_model.add_parameters({ attention_dim, hidden_dim }); 322 | Parameter p_ba1 = local_model.add_parameters({ attention_dim }, ParameterInitConst(0.f)); 323 | 324 | Parameter p_Wa2 = local_model.add_parameters({ max_dilations[i], attention_dim }); 325 | Parameter p_ba2 = local_model.add_parameters({ max_dilations[i] }, ParameterInitConst(0.f)); 326 | 327 | layer_input_dim = hidden_dim; // output (hidden) from 1st layer is input to next 328 | 329 | vector ps = { p_Wx, p_Wh, p_b, p_Wxa1, p_Wha1, p_Wsa1, p_ba1, p_Wa2, p_ba2 }; 330 | params.push_back(ps); 331 | 332 | } // layers 333 | dropout_rate = 0.f; 334 | dropout_rate_h = 0.f; 335 | } 336 | 337 | void AttentiveDilatedLSTMBuilder::new_graph_impl(ComputationGraph& cg, bool update) { 338 | param_vars.clear(); 339 | for (unsigned i = 0; i < layers; ++i) { 340 | auto& p = params[i]; 341 | vector vars; 342 | for (unsigned j = 0; j < p.size(); ++j) { 343 | vars.push_back(update ? parameter(cg, p[j]) : const_parameter(cg, p[j])); 344 | } 345 | param_vars.push_back(vars); 346 | } 347 | 348 | _cg = &cg; 349 | } 350 | // layout: 0..layers = c 351 | // layers+1..2*layers = h 352 | void AttentiveDilatedLSTMBuilder::start_new_sequence_impl(const vector& hinit) { 353 | h.clear(); 354 | c.clear(); 355 | 356 | if (hinit.size() > 0) { 357 | DYNET_ARG_CHECK(layers * 2 == hinit.size(), 358 | "AttentiveDilatedLSTMBuilder must be initialized with 2 times as many expressions as layers " 359 | "(hidden state, and cell for each layer). However, for " << layers << " layers, " << 360 | hinit.size() << " expressions were passed in"); 361 | h0.resize(layers); 362 | c0.resize(layers); 363 | for (unsigned i = 0; i < layers; ++i) { 364 | c0[i] = hinit[i]; 365 | h0[i] = hinit[i + layers]; 366 | } 367 | has_initial_state = true; 368 | } 369 | else { 370 | has_initial_state = false; 371 | } 372 | 373 | dropout_masks_valid = false; 374 | } 375 | 376 | void AttentiveDilatedLSTMBuilder::set_dropout_masks(unsigned batch_size) { 377 | masks.clear(); 378 | for (unsigned i = 0; i < layers; ++i) { 379 | std::vector masks_i; 380 | unsigned idim = (i == 0) ? input_dim : hid; 381 | if (dropout_rate > 0.f || dropout_rate_h > 0.f) { 382 | float retention_rate = 1.f - dropout_rate; 383 | float retention_rate_h = 1.f - dropout_rate_h; 384 | float scale = 1.f / retention_rate; 385 | float scale_h = 1.f / retention_rate_h; 386 | // in 387 | masks_i.push_back(random_bernoulli(*_cg, Dim({ idim }, batch_size), retention_rate, scale)); 388 | // h 389 | masks_i.push_back(random_bernoulli(*_cg, Dim({ hid }, batch_size), retention_rate_h, scale_h)); 390 | masks.push_back(masks_i); 391 | } 392 | } 393 | dropout_masks_valid = true; 394 | } 395 | 396 | ParameterCollection & AttentiveDilatedLSTMBuilder::get_parameter_collection() { 397 | return local_model; 398 | } 399 | 400 | // TODO - Make this correct 401 | // Copied c from the previous step (otherwise c.size()< h.size()) 402 | // Also is creating a new step something we want? 403 | // wouldn't overwriting the current one be better? 404 | Expression AttentiveDilatedLSTMBuilder::set_h_impl(int prev, const vector& h_new) { 405 | DYNET_ARG_CHECK(h_new.empty() || h_new.size() == layers, 406 | "AttentiveDilatedLSTMBuilder::set_h expects as many inputs as layers, but got " << 407 | h_new.size() << " inputs for " << layers << " layers"); 408 | const unsigned t = unsigned(h.size()); 409 | h.push_back(vector(layers)); 410 | c.push_back(vector(layers)); 411 | for (unsigned i = 0; i < layers; ++i) { 412 | Expression h_i = h_new[i]; 413 | Expression c_i = c[t - 1][i]; 414 | h[t][i] = h_i; 415 | c[t][i] = c_i; 416 | } 417 | return h[t].back(); 418 | } 419 | // Current implementation : s_new is either {new_c[0],...,new_c[n]} 420 | // or {new_c[0],...,new_c[n],new_h[0],...,new_h[n]} 421 | Expression AttentiveDilatedLSTMBuilder::set_s_impl(int prev, const std::vector& s_new) { 422 | DYNET_ARG_CHECK(s_new.size() == layers || s_new.size() == 2 * layers, 423 | "AttentiveDilatedLSTMBuilder::set_s expects either as many inputs or twice as many inputs as layers, but got " << s_new.size() << " inputs for " << layers << " layers"); 424 | bool only_c = s_new.size() == layers; 425 | const unsigned t = unsigned(c.size()); 426 | h.push_back(vector(layers)); 427 | c.push_back(vector(layers)); 428 | for (unsigned i = 0; i < layers; ++i) { 429 | Expression h_i = only_c ? h[t - 1][i] : s_new[i + layers]; 430 | Expression c_i = s_new[i]; 431 | h[t][i] = h_i; 432 | c[t][i] = c_i; 433 | } 434 | return h[t].back(); 435 | } 436 | 437 | Expression AttentiveDilatedLSTMBuilder::add_input_impl(int prev, const Expression& x) { 438 | h.push_back(vector(layers)); 439 | c.push_back(vector(layers)); 440 | vector& ht = h.back(); 441 | vector& ct = c.back(); 442 | Expression in = x; 443 | if ((dropout_rate > 0.f || dropout_rate_h > 0.f) && !dropout_masks_valid) set_dropout_masks(x.dim().bd); 444 | for (unsigned i = 0; i < layers; ++i) { 445 | int dilation_offset= max_dilations[i]-1; 446 | const vector& vars = param_vars[i]; 447 | Expression i_h_tm1, i_c_tm1; 448 | if (prev < dilation_offset) { 449 | if (has_initial_state) { 450 | // initial value for h and c at timestep 0 in layer i 451 | // defaults to zero matrix input if not set in add_parameter_edges 452 | i_h_tm1 = h0[i]; 453 | i_c_tm1 = c0[i]; 454 | } 455 | else { 456 | i_h_tm1 = zeros(*_cg, Dim({ vars[_BI].dim()[0] / 4 }, x.dim().bd)); 457 | i_c_tm1 = i_h_tm1; 458 | } 459 | } 460 | else { 461 | if (dilation_offset>0) { 462 | //enum { _X2I, _H2I, _BI, _XA1, _HA1, _SA1, _BA1, _A2, _B2 }; 463 | Expression weights_ex=vars[_XA1]*in+ vars[_HA1]*h[prev][i]+ vars[_SA1]*c[prev][i]+ vars[_BA1]; 464 | weights_ex=tanh(weights_ex); 465 | weights_ex=vars[_A2]* weights_ex+ vars[_B2]; 466 | weights_ex =softmax(weights_ex); 467 | #if defined _DEBUG 468 | vector weights=as_vector(weights_ex.value()); 469 | #endif 470 | 471 | unsigned indx=0; 472 | Expression w_ex = pick(weights_ex, indx); 473 | Expression avg_h= cmult(h[prev][i], w_ex); 474 | for (indx=1; indx <= dilation_offset; indx++) {//dilation_offset==max_dilations[i]-1, so together with indx==0, we cover max_dilations[i] steps 475 | w_ex = pick(weights_ex, indx); 476 | avg_h = avg_h+cmult(h[prev- indx][i], w_ex); 477 | } 478 | i_h_tm1 = avg_h; 479 | } else { 480 | i_h_tm1 = h[prev- dilation_offset][i]; 481 | } 482 | i_c_tm1 = c[prev- dilation_offset][i]; 483 | } 484 | if (dropout_rate > 0.f || dropout_rate_h > 0.f) { 485 | // apply dropout according to https://arxiv.org/abs/1512.05287 (tied weights) 486 | Expression gates_t = vanilla_lstm_gates_dropout({ in }, i_h_tm1, vars[_X2I], vars[_H2I], vars[_BI], masks[i][0], masks[i][1], weightnoise_std); 487 | ct[i] = vanilla_lstm_c(i_c_tm1, gates_t); 488 | in = ht[i] = vanilla_lstm_h(ct[i], gates_t); 489 | } 490 | else { 491 | Expression gates_t = vanilla_lstm_gates({ in }, i_h_tm1, vars[_X2I], vars[_H2I], vars[_BI], weightnoise_std); 492 | ct[i] = vanilla_lstm_c(i_c_tm1, gates_t); 493 | in = ht[i] = vanilla_lstm_h(ct[i], gates_t); 494 | } 495 | } 496 | return ht.back(); 497 | } 498 | 499 | void AttentiveDilatedLSTMBuilder::copy(const RNNBuilder & rnn) { 500 | const AttentiveDilatedLSTMBuilder & rnn_lstm = (const AttentiveDilatedLSTMBuilder&)rnn; 501 | DYNET_ARG_CHECK(params.size() == rnn_lstm.params.size(), 502 | "Attempt to copy AttentiveDilatedLSTMBuilder with different number of parameters " 503 | "(" << params.size() << " != " << rnn_lstm.params.size() << ")"); 504 | for (size_t i = 0; i < params.size(); ++i) 505 | for (size_t j = 0; j < params[i].size(); ++j) 506 | params[i][j] = rnn_lstm.params[i][j]; 507 | } 508 | 509 | void AttentiveDilatedLSTMBuilder::set_dropout(float d) { 510 | DYNET_ARG_CHECK(d >= 0.f && d <= 1.f, 511 | "dropout rate must be a probability (>=0 and <=1)"); 512 | dropout_rate = d; 513 | dropout_rate_h = d; 514 | } 515 | 516 | void AttentiveDilatedLSTMBuilder::set_dropout(float d, float d_h) { 517 | DYNET_ARG_CHECK(d >= 0.f && d <= 1.f && d_h >= 0.f && d_h <= 1.f, 518 | "dropout rate must be a probability (>=0 and <=1)"); 519 | dropout_rate = d; 520 | dropout_rate_h = d_h; 521 | } 522 | 523 | void AttentiveDilatedLSTMBuilder::disable_dropout() { 524 | dropout_rate = 0.f; 525 | dropout_rate_h = 0.f; 526 | } 527 | void AttentiveDilatedLSTMBuilder::set_weightnoise(float std) { 528 | DYNET_ARG_CHECK(std >= 0.f, "weight noise must have standard deviation >=0"); 529 | weightnoise_std = std; 530 | } 531 | 532 | //*/ 533 | 534 | DilatedLSTMBuilder::DilatedLSTMBuilder() : has_initial_state(false), layers(0), input_dim(0), hid(0), dropout_rate_h(0), weightnoise_std(0), dropout_masks_valid(false) { } 535 | 536 | DilatedLSTMBuilder::DilatedLSTMBuilder(vector dilations, 537 | unsigned input_dim, 538 | unsigned hidden_dim, 539 | ParameterCollection& model) 540 | : dilations(dilations), layers(unsigned(dilations.size())), 541 | input_dim(input_dim), hid(hidden_dim), weightnoise_std(0), dropout_masks_valid(false) { 542 | unsigned layer_input_dim = input_dim; 543 | local_model = model.add_subcollection("compact-vanilla-lstm-builder"); 544 | for (unsigned i = 0; i < layers; ++i) { 545 | // i 546 | Parameter p_Wx = local_model.add_parameters({ hidden_dim * 4, layer_input_dim }); 547 | Parameter p_Wh = local_model.add_parameters({ hidden_dim * 4, hidden_dim }); 548 | Parameter p_b = local_model.add_parameters({ hidden_dim * 4 }, ParameterInitConst(0.f)); 549 | 550 | layer_input_dim = hidden_dim; // output (hidden) from 1st layer is input to next 551 | 552 | vector ps = { p_Wx, p_Wh, p_b }; 553 | params.push_back(ps); 554 | 555 | } // layers 556 | dropout_rate = 0.f; 557 | dropout_rate_h = 0.f; 558 | } 559 | 560 | void DilatedLSTMBuilder::new_graph_impl(ComputationGraph& cg, bool update) { 561 | param_vars.clear(); 562 | for (unsigned i = 0; i < layers; ++i) { 563 | auto& p = params[i]; 564 | vector vars; 565 | for (unsigned j = 0; j < p.size(); ++j) { vars.push_back(update ? parameter(cg, p[j]) : const_parameter(cg, p[j])); } 566 | param_vars.push_back(vars); 567 | } 568 | 569 | _cg = &cg; 570 | } 571 | // layout: 0..layers = c 572 | // layers+1..2*layers = h 573 | void DilatedLSTMBuilder::start_new_sequence_impl(const vector& hinit) { 574 | h.clear(); 575 | c.clear(); 576 | 577 | if (hinit.size() > 0) { 578 | DYNET_ARG_CHECK(layers * 2 == hinit.size(), 579 | "DilatedLSTMBuilder must be initialized with 2 times as many expressions as layers " 580 | "(hidden state, and cell for each layer). However, for " << layers << " layers, " << 581 | hinit.size() << " expressions were passed in"); 582 | h0.resize(layers); 583 | c0.resize(layers); 584 | for (unsigned i = 0; i < layers; ++i) { 585 | c0[i] = hinit[i]; 586 | h0[i] = hinit[i + layers]; 587 | } 588 | has_initial_state = true; 589 | } else { 590 | has_initial_state = false; 591 | } 592 | 593 | dropout_masks_valid = false; 594 | } 595 | 596 | void DilatedLSTMBuilder::set_dropout_masks(unsigned batch_size) { 597 | masks.clear(); 598 | for (unsigned i = 0; i < layers; ++i) { 599 | std::vector masks_i; 600 | unsigned idim = (i == 0) ? input_dim : hid; 601 | if (dropout_rate > 0.f || dropout_rate_h > 0.f) { 602 | float retention_rate = 1.f - dropout_rate; 603 | float retention_rate_h = 1.f - dropout_rate_h; 604 | float scale = 1.f / retention_rate; 605 | float scale_h = 1.f / retention_rate_h; 606 | // in 607 | masks_i.push_back(random_bernoulli(*_cg, Dim({ idim }, batch_size), retention_rate, scale)); 608 | // h 609 | masks_i.push_back(random_bernoulli(*_cg, Dim({ hid }, batch_size), retention_rate_h, scale_h)); 610 | masks.push_back(masks_i); 611 | } 612 | } 613 | dropout_masks_valid = true; 614 | } 615 | 616 | ParameterCollection & DilatedLSTMBuilder::get_parameter_collection() { 617 | return local_model; 618 | } 619 | 620 | // TODO - Make this correct 621 | // Copied c from the previous step (otherwise c.size()< h.size()) 622 | // Also is creating a new step something we want? 623 | // wouldn't overwriting the current one be better? 624 | Expression DilatedLSTMBuilder::set_h_impl(int prev, const vector& h_new) { 625 | DYNET_ARG_CHECK(h_new.empty() || h_new.size() == layers, 626 | "DilatedLSTMBuilder::set_h expects as many inputs as layers, but got " << 627 | h_new.size() << " inputs for " << layers << " layers"); 628 | const unsigned t = unsigned(h.size()); 629 | h.push_back(vector(layers)); 630 | c.push_back(vector(layers)); 631 | for (unsigned i = 0; i < layers; ++i) { 632 | Expression h_i = h_new[i]; 633 | Expression c_i = c[t - 1][i]; 634 | h[t][i] = h_i; 635 | c[t][i] = c_i; 636 | } 637 | return h[t].back(); 638 | } 639 | // Current implementation : s_new is either {new_c[0],...,new_c[n]} 640 | // or {new_c[0],...,new_c[n],new_h[0],...,new_h[n]} 641 | Expression DilatedLSTMBuilder::set_s_impl(int prev, const std::vector& s_new) { 642 | DYNET_ARG_CHECK(s_new.size() == layers || s_new.size() == 2 * layers, 643 | "DilatedLSTMBuilder::set_s expects either as many inputs or twice as many inputs as layers, but got " << s_new.size() << " inputs for " << layers << " layers"); 644 | bool only_c = s_new.size() == layers; 645 | const unsigned t = unsigned(c.size()); 646 | h.push_back(vector(layers)); 647 | c.push_back(vector(layers)); 648 | for (unsigned i = 0; i < layers; ++i) { 649 | Expression h_i = only_c ? h[t - 1][i] : s_new[i + layers]; 650 | Expression c_i = s_new[i]; 651 | h[t][i] = h_i; 652 | c[t][i] = c_i; 653 | } 654 | return h[t].back(); 655 | } 656 | 657 | Expression DilatedLSTMBuilder::add_input_impl(int prev, const Expression& x) { 658 | h.push_back(vector(layers)); 659 | c.push_back(vector(layers)); 660 | vector& ht = h.back(); 661 | vector& ct = c.back(); 662 | Expression in = x; 663 | if ((dropout_rate > 0.f || dropout_rate_h > 0.f) && !dropout_masks_valid) set_dropout_masks(x.dim().bd); 664 | for (unsigned i = 0; i < layers; ++i) { 665 | int dilation_offset = dilations[i] - 1; 666 | const vector& vars = param_vars[i]; 667 | Expression i_h_tm1, i_c_tm1; 668 | if (prev < dilation_offset) { 669 | if (has_initial_state) { 670 | // initial value for h and c at timestep 0 in layer i 671 | // defaults to zero matrix input if not set in add_parameter_edges 672 | i_h_tm1 = h0[i]; 673 | i_c_tm1 = c0[i]; 674 | } else { 675 | i_h_tm1 = zeros(*_cg, Dim({ vars[_BI].dim()[0] / 4 }, x.dim().bd)); 676 | i_c_tm1 = i_h_tm1; 677 | } 678 | } else { // t > 0 679 | i_h_tm1 = h[prev - dilation_offset][i]; 680 | i_c_tm1 = c[prev - dilation_offset][i]; 681 | } 682 | if (dropout_rate > 0.f || dropout_rate_h > 0.f) { 683 | // apply dropout according to https://arxiv.org/abs/1512.05287 (tied weights) 684 | Expression gates_t = vanilla_lstm_gates_dropout({ in }, i_h_tm1, vars[_X2I], vars[_H2I], vars[_BI], masks[i][0], masks[i][1], weightnoise_std); 685 | ct[i] = vanilla_lstm_c(i_c_tm1, gates_t); 686 | in = ht[i] = vanilla_lstm_h(ct[i], gates_t); 687 | } else { 688 | Expression gates_t = vanilla_lstm_gates({ in }, i_h_tm1, vars[_X2I], vars[_H2I], vars[_BI], weightnoise_std); 689 | ct[i] = vanilla_lstm_c(i_c_tm1, gates_t); 690 | in = ht[i] = vanilla_lstm_h(ct[i], gates_t); 691 | } 692 | } 693 | return ht.back(); 694 | } 695 | 696 | void DilatedLSTMBuilder::copy(const RNNBuilder & rnn) { 697 | const DilatedLSTMBuilder & rnn_lstm = (const DilatedLSTMBuilder&)rnn; 698 | DYNET_ARG_CHECK(params.size() == rnn_lstm.params.size(), 699 | "Attempt to copy DilatedLSTMBuilder with different number of parameters " 700 | "(" << params.size() << " != " << rnn_lstm.params.size() << ")"); 701 | for (size_t i = 0; i < params.size(); ++i) 702 | for (size_t j = 0; j < params[i].size(); ++j) 703 | params[i][j] = rnn_lstm.params[i][j]; 704 | } 705 | 706 | void DilatedLSTMBuilder::set_dropout(float d) { 707 | DYNET_ARG_CHECK(d >= 0.f && d <= 1.f, 708 | "dropout rate must be a probability (>=0 and <=1)"); 709 | dropout_rate = d; 710 | dropout_rate_h = d; 711 | } 712 | 713 | void DilatedLSTMBuilder::set_dropout(float d, float d_h) { 714 | DYNET_ARG_CHECK(d >= 0.f && d <= 1.f && d_h >= 0.f && d_h <= 1.f, 715 | "dropout rate must be a probability (>=0 and <=1)"); 716 | dropout_rate = d; 717 | dropout_rate_h = d_h; 718 | } 719 | 720 | void DilatedLSTMBuilder::disable_dropout() { 721 | dropout_rate = 0.f; 722 | dropout_rate_h = 0.f; 723 | } 724 | void DilatedLSTMBuilder::set_weightnoise(float std) { 725 | DYNET_ARG_CHECK(std >= 0.f, "weight noise must have standard deviation >=0"); 726 | weightnoise_std = std; 727 | } 728 | 729 | } // namespace dynet 730 | -------------------------------------------------------------------------------- /c++/slstm.h: -------------------------------------------------------------------------------- 1 | /** 2 | * file slstm.h 3 | * header for my implementation of dilated LSTMs, based on Dynet LSTM builders 4 | - DilatedLSTMBuilder - standard Dilated LSTM (https://papers.nips.cc/paper/6613-dilated-recurrent-neural-networks.pdf) 5 | - ResidualDilatedLSTMBuilder - Dilated LSTM with special Residual shortcuts, after https://arxiv.org/abs/1701.03360 6 | - AttentiveDilatedLSTMBuilder - Dilated LSTM with Attention mechanism, as in the second stage of https://arxiv.org/abs/1704.02971 7 | * 8 | Slawek Smyl, Mar-May 2018 9 | */ 10 | 11 | #ifndef DYNET_SLSTMS_H_ 12 | #define DYNET_SLSTMS_H_ 13 | 14 | #include "dynet/dynet.h" 15 | #include "dynet/rnn.h" 16 | #include "dynet/expr.h" 17 | 18 | using namespace std; 19 | 20 | namespace dynet { 21 | 22 | //basd on VanillaLSTMBuilder 23 | struct ResidualDilatedLSTMBuilder : public RNNBuilder { 24 | /** 25 | * @brief Default Constructor 26 | */ 27 | ResidualDilatedLSTMBuilder(); 28 | /** 29 | * \brief Constructor for the ResidualDilatedLSTMBuilder 30 | * 31 | * \param dilations Vector of dilations 32 | * \param input_dim Dimention of the input \f$x_t\f$ 33 | * \param hidden_dim Dimention of the hidden states \f$h_t\f$ and \f$c_t\f$ 34 | * \param model ParameterCollection holding the parameters 35 | * \param ln_lstm Whether to use layer normalization 36 | * \param forget_bias value(float) to use as bias for the forget gate(default = 1.0) 37 | */ 38 | explicit ResidualDilatedLSTMBuilder(vector dilations, 39 | unsigned input_dim, 40 | unsigned hidden_dim, 41 | ParameterCollection& model, 42 | bool ln_lstm = false, 43 | float forget_bias = 1.f); 44 | 45 | Expression back() const override { return (cur == -1 ? h0.back() : h[cur].back()); } 46 | std::vector final_h() const override { return (h.size() == 0 ? h0 : h.back()); } 47 | std::vector final_s() const override { 48 | std::vector ret = (c.size() == 0 ? c0 : c.back()); 49 | for (auto my_h : final_h()) ret.push_back(my_h); 50 | return ret; 51 | } 52 | unsigned num_h0_components() const override { return 2 * layers; } 53 | 54 | std::vector get_h(RNNPointer i) const override { return (i == -1 ? h0 : h[i]); } 55 | std::vector get_s(RNNPointer i) const override { 56 | std::vector ret = (i == -1 ? c0 : c[i]); 57 | for (auto my_h : get_h(i)) ret.push_back(my_h); 58 | return ret; 59 | } 60 | 61 | void copy(const RNNBuilder & params) override; 62 | 63 | /** 64 | * \brief Set the dropout rates to a unique value 65 | * \details This has the same effect as `set_dropout(d,d_h)` except that all the dropout rates are set to the same value. 66 | * \param d Dropout rate to be applied on all of \f$x,h\f$ 67 | */ 68 | void set_dropout(float d); 69 | /** 70 | * \brief Set the dropout rates 71 | * \details The dropout implemented here is the variational dropout with tied weights introduced in [Gal, 2016](http://papers.nips.cc/paper/6241-a-theoretically-grounded-application-of-dropout-in-recurrent-neural-networks) 72 | * More specifically, dropout masks \f$\mathbf{z_x}\sim \mathrm{Bernoulli}(1-d_x)\f$,\f$\mathbf{z_h}\sim \mathrm{Bernoulli}(1-d_h)\f$ are sampled at the start of each sequence. 73 | * The dynamics of the cell are then modified to : 74 | * 75 | * \f$ 76 | * \begin{split} 77 | i_t & =\sigma(W_{ix}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{ih}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_i)\\ 78 | f_t & = \sigma(W_{fx}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{fh}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_f)\\ 79 | o_t & = \sigma(W_{ox}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{oh}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_o)\\ 80 | \tilde{c_t} & = \tanh(W_{cx}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{ch}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_c)\\ 81 | c_t & = c_{t-1}\circ f_t + \tilde{c_t}\circ i_t\\ 82 | h_t & = \tanh(c_t)\circ o_t\\ 83 | \end{split} 84 | * \f$ 85 | * 86 | * For more detail as to why scaling is applied, see the "Unorthodox" section of the documentation 87 | * \param d Dropout rate \f$d_x\f$ for the input \f$x_t\f$ 88 | * \param d_h Dropout rate \f$d_h\f$ for the output \f$h_t\f$ 89 | */ 90 | void set_dropout(float d, float d_r); 91 | /** 92 | * \brief Set all dropout rates to 0 93 | * \details This is equivalent to `set_dropout(0)` or `set_dropout(0,0,0)` 94 | * 95 | */ 96 | void disable_dropout(); 97 | /** 98 | * \brief Set dropout masks at the beginning of a sequence for a specific batch size 99 | * \details If this function is not called on batched input, the same mask will be applied across 100 | * all batch elements. Use this to apply different masks to each batch element 101 | * 102 | * \param batch_size Batch size 103 | */ 104 | void set_dropout_masks(unsigned batch_size = 1); 105 | /** 106 | * \brief Get parameters in ResidualDilatedLSTMBuilder 107 | * \return list of points to ParameterStorage objects 108 | */ 109 | ParameterCollection & get_parameter_collection() override; 110 | protected: 111 | void new_graph_impl(ComputationGraph& cg, bool update) override; 112 | void start_new_sequence_impl(const std::vector& h0) override; 113 | Expression add_input_impl(int prev, const Expression& x) override; 114 | Expression set_h_impl(int prev, const std::vector& h_new) override; 115 | Expression set_s_impl(int prev, const std::vector& s_new) override; 116 | 117 | public: 118 | ParameterCollection local_model; 119 | // first index is layer, then ... 120 | std::vector> params; 121 | // first index is layer, then ... 122 | std::vector> ln_params; 123 | 124 | // first index is layer, then ... 125 | std::vector> param_vars; 126 | // first index is layer, then ... 127 | std::vector> ln_param_vars; 128 | 129 | // first index is layer, then ... 130 | std::vector> masks; 131 | 132 | // first index is time, second is layer 133 | std::vector> h, c; 134 | 135 | // initial values of h and c at each layer 136 | // - both default to zero matrix input 137 | bool has_initial_state; // if this is false, treat h0 and c0 as 0 138 | std::vector h0; 139 | std::vector c0; 140 | unsigned layers; 141 | unsigned input_dim, hid; 142 | float dropout_rate_h; 143 | bool ln_lstm; 144 | float forget_bias; 145 | bool dropout_masks_valid; 146 | vector dilations; //one int per layer 147 | 148 | private: 149 | ComputationGraph* _cg; // Pointer to current cg 150 | 151 | }; 152 | 153 | 154 | struct DilatedLSTMBuilder : public RNNBuilder { 155 | /** 156 | * @brief Default Constructor 157 | */ 158 | DilatedLSTMBuilder(); 159 | /** 160 | * \brief Constructor for the DilatedLSTMBuilder 161 | * 162 | * \param dilations Vector of dilations 163 | * \param input_dim Dimention of the input \f$x_t\f$ 164 | * \param hidden_dim Dimention of the hidden states \f$h_t\f$ and \f$c_t\f$ 165 | * \param model ParameterCollection holding the parameters 166 | */ 167 | explicit DilatedLSTMBuilder(vector dilations, 168 | unsigned input_dim, 169 | unsigned hidden_dim, 170 | ParameterCollection& model); 171 | 172 | Expression back() const override { return (cur == -1 ? h0.back() : h[cur].back()); } 173 | std::vector final_h() const override { return (h.size() == 0 ? h0 : h.back()); } 174 | std::vector final_s() const override { 175 | std::vector ret = (c.size() == 0 ? c0 : c.back()); 176 | for (auto my_h : final_h()) ret.push_back(my_h); 177 | return ret; 178 | } 179 | unsigned num_h0_components() const override { return 2 * layers; } 180 | 181 | std::vector get_h(RNNPointer i) const override { return (i == -1 ? h0 : h[i]); } 182 | std::vector get_s(RNNPointer i) const override { 183 | std::vector ret = (i == -1 ? c0 : c[i]); 184 | for (auto my_h : get_h(i)) ret.push_back(my_h); 185 | return ret; 186 | } 187 | 188 | void copy(const RNNBuilder & params) override; 189 | 190 | /** 191 | * \brief Set the dropout rates to a unique value 192 | * \details This has the same effect as `set_dropout(d,d_h)` except that all the dropout rates are set to the same value. 193 | * \param d Dropout rate to be applied on all of \f$x,h\f$ 194 | */ 195 | void set_dropout(float d); 196 | /** 197 | * \brief Set the dropout rates 198 | * \details The dropout implemented here is the variational dropout with tied weights introduced in [Gal, 2016](http://papers.nips.cc/paper/6241-a-theoretically-grounded-application-of-dropout-in-recurrent-neural-networks) 199 | * More specifically, dropout masks \f$\mathbf{z_x}\sim \mathrm{Bernoulli}(1-d_x)\f$,\f$\mathbf{z_h}\sim \mathrm{Bernoulli}(1-d_h)\f$ are sampled at the start of each sequence. 200 | * The dynamics of the cell are then modified to : 201 | * 202 | * \f$ 203 | * \begin{split} 204 | i_t & =\sigma(W_{ix}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{ih}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_i)\\ 205 | f_t & = \sigma(W_{fx}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{fh}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_f)\\ 206 | o_t & = \sigma(W_{ox}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{oh}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_o)\\ 207 | \tilde{c_t} & = \tanh(W_{cx}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{ch}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_c)\\ 208 | c_t & = c_{t-1}\circ f_t + \tilde{c_t}\circ i_t\\ 209 | h_t & = \tanh(c_t)\circ o_t\\ 210 | \end{split} 211 | * \f$ 212 | * 213 | * For more detail as to why scaling is applied, see the "Unorthodox" section of the documentation 214 | * \param d Dropout rate \f$d_x\f$ for the input \f$x_t\f$ 215 | */ 216 | void set_dropout(float d, float d_r); 217 | /** 218 | * \brief Set all dropout rates to 0 219 | * \details This is equivalent to `set_dropout(0)` or `set_dropout(0,0,0)` 220 | * 221 | */ 222 | void disable_dropout(); 223 | /** 224 | * \brief Set dropout masks at the beginning of a sequence for a specific batch size 225 | * \details If this function is not called on batched input, the same mask will be applied across 226 | * all batch elements. Use this to apply different masks to each batch element 227 | * 228 | * \param batch_size Batch size 229 | */ 230 | void set_dropout_masks(unsigned batch_size = 1); 231 | 232 | void set_weightnoise(float std); 233 | ParameterCollection & get_parameter_collection() override; 234 | protected: 235 | void new_graph_impl(ComputationGraph& cg, bool update) override; 236 | void start_new_sequence_impl(const std::vector& h0) override; 237 | Expression add_input_impl(int prev, const Expression& x) override; 238 | Expression set_h_impl(int prev, const std::vector& h_new) override; 239 | Expression set_s_impl(int prev, const std::vector& s_new) override; 240 | 241 | public: 242 | ParameterCollection local_model; 243 | // first index is layer, then ... 244 | std::vector> params; 245 | 246 | // first index is layer, then ... 247 | std::vector> param_vars; 248 | 249 | // first index is layer, then ... 250 | std::vector> masks; 251 | 252 | // first index is time, second is layer 253 | std::vector> h, c; 254 | 255 | // initial values of h and c at each layer 256 | // - both default to zero matrix input 257 | bool has_initial_state; // if this is false, treat h0 and c0 as 0 258 | std::vector h0; 259 | std::vector c0; 260 | unsigned layers; 261 | unsigned input_dim, hid; 262 | float dropout_rate_h; 263 | float weightnoise_std; 264 | vector dilations; //one int per layer 265 | 266 | bool dropout_masks_valid; 267 | private: 268 | ComputationGraph* _cg; // Pointer to current cg 269 | 270 | }; 271 | 272 | 273 | struct AttentiveDilatedLSTMBuilder : public RNNBuilder { 274 | /** 275 | * @brief Default Constructor 276 | */ 277 | AttentiveDilatedLSTMBuilder(); 278 | /** 279 | * \brief Constructor for the AttentiveDilatedLSTMBuilder 280 | * 281 | * \param max_dilations Vector, maximum dilations (per layer) 282 | * \param input_dim Dimention of the input \f$x_t\f$ 283 | * \param hidden_dim Dimention of the hidden states \f$h_t\f$ and \f$c_t\f$ 284 | * \param model ParameterCollection holding the parameters 285 | */ 286 | explicit AttentiveDilatedLSTMBuilder(vector max_dilations, 287 | unsigned input_dim, 288 | unsigned hidden_dim, 289 | unsigned attention_dim, 290 | ParameterCollection& model); 291 | 292 | Expression back() const override { return (cur == -1 ? h0.back() : h[cur].back()); } 293 | std::vector final_h() const override { return (h.size() == 0 ? h0 : h.back()); } 294 | std::vector final_s() const override { 295 | std::vector ret = (c.size() == 0 ? c0 : c.back()); 296 | for (auto my_h : final_h()) ret.push_back(my_h); 297 | return ret; 298 | } 299 | unsigned num_h0_components() const override { return 2 * layers; } 300 | 301 | std::vector get_h(RNNPointer i) const override { return (i == -1 ? h0 : h[i]); } 302 | std::vector get_s(RNNPointer i) const override { 303 | std::vector ret = (i == -1 ? c0 : c[i]); 304 | for (auto my_h : get_h(i)) ret.push_back(my_h); 305 | return ret; 306 | } 307 | 308 | void copy(const RNNBuilder & params) override; 309 | 310 | /** 311 | * \brief Set the dropout rates to a unique value 312 | * \details This has the same effect as `set_dropout(d,d_h)` except that all the dropout rates are set to the same value. 313 | * \param d Dropout rate to be applied on all of \f$x,h\f$ 314 | */ 315 | void set_dropout(float d); 316 | /** 317 | * \brief Set the dropout rates 318 | * \details The dropout implemented here is the variational dropout with tied weights introduced in [Gal, 2016](http://papers.nips.cc/paper/6241-a-theoretically-grounded-application-of-dropout-in-recurrent-neural-networks) 319 | * More specifically, dropout masks \f$\mathbf{z_x}\sim \mathrm{Bernoulli}(1-d_x)\f$,\f$\mathbf{z_h}\sim \mathrm{Bernoulli}(1-d_h)\f$ are sampled at the start of each sequence. 320 | * The dynamics of the cell are then modified to : 321 | * 322 | * \f$ 323 | * \begin{split} 324 | i_t & =\sigma(W_{ix}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{ih}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_i)\\ 325 | f_t & = \sigma(W_{fx}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{fh}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_f)\\ 326 | o_t & = \sigma(W_{ox}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{oh}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_o)\\ 327 | \tilde{c_t} & = \tanh(W_{cx}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{ch}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_c)\\ 328 | c_t & = c_{t-1}\circ f_t + \tilde{c_t}\circ i_t\\ 329 | h_t & = \tanh(c_t)\circ o_t\\ 330 | \end{split} 331 | * \f$ 332 | * 333 | * For more detail as to why scaling is applied, see the "Unorthodox" section of the documentation 334 | * \param d Dropout rate \f$d_x\f$ for the input \f$x_t\f$ 335 | */ 336 | void set_dropout(float d, float d_r); 337 | /** 338 | * \brief Set all dropout rates to 0 339 | * \details This is equivalent to `set_dropout(0)` or `set_dropout(0,0,0)` 340 | * 341 | */ 342 | void disable_dropout(); 343 | /** 344 | * \brief Set dropout masks at the beginning of a sequence for a specific batch size 345 | * \details If this function is not called on batched input, the same mask will be applied across 346 | * all batch elements. Use this to apply different masks to each batch element 347 | * 348 | * \param batch_size Batch size 349 | */ 350 | void set_dropout_masks(unsigned batch_size = 1); 351 | 352 | void set_weightnoise(float std); 353 | ParameterCollection & get_parameter_collection() override; 354 | protected: 355 | void new_graph_impl(ComputationGraph& cg, bool update) override; 356 | void start_new_sequence_impl(const std::vector& h0) override; 357 | Expression add_input_impl(int prev, const Expression& x) override; 358 | Expression set_h_impl(int prev, const std::vector& h_new) override; 359 | Expression set_s_impl(int prev, const std::vector& s_new) override; 360 | 361 | public: 362 | ParameterCollection local_model; 363 | // first index is layer, then ... 364 | std::vector> params; 365 | 366 | // first index is layer, then ... 367 | std::vector> param_vars; 368 | 369 | // first index is layer, then ... 370 | std::vector> masks; 371 | 372 | // first index is time, second is layer 373 | std::vector> h, c; 374 | 375 | // initial values of h and c at each layer 376 | // - both default to zero matrix input 377 | bool has_initial_state; // if this is false, treat h0 and c0 as 0 378 | std::vector h0; 379 | std::vector c0; 380 | unsigned layers; 381 | unsigned input_dim, hid; 382 | unsigned attention_dim; 383 | float dropout_rate_h; 384 | float weightnoise_std; 385 | vector max_dilations; //one int per layer 386 | 387 | bool dropout_masks_valid; 388 | private: 389 | ComputationGraph* _cg; // Pointer to current cg 390 | 391 | }; 392 | } // namespace dynet 393 | 394 | #endif 395 | -------------------------------------------------------------------------------- /c++/windows_VisualStudio/M4.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 14 4 | VisualStudioVersion = 14.0.25420.1 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "M41", "M41\M41.vcxproj", "{928301A0-F01A-48F6-A499-851B3CE8BD4E}" 7 | EndProject 8 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "M42", "M42\M42.vcxproj", "{A16B5466-E680-43F6-A884-A4A01EB78E50}" 9 | EndProject 10 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "M43", "M43\M43.vcxproj", "{BE951571-3F3A-4048-BAA3-0C05F38CFF42}" 11 | EndProject 12 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "M44", "M44\M44.vcxproj", "{7A192E0C-8F58-4D65-998E-3A7010AB5F87}" 13 | EndProject 14 | Global 15 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 16 | Debug|x64 = Debug|x64 17 | Debug|x86 = Debug|x86 18 | RelWithDebug|x64 = RelWithDebug|x64 19 | RelWithDebug|x86 = RelWithDebug|x86 20 | EndGlobalSection 21 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 22 | {928301A0-F01A-48F6-A499-851B3CE8BD4E}.Debug|x64.ActiveCfg = Debug|x64 23 | {928301A0-F01A-48F6-A499-851B3CE8BD4E}.Debug|x64.Build.0 = Debug|x64 24 | {928301A0-F01A-48F6-A499-851B3CE8BD4E}.Debug|x86.ActiveCfg = Debug|Win32 25 | {928301A0-F01A-48F6-A499-851B3CE8BD4E}.Debug|x86.Build.0 = Debug|Win32 26 | {928301A0-F01A-48F6-A499-851B3CE8BD4E}.RelWithDebug|x64.ActiveCfg = RelWithDebug|x64 27 | {928301A0-F01A-48F6-A499-851B3CE8BD4E}.RelWithDebug|x64.Build.0 = RelWithDebug|x64 28 | {928301A0-F01A-48F6-A499-851B3CE8BD4E}.RelWithDebug|x86.ActiveCfg = RelWithDebug|Win32 29 | {928301A0-F01A-48F6-A499-851B3CE8BD4E}.RelWithDebug|x86.Build.0 = RelWithDebug|Win32 30 | {A16B5466-E680-43F6-A884-A4A01EB78E50}.Debug|x64.ActiveCfg = Debug|x64 31 | {A16B5466-E680-43F6-A884-A4A01EB78E50}.Debug|x64.Build.0 = Debug|x64 32 | {A16B5466-E680-43F6-A884-A4A01EB78E50}.Debug|x86.ActiveCfg = Debug|Win32 33 | {A16B5466-E680-43F6-A884-A4A01EB78E50}.Debug|x86.Build.0 = Debug|Win32 34 | {A16B5466-E680-43F6-A884-A4A01EB78E50}.RelWithDebug|x64.ActiveCfg = RelWithDebug|x64 35 | {A16B5466-E680-43F6-A884-A4A01EB78E50}.RelWithDebug|x64.Build.0 = RelWithDebug|x64 36 | {A16B5466-E680-43F6-A884-A4A01EB78E50}.RelWithDebug|x86.ActiveCfg = RelWithDebug|Win32 37 | {A16B5466-E680-43F6-A884-A4A01EB78E50}.RelWithDebug|x86.Build.0 = RelWithDebug|Win32 38 | {BE951571-3F3A-4048-BAA3-0C05F38CFF42}.Debug|x64.ActiveCfg = Debug|x64 39 | {BE951571-3F3A-4048-BAA3-0C05F38CFF42}.Debug|x64.Build.0 = Debug|x64 40 | {BE951571-3F3A-4048-BAA3-0C05F38CFF42}.Debug|x86.ActiveCfg = Debug|Win32 41 | {BE951571-3F3A-4048-BAA3-0C05F38CFF42}.Debug|x86.Build.0 = Debug|Win32 42 | {BE951571-3F3A-4048-BAA3-0C05F38CFF42}.RelWithDebug|x64.ActiveCfg = RelWithDebug|x64 43 | {BE951571-3F3A-4048-BAA3-0C05F38CFF42}.RelWithDebug|x64.Build.0 = RelWithDebug|x64 44 | {BE951571-3F3A-4048-BAA3-0C05F38CFF42}.RelWithDebug|x86.ActiveCfg = RelWithDebug|Win32 45 | {BE951571-3F3A-4048-BAA3-0C05F38CFF42}.RelWithDebug|x86.Build.0 = RelWithDebug|Win32 46 | {7A192E0C-8F58-4D65-998E-3A7010AB5F87}.Debug|x64.ActiveCfg = Debug|x64 47 | {7A192E0C-8F58-4D65-998E-3A7010AB5F87}.Debug|x64.Build.0 = Debug|x64 48 | {7A192E0C-8F58-4D65-998E-3A7010AB5F87}.Debug|x86.ActiveCfg = Debug|Win32 49 | {7A192E0C-8F58-4D65-998E-3A7010AB5F87}.Debug|x86.Build.0 = Debug|Win32 50 | {7A192E0C-8F58-4D65-998E-3A7010AB5F87}.RelWithDebug|x64.ActiveCfg = RelWithDebug|x64 51 | {7A192E0C-8F58-4D65-998E-3A7010AB5F87}.RelWithDebug|x64.Build.0 = RelWithDebug|x64 52 | {7A192E0C-8F58-4D65-998E-3A7010AB5F87}.RelWithDebug|x86.ActiveCfg = RelWithDebug|Win32 53 | {7A192E0C-8F58-4D65-998E-3A7010AB5F87}.RelWithDebug|x86.Build.0 = RelWithDebug|Win32 54 | EndGlobalSection 55 | GlobalSection(SolutionProperties) = preSolution 56 | HideSolutionNode = FALSE 57 | EndGlobalSection 58 | EndGlobal 59 | -------------------------------------------------------------------------------- /c++/windows_VisualStudio/M41/M41.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | Debug 14 | x64 15 | 16 | 17 | Release 18 | x64 19 | 20 | 21 | RelWithDebug 22 | Win32 23 | 24 | 25 | RelWithDebug 26 | x64 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | {928301A0-F01A-48F6-A499-851B3CE8BD4E} 38 | Win32Proj 39 | M41 40 | 8.1 41 | 42 | 43 | 44 | Application 45 | true 46 | v140 47 | Unicode 48 | 49 | 50 | Application 51 | true 52 | v140 53 | Unicode 54 | 55 | 56 | Application 57 | false 58 | v140 59 | true 60 | Unicode 61 | 62 | 63 | Application 64 | true 65 | v140 66 | Unicode 67 | Sequential 68 | 69 | 70 | Application 71 | true 72 | v140 73 | Unicode 74 | Sequential 75 | 76 | 77 | Application 78 | false 79 | v140 80 | true 81 | Unicode 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | true 109 | 110 | 111 | true 112 | 113 | 114 | true 115 | 116 | 117 | true 118 | 119 | 120 | false 121 | 122 | 123 | false 124 | 125 | 126 | 127 | 128 | 129 | Level3 130 | Disabled 131 | WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 132 | 133 | 134 | Console 135 | true 136 | 137 | 138 | 139 | 140 | 141 | 142 | Level3 143 | Disabled 144 | WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 145 | 146 | 147 | Console 148 | true 149 | 150 | 151 | 152 | 153 | NotUsing 154 | Level1 155 | Disabled 156 | WIN32;_WINDOWS;EIGEN_USE_MKL_ALL;EIGEN_FAST_MATH;NOMINMAX;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 157 | E:\progs2\dynet;E:\progs\Eigen; 158 | 159 | 160 | Console 161 | true 162 | E:\progs2\dynet\buildMKL\dynet\Debug 163 | dynet.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) 164 | 165 | 166 | 167 | 168 | NotUsing 169 | Level1 170 | MaxSpeed 171 | WIN32;_WINDOWS;EIGEN_FAST_MATH;EIGEN_USE_MKL_ALL;NOMINMAX;NDEBUG_;CONSOLE;%(PreprocessorDefinitions) 172 | E:\progs2\dynet;E:\progs\Eigen; 173 | AnySuitable 174 | true 175 | Speed 176 | AdvancedVectorExtensions 177 | Default 178 | MultiThreadedDLL 179 | ProgramDatabase 180 | true 181 | false 182 | 183 | 184 | Console 185 | true 186 | E:\progs2\dynet\buildMKL\dynet\RelWithDebInfo 187 | dynet.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) 188 | 189 | 190 | 191 | 192 | Level3 193 | 194 | 195 | MaxSpeed 196 | true 197 | true 198 | WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 199 | 200 | 201 | Console 202 | true 203 | true 204 | true 205 | 206 | 207 | 208 | 209 | Level3 210 | 211 | 212 | MaxSpeed 213 | true 214 | true 215 | NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 216 | 217 | 218 | Console 219 | true 220 | true 221 | true 222 | 223 | 224 | 225 | 226 | 227 | -------------------------------------------------------------------------------- /c++/windows_VisualStudio/M41/slstm.h: -------------------------------------------------------------------------------- 1 | /** 2 | * file slstm.h 3 | * header for my implementation of dilated LSTMs, based on Dynet LSTM builders 4 | - DilatedLSTMBuilder - standard Dilated LSTM (https://papers.nips.cc/paper/6613-dilated-recurrent-neural-networks.pdf) 5 | - ResidualDilatedLSTMBuilder - Dilated LSTM with special Residual shortcuts, after https://arxiv.org/abs/1701.03360 6 | - AttentiveDilatedLSTMBuilder - Dilated LSTM with Attention mechanism, as in the second stage of https://arxiv.org/abs/1704.02971 7 | * 8 | Slawek Smyl, Mar-May 2018 9 | */ 10 | 11 | #ifndef DYNET_SLSTMS_H_ 12 | #define DYNET_SLSTMS_H_ 13 | 14 | #include "dynet/dynet.h" 15 | #include "dynet/rnn.h" 16 | #include "dynet/expr.h" 17 | 18 | using namespace std; 19 | 20 | namespace dynet { 21 | 22 | //basd on VanillaLSTMBuilder 23 | struct ResidualDilatedLSTMBuilder : public RNNBuilder { 24 | /** 25 | * @brief Default Constructor 26 | */ 27 | ResidualDilatedLSTMBuilder(); 28 | /** 29 | * \brief Constructor for the ResidualDilatedLSTMBuilder 30 | * 31 | * \param dilations Vector of dilations 32 | * \param input_dim Dimention of the input \f$x_t\f$ 33 | * \param hidden_dim Dimention of the hidden states \f$h_t\f$ and \f$c_t\f$ 34 | * \param model ParameterCollection holding the parameters 35 | * \param ln_lstm Whether to use layer normalization 36 | * \param forget_bias value(float) to use as bias for the forget gate(default = 1.0) 37 | */ 38 | explicit ResidualDilatedLSTMBuilder(vector dilations, 39 | unsigned input_dim, 40 | unsigned hidden_dim, 41 | ParameterCollection& model, 42 | bool ln_lstm = false, 43 | float forget_bias = 1.f); 44 | 45 | Expression back() const override { return (cur == -1 ? h0.back() : h[cur].back()); } 46 | std::vector final_h() const override { return (h.size() == 0 ? h0 : h.back()); } 47 | std::vector final_s() const override { 48 | std::vector ret = (c.size() == 0 ? c0 : c.back()); 49 | for (auto my_h : final_h()) ret.push_back(my_h); 50 | return ret; 51 | } 52 | unsigned num_h0_components() const override { return 2 * layers; } 53 | 54 | std::vector get_h(RNNPointer i) const override { return (i == -1 ? h0 : h[i]); } 55 | std::vector get_s(RNNPointer i) const override { 56 | std::vector ret = (i == -1 ? c0 : c[i]); 57 | for (auto my_h : get_h(i)) ret.push_back(my_h); 58 | return ret; 59 | } 60 | 61 | void copy(const RNNBuilder & params) override; 62 | 63 | /** 64 | * \brief Set the dropout rates to a unique value 65 | * \details This has the same effect as `set_dropout(d,d_h)` except that all the dropout rates are set to the same value. 66 | * \param d Dropout rate to be applied on all of \f$x,h\f$ 67 | */ 68 | void set_dropout(float d); 69 | /** 70 | * \brief Set the dropout rates 71 | * \details The dropout implemented here is the variational dropout with tied weights introduced in [Gal, 2016](http://papers.nips.cc/paper/6241-a-theoretically-grounded-application-of-dropout-in-recurrent-neural-networks) 72 | * More specifically, dropout masks \f$\mathbf{z_x}\sim \mathrm{Bernoulli}(1-d_x)\f$,\f$\mathbf{z_h}\sim \mathrm{Bernoulli}(1-d_h)\f$ are sampled at the start of each sequence. 73 | * The dynamics of the cell are then modified to : 74 | * 75 | * \f$ 76 | * \begin{split} 77 | i_t & =\sigma(W_{ix}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{ih}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_i)\\ 78 | f_t & = \sigma(W_{fx}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{fh}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_f)\\ 79 | o_t & = \sigma(W_{ox}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{oh}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_o)\\ 80 | \tilde{c_t} & = \tanh(W_{cx}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{ch}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_c)\\ 81 | c_t & = c_{t-1}\circ f_t + \tilde{c_t}\circ i_t\\ 82 | h_t & = \tanh(c_t)\circ o_t\\ 83 | \end{split} 84 | * \f$ 85 | * 86 | * For more detail as to why scaling is applied, see the "Unorthodox" section of the documentation 87 | * \param d Dropout rate \f$d_x\f$ for the input \f$x_t\f$ 88 | * \param d_h Dropout rate \f$d_h\f$ for the output \f$h_t\f$ 89 | */ 90 | void set_dropout(float d, float d_r); 91 | /** 92 | * \brief Set all dropout rates to 0 93 | * \details This is equivalent to `set_dropout(0)` or `set_dropout(0,0,0)` 94 | * 95 | */ 96 | void disable_dropout(); 97 | /** 98 | * \brief Set dropout masks at the beginning of a sequence for a specific batch size 99 | * \details If this function is not called on batched input, the same mask will be applied across 100 | * all batch elements. Use this to apply different masks to each batch element 101 | * 102 | * \param batch_size Batch size 103 | */ 104 | void set_dropout_masks(unsigned batch_size = 1); 105 | /** 106 | * \brief Get parameters in ResidualDilatedLSTMBuilder 107 | * \return list of points to ParameterStorage objects 108 | */ 109 | ParameterCollection & get_parameter_collection() override; 110 | protected: 111 | void new_graph_impl(ComputationGraph& cg, bool update) override; 112 | void start_new_sequence_impl(const std::vector& h0) override; 113 | Expression add_input_impl(int prev, const Expression& x) override; 114 | Expression set_h_impl(int prev, const std::vector& h_new) override; 115 | Expression set_s_impl(int prev, const std::vector& s_new) override; 116 | 117 | public: 118 | ParameterCollection local_model; 119 | // first index is layer, then ... 120 | std::vector> params; 121 | // first index is layer, then ... 122 | std::vector> ln_params; 123 | 124 | // first index is layer, then ... 125 | std::vector> param_vars; 126 | // first index is layer, then ... 127 | std::vector> ln_param_vars; 128 | 129 | // first index is layer, then ... 130 | std::vector> masks; 131 | 132 | // first index is time, second is layer 133 | std::vector> h, c; 134 | 135 | // initial values of h and c at each layer 136 | // - both default to zero matrix input 137 | bool has_initial_state; // if this is false, treat h0 and c0 as 0 138 | std::vector h0; 139 | std::vector c0; 140 | unsigned layers; 141 | unsigned input_dim, hid; 142 | float dropout_rate_h; 143 | bool ln_lstm; 144 | float forget_bias; 145 | bool dropout_masks_valid; 146 | vector dilations; //one int per layer 147 | 148 | private: 149 | ComputationGraph* _cg; // Pointer to current cg 150 | 151 | }; 152 | 153 | 154 | struct DilatedLSTMBuilder : public RNNBuilder { 155 | /** 156 | * @brief Default Constructor 157 | */ 158 | DilatedLSTMBuilder(); 159 | /** 160 | * \brief Constructor for the DilatedLSTMBuilder 161 | * 162 | * \param dilations Vector of dilations 163 | * \param input_dim Dimention of the input \f$x_t\f$ 164 | * \param hidden_dim Dimention of the hidden states \f$h_t\f$ and \f$c_t\f$ 165 | * \param model ParameterCollection holding the parameters 166 | */ 167 | explicit DilatedLSTMBuilder(vector dilations, 168 | unsigned input_dim, 169 | unsigned hidden_dim, 170 | ParameterCollection& model); 171 | 172 | Expression back() const override { return (cur == -1 ? h0.back() : h[cur].back()); } 173 | std::vector final_h() const override { return (h.size() == 0 ? h0 : h.back()); } 174 | std::vector final_s() const override { 175 | std::vector ret = (c.size() == 0 ? c0 : c.back()); 176 | for (auto my_h : final_h()) ret.push_back(my_h); 177 | return ret; 178 | } 179 | unsigned num_h0_components() const override { return 2 * layers; } 180 | 181 | std::vector get_h(RNNPointer i) const override { return (i == -1 ? h0 : h[i]); } 182 | std::vector get_s(RNNPointer i) const override { 183 | std::vector ret = (i == -1 ? c0 : c[i]); 184 | for (auto my_h : get_h(i)) ret.push_back(my_h); 185 | return ret; 186 | } 187 | 188 | void copy(const RNNBuilder & params) override; 189 | 190 | /** 191 | * \brief Set the dropout rates to a unique value 192 | * \details This has the same effect as `set_dropout(d,d_h)` except that all the dropout rates are set to the same value. 193 | * \param d Dropout rate to be applied on all of \f$x,h\f$ 194 | */ 195 | void set_dropout(float d); 196 | /** 197 | * \brief Set the dropout rates 198 | * \details The dropout implemented here is the variational dropout with tied weights introduced in [Gal, 2016](http://papers.nips.cc/paper/6241-a-theoretically-grounded-application-of-dropout-in-recurrent-neural-networks) 199 | * More specifically, dropout masks \f$\mathbf{z_x}\sim \mathrm{Bernoulli}(1-d_x)\f$,\f$\mathbf{z_h}\sim \mathrm{Bernoulli}(1-d_h)\f$ are sampled at the start of each sequence. 200 | * The dynamics of the cell are then modified to : 201 | * 202 | * \f$ 203 | * \begin{split} 204 | i_t & =\sigma(W_{ix}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{ih}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_i)\\ 205 | f_t & = \sigma(W_{fx}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{fh}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_f)\\ 206 | o_t & = \sigma(W_{ox}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{oh}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_o)\\ 207 | \tilde{c_t} & = \tanh(W_{cx}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{ch}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_c)\\ 208 | c_t & = c_{t-1}\circ f_t + \tilde{c_t}\circ i_t\\ 209 | h_t & = \tanh(c_t)\circ o_t\\ 210 | \end{split} 211 | * \f$ 212 | * 213 | * For more detail as to why scaling is applied, see the "Unorthodox" section of the documentation 214 | * \param d Dropout rate \f$d_x\f$ for the input \f$x_t\f$ 215 | */ 216 | void set_dropout(float d, float d_r); 217 | /** 218 | * \brief Set all dropout rates to 0 219 | * \details This is equivalent to `set_dropout(0)` or `set_dropout(0,0,0)` 220 | * 221 | */ 222 | void disable_dropout(); 223 | /** 224 | * \brief Set dropout masks at the beginning of a sequence for a specific batch size 225 | * \details If this function is not called on batched input, the same mask will be applied across 226 | * all batch elements. Use this to apply different masks to each batch element 227 | * 228 | * \param batch_size Batch size 229 | */ 230 | void set_dropout_masks(unsigned batch_size = 1); 231 | 232 | void set_weightnoise(float std); 233 | ParameterCollection & get_parameter_collection() override; 234 | protected: 235 | void new_graph_impl(ComputationGraph& cg, bool update) override; 236 | void start_new_sequence_impl(const std::vector& h0) override; 237 | Expression add_input_impl(int prev, const Expression& x) override; 238 | Expression set_h_impl(int prev, const std::vector& h_new) override; 239 | Expression set_s_impl(int prev, const std::vector& s_new) override; 240 | 241 | public: 242 | ParameterCollection local_model; 243 | // first index is layer, then ... 244 | std::vector> params; 245 | 246 | // first index is layer, then ... 247 | std::vector> param_vars; 248 | 249 | // first index is layer, then ... 250 | std::vector> masks; 251 | 252 | // first index is time, second is layer 253 | std::vector> h, c; 254 | 255 | // initial values of h and c at each layer 256 | // - both default to zero matrix input 257 | bool has_initial_state; // if this is false, treat h0 and c0 as 0 258 | std::vector h0; 259 | std::vector c0; 260 | unsigned layers; 261 | unsigned input_dim, hid; 262 | float dropout_rate_h; 263 | float weightnoise_std; 264 | vector dilations; //one int per layer 265 | 266 | bool dropout_masks_valid; 267 | private: 268 | ComputationGraph* _cg; // Pointer to current cg 269 | 270 | }; 271 | 272 | 273 | struct AttentiveDilatedLSTMBuilder : public RNNBuilder { 274 | /** 275 | * @brief Default Constructor 276 | */ 277 | AttentiveDilatedLSTMBuilder(); 278 | /** 279 | * \brief Constructor for the AttentiveDilatedLSTMBuilder 280 | * 281 | * \param max_dilations Vector, maximum dilations (per layer) 282 | * \param input_dim Dimention of the input \f$x_t\f$ 283 | * \param hidden_dim Dimention of the hidden states \f$h_t\f$ and \f$c_t\f$ 284 | * \param model ParameterCollection holding the parameters 285 | */ 286 | explicit AttentiveDilatedLSTMBuilder(vector max_dilations, 287 | unsigned input_dim, 288 | unsigned hidden_dim, 289 | unsigned attention_dim, 290 | ParameterCollection& model); 291 | 292 | Expression back() const override { return (cur == -1 ? h0.back() : h[cur].back()); } 293 | std::vector final_h() const override { return (h.size() == 0 ? h0 : h.back()); } 294 | std::vector final_s() const override { 295 | std::vector ret = (c.size() == 0 ? c0 : c.back()); 296 | for (auto my_h : final_h()) ret.push_back(my_h); 297 | return ret; 298 | } 299 | unsigned num_h0_components() const override { return 2 * layers; } 300 | 301 | std::vector get_h(RNNPointer i) const override { return (i == -1 ? h0 : h[i]); } 302 | std::vector get_s(RNNPointer i) const override { 303 | std::vector ret = (i == -1 ? c0 : c[i]); 304 | for (auto my_h : get_h(i)) ret.push_back(my_h); 305 | return ret; 306 | } 307 | 308 | void copy(const RNNBuilder & params) override; 309 | 310 | /** 311 | * \brief Set the dropout rates to a unique value 312 | * \details This has the same effect as `set_dropout(d,d_h)` except that all the dropout rates are set to the same value. 313 | * \param d Dropout rate to be applied on all of \f$x,h\f$ 314 | */ 315 | void set_dropout(float d); 316 | /** 317 | * \brief Set the dropout rates 318 | * \details The dropout implemented here is the variational dropout with tied weights introduced in [Gal, 2016](http://papers.nips.cc/paper/6241-a-theoretically-grounded-application-of-dropout-in-recurrent-neural-networks) 319 | * More specifically, dropout masks \f$\mathbf{z_x}\sim \mathrm{Bernoulli}(1-d_x)\f$,\f$\mathbf{z_h}\sim \mathrm{Bernoulli}(1-d_h)\f$ are sampled at the start of each sequence. 320 | * The dynamics of the cell are then modified to : 321 | * 322 | * \f$ 323 | * \begin{split} 324 | i_t & =\sigma(W_{ix}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{ih}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_i)\\ 325 | f_t & = \sigma(W_{fx}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{fh}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_f)\\ 326 | o_t & = \sigma(W_{ox}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{oh}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_o)\\ 327 | \tilde{c_t} & = \tanh(W_{cx}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{ch}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_c)\\ 328 | c_t & = c_{t-1}\circ f_t + \tilde{c_t}\circ i_t\\ 329 | h_t & = \tanh(c_t)\circ o_t\\ 330 | \end{split} 331 | * \f$ 332 | * 333 | * For more detail as to why scaling is applied, see the "Unorthodox" section of the documentation 334 | * \param d Dropout rate \f$d_x\f$ for the input \f$x_t\f$ 335 | */ 336 | void set_dropout(float d, float d_r); 337 | /** 338 | * \brief Set all dropout rates to 0 339 | * \details This is equivalent to `set_dropout(0)` or `set_dropout(0,0,0)` 340 | * 341 | */ 342 | void disable_dropout(); 343 | /** 344 | * \brief Set dropout masks at the beginning of a sequence for a specific batch size 345 | * \details If this function is not called on batched input, the same mask will be applied across 346 | * all batch elements. Use this to apply different masks to each batch element 347 | * 348 | * \param batch_size Batch size 349 | */ 350 | void set_dropout_masks(unsigned batch_size = 1); 351 | 352 | void set_weightnoise(float std); 353 | ParameterCollection & get_parameter_collection() override; 354 | protected: 355 | void new_graph_impl(ComputationGraph& cg, bool update) override; 356 | void start_new_sequence_impl(const std::vector& h0) override; 357 | Expression add_input_impl(int prev, const Expression& x) override; 358 | Expression set_h_impl(int prev, const std::vector& h_new) override; 359 | Expression set_s_impl(int prev, const std::vector& s_new) override; 360 | 361 | public: 362 | ParameterCollection local_model; 363 | // first index is layer, then ... 364 | std::vector> params; 365 | 366 | // first index is layer, then ... 367 | std::vector> param_vars; 368 | 369 | // first index is layer, then ... 370 | std::vector> masks; 371 | 372 | // first index is time, second is layer 373 | std::vector> h, c; 374 | 375 | // initial values of h and c at each layer 376 | // - both default to zero matrix input 377 | bool has_initial_state; // if this is false, treat h0 and c0 as 0 378 | std::vector h0; 379 | std::vector c0; 380 | unsigned layers; 381 | unsigned input_dim, hid; 382 | unsigned attention_dim; 383 | float dropout_rate_h; 384 | float weightnoise_std; 385 | vector max_dilations; //one int per layer 386 | 387 | bool dropout_masks_valid; 388 | private: 389 | ComputationGraph* _cg; // Pointer to current cg 390 | 391 | }; 392 | } // namespace dynet 393 | 394 | #endif 395 | -------------------------------------------------------------------------------- /c++/windows_VisualStudio/M42/M42.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | Debug 14 | x64 15 | 16 | 17 | Release 18 | x64 19 | 20 | 21 | RelWithDebug 22 | Win32 23 | 24 | 25 | RelWithDebug 26 | x64 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | {A16B5466-E680-43F6-A884-A4A01EB78E50} 38 | Win32Proj 39 | M42 40 | 8.1 41 | 42 | 43 | 44 | Application 45 | true 46 | v140 47 | Unicode 48 | 49 | 50 | Application 51 | true 52 | v140 53 | Unicode 54 | 55 | 56 | Application 57 | false 58 | v140 59 | true 60 | Unicode 61 | 62 | 63 | Application 64 | true 65 | v140 66 | Unicode 67 | Sequential 68 | 69 | 70 | Application 71 | true 72 | v140 73 | Unicode 74 | Sequential 75 | 76 | 77 | Application 78 | false 79 | v140 80 | true 81 | Unicode 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | true 109 | 110 | 111 | true 112 | 113 | 114 | true 115 | 116 | 117 | true 118 | 119 | 120 | false 121 | 122 | 123 | false 124 | 125 | 126 | 127 | 128 | 129 | Level3 130 | Disabled 131 | WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 132 | 133 | 134 | Console 135 | true 136 | 137 | 138 | 139 | 140 | 141 | 142 | Level3 143 | Disabled 144 | WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 145 | 146 | 147 | Console 148 | true 149 | 150 | 151 | 152 | 153 | NotUsing 154 | Level1 155 | Disabled 156 | WIN32;_WINDOWS;EIGEN_USE_MKL_ALL;EIGEN_FAST_MATH;NOMINMAX;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 157 | E:\progs2\dynet;E:\progs\Eigen; 158 | 159 | 160 | Console 161 | true 162 | E:\progs2\dynet\buildMKL\dynet\Debug 163 | dynet.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) 164 | 165 | 166 | 167 | 168 | NotUsing 169 | Level1 170 | MaxSpeed 171 | WIN32;_WINDOWS;EIGEN_FAST_MATH;EIGEN_USE_MKL_ALL;NOMINMAX;NDEBUG_;CONSOLE;%(PreprocessorDefinitions) 172 | E:\progs2\dynet;E:\progs\Eigen; 173 | AnySuitable 174 | true 175 | Speed 176 | AdvancedVectorExtensions 177 | Default 178 | MultiThreadedDLL 179 | ProgramDatabase 180 | true 181 | false 182 | 183 | 184 | Console 185 | true 186 | E:\progs2\dynet\buildMKL\dynet\RelWithDebInfo 187 | dynet.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) 188 | 189 | 190 | 191 | 192 | Level3 193 | 194 | 195 | MaxSpeed 196 | true 197 | true 198 | WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 199 | 200 | 201 | Console 202 | true 203 | true 204 | true 205 | 206 | 207 | 208 | 209 | Level3 210 | 211 | 212 | MaxSpeed 213 | true 214 | true 215 | NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 216 | 217 | 218 | Console 219 | true 220 | true 221 | true 222 | 223 | 224 | 225 | 226 | 227 | -------------------------------------------------------------------------------- /c++/windows_VisualStudio/M42/M42.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {93995380-89BD-4b04-88EB-625FBE52EBFB} 10 | h;hh;hpp;hxx;hm;inl;inc;xsd 11 | 12 | 13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} 14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms 15 | 16 | 17 | 18 | 19 | Source Files 20 | 21 | 22 | Source Files 23 | 24 | 25 | 26 | 27 | Header Files 28 | 29 | 30 | -------------------------------------------------------------------------------- /c++/windows_VisualStudio/M42/slstm.h: -------------------------------------------------------------------------------- 1 | /** 2 | * file slstm.h 3 | * header for my implementation of dilated LSTMs, based on Dynet LSTM builders 4 | - DilatedLSTMBuilder - standard Dilated LSTM (https://papers.nips.cc/paper/6613-dilated-recurrent-neural-networks.pdf) 5 | - ResidualDilatedLSTMBuilder - Dilated LSTM with special Residual shortcuts, after https://arxiv.org/abs/1701.03360 6 | - AttentiveDilatedLSTMBuilder - Dilated LSTM with Attention mechanism, as in the second stage of https://arxiv.org/abs/1704.02971 7 | * 8 | Slawek Smyl, Mar-May 2018 9 | */ 10 | 11 | #ifndef DYNET_SLSTMS_H_ 12 | #define DYNET_SLSTMS_H_ 13 | 14 | #include "dynet/dynet.h" 15 | #include "dynet/rnn.h" 16 | #include "dynet/expr.h" 17 | 18 | using namespace std; 19 | 20 | namespace dynet { 21 | 22 | //basd on VanillaLSTMBuilder 23 | struct ResidualDilatedLSTMBuilder : public RNNBuilder { 24 | /** 25 | * @brief Default Constructor 26 | */ 27 | ResidualDilatedLSTMBuilder(); 28 | /** 29 | * \brief Constructor for the ResidualDilatedLSTMBuilder 30 | * 31 | * \param dilations Vector of dilations 32 | * \param input_dim Dimention of the input \f$x_t\f$ 33 | * \param hidden_dim Dimention of the hidden states \f$h_t\f$ and \f$c_t\f$ 34 | * \param model ParameterCollection holding the parameters 35 | * \param ln_lstm Whether to use layer normalization 36 | * \param forget_bias value(float) to use as bias for the forget gate(default = 1.0) 37 | */ 38 | explicit ResidualDilatedLSTMBuilder(vector dilations, 39 | unsigned input_dim, 40 | unsigned hidden_dim, 41 | ParameterCollection& model, 42 | bool ln_lstm = false, 43 | float forget_bias = 1.f); 44 | 45 | Expression back() const override { return (cur == -1 ? h0.back() : h[cur].back()); } 46 | std::vector final_h() const override { return (h.size() == 0 ? h0 : h.back()); } 47 | std::vector final_s() const override { 48 | std::vector ret = (c.size() == 0 ? c0 : c.back()); 49 | for (auto my_h : final_h()) ret.push_back(my_h); 50 | return ret; 51 | } 52 | unsigned num_h0_components() const override { return 2 * layers; } 53 | 54 | std::vector get_h(RNNPointer i) const override { return (i == -1 ? h0 : h[i]); } 55 | std::vector get_s(RNNPointer i) const override { 56 | std::vector ret = (i == -1 ? c0 : c[i]); 57 | for (auto my_h : get_h(i)) ret.push_back(my_h); 58 | return ret; 59 | } 60 | 61 | void copy(const RNNBuilder & params) override; 62 | 63 | /** 64 | * \brief Set the dropout rates to a unique value 65 | * \details This has the same effect as `set_dropout(d,d_h)` except that all the dropout rates are set to the same value. 66 | * \param d Dropout rate to be applied on all of \f$x,h\f$ 67 | */ 68 | void set_dropout(float d); 69 | /** 70 | * \brief Set the dropout rates 71 | * \details The dropout implemented here is the variational dropout with tied weights introduced in [Gal, 2016](http://papers.nips.cc/paper/6241-a-theoretically-grounded-application-of-dropout-in-recurrent-neural-networks) 72 | * More specifically, dropout masks \f$\mathbf{z_x}\sim \mathrm{Bernoulli}(1-d_x)\f$,\f$\mathbf{z_h}\sim \mathrm{Bernoulli}(1-d_h)\f$ are sampled at the start of each sequence. 73 | * The dynamics of the cell are then modified to : 74 | * 75 | * \f$ 76 | * \begin{split} 77 | i_t & =\sigma(W_{ix}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{ih}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_i)\\ 78 | f_t & = \sigma(W_{fx}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{fh}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_f)\\ 79 | o_t & = \sigma(W_{ox}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{oh}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_o)\\ 80 | \tilde{c_t} & = \tanh(W_{cx}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{ch}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_c)\\ 81 | c_t & = c_{t-1}\circ f_t + \tilde{c_t}\circ i_t\\ 82 | h_t & = \tanh(c_t)\circ o_t\\ 83 | \end{split} 84 | * \f$ 85 | * 86 | * For more detail as to why scaling is applied, see the "Unorthodox" section of the documentation 87 | * \param d Dropout rate \f$d_x\f$ for the input \f$x_t\f$ 88 | * \param d_h Dropout rate \f$d_h\f$ for the output \f$h_t\f$ 89 | */ 90 | void set_dropout(float d, float d_r); 91 | /** 92 | * \brief Set all dropout rates to 0 93 | * \details This is equivalent to `set_dropout(0)` or `set_dropout(0,0,0)` 94 | * 95 | */ 96 | void disable_dropout(); 97 | /** 98 | * \brief Set dropout masks at the beginning of a sequence for a specific batch size 99 | * \details If this function is not called on batched input, the same mask will be applied across 100 | * all batch elements. Use this to apply different masks to each batch element 101 | * 102 | * \param batch_size Batch size 103 | */ 104 | void set_dropout_masks(unsigned batch_size = 1); 105 | /** 106 | * \brief Get parameters in ResidualDilatedLSTMBuilder 107 | * \return list of points to ParameterStorage objects 108 | */ 109 | ParameterCollection & get_parameter_collection() override; 110 | protected: 111 | void new_graph_impl(ComputationGraph& cg, bool update) override; 112 | void start_new_sequence_impl(const std::vector& h0) override; 113 | Expression add_input_impl(int prev, const Expression& x) override; 114 | Expression set_h_impl(int prev, const std::vector& h_new) override; 115 | Expression set_s_impl(int prev, const std::vector& s_new) override; 116 | 117 | public: 118 | ParameterCollection local_model; 119 | // first index is layer, then ... 120 | std::vector> params; 121 | // first index is layer, then ... 122 | std::vector> ln_params; 123 | 124 | // first index is layer, then ... 125 | std::vector> param_vars; 126 | // first index is layer, then ... 127 | std::vector> ln_param_vars; 128 | 129 | // first index is layer, then ... 130 | std::vector> masks; 131 | 132 | // first index is time, second is layer 133 | std::vector> h, c; 134 | 135 | // initial values of h and c at each layer 136 | // - both default to zero matrix input 137 | bool has_initial_state; // if this is false, treat h0 and c0 as 0 138 | std::vector h0; 139 | std::vector c0; 140 | unsigned layers; 141 | unsigned input_dim, hid; 142 | float dropout_rate_h; 143 | bool ln_lstm; 144 | float forget_bias; 145 | bool dropout_masks_valid; 146 | vector dilations; //one int per layer 147 | 148 | private: 149 | ComputationGraph* _cg; // Pointer to current cg 150 | 151 | }; 152 | 153 | 154 | struct DilatedLSTMBuilder : public RNNBuilder { 155 | /** 156 | * @brief Default Constructor 157 | */ 158 | DilatedLSTMBuilder(); 159 | /** 160 | * \brief Constructor for the DilatedLSTMBuilder 161 | * 162 | * \param dilations Vector of dilations 163 | * \param input_dim Dimention of the input \f$x_t\f$ 164 | * \param hidden_dim Dimention of the hidden states \f$h_t\f$ and \f$c_t\f$ 165 | * \param model ParameterCollection holding the parameters 166 | */ 167 | explicit DilatedLSTMBuilder(vector dilations, 168 | unsigned input_dim, 169 | unsigned hidden_dim, 170 | ParameterCollection& model); 171 | 172 | Expression back() const override { return (cur == -1 ? h0.back() : h[cur].back()); } 173 | std::vector final_h() const override { return (h.size() == 0 ? h0 : h.back()); } 174 | std::vector final_s() const override { 175 | std::vector ret = (c.size() == 0 ? c0 : c.back()); 176 | for (auto my_h : final_h()) ret.push_back(my_h); 177 | return ret; 178 | } 179 | unsigned num_h0_components() const override { return 2 * layers; } 180 | 181 | std::vector get_h(RNNPointer i) const override { return (i == -1 ? h0 : h[i]); } 182 | std::vector get_s(RNNPointer i) const override { 183 | std::vector ret = (i == -1 ? c0 : c[i]); 184 | for (auto my_h : get_h(i)) ret.push_back(my_h); 185 | return ret; 186 | } 187 | 188 | void copy(const RNNBuilder & params) override; 189 | 190 | /** 191 | * \brief Set the dropout rates to a unique value 192 | * \details This has the same effect as `set_dropout(d,d_h)` except that all the dropout rates are set to the same value. 193 | * \param d Dropout rate to be applied on all of \f$x,h\f$ 194 | */ 195 | void set_dropout(float d); 196 | /** 197 | * \brief Set the dropout rates 198 | * \details The dropout implemented here is the variational dropout with tied weights introduced in [Gal, 2016](http://papers.nips.cc/paper/6241-a-theoretically-grounded-application-of-dropout-in-recurrent-neural-networks) 199 | * More specifically, dropout masks \f$\mathbf{z_x}\sim \mathrm{Bernoulli}(1-d_x)\f$,\f$\mathbf{z_h}\sim \mathrm{Bernoulli}(1-d_h)\f$ are sampled at the start of each sequence. 200 | * The dynamics of the cell are then modified to : 201 | * 202 | * \f$ 203 | * \begin{split} 204 | i_t & =\sigma(W_{ix}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{ih}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_i)\\ 205 | f_t & = \sigma(W_{fx}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{fh}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_f)\\ 206 | o_t & = \sigma(W_{ox}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{oh}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_o)\\ 207 | \tilde{c_t} & = \tanh(W_{cx}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{ch}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_c)\\ 208 | c_t & = c_{t-1}\circ f_t + \tilde{c_t}\circ i_t\\ 209 | h_t & = \tanh(c_t)\circ o_t\\ 210 | \end{split} 211 | * \f$ 212 | * 213 | * For more detail as to why scaling is applied, see the "Unorthodox" section of the documentation 214 | * \param d Dropout rate \f$d_x\f$ for the input \f$x_t\f$ 215 | */ 216 | void set_dropout(float d, float d_r); 217 | /** 218 | * \brief Set all dropout rates to 0 219 | * \details This is equivalent to `set_dropout(0)` or `set_dropout(0,0,0)` 220 | * 221 | */ 222 | void disable_dropout(); 223 | /** 224 | * \brief Set dropout masks at the beginning of a sequence for a specific batch size 225 | * \details If this function is not called on batched input, the same mask will be applied across 226 | * all batch elements. Use this to apply different masks to each batch element 227 | * 228 | * \param batch_size Batch size 229 | */ 230 | void set_dropout_masks(unsigned batch_size = 1); 231 | 232 | void set_weightnoise(float std); 233 | ParameterCollection & get_parameter_collection() override; 234 | protected: 235 | void new_graph_impl(ComputationGraph& cg, bool update) override; 236 | void start_new_sequence_impl(const std::vector& h0) override; 237 | Expression add_input_impl(int prev, const Expression& x) override; 238 | Expression set_h_impl(int prev, const std::vector& h_new) override; 239 | Expression set_s_impl(int prev, const std::vector& s_new) override; 240 | 241 | public: 242 | ParameterCollection local_model; 243 | // first index is layer, then ... 244 | std::vector> params; 245 | 246 | // first index is layer, then ... 247 | std::vector> param_vars; 248 | 249 | // first index is layer, then ... 250 | std::vector> masks; 251 | 252 | // first index is time, second is layer 253 | std::vector> h, c; 254 | 255 | // initial values of h and c at each layer 256 | // - both default to zero matrix input 257 | bool has_initial_state; // if this is false, treat h0 and c0 as 0 258 | std::vector h0; 259 | std::vector c0; 260 | unsigned layers; 261 | unsigned input_dim, hid; 262 | float dropout_rate_h; 263 | float weightnoise_std; 264 | vector dilations; //one int per layer 265 | 266 | bool dropout_masks_valid; 267 | private: 268 | ComputationGraph* _cg; // Pointer to current cg 269 | 270 | }; 271 | 272 | 273 | struct AttentiveDilatedLSTMBuilder : public RNNBuilder { 274 | /** 275 | * @brief Default Constructor 276 | */ 277 | AttentiveDilatedLSTMBuilder(); 278 | /** 279 | * \brief Constructor for the AttentiveDilatedLSTMBuilder 280 | * 281 | * \param max_dilations Vector, maximum dilations (per layer) 282 | * \param input_dim Dimention of the input \f$x_t\f$ 283 | * \param hidden_dim Dimention of the hidden states \f$h_t\f$ and \f$c_t\f$ 284 | * \param model ParameterCollection holding the parameters 285 | */ 286 | explicit AttentiveDilatedLSTMBuilder(vector max_dilations, 287 | unsigned input_dim, 288 | unsigned hidden_dim, 289 | unsigned attention_dim, 290 | ParameterCollection& model); 291 | 292 | Expression back() const override { return (cur == -1 ? h0.back() : h[cur].back()); } 293 | std::vector final_h() const override { return (h.size() == 0 ? h0 : h.back()); } 294 | std::vector final_s() const override { 295 | std::vector ret = (c.size() == 0 ? c0 : c.back()); 296 | for (auto my_h : final_h()) ret.push_back(my_h); 297 | return ret; 298 | } 299 | unsigned num_h0_components() const override { return 2 * layers; } 300 | 301 | std::vector get_h(RNNPointer i) const override { return (i == -1 ? h0 : h[i]); } 302 | std::vector get_s(RNNPointer i) const override { 303 | std::vector ret = (i == -1 ? c0 : c[i]); 304 | for (auto my_h : get_h(i)) ret.push_back(my_h); 305 | return ret; 306 | } 307 | 308 | void copy(const RNNBuilder & params) override; 309 | 310 | /** 311 | * \brief Set the dropout rates to a unique value 312 | * \details This has the same effect as `set_dropout(d,d_h)` except that all the dropout rates are set to the same value. 313 | * \param d Dropout rate to be applied on all of \f$x,h\f$ 314 | */ 315 | void set_dropout(float d); 316 | /** 317 | * \brief Set the dropout rates 318 | * \details The dropout implemented here is the variational dropout with tied weights introduced in [Gal, 2016](http://papers.nips.cc/paper/6241-a-theoretically-grounded-application-of-dropout-in-recurrent-neural-networks) 319 | * More specifically, dropout masks \f$\mathbf{z_x}\sim \mathrm{Bernoulli}(1-d_x)\f$,\f$\mathbf{z_h}\sim \mathrm{Bernoulli}(1-d_h)\f$ are sampled at the start of each sequence. 320 | * The dynamics of the cell are then modified to : 321 | * 322 | * \f$ 323 | * \begin{split} 324 | i_t & =\sigma(W_{ix}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{ih}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_i)\\ 325 | f_t & = \sigma(W_{fx}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{fh}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_f)\\ 326 | o_t & = \sigma(W_{ox}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{oh}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_o)\\ 327 | \tilde{c_t} & = \tanh(W_{cx}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{ch}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_c)\\ 328 | c_t & = c_{t-1}\circ f_t + \tilde{c_t}\circ i_t\\ 329 | h_t & = \tanh(c_t)\circ o_t\\ 330 | \end{split} 331 | * \f$ 332 | * 333 | * For more detail as to why scaling is applied, see the "Unorthodox" section of the documentation 334 | * \param d Dropout rate \f$d_x\f$ for the input \f$x_t\f$ 335 | */ 336 | void set_dropout(float d, float d_r); 337 | /** 338 | * \brief Set all dropout rates to 0 339 | * \details This is equivalent to `set_dropout(0)` or `set_dropout(0,0,0)` 340 | * 341 | */ 342 | void disable_dropout(); 343 | /** 344 | * \brief Set dropout masks at the beginning of a sequence for a specific batch size 345 | * \details If this function is not called on batched input, the same mask will be applied across 346 | * all batch elements. Use this to apply different masks to each batch element 347 | * 348 | * \param batch_size Batch size 349 | */ 350 | void set_dropout_masks(unsigned batch_size = 1); 351 | 352 | void set_weightnoise(float std); 353 | ParameterCollection & get_parameter_collection() override; 354 | protected: 355 | void new_graph_impl(ComputationGraph& cg, bool update) override; 356 | void start_new_sequence_impl(const std::vector& h0) override; 357 | Expression add_input_impl(int prev, const Expression& x) override; 358 | Expression set_h_impl(int prev, const std::vector& h_new) override; 359 | Expression set_s_impl(int prev, const std::vector& s_new) override; 360 | 361 | public: 362 | ParameterCollection local_model; 363 | // first index is layer, then ... 364 | std::vector> params; 365 | 366 | // first index is layer, then ... 367 | std::vector> param_vars; 368 | 369 | // first index is layer, then ... 370 | std::vector> masks; 371 | 372 | // first index is time, second is layer 373 | std::vector> h, c; 374 | 375 | // initial values of h and c at each layer 376 | // - both default to zero matrix input 377 | bool has_initial_state; // if this is false, treat h0 and c0 as 0 378 | std::vector h0; 379 | std::vector c0; 380 | unsigned layers; 381 | unsigned input_dim, hid; 382 | unsigned attention_dim; 383 | float dropout_rate_h; 384 | float weightnoise_std; 385 | vector max_dilations; //one int per layer 386 | 387 | bool dropout_masks_valid; 388 | private: 389 | ComputationGraph* _cg; // Pointer to current cg 390 | 391 | }; 392 | } // namespace dynet 393 | 394 | #endif 395 | -------------------------------------------------------------------------------- /c++/windows_VisualStudio/M43/M43.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {93995380-89BD-4b04-88EB-625FBE52EBFB} 10 | h;hh;hpp;hxx;hm;inl;inc;xsd 11 | 12 | 13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} 14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | Source Files 23 | 24 | 25 | -------------------------------------------------------------------------------- /c++/windows_VisualStudio/M43/M43.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | Debug 14 | x64 15 | 16 | 17 | Release 18 | x64 19 | 20 | 21 | RelWithDebug 22 | Win32 23 | 24 | 25 | RelWithDebug 26 | x64 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | {BE951571-3F3A-4048-BAA3-0C05F38CFF42} 38 | Win32Proj 39 | M43 40 | 8.1 41 | 42 | 43 | 44 | Application 45 | true 46 | v140 47 | Unicode 48 | 49 | 50 | Application 51 | true 52 | v140 53 | Unicode 54 | 55 | 56 | Application 57 | false 58 | v140 59 | true 60 | Unicode 61 | 62 | 63 | Application 64 | true 65 | v140 66 | Unicode 67 | Sequential 68 | 69 | 70 | Application 71 | true 72 | v140 73 | Unicode 74 | Sequential 75 | 76 | 77 | Application 78 | false 79 | v140 80 | true 81 | Unicode 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | true 109 | 110 | 111 | true 112 | 113 | 114 | true 115 | 116 | 117 | true 118 | 119 | 120 | false 121 | 122 | 123 | false 124 | 125 | 126 | 127 | 128 | 129 | Level3 130 | Disabled 131 | WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 132 | 133 | 134 | Console 135 | true 136 | 137 | 138 | 139 | 140 | 141 | 142 | Level3 143 | Disabled 144 | WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 145 | 146 | 147 | Console 148 | true 149 | 150 | 151 | 152 | 153 | NotUsing 154 | Level1 155 | Disabled 156 | WIN32;_WINDOWS;EIGEN_USE_MKL_ALL;EIGEN_FAST_MATH;NOMINMAX;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 157 | E:\progs2\dynet;E:\progs\Eigen; 158 | 159 | 160 | Console 161 | true 162 | E:\progs2\dynet\buildMKL\dynet\Debug 163 | dynet.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) 164 | 165 | 166 | 167 | 168 | NotUsing 169 | Level1 170 | MaxSpeed 171 | WIN32;_WINDOWS;EIGEN_FAST_MATH;EIGEN_USE_MKL_ALL;NOMINMAX;NDEBUG_;CONSOLE;%(PreprocessorDefinitions) 172 | E:\progs2\dynet;E:\progs\Eigen; 173 | AnySuitable 174 | true 175 | Speed 176 | AdvancedVectorExtensions 177 | Default 178 | MultiThreadedDLL 179 | ProgramDatabase 180 | true 181 | false 182 | 183 | 184 | Console 185 | true 186 | E:\progs2\dynet\buildMKL\dynet\RelWithDebInfo 187 | dynet.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) 188 | 189 | 190 | 191 | 192 | Level3 193 | 194 | 195 | MaxSpeed 196 | true 197 | true 198 | WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 199 | 200 | 201 | Console 202 | true 203 | true 204 | true 205 | 206 | 207 | 208 | 209 | Level3 210 | 211 | 212 | MaxSpeed 213 | true 214 | true 215 | NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 216 | 217 | 218 | Console 219 | true 220 | true 221 | true 222 | 223 | 224 | 225 | 226 | 227 | -------------------------------------------------------------------------------- /c++/windows_VisualStudio/M43/slstm.h: -------------------------------------------------------------------------------- 1 | /** 2 | * file slstm.h 3 | * header for my implementation of dilated LSTMs, based on Dynet LSTM builders 4 | - DilatedLSTMBuilder - standard Dilated LSTM (https://papers.nips.cc/paper/6613-dilated-recurrent-neural-networks.pdf) 5 | - ResidualDilatedLSTMBuilder - Dilated LSTM with special Residual shortcuts, after https://arxiv.org/abs/1701.03360 6 | - AttentiveDilatedLSTMBuilder - Dilated LSTM with Attention mechanism, as in the second stage of https://arxiv.org/abs/1704.02971 7 | * 8 | Slawek Smyl, Mar-May 2018 9 | */ 10 | 11 | #ifndef DYNET_SLSTMS_H_ 12 | #define DYNET_SLSTMS_H_ 13 | 14 | #include "dynet/dynet.h" 15 | #include "dynet/rnn.h" 16 | #include "dynet/expr.h" 17 | 18 | using namespace std; 19 | 20 | namespace dynet { 21 | 22 | //basd on VanillaLSTMBuilder 23 | struct ResidualDilatedLSTMBuilder : public RNNBuilder { 24 | /** 25 | * @brief Default Constructor 26 | */ 27 | ResidualDilatedLSTMBuilder(); 28 | /** 29 | * \brief Constructor for the ResidualDilatedLSTMBuilder 30 | * 31 | * \param dilations Vector of dilations 32 | * \param input_dim Dimention of the input \f$x_t\f$ 33 | * \param hidden_dim Dimention of the hidden states \f$h_t\f$ and \f$c_t\f$ 34 | * \param model ParameterCollection holding the parameters 35 | * \param ln_lstm Whether to use layer normalization 36 | * \param forget_bias value(float) to use as bias for the forget gate(default = 1.0) 37 | */ 38 | explicit ResidualDilatedLSTMBuilder(vector dilations, 39 | unsigned input_dim, 40 | unsigned hidden_dim, 41 | ParameterCollection& model, 42 | bool ln_lstm = false, 43 | float forget_bias = 1.f); 44 | 45 | Expression back() const override { return (cur == -1 ? h0.back() : h[cur].back()); } 46 | std::vector final_h() const override { return (h.size() == 0 ? h0 : h.back()); } 47 | std::vector final_s() const override { 48 | std::vector ret = (c.size() == 0 ? c0 : c.back()); 49 | for (auto my_h : final_h()) ret.push_back(my_h); 50 | return ret; 51 | } 52 | unsigned num_h0_components() const override { return 2 * layers; } 53 | 54 | std::vector get_h(RNNPointer i) const override { return (i == -1 ? h0 : h[i]); } 55 | std::vector get_s(RNNPointer i) const override { 56 | std::vector ret = (i == -1 ? c0 : c[i]); 57 | for (auto my_h : get_h(i)) ret.push_back(my_h); 58 | return ret; 59 | } 60 | 61 | void copy(const RNNBuilder & params) override; 62 | 63 | /** 64 | * \brief Set the dropout rates to a unique value 65 | * \details This has the same effect as `set_dropout(d,d_h)` except that all the dropout rates are set to the same value. 66 | * \param d Dropout rate to be applied on all of \f$x,h\f$ 67 | */ 68 | void set_dropout(float d); 69 | /** 70 | * \brief Set the dropout rates 71 | * \details The dropout implemented here is the variational dropout with tied weights introduced in [Gal, 2016](http://papers.nips.cc/paper/6241-a-theoretically-grounded-application-of-dropout-in-recurrent-neural-networks) 72 | * More specifically, dropout masks \f$\mathbf{z_x}\sim \mathrm{Bernoulli}(1-d_x)\f$,\f$\mathbf{z_h}\sim \mathrm{Bernoulli}(1-d_h)\f$ are sampled at the start of each sequence. 73 | * The dynamics of the cell are then modified to : 74 | * 75 | * \f$ 76 | * \begin{split} 77 | i_t & =\sigma(W_{ix}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{ih}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_i)\\ 78 | f_t & = \sigma(W_{fx}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{fh}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_f)\\ 79 | o_t & = \sigma(W_{ox}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{oh}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_o)\\ 80 | \tilde{c_t} & = \tanh(W_{cx}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{ch}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_c)\\ 81 | c_t & = c_{t-1}\circ f_t + \tilde{c_t}\circ i_t\\ 82 | h_t & = \tanh(c_t)\circ o_t\\ 83 | \end{split} 84 | * \f$ 85 | * 86 | * For more detail as to why scaling is applied, see the "Unorthodox" section of the documentation 87 | * \param d Dropout rate \f$d_x\f$ for the input \f$x_t\f$ 88 | * \param d_h Dropout rate \f$d_h\f$ for the output \f$h_t\f$ 89 | */ 90 | void set_dropout(float d, float d_r); 91 | /** 92 | * \brief Set all dropout rates to 0 93 | * \details This is equivalent to `set_dropout(0)` or `set_dropout(0,0,0)` 94 | * 95 | */ 96 | void disable_dropout(); 97 | /** 98 | * \brief Set dropout masks at the beginning of a sequence for a specific batch size 99 | * \details If this function is not called on batched input, the same mask will be applied across 100 | * all batch elements. Use this to apply different masks to each batch element 101 | * 102 | * \param batch_size Batch size 103 | */ 104 | void set_dropout_masks(unsigned batch_size = 1); 105 | /** 106 | * \brief Get parameters in ResidualDilatedLSTMBuilder 107 | * \return list of points to ParameterStorage objects 108 | */ 109 | ParameterCollection & get_parameter_collection() override; 110 | protected: 111 | void new_graph_impl(ComputationGraph& cg, bool update) override; 112 | void start_new_sequence_impl(const std::vector& h0) override; 113 | Expression add_input_impl(int prev, const Expression& x) override; 114 | Expression set_h_impl(int prev, const std::vector& h_new) override; 115 | Expression set_s_impl(int prev, const std::vector& s_new) override; 116 | 117 | public: 118 | ParameterCollection local_model; 119 | // first index is layer, then ... 120 | std::vector> params; 121 | // first index is layer, then ... 122 | std::vector> ln_params; 123 | 124 | // first index is layer, then ... 125 | std::vector> param_vars; 126 | // first index is layer, then ... 127 | std::vector> ln_param_vars; 128 | 129 | // first index is layer, then ... 130 | std::vector> masks; 131 | 132 | // first index is time, second is layer 133 | std::vector> h, c; 134 | 135 | // initial values of h and c at each layer 136 | // - both default to zero matrix input 137 | bool has_initial_state; // if this is false, treat h0 and c0 as 0 138 | std::vector h0; 139 | std::vector c0; 140 | unsigned layers; 141 | unsigned input_dim, hid; 142 | float dropout_rate_h; 143 | bool ln_lstm; 144 | float forget_bias; 145 | bool dropout_masks_valid; 146 | vector dilations; //one int per layer 147 | 148 | private: 149 | ComputationGraph* _cg; // Pointer to current cg 150 | 151 | }; 152 | 153 | 154 | struct DilatedLSTMBuilder : public RNNBuilder { 155 | /** 156 | * @brief Default Constructor 157 | */ 158 | DilatedLSTMBuilder(); 159 | /** 160 | * \brief Constructor for the DilatedLSTMBuilder 161 | * 162 | * \param dilations Vector of dilations 163 | * \param input_dim Dimention of the input \f$x_t\f$ 164 | * \param hidden_dim Dimention of the hidden states \f$h_t\f$ and \f$c_t\f$ 165 | * \param model ParameterCollection holding the parameters 166 | */ 167 | explicit DilatedLSTMBuilder(vector dilations, 168 | unsigned input_dim, 169 | unsigned hidden_dim, 170 | ParameterCollection& model); 171 | 172 | Expression back() const override { return (cur == -1 ? h0.back() : h[cur].back()); } 173 | std::vector final_h() const override { return (h.size() == 0 ? h0 : h.back()); } 174 | std::vector final_s() const override { 175 | std::vector ret = (c.size() == 0 ? c0 : c.back()); 176 | for (auto my_h : final_h()) ret.push_back(my_h); 177 | return ret; 178 | } 179 | unsigned num_h0_components() const override { return 2 * layers; } 180 | 181 | std::vector get_h(RNNPointer i) const override { return (i == -1 ? h0 : h[i]); } 182 | std::vector get_s(RNNPointer i) const override { 183 | std::vector ret = (i == -1 ? c0 : c[i]); 184 | for (auto my_h : get_h(i)) ret.push_back(my_h); 185 | return ret; 186 | } 187 | 188 | void copy(const RNNBuilder & params) override; 189 | 190 | /** 191 | * \brief Set the dropout rates to a unique value 192 | * \details This has the same effect as `set_dropout(d,d_h)` except that all the dropout rates are set to the same value. 193 | * \param d Dropout rate to be applied on all of \f$x,h\f$ 194 | */ 195 | void set_dropout(float d); 196 | /** 197 | * \brief Set the dropout rates 198 | * \details The dropout implemented here is the variational dropout with tied weights introduced in [Gal, 2016](http://papers.nips.cc/paper/6241-a-theoretically-grounded-application-of-dropout-in-recurrent-neural-networks) 199 | * More specifically, dropout masks \f$\mathbf{z_x}\sim \mathrm{Bernoulli}(1-d_x)\f$,\f$\mathbf{z_h}\sim \mathrm{Bernoulli}(1-d_h)\f$ are sampled at the start of each sequence. 200 | * The dynamics of the cell are then modified to : 201 | * 202 | * \f$ 203 | * \begin{split} 204 | i_t & =\sigma(W_{ix}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{ih}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_i)\\ 205 | f_t & = \sigma(W_{fx}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{fh}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_f)\\ 206 | o_t & = \sigma(W_{ox}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{oh}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_o)\\ 207 | \tilde{c_t} & = \tanh(W_{cx}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{ch}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_c)\\ 208 | c_t & = c_{t-1}\circ f_t + \tilde{c_t}\circ i_t\\ 209 | h_t & = \tanh(c_t)\circ o_t\\ 210 | \end{split} 211 | * \f$ 212 | * 213 | * For more detail as to why scaling is applied, see the "Unorthodox" section of the documentation 214 | * \param d Dropout rate \f$d_x\f$ for the input \f$x_t\f$ 215 | */ 216 | void set_dropout(float d, float d_r); 217 | /** 218 | * \brief Set all dropout rates to 0 219 | * \details This is equivalent to `set_dropout(0)` or `set_dropout(0,0,0)` 220 | * 221 | */ 222 | void disable_dropout(); 223 | /** 224 | * \brief Set dropout masks at the beginning of a sequence for a specific batch size 225 | * \details If this function is not called on batched input, the same mask will be applied across 226 | * all batch elements. Use this to apply different masks to each batch element 227 | * 228 | * \param batch_size Batch size 229 | */ 230 | void set_dropout_masks(unsigned batch_size = 1); 231 | 232 | void set_weightnoise(float std); 233 | ParameterCollection & get_parameter_collection() override; 234 | protected: 235 | void new_graph_impl(ComputationGraph& cg, bool update) override; 236 | void start_new_sequence_impl(const std::vector& h0) override; 237 | Expression add_input_impl(int prev, const Expression& x) override; 238 | Expression set_h_impl(int prev, const std::vector& h_new) override; 239 | Expression set_s_impl(int prev, const std::vector& s_new) override; 240 | 241 | public: 242 | ParameterCollection local_model; 243 | // first index is layer, then ... 244 | std::vector> params; 245 | 246 | // first index is layer, then ... 247 | std::vector> param_vars; 248 | 249 | // first index is layer, then ... 250 | std::vector> masks; 251 | 252 | // first index is time, second is layer 253 | std::vector> h, c; 254 | 255 | // initial values of h and c at each layer 256 | // - both default to zero matrix input 257 | bool has_initial_state; // if this is false, treat h0 and c0 as 0 258 | std::vector h0; 259 | std::vector c0; 260 | unsigned layers; 261 | unsigned input_dim, hid; 262 | float dropout_rate_h; 263 | float weightnoise_std; 264 | vector dilations; //one int per layer 265 | 266 | bool dropout_masks_valid; 267 | private: 268 | ComputationGraph* _cg; // Pointer to current cg 269 | 270 | }; 271 | 272 | 273 | struct AttentiveDilatedLSTMBuilder : public RNNBuilder { 274 | /** 275 | * @brief Default Constructor 276 | */ 277 | AttentiveDilatedLSTMBuilder(); 278 | /** 279 | * \brief Constructor for the AttentiveDilatedLSTMBuilder 280 | * 281 | * \param max_dilations Vector, maximum dilations (per layer) 282 | * \param input_dim Dimention of the input \f$x_t\f$ 283 | * \param hidden_dim Dimention of the hidden states \f$h_t\f$ and \f$c_t\f$ 284 | * \param model ParameterCollection holding the parameters 285 | */ 286 | explicit AttentiveDilatedLSTMBuilder(vector max_dilations, 287 | unsigned input_dim, 288 | unsigned hidden_dim, 289 | unsigned attention_dim, 290 | ParameterCollection& model); 291 | 292 | Expression back() const override { return (cur == -1 ? h0.back() : h[cur].back()); } 293 | std::vector final_h() const override { return (h.size() == 0 ? h0 : h.back()); } 294 | std::vector final_s() const override { 295 | std::vector ret = (c.size() == 0 ? c0 : c.back()); 296 | for (auto my_h : final_h()) ret.push_back(my_h); 297 | return ret; 298 | } 299 | unsigned num_h0_components() const override { return 2 * layers; } 300 | 301 | std::vector get_h(RNNPointer i) const override { return (i == -1 ? h0 : h[i]); } 302 | std::vector get_s(RNNPointer i) const override { 303 | std::vector ret = (i == -1 ? c0 : c[i]); 304 | for (auto my_h : get_h(i)) ret.push_back(my_h); 305 | return ret; 306 | } 307 | 308 | void copy(const RNNBuilder & params) override; 309 | 310 | /** 311 | * \brief Set the dropout rates to a unique value 312 | * \details This has the same effect as `set_dropout(d,d_h)` except that all the dropout rates are set to the same value. 313 | * \param d Dropout rate to be applied on all of \f$x,h\f$ 314 | */ 315 | void set_dropout(float d); 316 | /** 317 | * \brief Set the dropout rates 318 | * \details The dropout implemented here is the variational dropout with tied weights introduced in [Gal, 2016](http://papers.nips.cc/paper/6241-a-theoretically-grounded-application-of-dropout-in-recurrent-neural-networks) 319 | * More specifically, dropout masks \f$\mathbf{z_x}\sim \mathrm{Bernoulli}(1-d_x)\f$,\f$\mathbf{z_h}\sim \mathrm{Bernoulli}(1-d_h)\f$ are sampled at the start of each sequence. 320 | * The dynamics of the cell are then modified to : 321 | * 322 | * \f$ 323 | * \begin{split} 324 | i_t & =\sigma(W_{ix}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{ih}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_i)\\ 325 | f_t & = \sigma(W_{fx}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{fh}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_f)\\ 326 | o_t & = \sigma(W_{ox}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{oh}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_o)\\ 327 | \tilde{c_t} & = \tanh(W_{cx}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{ch}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_c)\\ 328 | c_t & = c_{t-1}\circ f_t + \tilde{c_t}\circ i_t\\ 329 | h_t & = \tanh(c_t)\circ o_t\\ 330 | \end{split} 331 | * \f$ 332 | * 333 | * For more detail as to why scaling is applied, see the "Unorthodox" section of the documentation 334 | * \param d Dropout rate \f$d_x\f$ for the input \f$x_t\f$ 335 | */ 336 | void set_dropout(float d, float d_r); 337 | /** 338 | * \brief Set all dropout rates to 0 339 | * \details This is equivalent to `set_dropout(0)` or `set_dropout(0,0,0)` 340 | * 341 | */ 342 | void disable_dropout(); 343 | /** 344 | * \brief Set dropout masks at the beginning of a sequence for a specific batch size 345 | * \details If this function is not called on batched input, the same mask will be applied across 346 | * all batch elements. Use this to apply different masks to each batch element 347 | * 348 | * \param batch_size Batch size 349 | */ 350 | void set_dropout_masks(unsigned batch_size = 1); 351 | 352 | void set_weightnoise(float std); 353 | ParameterCollection & get_parameter_collection() override; 354 | protected: 355 | void new_graph_impl(ComputationGraph& cg, bool update) override; 356 | void start_new_sequence_impl(const std::vector& h0) override; 357 | Expression add_input_impl(int prev, const Expression& x) override; 358 | Expression set_h_impl(int prev, const std::vector& h_new) override; 359 | Expression set_s_impl(int prev, const std::vector& s_new) override; 360 | 361 | public: 362 | ParameterCollection local_model; 363 | // first index is layer, then ... 364 | std::vector> params; 365 | 366 | // first index is layer, then ... 367 | std::vector> param_vars; 368 | 369 | // first index is layer, then ... 370 | std::vector> masks; 371 | 372 | // first index is time, second is layer 373 | std::vector> h, c; 374 | 375 | // initial values of h and c at each layer 376 | // - both default to zero matrix input 377 | bool has_initial_state; // if this is false, treat h0 and c0 as 0 378 | std::vector h0; 379 | std::vector c0; 380 | unsigned layers; 381 | unsigned input_dim, hid; 382 | unsigned attention_dim; 383 | float dropout_rate_h; 384 | float weightnoise_std; 385 | vector max_dilations; //one int per layer 386 | 387 | bool dropout_masks_valid; 388 | private: 389 | ComputationGraph* _cg; // Pointer to current cg 390 | 391 | }; 392 | } // namespace dynet 393 | 394 | #endif 395 | -------------------------------------------------------------------------------- /c++/windows_VisualStudio/M44/M44.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {93995380-89BD-4b04-88EB-625FBE52EBFB} 10 | h;hh;hpp;hxx;hm;inl;inc;xsd 11 | 12 | 13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} 14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | Source Files 23 | 24 | 25 | -------------------------------------------------------------------------------- /c++/windows_VisualStudio/M44/M44.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | Debug 14 | x64 15 | 16 | 17 | Release 18 | x64 19 | 20 | 21 | RelWithDebug 22 | Win32 23 | 24 | 25 | RelWithDebug 26 | x64 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | {7A192E0C-8F58-4D65-998E-3A7010AB5F87} 38 | Win32Proj 39 | M44 40 | 8.1 41 | 42 | 43 | 44 | Application 45 | true 46 | v140 47 | Unicode 48 | 49 | 50 | Application 51 | true 52 | v140 53 | Unicode 54 | 55 | 56 | Application 57 | false 58 | v140 59 | true 60 | Unicode 61 | 62 | 63 | Application 64 | true 65 | v140 66 | Unicode 67 | Sequential 68 | 69 | 70 | Application 71 | true 72 | v140 73 | Unicode 74 | Sequential 75 | 76 | 77 | Application 78 | false 79 | v140 80 | true 81 | Unicode 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | true 109 | 110 | 111 | true 112 | 113 | 114 | true 115 | 116 | 117 | true 118 | 119 | 120 | false 121 | 122 | 123 | false 124 | 125 | 126 | 127 | 128 | 129 | Level3 130 | Disabled 131 | WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 132 | 133 | 134 | Console 135 | true 136 | 137 | 138 | 139 | 140 | 141 | 142 | Level3 143 | Disabled 144 | WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 145 | 146 | 147 | Console 148 | true 149 | 150 | 151 | 152 | 153 | NotUsing 154 | Level1 155 | Disabled 156 | WIN32;_WINDOWS;EIGEN_USE_MKL_ALL;EIGEN_FAST_MATH;NOMINMAX;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 157 | E:\progs2\dynet;E:\progs\Eigen; 158 | 159 | 160 | Console 161 | true 162 | E:\progs2\dynet\buildMKL\dynet\Debug 163 | dynet.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) 164 | 165 | 166 | 167 | 168 | NotUsing 169 | Level1 170 | MaxSpeed 171 | WIN32;_WINDOWS;EIGEN_FAST_MATH;EIGEN_USE_MKL_ALL;NOMINMAX;NDEBUG_;CONSOLE;%(PreprocessorDefinitions) 172 | E:\progs2\dynet;E:\progs\Eigen; 173 | AnySuitable 174 | true 175 | Speed 176 | AdvancedVectorExtensions 177 | Default 178 | MultiThreadedDLL 179 | ProgramDatabase 180 | true 181 | false 182 | 183 | 184 | Console 185 | true 186 | E:\progs2\dynet\buildMKL\dynet\RelWithDebInfo 187 | dynet.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) 188 | 189 | 190 | 191 | 192 | Level3 193 | 194 | 195 | MaxSpeed 196 | true 197 | true 198 | WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 199 | 200 | 201 | Console 202 | true 203 | true 204 | true 205 | 206 | 207 | 208 | 209 | Level3 210 | 211 | 212 | MaxSpeed 213 | true 214 | true 215 | NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 216 | 217 | 218 | Console 219 | true 220 | true 221 | true 222 | 223 | 224 | 225 | 226 | 227 | -------------------------------------------------------------------------------- /c++/windows_VisualStudio/M44/slstm.h: -------------------------------------------------------------------------------- 1 | /** 2 | * file slstm.h 3 | * header for my implementation of dilated LSTMs, based on Dynet LSTM builders 4 | - DilatedLSTMBuilder - standard Dilated LSTM (https://papers.nips.cc/paper/6613-dilated-recurrent-neural-networks.pdf) 5 | - ResidualDilatedLSTMBuilder - Dilated LSTM with special Residual shortcuts, after https://arxiv.org/abs/1701.03360 6 | - AttentiveDilatedLSTMBuilder - Dilated LSTM with Attention mechanism, as in the second stage of https://arxiv.org/abs/1704.02971 7 | * 8 | Slawek Smyl, Mar-May 2018 9 | */ 10 | 11 | #ifndef DYNET_SLSTMS_H_ 12 | #define DYNET_SLSTMS_H_ 13 | 14 | #include "dynet/dynet.h" 15 | #include "dynet/rnn.h" 16 | #include "dynet/expr.h" 17 | 18 | using namespace std; 19 | 20 | namespace dynet { 21 | 22 | //basd on VanillaLSTMBuilder 23 | struct ResidualDilatedLSTMBuilder : public RNNBuilder { 24 | /** 25 | * @brief Default Constructor 26 | */ 27 | ResidualDilatedLSTMBuilder(); 28 | /** 29 | * \brief Constructor for the ResidualDilatedLSTMBuilder 30 | * 31 | * \param dilations Vector of dilations 32 | * \param input_dim Dimention of the input \f$x_t\f$ 33 | * \param hidden_dim Dimention of the hidden states \f$h_t\f$ and \f$c_t\f$ 34 | * \param model ParameterCollection holding the parameters 35 | * \param ln_lstm Whether to use layer normalization 36 | * \param forget_bias value(float) to use as bias for the forget gate(default = 1.0) 37 | */ 38 | explicit ResidualDilatedLSTMBuilder(vector dilations, 39 | unsigned input_dim, 40 | unsigned hidden_dim, 41 | ParameterCollection& model, 42 | bool ln_lstm = false, 43 | float forget_bias = 1.f); 44 | 45 | Expression back() const override { return (cur == -1 ? h0.back() : h[cur].back()); } 46 | std::vector final_h() const override { return (h.size() == 0 ? h0 : h.back()); } 47 | std::vector final_s() const override { 48 | std::vector ret = (c.size() == 0 ? c0 : c.back()); 49 | for (auto my_h : final_h()) ret.push_back(my_h); 50 | return ret; 51 | } 52 | unsigned num_h0_components() const override { return 2 * layers; } 53 | 54 | std::vector get_h(RNNPointer i) const override { return (i == -1 ? h0 : h[i]); } 55 | std::vector get_s(RNNPointer i) const override { 56 | std::vector ret = (i == -1 ? c0 : c[i]); 57 | for (auto my_h : get_h(i)) ret.push_back(my_h); 58 | return ret; 59 | } 60 | 61 | void copy(const RNNBuilder & params) override; 62 | 63 | /** 64 | * \brief Set the dropout rates to a unique value 65 | * \details This has the same effect as `set_dropout(d,d_h)` except that all the dropout rates are set to the same value. 66 | * \param d Dropout rate to be applied on all of \f$x,h\f$ 67 | */ 68 | void set_dropout(float d); 69 | /** 70 | * \brief Set the dropout rates 71 | * \details The dropout implemented here is the variational dropout with tied weights introduced in [Gal, 2016](http://papers.nips.cc/paper/6241-a-theoretically-grounded-application-of-dropout-in-recurrent-neural-networks) 72 | * More specifically, dropout masks \f$\mathbf{z_x}\sim \mathrm{Bernoulli}(1-d_x)\f$,\f$\mathbf{z_h}\sim \mathrm{Bernoulli}(1-d_h)\f$ are sampled at the start of each sequence. 73 | * The dynamics of the cell are then modified to : 74 | * 75 | * \f$ 76 | * \begin{split} 77 | i_t & =\sigma(W_{ix}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{ih}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_i)\\ 78 | f_t & = \sigma(W_{fx}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{fh}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_f)\\ 79 | o_t & = \sigma(W_{ox}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{oh}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_o)\\ 80 | \tilde{c_t} & = \tanh(W_{cx}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{ch}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_c)\\ 81 | c_t & = c_{t-1}\circ f_t + \tilde{c_t}\circ i_t\\ 82 | h_t & = \tanh(c_t)\circ o_t\\ 83 | \end{split} 84 | * \f$ 85 | * 86 | * For more detail as to why scaling is applied, see the "Unorthodox" section of the documentation 87 | * \param d Dropout rate \f$d_x\f$ for the input \f$x_t\f$ 88 | * \param d_h Dropout rate \f$d_h\f$ for the output \f$h_t\f$ 89 | */ 90 | void set_dropout(float d, float d_r); 91 | /** 92 | * \brief Set all dropout rates to 0 93 | * \details This is equivalent to `set_dropout(0)` or `set_dropout(0,0,0)` 94 | * 95 | */ 96 | void disable_dropout(); 97 | /** 98 | * \brief Set dropout masks at the beginning of a sequence for a specific batch size 99 | * \details If this function is not called on batched input, the same mask will be applied across 100 | * all batch elements. Use this to apply different masks to each batch element 101 | * 102 | * \param batch_size Batch size 103 | */ 104 | void set_dropout_masks(unsigned batch_size = 1); 105 | /** 106 | * \brief Get parameters in ResidualDilatedLSTMBuilder 107 | * \return list of points to ParameterStorage objects 108 | */ 109 | ParameterCollection & get_parameter_collection() override; 110 | protected: 111 | void new_graph_impl(ComputationGraph& cg, bool update) override; 112 | void start_new_sequence_impl(const std::vector& h0) override; 113 | Expression add_input_impl(int prev, const Expression& x) override; 114 | Expression set_h_impl(int prev, const std::vector& h_new) override; 115 | Expression set_s_impl(int prev, const std::vector& s_new) override; 116 | 117 | public: 118 | ParameterCollection local_model; 119 | // first index is layer, then ... 120 | std::vector> params; 121 | // first index is layer, then ... 122 | std::vector> ln_params; 123 | 124 | // first index is layer, then ... 125 | std::vector> param_vars; 126 | // first index is layer, then ... 127 | std::vector> ln_param_vars; 128 | 129 | // first index is layer, then ... 130 | std::vector> masks; 131 | 132 | // first index is time, second is layer 133 | std::vector> h, c; 134 | 135 | // initial values of h and c at each layer 136 | // - both default to zero matrix input 137 | bool has_initial_state; // if this is false, treat h0 and c0 as 0 138 | std::vector h0; 139 | std::vector c0; 140 | unsigned layers; 141 | unsigned input_dim, hid; 142 | float dropout_rate_h; 143 | bool ln_lstm; 144 | float forget_bias; 145 | bool dropout_masks_valid; 146 | vector dilations; //one int per layer 147 | 148 | private: 149 | ComputationGraph* _cg; // Pointer to current cg 150 | 151 | }; 152 | 153 | 154 | struct DilatedLSTMBuilder : public RNNBuilder { 155 | /** 156 | * @brief Default Constructor 157 | */ 158 | DilatedLSTMBuilder(); 159 | /** 160 | * \brief Constructor for the DilatedLSTMBuilder 161 | * 162 | * \param dilations Vector of dilations 163 | * \param input_dim Dimention of the input \f$x_t\f$ 164 | * \param hidden_dim Dimention of the hidden states \f$h_t\f$ and \f$c_t\f$ 165 | * \param model ParameterCollection holding the parameters 166 | */ 167 | explicit DilatedLSTMBuilder(vector dilations, 168 | unsigned input_dim, 169 | unsigned hidden_dim, 170 | ParameterCollection& model); 171 | 172 | Expression back() const override { return (cur == -1 ? h0.back() : h[cur].back()); } 173 | std::vector final_h() const override { return (h.size() == 0 ? h0 : h.back()); } 174 | std::vector final_s() const override { 175 | std::vector ret = (c.size() == 0 ? c0 : c.back()); 176 | for (auto my_h : final_h()) ret.push_back(my_h); 177 | return ret; 178 | } 179 | unsigned num_h0_components() const override { return 2 * layers; } 180 | 181 | std::vector get_h(RNNPointer i) const override { return (i == -1 ? h0 : h[i]); } 182 | std::vector get_s(RNNPointer i) const override { 183 | std::vector ret = (i == -1 ? c0 : c[i]); 184 | for (auto my_h : get_h(i)) ret.push_back(my_h); 185 | return ret; 186 | } 187 | 188 | void copy(const RNNBuilder & params) override; 189 | 190 | /** 191 | * \brief Set the dropout rates to a unique value 192 | * \details This has the same effect as `set_dropout(d,d_h)` except that all the dropout rates are set to the same value. 193 | * \param d Dropout rate to be applied on all of \f$x,h\f$ 194 | */ 195 | void set_dropout(float d); 196 | /** 197 | * \brief Set the dropout rates 198 | * \details The dropout implemented here is the variational dropout with tied weights introduced in [Gal, 2016](http://papers.nips.cc/paper/6241-a-theoretically-grounded-application-of-dropout-in-recurrent-neural-networks) 199 | * More specifically, dropout masks \f$\mathbf{z_x}\sim \mathrm{Bernoulli}(1-d_x)\f$,\f$\mathbf{z_h}\sim \mathrm{Bernoulli}(1-d_h)\f$ are sampled at the start of each sequence. 200 | * The dynamics of the cell are then modified to : 201 | * 202 | * \f$ 203 | * \begin{split} 204 | i_t & =\sigma(W_{ix}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{ih}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_i)\\ 205 | f_t & = \sigma(W_{fx}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{fh}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_f)\\ 206 | o_t & = \sigma(W_{ox}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{oh}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_o)\\ 207 | \tilde{c_t} & = \tanh(W_{cx}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{ch}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_c)\\ 208 | c_t & = c_{t-1}\circ f_t + \tilde{c_t}\circ i_t\\ 209 | h_t & = \tanh(c_t)\circ o_t\\ 210 | \end{split} 211 | * \f$ 212 | * 213 | * For more detail as to why scaling is applied, see the "Unorthodox" section of the documentation 214 | * \param d Dropout rate \f$d_x\f$ for the input \f$x_t\f$ 215 | */ 216 | void set_dropout(float d, float d_r); 217 | /** 218 | * \brief Set all dropout rates to 0 219 | * \details This is equivalent to `set_dropout(0)` or `set_dropout(0,0,0)` 220 | * 221 | */ 222 | void disable_dropout(); 223 | /** 224 | * \brief Set dropout masks at the beginning of a sequence for a specific batch size 225 | * \details If this function is not called on batched input, the same mask will be applied across 226 | * all batch elements. Use this to apply different masks to each batch element 227 | * 228 | * \param batch_size Batch size 229 | */ 230 | void set_dropout_masks(unsigned batch_size = 1); 231 | 232 | void set_weightnoise(float std); 233 | ParameterCollection & get_parameter_collection() override; 234 | protected: 235 | void new_graph_impl(ComputationGraph& cg, bool update) override; 236 | void start_new_sequence_impl(const std::vector& h0) override; 237 | Expression add_input_impl(int prev, const Expression& x) override; 238 | Expression set_h_impl(int prev, const std::vector& h_new) override; 239 | Expression set_s_impl(int prev, const std::vector& s_new) override; 240 | 241 | public: 242 | ParameterCollection local_model; 243 | // first index is layer, then ... 244 | std::vector> params; 245 | 246 | // first index is layer, then ... 247 | std::vector> param_vars; 248 | 249 | // first index is layer, then ... 250 | std::vector> masks; 251 | 252 | // first index is time, second is layer 253 | std::vector> h, c; 254 | 255 | // initial values of h and c at each layer 256 | // - both default to zero matrix input 257 | bool has_initial_state; // if this is false, treat h0 and c0 as 0 258 | std::vector h0; 259 | std::vector c0; 260 | unsigned layers; 261 | unsigned input_dim, hid; 262 | float dropout_rate_h; 263 | float weightnoise_std; 264 | vector dilations; //one int per layer 265 | 266 | bool dropout_masks_valid; 267 | private: 268 | ComputationGraph* _cg; // Pointer to current cg 269 | 270 | }; 271 | 272 | 273 | struct AttentiveDilatedLSTMBuilder : public RNNBuilder { 274 | /** 275 | * @brief Default Constructor 276 | */ 277 | AttentiveDilatedLSTMBuilder(); 278 | /** 279 | * \brief Constructor for the AttentiveDilatedLSTMBuilder 280 | * 281 | * \param max_dilations Vector, maximum dilations (per layer) 282 | * \param input_dim Dimention of the input \f$x_t\f$ 283 | * \param hidden_dim Dimention of the hidden states \f$h_t\f$ and \f$c_t\f$ 284 | * \param model ParameterCollection holding the parameters 285 | */ 286 | explicit AttentiveDilatedLSTMBuilder(vector max_dilations, 287 | unsigned input_dim, 288 | unsigned hidden_dim, 289 | unsigned attention_dim, 290 | ParameterCollection& model); 291 | 292 | Expression back() const override { return (cur == -1 ? h0.back() : h[cur].back()); } 293 | std::vector final_h() const override { return (h.size() == 0 ? h0 : h.back()); } 294 | std::vector final_s() const override { 295 | std::vector ret = (c.size() == 0 ? c0 : c.back()); 296 | for (auto my_h : final_h()) ret.push_back(my_h); 297 | return ret; 298 | } 299 | unsigned num_h0_components() const override { return 2 * layers; } 300 | 301 | std::vector get_h(RNNPointer i) const override { return (i == -1 ? h0 : h[i]); } 302 | std::vector get_s(RNNPointer i) const override { 303 | std::vector ret = (i == -1 ? c0 : c[i]); 304 | for (auto my_h : get_h(i)) ret.push_back(my_h); 305 | return ret; 306 | } 307 | 308 | void copy(const RNNBuilder & params) override; 309 | 310 | /** 311 | * \brief Set the dropout rates to a unique value 312 | * \details This has the same effect as `set_dropout(d,d_h)` except that all the dropout rates are set to the same value. 313 | * \param d Dropout rate to be applied on all of \f$x,h\f$ 314 | */ 315 | void set_dropout(float d); 316 | /** 317 | * \brief Set the dropout rates 318 | * \details The dropout implemented here is the variational dropout with tied weights introduced in [Gal, 2016](http://papers.nips.cc/paper/6241-a-theoretically-grounded-application-of-dropout-in-recurrent-neural-networks) 319 | * More specifically, dropout masks \f$\mathbf{z_x}\sim \mathrm{Bernoulli}(1-d_x)\f$,\f$\mathbf{z_h}\sim \mathrm{Bernoulli}(1-d_h)\f$ are sampled at the start of each sequence. 320 | * The dynamics of the cell are then modified to : 321 | * 322 | * \f$ 323 | * \begin{split} 324 | i_t & =\sigma(W_{ix}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{ih}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_i)\\ 325 | f_t & = \sigma(W_{fx}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{fh}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_f)\\ 326 | o_t & = \sigma(W_{ox}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{oh}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_o)\\ 327 | \tilde{c_t} & = \tanh(W_{cx}(\frac 1 {1-d_x}\mathbf{z_x} \circ x_t)+W_{ch}(\frac 1 {1-d_h}\mathbf{z_h} \circ h_{t-1})+b_c)\\ 328 | c_t & = c_{t-1}\circ f_t + \tilde{c_t}\circ i_t\\ 329 | h_t & = \tanh(c_t)\circ o_t\\ 330 | \end{split} 331 | * \f$ 332 | * 333 | * For more detail as to why scaling is applied, see the "Unorthodox" section of the documentation 334 | * \param d Dropout rate \f$d_x\f$ for the input \f$x_t\f$ 335 | */ 336 | void set_dropout(float d, float d_r); 337 | /** 338 | * \brief Set all dropout rates to 0 339 | * \details This is equivalent to `set_dropout(0)` or `set_dropout(0,0,0)` 340 | * 341 | */ 342 | void disable_dropout(); 343 | /** 344 | * \brief Set dropout masks at the beginning of a sequence for a specific batch size 345 | * \details If this function is not called on batched input, the same mask will be applied across 346 | * all batch elements. Use this to apply different masks to each batch element 347 | * 348 | * \param batch_size Batch size 349 | */ 350 | void set_dropout_masks(unsigned batch_size = 1); 351 | 352 | void set_weightnoise(float std); 353 | ParameterCollection & get_parameter_collection() override; 354 | protected: 355 | void new_graph_impl(ComputationGraph& cg, bool update) override; 356 | void start_new_sequence_impl(const std::vector& h0) override; 357 | Expression add_input_impl(int prev, const Expression& x) override; 358 | Expression set_h_impl(int prev, const std::vector& h_new) override; 359 | Expression set_s_impl(int prev, const std::vector& s_new) override; 360 | 361 | public: 362 | ParameterCollection local_model; 363 | // first index is layer, then ... 364 | std::vector> params; 365 | 366 | // first index is layer, then ... 367 | std::vector> param_vars; 368 | 369 | // first index is layer, then ... 370 | std::vector> masks; 371 | 372 | // first index is time, second is layer 373 | std::vector> h, c; 374 | 375 | // initial values of h and c at each layer 376 | // - both default to zero matrix input 377 | bool has_initial_state; // if this is false, treat h0 and c0 as 0 378 | std::vector h0; 379 | std::vector c0; 380 | unsigned layers; 381 | unsigned input_dim, hid; 382 | unsigned attention_dim; 383 | float dropout_rate_h; 384 | float weightnoise_std; 385 | vector max_dilations; //one int per layer 386 | 387 | bool dropout_masks_valid; 388 | private: 389 | ComputationGraph* _cg; // Pointer to current cg 390 | 391 | }; 392 | } // namespace dynet 393 | 394 | #endif 395 | -------------------------------------------------------------------------------- /c++/windows_VisualStudio/readme.txt: -------------------------------------------------------------------------------- 1 | This is Visual Studio 15 solution, with 4 projects, one for each .cc file. 2 | Two targets are defined: Debug and RelWitDebug, which is Release with debug info, that I used normally. 3 | You will need to update include and link paths to point to your installation of Dynet. 4 | In x64\RelWithDebug directory you will find two example scripts to run the executables 5 | in conjunction with one program started interactively inside VS. -------------------------------------------------------------------------------- /c++/windows_VisualStudio/x64/RelWithDebug/readme.txt: -------------------------------------------------------------------------------- 1 | These example run scripts. They are meant to be run on 6-core computer and assume that the program, 2 | M41.exe has been started interactively in Visual Studio, so they add 5 processes. 3 | run61.cmd should be run for ES_RNN and ES_RNN_PI, so Monthly and Quarterly series, 4 | although for Monthly you probably want to use computer with more cores, unless you are fine waiting a week or so :-) 5 | run61_e.cmd is for ES_RNN_E and ES_RNN_E_PI, so all other cases. -------------------------------------------------------------------------------- /c++/windows_VisualStudio/x64/RelWithDebug/run61.cmd: -------------------------------------------------------------------------------- 1 | start M41 10 2 2 | start M41 11 1 5 3 | start M41 11 2 5 4 | start M41 12 1 10 5 | start M41 12 2 10 6 | -------------------------------------------------------------------------------- /c++/windows_VisualStudio/x64/RelWithDebug/run61_e.cmd: -------------------------------------------------------------------------------- 1 | start M41 5 2 | start M41 10 3 | start M41 15 4 | start M41 20 5 | start M41 25 6 | -------------------------------------------------------------------------------- /readme.txt: -------------------------------------------------------------------------------- 1 | ES-RNN programs, related script, and docs. 2 | M4 Forecasting Competition, 2018 3 | Slawek Smyl, Uber. 4 | 5 | The programs are in C++ and use Dynet - a Dynamic Graph NN system (https://github.com/clab/dynet) 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /sql/createM72nn_SQLServer.sql: -------------------------------------------------------------------------------- 1 | USE [slawek] 2 | GO 3 | 4 | /****** Object: Table [dbo].[M72nn] Script Date: 6/2/2018 9:37:26 AM ******/ 5 | SET ANSI_NULLS ON 6 | GO 7 | 8 | SET QUOTED_IDENTIFIER ON 9 | GO 10 | 11 | SET ANSI_PADDING ON 12 | GO 13 | 14 | CREATE TABLE [dbo].[M72nn]( 15 | [run] [varchar](164) NOT NULL, 16 | [LBack] [smallint] NOT NULL, 17 | [iBig] [smallint] NOT NULL, 18 | [series] [varchar](20) NOT NULL, 19 | [epoch] [smallint] NOT NULL, 20 | [actual1] [real] NULL, 21 | [forec1] [real] NULL, 22 | [actual2] [real] NULL, 23 | [forec2] [real] NULL, 24 | [actual3] [real] NULL, 25 | [forec3] [real] NULL, 26 | [actual4] [real] NULL, 27 | [forec4] [real] NULL, 28 | [actual5] [real] NULL, 29 | [forec5] [real] NULL, 30 | [actual6] [real] NULL, 31 | [forec6] [real] NULL, 32 | [actual7] [real] NULL, 33 | [forec7] [real] NULL, 34 | [actual8] [real] NULL, 35 | [forec8] [real] NULL, 36 | [actual9] [real] NULL, 37 | [forec9] [real] NULL, 38 | [actual10] [real] NULL, 39 | [forec10] [real] NULL, 40 | [actual11] [real] NULL, 41 | [forec11] [real] NULL, 42 | [actual12] [real] NULL, 43 | [forec12] [real] NULL, 44 | [actual13] [real] NULL, 45 | [forec13] [real] NULL, 46 | [actual14] [real] NULL, 47 | [forec14] [real] NULL, 48 | [actual15] [real] NULL, 49 | [forec15] [real] NULL, 50 | [actual16] [real] NULL, 51 | [forec16] [real] NULL, 52 | [actual17] [real] NULL, 53 | [forec17] [real] NULL, 54 | [actual18] [real] NULL, 55 | [forec18] [real] NULL, 56 | [actual19] [real] NULL, 57 | [forec19] [real] NULL, 58 | [actual20] [real] NULL, 59 | [forec20] [real] NULL, 60 | [actual21] [real] NULL, 61 | [forec21] [real] NULL, 62 | [actual22] [real] NULL, 63 | [forec22] [real] NULL, 64 | [actual23] [real] NULL, 65 | [forec23] [real] NULL, 66 | [actual24] [real] NULL, 67 | [forec24] [real] NULL, 68 | [actual25] [real] NULL, 69 | [forec25] [real] NULL, 70 | [actual26] [real] NULL, 71 | [forec26] [real] NULL, 72 | [actual27] [real] NULL, 73 | [forec27] [real] NULL, 74 | [actual28] [real] NULL, 75 | [forec28] [real] NULL, 76 | [actual29] [real] NULL, 77 | [forec29] [real] NULL, 78 | [actual30] [real] NULL, 79 | [forec30] [real] NULL, 80 | [actual31] [real] NULL, 81 | [forec31] [real] NULL, 82 | [actual32] [real] NULL, 83 | [forec32] [real] NULL, 84 | [actual33] [real] NULL, 85 | [forec33] [real] NULL, 86 | [actual34] [real] NULL, 87 | [forec34] [real] NULL, 88 | [actual35] [real] NULL, 89 | [forec35] [real] NULL, 90 | [actual36] [real] NULL, 91 | [forec36] [real] NULL, 92 | [actual37] [real] NULL, 93 | [forec37] [real] NULL, 94 | [actual38] [real] NULL, 95 | [forec38] [real] NULL, 96 | [actual39] [real] NULL, 97 | [forec39] [real] NULL, 98 | [actual40] [real] NULL, 99 | [forec40] [real] NULL, 100 | [actual41] [real] NULL, 101 | [forec41] [real] NULL, 102 | [actual42] [real] NULL, 103 | [forec42] [real] NULL, 104 | [actual43] [real] NULL, 105 | [forec43] [real] NULL, 106 | [actual44] [real] NULL, 107 | [forec44] [real] NULL, 108 | [actual45] [real] NULL, 109 | [forec45] [real] NULL, 110 | [actual46] [real] NULL, 111 | [forec46] [real] NULL, 112 | [actual47] [real] NULL, 113 | [forec47] [real] NULL, 114 | [actual48] [real] NULL, 115 | [forec48] [real] NULL, 116 | [trainingError] [real] NULL, 117 | [variable] [varchar](20) NOT NULL, 118 | [n] [smallint] NOT NULL, 119 | [dateTimeOfPrediction] [datetime] NOT NULL, 120 | CONSTRAINT [M72nn_pk] PRIMARY KEY CLUSTERED 121 | ( 122 | [run] ASC, 123 | [LBack] ASC, 124 | [iBig] ASC, 125 | [series] ASC, 126 | [epoch] ASC 127 | )WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY] 128 | ) ON [PRIMARY] 129 | 130 | GO 131 | 132 | SET ANSI_PADDING OFF 133 | GO 134 | 135 | 136 | -------------------------------------------------------------------------------- /sql/createM72nn_mysql.txt: -------------------------------------------------------------------------------- 1 | CREATE TABLE M72nn( 2 | run varchar(160) NOT NULL, 3 | LBack smallint NOT NULL, 4 | iBig smallint NOT NULL, 5 | series varchar(20) NOT NULL, 6 | epoch smallint NOT NULL, 7 | actual1 float NULL, 8 | forec1 float NULL, 9 | actual2 float NULL, 10 | forec2 float NULL, 11 | actual3 float NULL, 12 | forec3 float NULL, 13 | actual4 float NULL, 14 | forec4 float NULL, 15 | actual5 float NULL, 16 | forec5 float NULL, 17 | actual6 float NULL, 18 | forec6 float NULL, 19 | actual7 float NULL, 20 | forec7 float NULL, 21 | actual8 float NULL, 22 | forec8 float NULL, 23 | actual9 float NULL, 24 | forec9 float NULL, 25 | actual10 float NULL, 26 | forec10 float NULL, 27 | actual11 float NULL, 28 | forec11 float NULL, 29 | actual12 float NULL, 30 | forec12 float NULL, 31 | actual13 float NULL, 32 | forec13 float NULL, 33 | actual14 float NULL, 34 | forec14 float NULL, 35 | actual15 float NULL, 36 | forec15 float NULL, 37 | actual16 float NULL, 38 | forec16 float NULL, 39 | actual17 float NULL, 40 | forec17 float NULL, 41 | actual18 float NULL, 42 | forec18 float NULL, 43 | trainingError float NULL, 44 | variable varchar(20) NOT NULL, 45 | n smallint NOT NULL, 46 | dateTimeOfPrediction datetime NOT NULL, 47 | CONSTRAINT M72nn_pk PRIMARY KEY CLUSTERED 48 | ( 49 | run ASC, 50 | LBack ASC, 51 | iBig ASC, 52 | series ASC, 53 | epoch ASC)); 54 | 55 | -------------------------------------------------------------------------------- /sql/readme.txt: -------------------------------------------------------------------------------- 1 | I provide just two example table creation scrits, one for SQL Server and one for mysql. 2 | The mysql table is limited to output vector 18, so would not be good for hourly runs. 3 | Anyway, starting using the database is a large investment of time, apart from installationm, you also need to create auxiliary tables with MASE, and a lot of queries. 4 | I do not have time to do all of it here and suspect there will be little interest in ODBC, so this is all what you get :-) 5 | --------------------------------------------------------------------------------