├── README.md ├── codes ├── dpe │ ├── linelib.cpp │ ├── linelib.h │ ├── main.cpp │ ├── makefile │ └── run.sh ├── evaluation │ ├── cal_result.py │ ├── concat.cpp │ ├── final_score_wt.py │ ├── gen_cand_eval.cpp │ ├── gen_model.py │ ├── infer.cpp │ ├── liblinear │ │ ├── COPYRIGHT │ │ ├── Makefile │ │ ├── Makefile.win │ │ ├── README │ │ ├── blas │ │ │ ├── Makefile │ │ │ ├── blas.h │ │ │ ├── blasp.h │ │ │ ├── daxpy.c │ │ │ ├── ddot.c │ │ │ ├── dnrm2.c │ │ │ └── dscal.c │ │ ├── heart_scale │ │ ├── linear.cpp │ │ ├── linear.def │ │ ├── linear.h │ │ ├── matlab │ │ │ ├── Makefile │ │ │ ├── README │ │ │ ├── libsvmread.c │ │ │ ├── libsvmwrite.c │ │ │ ├── linear_model_matlab.c │ │ │ ├── linear_model_matlab.h │ │ │ ├── make.m │ │ │ ├── predict.c │ │ │ └── train.c │ │ ├── predict.c │ │ ├── python │ │ │ ├── Makefile │ │ │ ├── README │ │ │ ├── liblinear.py │ │ │ └── liblinearutil.py │ │ ├── train.c │ │ ├── tron.cpp │ │ ├── tron.h │ │ └── windows │ │ │ ├── liblinear.dll │ │ │ ├── libsvmread.mexw64 │ │ │ ├── libsvmwrite.mexw64 │ │ │ ├── predict.exe │ │ │ ├── predict.mexw64 │ │ │ ├── train.exe │ │ │ └── train.mexw64 │ ├── make.sh │ ├── pair2bow.cpp │ ├── run.sh │ ├── score_syn.py │ └── vocab.cpp └── preprocess │ ├── data2net.cpp │ ├── gen_index.py │ ├── gen_pattern.cpp │ ├── label.py │ ├── make.sh │ ├── netww2netwe.py │ ├── run.sh │ ├── vocab.cpp │ └── vocab.py ├── data ├── README.md └── label.set └── run.sh /README.md: -------------------------------------------------------------------------------- 1 | # DPE 2 | This is an implementation of the DPE model proposed in the KDD 2017 paper ["Automatic Synonym Detection with Knowledge Bases"](https://arxiv.org/abs/1706.08186). 3 | 4 | Given a corpus and a knowledge base, DPE will automatically discover missing entity synonyms from the corpus. Specifically, DPE leverages the idea of distant supervision and collects existing entity synonyms in knowledge bases as training seeds. The collected seeds are then used to train the DPE model, which aims at predicting whether two strings are synonymous or not. DPE has two modules: the distributional module predicts synonym relation from the corpus-level statistics, while the pattern module considers local contexts for prediction. At the inference stage, both modules will collaborate to discover high-quality entity synonyms. 5 | 6 | We provide the codes for data preprocessing, model training and model evaluation in the "codes" folder. Also, we provide the Wiki-Freebase dataset in the "data" folder. 7 | 8 | ## Install 9 | Our codes rely on two external packages, which are the Eigen package and the GSL package. 10 | 11 | #### Eigen 12 | The [Eigen](http://eigen.tuxfamily.org/index.php?title=Main_Page) package is used for matrix and vector operations. To compile our codes, users need to download the package. 13 | 14 | #### GSL 15 | The [GSL](https://www.gnu.org/software/gsl/) package is used for random number generation. Users need to download and install the package. 16 | 17 | ## Compile 18 | After installing the two packages, users need to modify the package paths in "codes/dpe/makefile". Then users may go to every folder and use the makefile to compile the codes. 19 | 20 | ## Running 21 | To run the DPE model and evaluate it on the Wiki-Freebase dataset, users may directly use the example script (run.sh) we provide. By running this scipt, the program will first generate all the training data for DPE, such as the co-occurrence network of strings. Then it will learn the string embeddings as well as the distributional score function of the distributional module and the pattern classifier of the pattern module. Finally, the distributional module and the pattern module will mutually collaborate for synonym prediction. 22 | 23 | Compiling, training and evaluating DPE on the Wiki-Freebase dataset: 24 | ``` 25 | ./run.sh 26 | ``` 27 | 28 | ## Contact: 29 | If you have any questions about the codes and data, please feel free to contact us. 30 | ``` 31 | Meng Qu, qumn123@gmail.com 32 | ``` 33 | 34 | ## Citation 35 | ``` 36 | @article{qu2017automatic, 37 | title={Automatic Synonym Discovery with Knowledge Bases}, 38 | author={Qu, Meng and Ren, Xiang and Han, Jiawei}, 39 | journal={arXiv preprint arXiv:1706.08186}, 40 | year={2017} 41 | } 42 | ``` 43 | -------------------------------------------------------------------------------- /codes/dpe/linelib.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #define MAX_STRING 500 14 | #define EXP_TABLE_SIZE 1000 15 | #define MAX_EXP 6 16 | const int neg_table_size = 1e8; 17 | const int hash_table_size = 30000000; 18 | 19 | typedef float real; 20 | 21 | typedef Eigen::Matrix< real, Eigen::Dynamic, 22 | Eigen::Dynamic, Eigen::RowMajor | Eigen::AutoAlign > 23 | BLPMatrix; 24 | 25 | typedef Eigen::Matrix< real, 1, Eigen::Dynamic, 26 | Eigen::RowMajor | Eigen::AutoAlign > 27 | BLPVector; 28 | 29 | struct struct_node { 30 | char *word; 31 | }; 32 | 33 | struct hin_nb { 34 | int nb_id; 35 | double eg_wei; 36 | char eg_tp; 37 | }; 38 | 39 | struct triple 40 | { 41 | int h, r, t; 42 | friend bool operator < (triple t1, triple t2) 43 | { 44 | if (t1.h == t2.h) 45 | { 46 | if (t1.r == t2.r) return t1.t < t2.t; 47 | return t1.r < t2.r; 48 | } 49 | return t1.h < t2.h; 50 | } 51 | }; 52 | 53 | class sampler 54 | { 55 | long long n; 56 | long long *alias; 57 | double *prob; 58 | 59 | public: 60 | sampler(); 61 | ~sampler(); 62 | 63 | void init(long long ndata, double *p); 64 | long long draw(double ran1, double ran2); 65 | }; 66 | 67 | class line_node; 68 | class line_hin; 69 | class line_adjacency; 70 | class line_trainer_line; 71 | class line_trainer_norm; 72 | class line_trainer_reg; 73 | class line_triple; 74 | class line_regularizer_norm; 75 | class line_regularizer_line; 76 | class line_trainer_feature; 77 | 78 | class line_node 79 | { 80 | protected: 81 | struct struct_node *node; 82 | int node_size, node_max_size, vector_size; 83 | char node_file[MAX_STRING]; 84 | int *node_hash; 85 | real *_vec; 86 | Eigen::Map vec; 87 | 88 | int get_hash(char *word); 89 | int add_node(char *word); 90 | public: 91 | 92 | line_node(); 93 | ~line_node(); 94 | 95 | friend class line_hin; 96 | friend class line_adjacency; 97 | friend class line_trainer_line; 98 | friend class line_trainer_norm; 99 | friend class line_trainer_reg; 100 | friend class line_triple; 101 | friend class line_regularizer_norm; 102 | friend class line_regularizer_line; 103 | 104 | friend class line_trainer_feature; 105 | 106 | void init(const char *file_name, int vector_dim); 107 | int search(char *word); 108 | void output(const char *file_name, int binary); 109 | int get_node_size(); 110 | real *get_vec(); 111 | 112 | //friend void linelib_output_batch(char *file_name, int binary, line_node **array_line_node, int cnt); 113 | }; 114 | 115 | class line_hin 116 | { 117 | protected: 118 | char hin_file[MAX_STRING]; 119 | 120 | line_node *node_u, *node_v; 121 | std::vector *hin; 122 | long long hin_size; 123 | 124 | public: 125 | line_hin(); 126 | ~line_hin(); 127 | 128 | friend class line_adjacency; 129 | friend class line_trainer_line; 130 | friend class line_trainer_norm; 131 | friend class line_trainer_reg; 132 | 133 | void init(const char *file_name, line_node *p_u, line_node *p_v, bool with_type = 1); 134 | }; 135 | 136 | class line_adjacency 137 | { 138 | protected: 139 | line_hin *phin; 140 | 141 | int adjmode; 142 | char edge_tp; 143 | 144 | double *u_wei; 145 | sampler smp_u; 146 | 147 | int *u_nb_cnt; int **u_nb_id; double **u_nb_wei; 148 | sampler *smp_u_nb; 149 | 150 | int *v_nb_cnt; int **v_nb_id; double **v_nb_wei; 151 | sampler *smp_v_nb; 152 | 153 | public: 154 | line_adjacency(); 155 | ~line_adjacency(); 156 | 157 | friend class line_trainer_line; 158 | friend class line_trainer_norm; 159 | friend class line_trainer_reg; 160 | friend class line_regularizer_norm; 161 | friend class line_regularizer_line; 162 | 163 | void init(line_hin *p_hin, char edge_type, int mode); 164 | int sample(int u, double (*func_rand_num)()); 165 | int sample_head(double (*func_rand_num)()); 166 | }; 167 | 168 | class line_trainer_line 169 | { 170 | protected: 171 | line_hin *phin; 172 | 173 | int *u_nb_cnt; int **u_nb_id; double **u_nb_wei; 174 | double *u_wei, *v_wei; 175 | sampler smp_u, *smp_u_nb; 176 | real *expTable; 177 | int *neg_table; 178 | 179 | char edge_tp; 180 | 181 | void train_uv(int u, int v, real lr, int neg_samples, real *_error_vec, unsigned long long &rand_index); 182 | void train_uv_od3(int u, int v, real lr, int neg_samples, real *_error_vec, unsigned long long &rand_index); 183 | void train_uv_od3_attention(int u, int v, real lr, int neg_samples, real *_error_vec, unsigned long long &rand_index, real lr2, real *_para); 184 | public: 185 | line_trainer_line(); 186 | ~line_trainer_line(); 187 | 188 | void init(line_hin *p_hin, char edge_type); 189 | void copy_neg_table(line_trainer_line *p_trainer_line); 190 | void train_sample(real lr, int neg_samples, real *_error_vec, double (*func_rand_num)(), unsigned long long &rand_index); 191 | void train_sample_od3(real lr, int neg_samples, real *_error_vec, double (*func_rand_num)(), unsigned long long &rand_index); 192 | void train_sample_od3_attention(real lr, int neg_samples, real *_error_vec, double (*func_rand_num)(), unsigned long long &rand_index, real lr2, real *_para); 193 | void train_sample_depth(real lr, int neg_samples, real *_error_vec, double (*func_rand_num)(), unsigned long long &rand_index, int depth, line_adjacency *p_adjacency, char pst); 194 | }; 195 | 196 | class line_trainer_norm 197 | { 198 | protected: 199 | line_hin *phin; 200 | 201 | int *u_nb_cnt; int **u_nb_id; double **u_nb_wei; 202 | double *u_wei, *v_wei; 203 | sampler smp_u, *smp_u_nb; 204 | 205 | char edge_tp; 206 | 207 | void train_uv(int u, int v, real lr, real margin, int dis_type, real *_error_vec, double randv); 208 | 209 | real *_para; 210 | Eigen::Map para; 211 | public: 212 | line_trainer_norm(); 213 | ~line_trainer_norm(); 214 | 215 | void init(line_hin *p_hin, char edge_type); 216 | void train_sample(real lr, real margin, int dis_type, real *_error_vec, double (*func_rand_num)()); 217 | void train_sample_depth(real lr, real margin, int dis_type, real *_error_vec, double (*func_rand_num)(), int depth, line_adjacency *p_adjacency, char pst); 218 | void output(const char *file_name, int binary); 219 | }; 220 | 221 | class line_trainer_reg 222 | { 223 | protected: 224 | line_hin *phin; 225 | 226 | int *u_nb_cnt; int **u_nb_id; double **u_nb_wei; 227 | double *u_wei, *v_wei; 228 | sampler smp_u, *smp_u_nb; 229 | 230 | char edge_tp; 231 | 232 | void train_uv(int u, int v, real lr); 233 | public: 234 | line_trainer_reg(); 235 | ~line_trainer_reg(); 236 | 237 | void init(line_hin *p_hin, char edge_type); 238 | void train_sample(real lr, double (*func_rand_num)()); 239 | void train_sample_depth(real lr, double (*func_rand_num)(), int depth, line_adjacency *p_adjacency, char pst); 240 | }; 241 | 242 | class line_triple 243 | { 244 | protected: 245 | line_node *node_h, *node_t, *node_r; 246 | long long triple_size; 247 | int *triple_h, *triple_t, *triple_r; 248 | char triple_file[MAX_STRING]; 249 | std::set appear; 250 | 251 | void train_ht(real lr, int dis_type, int h, int t, int r, int nh, int nt, int nr); 252 | 253 | public: 254 | line_triple(); 255 | ~line_triple(); 256 | 257 | void init(const char *file_name, line_node *p_h, line_node *p_t, line_node *p_r); 258 | void train_sample(real lr, real margin, int dis_type, double (*func_rand_num)()); 259 | long long get_triple_size(); 260 | }; 261 | 262 | class line_regularizer_line 263 | { 264 | protected: 265 | line_node *node; 266 | real *expTable; 267 | 268 | void train_uv(real lr, int u, int v, int neg_samples, real *_error_vec, double (*func_rand_num)()); 269 | public: 270 | line_regularizer_line(); 271 | ~line_regularizer_line(); 272 | 273 | void init(line_node *p_node); 274 | void train_sample(real lr, int neg_samples, real *_error_vec, double (*func_rand_num)(), int depth, line_adjacency *p_adjacency); 275 | }; 276 | 277 | class line_regularizer_norm 278 | { 279 | protected: 280 | line_node *node; 281 | 282 | void train_uv(real lr, int dis_type, int u, int v); 283 | void train_uv_neg(real lr, int dis_type, int u, int v, int n); 284 | public: 285 | line_regularizer_norm(); 286 | ~line_regularizer_norm(); 287 | 288 | void init(line_node *p_node); 289 | void train_sample(real lr, int dis_type, double (*func_rand_num)(), int depth, line_adjacency *p_adjacency); 290 | void train_sample_neg(real lr, real margin, int dis_type, double (*func_rand_num)(), int depth, line_adjacency *p_adjacency); 291 | }; 292 | 293 | class line_trainer_feature 294 | { 295 | protected: 296 | char data_file[MAX_STRING]; 297 | line_node *lab, *fea; 298 | std::vector data_lab; 299 | std::vector< std::vector > data_fea; 300 | int data_size; 301 | 302 | void ReadWord(char *word, FILE *fin); 303 | void Softmax(BLPMatrix &mat); 304 | public: 305 | line_trainer_feature(); 306 | ~line_trainer_feature(); 307 | 308 | void init(const char *file_name, line_node *p_label, line_node *p_feature); 309 | void train_sample(real lr, double (*func_rand_num)()); 310 | }; 311 | 312 | 313 | 314 | 315 | 316 | -------------------------------------------------------------------------------- /codes/dpe/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "linelib.h" 9 | 10 | char string_set[MAX_STRING], label_set[MAX_STRING], occur_net[MAX_STRING], syn_net[MAX_STRING], pattern[MAX_STRING], output_string[MAX_STRING], output_pat[MAX_STRING], output_dis[MAX_STRING]; 11 | int binary = 0, num_threads = 1, vector_size = 100, negative = 5; 12 | long long samples = 1, edge_count_actual; 13 | real alpha = 0.01, beta = 0.0001, starting_alpha; 14 | 15 | const gsl_rng_type * gsl_T; 16 | gsl_rng * gsl_r; 17 | 18 | line_node node_w, node_c, node_l; 19 | line_hin hin_wc, hin_ww_syn; 20 | line_trainer_line trainer_wc; 21 | line_trainer_norm trainer_ww_syn; 22 | line_trainer_feature trainer_pat; 23 | 24 | double func_rand_num() 25 | { 26 | return gsl_rng_uniform(gsl_r); 27 | } 28 | 29 | void *training_thread(void *id) 30 | { 31 | long long edge_count = 0, last_edge_count = 0; 32 | unsigned long long next_random = (long long)id; 33 | real *error_vec = (real *)calloc(vector_size, sizeof(real)); 34 | 35 | while (1) 36 | { 37 | //judge for exit 38 | if (edge_count > samples / num_threads + 2) break; 39 | 40 | if (edge_count - last_edge_count > 1000) 41 | { 42 | edge_count_actual += edge_count - last_edge_count; 43 | last_edge_count = edge_count; 44 | printf("%cAlpha: %f Progress: %.3lf%%", 13, alpha, (real)edge_count_actual / (real)(samples + 1) * 100); 45 | fflush(stdout); 46 | } 47 | 48 | for (int k = 0; k != 100; k++) trainer_wc.train_sample_od3(alpha, negative, error_vec, func_rand_num, next_random); 49 | trainer_ww_syn.train_sample(alpha, 1, 2, error_vec, func_rand_num); 50 | trainer_pat.train_sample(alpha, func_rand_num); 51 | 52 | edge_count += 102; 53 | } 54 | free(error_vec); 55 | pthread_exit(NULL); 56 | } 57 | 58 | void TrainModel() 59 | { 60 | long a; 61 | pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t)); 62 | starting_alpha = alpha; 63 | 64 | gsl_rng_env_setup(); 65 | gsl_T = gsl_rng_rand48; 66 | gsl_r = gsl_rng_alloc(gsl_T); 67 | gsl_rng_set(gsl_r, 314159265); 68 | 69 | node_w.init(string_set, vector_size); 70 | node_c.init(string_set, vector_size); 71 | node_l.init(label_set, vector_size); 72 | 73 | hin_wc.init(occur_net, &node_w, &node_c, 0); 74 | hin_ww_syn.init(syn_net, &node_w, &node_w, 0); 75 | 76 | trainer_wc.init(&hin_wc, 0); 77 | trainer_ww_syn.init(&hin_ww_syn, 0); 78 | trainer_pat.init(pattern, &node_l, &node_w); 79 | 80 | clock_t start = clock(); 81 | printf("Training:"); 82 | for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, training_thread, (void *)a); 83 | for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL); 84 | printf("\n"); 85 | clock_t finish = clock(); 86 | printf("Total time: %lf\n", (double)(finish - start) / CLOCKS_PER_SEC); 87 | 88 | node_w.output(output_string, binary); 89 | node_l.output(output_pat, 0); 90 | trainer_ww_syn.output(output_dis, 0); 91 | } 92 | 93 | int ArgPos(char *str, int argc, char **argv) { 94 | int a; 95 | for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) { 96 | if (a == argc - 1) { 97 | printf("Argument missing for %s\n", str); 98 | exit(1); 99 | } 100 | return a; 101 | } 102 | return -1; 103 | } 104 | 105 | int main(int argc, char **argv) { 106 | int i; 107 | if (argc == 1) { 108 | printf("DPE\n\n"); 109 | printf("Options:\n"); 110 | printf("Parameters for training:\n"); 111 | printf("\t-string-set \n"); 112 | printf("\t\tA dictionary of all strings.\n"); 113 | printf("\t-label-set \n"); 114 | printf("\t\tThe set of all pattern labels.\n"); 115 | printf("\t-occur-net \n"); 116 | printf("\t\tThe co-occurren network between strings\n"); 117 | printf("\t-syn-net \n"); 118 | printf("\t\tThe set of all synonymous string pairs used in training.\n"); 119 | printf("\t-pattern \n"); 120 | printf("\t\tThe set of all training patterns.\n"); 121 | printf("\t-output-string \n"); 122 | printf("\t\tOutput file of the string embeddings.\n"); 123 | printf("\t-output-pat \n"); 124 | printf("\t\tOutput file of \n"); 125 | printf("\t-binary \n"); 126 | printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n"); 127 | printf("\t-size \n"); 128 | printf("\t\tSet size of word vectors; default is 100\n"); 129 | printf("\t-negative \n"); 130 | printf("\t\tNumber of negative examples; default is 5, common values are 5 - 10 (0 = not used)\n"); 131 | printf("\t-samples \n"); 132 | printf("\t\tSet the number of training samples as Million\n"); 133 | printf("\t-threads \n"); 134 | printf("\t\tUse threads (default 1)\n"); 135 | printf("\t-alpha \n"); 136 | printf("\t\tSet the starting learning rate; default is 0.025\n"); 137 | return 0; 138 | } 139 | if ((i = ArgPos((char *)"-string-set", argc, argv)) > 0) strcpy(string_set, argv[i + 1]); 140 | if ((i = ArgPos((char *)"-label-set", argc, argv)) > 0) strcpy(label_set, argv[i + 1]); 141 | if ((i = ArgPos((char *)"-occur-net", argc, argv)) > 0) strcpy(occur_net, argv[i + 1]); 142 | if ((i = ArgPos((char *)"-syn-net", argc, argv)) > 0) strcpy(syn_net, argv[i + 1]); 143 | if ((i = ArgPos((char *)"-pattern", argc, argv)) > 0) strcpy(pattern, argv[i + 1]); 144 | if ((i = ArgPos((char *)"-output-string", argc, argv)) > 0) strcpy(output_string, argv[i + 1]); 145 | if ((i = ArgPos((char *)"-output-pat", argc, argv)) > 0) strcpy(output_pat, argv[i + 1]); 146 | if ((i = ArgPos((char *)"-output-dis", argc, argv)) > 0) strcpy(output_dis, argv[i + 1]); 147 | if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]); 148 | if ((i = ArgPos((char *)"-size", argc, argv)) > 0) vector_size = atoi(argv[i + 1]); 149 | if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]); 150 | if ((i = ArgPos((char *)"-samples", argc, argv)) > 0) samples = (long long)(atof(argv[i + 1])*1000000); 151 | if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]); 152 | if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]); 153 | TrainModel(); 154 | return 0; 155 | } -------------------------------------------------------------------------------- /codes/dpe/makefile: -------------------------------------------------------------------------------- 1 | CC = g++ 2 | CFLAGS = -lm -pthread -O2 -march=native -Wall -funroll-loops -Wno-unused-result -lgsl -lm -lgslcblas 3 | LFLAGS = -lgsl -lm -lgslcblas 4 | INCLUDES = -I/usr/include -I../../eigen-3.2.5 5 | LIBS = -L/usr/local/lib 6 | 7 | 8 | dpe : linelib.o main.o 9 | $(CC) $(CFLAGS) -o dpe linelib.o main.o $(INCLUDES) $(LIBS) $(LFLAGS) 10 | 11 | linelib.o : linelib.cpp 12 | $(CC) $(CFLAGS) -c linelib.cpp $(INCLUDES) $(LIBS) $(LFLAGS) 13 | 14 | main.o : main.cpp linelib.o 15 | $(CC) $(CFLAGS) -c main.cpp $(INCLUDES) $(LIBS) $(LFLAGS) 16 | 17 | clean : 18 | rm -rf *.o dpe 19 | -------------------------------------------------------------------------------- /codes/dpe/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | string_set="../../data/string.set" 4 | occur_net="../../data/net.txt" 5 | syn_net="../../data/pairs.txt" 6 | pattern="../../data/pattern.txt" 7 | label_set="../../data/label.set" 8 | 9 | output_string="word.emb" 10 | output_pat="pat.txt" 11 | output_dis="dis.txt" 12 | 13 | size=100 14 | negative=5 15 | samples=10000 16 | alpha=0.01 17 | threads=30 18 | 19 | ./dpe -string-set ${string_set} -label-set ${label_set} -occur-net ${occur_net} -syn-net ${syn_net} -pattern ${pattern} -output-string ${output_string} -output-pat ${output_pat} -output-dis ${output_dis} -binary 1 -size ${size} -negative ${negative} -samples ${samples} -alpha ${alpha} -threads ${threads} 20 | -------------------------------------------------------------------------------- /codes/evaluation/cal_result.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | eval_file = sys.argv[1] 5 | score_file = sys.argv[2] 6 | k_max = int(sys.argv[3]) 7 | 8 | check = {} 9 | eid2train = {} 10 | eid2test = {} 11 | 12 | fi = open(eval_file, 'r') 13 | while True: 14 | eid = fi.readline() 15 | trains = fi.readline() 16 | tests = fi.readline() 17 | 18 | if not eid: 19 | break 20 | if not trains: 21 | break 22 | if not tests: 23 | break 24 | 25 | eid = eid.strip() 26 | if check.get(eid, None) == None: 27 | check[eid] = {} 28 | 29 | trainl = trains.strip().split() 30 | testl = tests.strip().split() 31 | ntestl = [] 32 | for ent in testl: 33 | if ent in trainl: 34 | continue 35 | ntestl.append(ent) 36 | testl = ntestl 37 | 38 | eid2train[eid] = trainl 39 | eid2test[eid] = testl 40 | 41 | validl = testl 42 | 43 | for ent in validl: 44 | #word = ent[0:ent.find('||')] 45 | word = ent 46 | check[eid][word] = 1 47 | fi.close() 48 | 49 | eid2score = {} 50 | 51 | fi = open(score_file, 'r') 52 | for line in fi: 53 | eid = line.strip().split()[0] 54 | ent = line.strip().split()[1] 55 | val = float(line.strip().split()[2]) 56 | if eid2score.get(eid, None) == None: 57 | eid2score[eid] = {} 58 | eid2score[eid][ent] = val 59 | fi.close() 60 | 61 | sh = [0.0 for i in range(k_max)] 62 | sp = [0.0 for i in range(k_max)] 63 | sr = [0.0 for i in range(k_max)] 64 | 65 | for eid, dic in eid2score.items(): 66 | if check.get(eid, None) == None: 67 | continue 68 | 69 | dic = sorted(dic.items(), key = lambda x:x[1], reverse = True) 70 | 71 | ch = [0.0 for i in range(k_max)] 72 | cp = [0.0 for i in range(k_max)] 73 | cr = [0.0 for i in range(k_max)] 74 | nhit = 0 75 | nprec = 0 76 | remain = len(eid2test[eid]) 77 | #print eid, len(eid2test[eid]) 78 | #exit(0) 79 | for k in range(k_max): 80 | ent = dic[k][0] 81 | #word = ent[0:ent.find('||')] 82 | word = ent 83 | 84 | #print eid, ent, check[eid].get(word, 0) 85 | 86 | if check[eid].get(word, 0) == 1: 87 | nhit += 1 88 | ch[k] = nhit 89 | if remain > 0: 90 | nprec += 1 91 | if check[eid].get(word, 0) == 1: 92 | remain -= 1 93 | cp[k] = nprec 94 | cr[k] = len(eid2test[eid]) 95 | 96 | for k in range(k_max): 97 | sh[k] += ch[k] 98 | sp[k] += cp[k] 99 | sr[k] += cr[k] 100 | 101 | for k in range(k_max): 102 | print 'P@' + str(k+1), sh[k] / sp[k] 103 | #print sh[k] / sp[k] 104 | for k in range(k_max): 105 | print 'R@' + str(k+1), sh[k] / sr[k] 106 | #print sh[k] / sr[k] 107 | 108 | -------------------------------------------------------------------------------- /codes/evaluation/concat.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #define MAX_STRING 100 8 | #define EXP_TABLE_SIZE 1000 9 | #define MAX_EXP 6 10 | #define MAX_SENTENCE_LENGTH 1000 11 | #define MAX_CODE_LENGTH 40 12 | 13 | const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary 14 | 15 | typedef float real; // Precision of float numbers 16 | 17 | struct vocab_word { 18 | long long cn; 19 | char word[MAX_STRING]; 20 | }; 21 | 22 | char vector_file1[MAX_STRING], vector_file2[MAX_STRING], output_file[MAX_STRING]; 23 | struct vocab_word *vocab; 24 | int binary = 0, debug_mode = 2; 25 | int *vocab_hash; 26 | long long vocab_max_size = 1000, vocab_size = 0; 27 | long long vector_size1, vector_size2; 28 | real *syn0, *syn1; 29 | 30 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries 31 | void ReadWord(char *word, FILE *fin) { 32 | int a = 0, ch; 33 | while (!feof(fin)) { 34 | ch = fgetc(fin); 35 | if (ch == 13) continue; 36 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { 37 | if (a > 0) { 38 | if (ch == '\n') ungetc(ch, fin); 39 | break; 40 | } 41 | if (ch == '\n') { 42 | strcpy(word, (char *)""); 43 | return; 44 | } 45 | else continue; 46 | } 47 | word[a] = ch; 48 | a++; 49 | if (a >= MAX_STRING - 1) a--; // Truncate too long words 50 | } 51 | word[a] = 0; 52 | } 53 | 54 | // Returns hash value of a word 55 | int GetWordHash(char *word) { 56 | unsigned long long a, hash = 0; 57 | for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a]; 58 | hash = hash % vocab_hash_size; 59 | return hash; 60 | } 61 | 62 | // Returns position of a word in the vocabulary; if the word is not found, returns -1 63 | int SearchVocab(char *word) { 64 | unsigned int hash = GetWordHash(word); 65 | while (1) { 66 | if (vocab_hash[hash] == -1) return -1; 67 | if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash]; 68 | hash = (hash + 1) % vocab_hash_size; 69 | } 70 | return -1; 71 | } 72 | 73 | // Reads a word and returns its index in the vocabulary 74 | int ReadWordIndex(FILE *fin) { 75 | char word[MAX_STRING]; 76 | ReadWord(word, fin); 77 | if (feof(fin)) return -1; 78 | return SearchVocab(word); 79 | } 80 | 81 | // Adds a word to the vocabulary 82 | int AddWordToVocab(char *word, int cn, int id) { 83 | unsigned int hash; 84 | strcpy(vocab[id].word, word); 85 | vocab[id].cn = cn; 86 | hash = GetWordHash(word); 87 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 88 | vocab_hash[hash] = id; 89 | return id; 90 | } 91 | 92 | void LearnVocabFromTrainFile() 93 | { 94 | char ch, word[MAX_STRING]; 95 | float f_num; 96 | long long l; 97 | 98 | vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int)); 99 | for (long long a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 100 | 101 | FILE *fi = fopen(vector_file1, "rb"); 102 | if (fi == NULL) { 103 | printf("Vector file 1 not found\n"); 104 | exit(1); 105 | } 106 | fscanf(fi, "%lld %lld", &vocab_size, &vector_size1); 107 | vocab = (struct vocab_word *)malloc(vocab_size * sizeof(struct vocab_word)); 108 | syn0 = (real *)calloc(vocab_size * vector_size1, sizeof(real)); 109 | for (long long k = 0; k != vocab_size; k++) 110 | { 111 | fscanf(fi, "%s", word); 112 | ch = fgetc(fi); 113 | AddWordToVocab(word, 0, k); 114 | l = k * vector_size1; 115 | for (int c = 0; c != vector_size1; c++) 116 | { 117 | fread(&f_num, sizeof(float), 1, fi); 118 | syn0[c + l] = (real)f_num; 119 | } 120 | } 121 | fclose(fi); 122 | 123 | fi = fopen(vector_file2, "rb"); 124 | if (fi == NULL) { 125 | printf("Vector file 2 not found\n"); 126 | exit(1); 127 | } 128 | fscanf(fi, "%lld %lld", &l, &vector_size2); 129 | syn1 = (real *)calloc((vocab_size + 1) * vector_size2, sizeof(real)); 130 | for (long long k = 0; k != vocab_size; k++) 131 | { 132 | fscanf(fi, "%s", word); 133 | ch = fgetc(fi); 134 | int i = SearchVocab(word); 135 | if (i == -1) l = vocab_size * vector_size2; 136 | else l = i * vector_size2; 137 | for (int c = 0; c != vector_size1; c++) 138 | { 139 | fread(&f_num, sizeof(float), 1, fi); 140 | syn1[c + l] = (real)f_num; 141 | } 142 | } 143 | fclose(fi); 144 | 145 | if (debug_mode>0) 146 | { 147 | printf("Vocab size: %lld\n", vocab_size); 148 | printf("Vector size 1: %lld\n", vector_size1); 149 | printf("Vector size 2: %lld\n", vector_size2); 150 | } 151 | } 152 | 153 | 154 | void TrainModel() { 155 | long long a, b; 156 | double len; 157 | 158 | LearnVocabFromTrainFile(); 159 | 160 | FILE *fo; 161 | fo = fopen(output_file, "wb"); 162 | fprintf(fo, "%lld %lld\n", vocab_size, vector_size1 + vector_size2); 163 | for (a = 0; a < vocab_size; a++) { 164 | fprintf(fo, "%s ", vocab[a].word); 165 | 166 | len = 0; 167 | for (b = 0; b < vector_size1; b++) len += syn0[b + a * vector_size1] * syn0[b + a * vector_size1]; 168 | len = sqrt(len); 169 | if (len != 0) for (b = 0; b < vector_size1; b++) syn0[b + a * vector_size1] /= len; 170 | 171 | len = 0; 172 | for (b = 0; b < vector_size2; b++) len += syn1[b + a * vector_size2] * syn1[b + a * vector_size2]; 173 | len = sqrt(len); 174 | if (len != 0) for (b = 0; b < vector_size2; b++) syn1[b + a * vector_size2] /= len; 175 | 176 | if (binary) 177 | { 178 | for (b = 0; b < vector_size1; b++) 179 | fwrite(&syn0[a * vector_size1 + b], sizeof(real), 1, fo); 180 | for (b = 0; b < vector_size2; b++) 181 | fwrite(&syn1[a * vector_size2 + b], sizeof(real), 1, fo); 182 | } 183 | else 184 | { 185 | for (b = 0; b < vector_size1; b++) 186 | fprintf(fo, "%lf ", syn0[a * vector_size1 + b]); 187 | for (b = 0; b < vector_size2; b++) 188 | fprintf(fo, "%lf ", syn0[a * vector_size2 + b]); 189 | } 190 | fprintf(fo, "\n"); 191 | } 192 | fclose(fo); 193 | } 194 | 195 | int ArgPos(char *str, int argc, char **argv) { 196 | int a; 197 | for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) { 198 | if (a == argc - 1) { 199 | printf("Argument missing for %s\n", str); 200 | exit(1); 201 | } 202 | return a; 203 | } 204 | return -1; 205 | } 206 | 207 | int main(int argc, char **argv) { 208 | int i; 209 | output_file[0] = 0; 210 | if ((i = ArgPos((char *)"-input1", argc, argv)) > 0) strcpy(vector_file1, argv[i + 1]); 211 | if ((i = ArgPos((char *)"-input2", argc, argv)) > 0) strcpy(vector_file2, argv[i + 1]); 212 | if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]); 213 | if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]); 214 | if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]); 215 | vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word)); 216 | vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int)); 217 | TrainModel(); 218 | return 0; 219 | } 220 | -------------------------------------------------------------------------------- /codes/evaluation/final_score_wt.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | embed_rst = sys.argv[1] 5 | bow_rst = sys.argv[2] 6 | output_file = sys.argv[3] 7 | wt = float(sys.argv[4]) 8 | 9 | eid2score = {} 10 | fi = open(embed_rst, 'r') 11 | for line in fi: 12 | eid = line.strip().split()[0] 13 | ent = line.strip().split()[1] 14 | val = float(line.strip().split()[3]) 15 | if eid2score.get(eid, None) == None: 16 | eid2score[eid] = {} 17 | eid2score[eid][ent] = val 18 | fi.close() 19 | 20 | fi = open(bow_rst, 'r') 21 | for line in fi: 22 | eid = line.strip().split()[0] 23 | val = float(line.strip().split()[2]) 24 | ent = line.strip().split()[3] 25 | eid2score[eid][ent] += wt * val 26 | fi.close() 27 | 28 | fo = open(output_file, 'w') 29 | for eid, dic in eid2score.items(): 30 | dic = sorted(dic.items(), reverse = True, key = lambda x:x[1]) 31 | for ent, val in dic: 32 | fo.write(eid + '\t' + ent + '\t' + str(val) + '\n') 33 | fo.close() 34 | 35 | -------------------------------------------------------------------------------- /codes/evaluation/gen_cand_eval.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #define MAX_STRING 1000 12 | #define EXP_TABLE_SIZE 1000 13 | #define MAX_EXP 6 14 | #define MAX_SENTENCE_LENGTH 1000 15 | #define MAX_CODE_LENGTH 40 16 | 17 | const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary 18 | 19 | typedef float real; // Precision of float numbers 20 | 21 | typedef Eigen::Matrix< real, Eigen::Dynamic, 22 | Eigen::Dynamic, Eigen::RowMajor | Eigen::AutoAlign > 23 | BLPMatrix; 24 | 25 | typedef Eigen::Matrix< real, 1, Eigen::Dynamic, 26 | Eigen::RowMajor | Eigen::AutoAlign > 27 | BLPVector; 28 | 29 | struct vocab_word 30 | { 31 | char word[MAX_STRING]; 32 | int flag; 33 | }; 34 | 35 | struct entry 36 | { 37 | char ent[MAX_STRING]; 38 | std::vector train; 39 | std::set check, right; 40 | }; 41 | 42 | struct pair 43 | { 44 | int id; 45 | real vl; 46 | }; 47 | 48 | struct kmax_list 49 | { 50 | pair *list; 51 | int k_max, list_size; 52 | 53 | void init(int k) 54 | { 55 | k_max = k; 56 | list = (pair *)malloc((k_max + 1) * sizeof(pair)); 57 | list_size = 0; 58 | for (int k = 0; k != k_max + 1; k++) 59 | { 60 | list[k].id = -1; 61 | list[k].vl = -1; 62 | } 63 | } 64 | 65 | void clear() 66 | { 67 | list_size = 0; 68 | for (int k = 0; k != k_max + 1; k++) 69 | { 70 | list[k].id = -1; 71 | list[k].vl = -1; 72 | } 73 | } 74 | 75 | void add(pair pr) 76 | { 77 | list[list_size].id = pr.id; 78 | list[list_size].vl = pr.vl; 79 | 80 | for (int k = list_size - 1; k >= 0; k--) 81 | { 82 | if (list[k].vl < list[k + 1].vl) 83 | { 84 | int tmp_id = list[k].id; 85 | real tmp_vl = list[k].vl; 86 | list[k].id = list[k + 1].id; 87 | list[k].vl = list[k + 1].vl; 88 | list[k + 1].id = tmp_id; 89 | list[k + 1].vl = tmp_vl; 90 | } 91 | else 92 | break; 93 | } 94 | 95 | if (list_size < k_max) list_size++; 96 | } 97 | }; 98 | 99 | char data_file[MAX_STRING], vector_file[MAX_STRING], output_cand_file[MAX_STRING], output_pair_file[MAX_STRING]; 100 | struct vocab_word *vocab; 101 | int *vocab_hash; 102 | int vocab_max_size = 1000, vocab_size = 0, vector_size = 0, data_size = 0, k_max = 5, filter = 0; 103 | 104 | BLPMatrix vec; 105 | std::vector data; 106 | 107 | real score(int wid, std::vector &train) 108 | { 109 | real sc = 0; 110 | int size = (int)(train.size()); 111 | for (int k = 0; k != size; k++) 112 | { 113 | int tid = train[k]; 114 | real f = vec.row(wid) * vec.row(tid).transpose(); 115 | //if (f > sc) sc = f; 116 | sc += f; 117 | } 118 | if (size == 0) sc = 0; 119 | else sc /= size; 120 | return sc; 121 | } 122 | 123 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries 124 | void ReadWord(char *word, FILE *fin) { 125 | int a = 0, ch; 126 | while (!feof(fin)) { 127 | ch = fgetc(fin); 128 | if (ch == 13) continue; 129 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { 130 | if (a > 0) { 131 | if (ch == '\n') ungetc(ch, fin); 132 | break; 133 | } 134 | if (ch == '\n') { 135 | strcpy(word, (char *)""); 136 | return; 137 | } 138 | else continue; 139 | } 140 | word[a] = ch; 141 | a++; 142 | if (a >= MAX_STRING - 1) a--; // Truncate too long words 143 | } 144 | word[a] = 0; 145 | } 146 | 147 | // Returns hash value of a word 148 | int GetWordHash(char *word) { 149 | unsigned long long a, hash = 0; 150 | for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a]; 151 | hash = hash % vocab_hash_size; 152 | return hash; 153 | } 154 | 155 | // Returns position of a word in the vocabulary; if the word is not found, returns -1 156 | int SearchVocab(char *word) { 157 | unsigned int hash = GetWordHash(word); 158 | while (1) { 159 | if (vocab_hash[hash] == -1) return -1; 160 | if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash]; 161 | hash = (hash + 1) % vocab_hash_size; 162 | } 163 | return -1; 164 | } 165 | 166 | // Adds a word to the vocabulary 167 | int AddWordToVocab(char *word, int id) { 168 | unsigned int hash; 169 | strcpy(vocab[id].word, word); 170 | vocab[id].flag = 0; 171 | hash = GetWordHash(word); 172 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 173 | vocab_hash[hash] = id; 174 | return id; 175 | } 176 | 177 | void ReadVector() 178 | { 179 | FILE *fi; 180 | char ch, word[MAX_STRING]; 181 | real f; 182 | 183 | fi = fopen(vector_file, "rb"); 184 | if (fi == NULL) { 185 | printf("Vector file not found\n"); 186 | exit(1); 187 | } 188 | 189 | fscanf(fi, "%d %d", &vocab_size, &vector_size); 190 | 191 | vocab = (struct vocab_word *)malloc(vocab_size*sizeof(struct vocab_word)); 192 | vec.resize(vocab_size, vector_size); 193 | 194 | for (int k = 0; k != vocab_size; k++) 195 | { 196 | fscanf(fi, "%s", word); 197 | ch = fgetc(fi); 198 | AddWordToVocab(word, k); 199 | for (int c = 0; c != vector_size; c++) 200 | { 201 | fread(&f, sizeof(real), 1, fi); 202 | vec(k, c) = f; 203 | } 204 | 205 | vec.row(k) /= vec.row(k).norm(); 206 | } 207 | 208 | printf("Vocab size: %d\n", vocab_size); 209 | printf("Vector size: %d\n", vector_size); 210 | 211 | fclose(fi); 212 | } 213 | 214 | void ReadData() 215 | { 216 | FILE *fi; 217 | char ent[MAX_STRING], word[MAX_STRING]; 218 | int wid; 219 | entry curentry; 220 | 221 | fi = fopen(data_file, "rb"); 222 | while (1) 223 | { 224 | if (fscanf(fi, "%s", ent) != 1) break; 225 | ReadWord(word, fi); 226 | 227 | curentry.train.clear(); 228 | curentry.check.clear(); 229 | curentry.right.clear(); 230 | 231 | while (1) 232 | { 233 | ReadWord(word, fi); 234 | if (strcmp(word, "") == 0) break; 235 | wid = SearchVocab(word); 236 | if (wid == -1) continue; 237 | curentry.train.push_back(wid); 238 | curentry.check.insert(wid); 239 | curentry.right.insert(wid); 240 | } 241 | 242 | while (1) 243 | { 244 | ReadWord(word, fi); 245 | if (strcmp(word, "") == 0) break; 246 | wid = SearchVocab(word); 247 | if (wid == -1) continue; 248 | curentry.right.insert(wid); 249 | } 250 | 251 | if (curentry.check.empty() || curentry.right.empty()) continue; 252 | 253 | strcpy(curentry.ent, ent); 254 | 255 | data.push_back(curentry); 256 | } 257 | 258 | data_size = (int)(data.size()); 259 | 260 | printf("Data size: %d\n", data_size); 261 | } 262 | 263 | void Evaluate() 264 | { 265 | pair pr; 266 | std::vector train; 267 | std::set check, right; 268 | 269 | kmax_list rklist; 270 | rklist.init(k_max); 271 | 272 | FILE *foc = fopen(output_cand_file, "wb"); 273 | FILE *fop = fopen(output_pair_file, "wb"); 274 | 275 | for (int data_id = 0; data_id != data_size; data_id++) 276 | { 277 | if (data_id % 10 == 0) 278 | { 279 | printf("%cProgress: %.2f%%", 13, 100.0 * data_id / data_size); 280 | fflush(stdout); 281 | } 282 | 283 | train = data[data_id].train; 284 | check = data[data_id].check; 285 | right = data[data_id].right; 286 | 287 | // calculate average 288 | rklist.clear(); 289 | for (int k = 0; k != vocab_size; k++) 290 | { 291 | if (filter) 292 | { 293 | if (check.count(k)) continue; 294 | } 295 | 296 | int pst = -1; 297 | int length = strlen(vocab[k].word); 298 | for (int i = 0; i != length; i++) if (vocab[k].word[i] == '|') 299 | { 300 | pst = i; 301 | break; 302 | } 303 | if (pst != -1) 304 | { 305 | pst += 2; 306 | length -= 2; 307 | if (length - pst != strlen(data[data_id].ent)) continue; 308 | 309 | char curent[MAX_STRING]; 310 | for (int i = 0; i != length - pst; i++) curent[i] = vocab[k].word[pst + i]; 311 | curent[length - pst] = 0; 312 | 313 | if (strcmp(data[data_id].ent, curent) != 0) continue; 314 | } 315 | 316 | real f = score(k, train); 317 | 318 | pr.id = k; 319 | pr.vl = f; 320 | rklist.add(pr); 321 | } 322 | 323 | for (int k = 0; k != k_max; k++) 324 | { 325 | int wid = rklist.list[k].id; 326 | real vl = rklist.list[k].vl; 327 | fprintf(foc, "%s\t%s\t%d\t%lf\n", data[data_id].ent, vocab[wid].word, k, vl); 328 | } 329 | 330 | for (int i = 0; i != (int)(train.size()); i++) 331 | { 332 | int tid = train[i]; 333 | for (int k = 0; k != k_max; k++) 334 | { 335 | int wid = rklist.list[k].id; 336 | fprintf(fop, "0\t%s\t%s\t%s\t%d\t%d\n", vocab[tid].word, vocab[wid].word, data[data_id].ent, i, k); 337 | } 338 | } 339 | } 340 | printf("\n"); 341 | fclose(foc); 342 | fclose(fop); 343 | } 344 | 345 | void TrainModel() 346 | { 347 | ReadVector(); 348 | ReadData(); 349 | Evaluate(); 350 | } 351 | 352 | int ArgPos(char *str, int argc, char **argv) { 353 | int a; 354 | for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) { 355 | if (a == argc - 1) { 356 | printf("Argument missing for %s\n", str); 357 | exit(1); 358 | } 359 | return a; 360 | } 361 | return -1; 362 | } 363 | 364 | int main(int argc, char **argv) { 365 | int i; 366 | if ((i = ArgPos((char *)"-data", argc, argv)) > 0) strcpy(data_file, argv[i + 1]); 367 | if ((i = ArgPos((char *)"-vector", argc, argv)) > 0) strcpy(vector_file, argv[i + 1]); 368 | if ((i = ArgPos((char *)"-output-cand", argc, argv)) > 0) strcpy(output_cand_file, argv[i + 1]); 369 | if ((i = ArgPos((char *)"-output-pair", argc, argv)) > 0) strcpy(output_pair_file, argv[i + 1]); 370 | if ((i = ArgPos((char *)"-k-max", argc, argv)) > 0) k_max = atoi(argv[i + 1]); 371 | if ((i = ArgPos((char *)"-filter", argc, argv)) > 0) filter = atoi(argv[i + 1]); 372 | vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word)); 373 | vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int)); 374 | for (long long a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 375 | TrainModel(); 376 | return 0; 377 | } 378 | -------------------------------------------------------------------------------- /codes/evaluation/gen_model.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | fi1 = open(sys.argv[1], 'r') 5 | fi2 = open(sys.argv[2], 'r') 6 | fo = open(sys.argv[3], 'w') 7 | 8 | dims = int(fi1.readline().strip().split()[1]) 9 | 10 | fo.write('solver_type L2R_LR' + '\n') 11 | fo.write('nr_class 2' + '\n') 12 | fo.write('label 0 1' + '\n') 13 | fo.write('nr_feature ' + str(dims) + '\n') 14 | fo.write('bias -0.00001' + '\n') 15 | fo.write('w' + '\n') 16 | 17 | line_a = fi1.readline() 18 | lst_a = line_a.strip().split() 19 | 20 | line_b = fi1.readline() 21 | lst_b = line_b.strip().split() 22 | 23 | for i in range(1, len(lst_a)): 24 | #fo.write(str(float(lst_a[i]) - float(lst_b[i])) + '\n') 25 | fo.write(str(float(lst_a[i])) + '\n') 26 | fi1.close() 27 | fi2.close() 28 | fo.close() -------------------------------------------------------------------------------- /codes/evaluation/infer.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #define MAX_STRING 100 7 | #define EXP_TABLE_SIZE 1000 8 | #define MAX_EXP 6 9 | #define MAX_SENTENCE_LENGTH 1000 10 | #define MAX_CODE_LENGTH 40 11 | 12 | const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary 13 | 14 | typedef float real; // Precision of float numbers 15 | 16 | struct vocab_word { 17 | double cn; 18 | int *point; 19 | char *word, *code, codelen; 20 | }; 21 | 22 | char train_file[MAX_STRING], vector_file[MAX_STRING], output_file[MAX_STRING]; 23 | struct vocab_word *vocab; 24 | int binary = 0, debug_mode = 2; 25 | int *vocab_hash; 26 | long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100; 27 | double *syn0, *syn1; 28 | 29 | long long nedges; 30 | int *edge_from, *edge_to; 31 | double *edge_weight; 32 | 33 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries 34 | void ReadWord(char *word, FILE *fin) { 35 | int a = 0, ch; 36 | while (!feof(fin)) { 37 | ch = fgetc(fin); 38 | if (ch == 13) continue; 39 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) 40 | { 41 | if (a > 0) break; 42 | else continue; 43 | } 44 | word[a] = ch; 45 | a++; 46 | if (a >= MAX_STRING - 1) a--; // Truncate too long words 47 | } 48 | word[a] = 0; 49 | } 50 | 51 | // Returns hash value of a word 52 | int GetWordHash(char *word) { 53 | unsigned long long a, hash = 0; 54 | for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a]; 55 | hash = hash % vocab_hash_size; 56 | return hash; 57 | } 58 | 59 | // Returns position of a word in the vocabulary; if the word is not found, returns -1 60 | int SearchVocab(char *word) { 61 | unsigned int hash = GetWordHash(word); 62 | while (1) { 63 | if (vocab_hash[hash] == -1) return -1; 64 | if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash]; 65 | hash = (hash + 1) % vocab_hash_size; 66 | } 67 | return -1; 68 | } 69 | 70 | // Reads a word and returns its index in the vocabulary 71 | int ReadWordIndex(FILE *fin) { 72 | char word[MAX_STRING]; 73 | ReadWord(word, fin); 74 | if (feof(fin)) return -1; 75 | return SearchVocab(word); 76 | } 77 | 78 | // Adds a word to the vocabulary 79 | int AddWordToVocab(char *word, int id) { 80 | unsigned int hash, length = strlen(word) + 1; 81 | if (length > MAX_STRING) length = MAX_STRING; 82 | vocab[id].word = (char *)calloc(length, sizeof(char)); 83 | strcpy(vocab[id].word, word); 84 | vocab[id].cn = 0; 85 | hash = GetWordHash(word); 86 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 87 | vocab_hash[hash] = id; 88 | return id; 89 | } 90 | 91 | void LearnVocabFromTrainFile() 92 | { 93 | FILE *fi = fopen(vector_file, "rb"); 94 | if (fi == NULL) { 95 | printf("Vector file not found\n"); 96 | exit(1); 97 | } 98 | char ch, word[MAX_STRING]; 99 | real f_num; 100 | 101 | fscanf(fi, "%lld %lld", &vocab_size, &layer1_size); 102 | 103 | vocab = (struct vocab_word *)malloc(vocab_size*sizeof(struct vocab_word)); 104 | syn0 = (double *)malloc(vocab_size*layer1_size*sizeof(double)); 105 | vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int)); 106 | for (long long a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 107 | 108 | for (long long k = 0; k != vocab_size; k++) 109 | { 110 | fscanf(fi, "%s", word); 111 | ch = fgetc(fi); 112 | AddWordToVocab(word, k); 113 | for (int c = 0; c != layer1_size; c++) 114 | { 115 | fread(&f_num, sizeof(real), 1, fi); 116 | syn0[c + k * layer1_size] = (double)f_num; 117 | } 118 | } 119 | if (debug_mode>0) 120 | { 121 | printf("Vocab size: %lld\n", vocab_size); 122 | printf("Vector size: %lld\n", layer1_size); 123 | } 124 | 125 | fclose(fi); 126 | } 127 | 128 | void ReadNet() 129 | { 130 | char word1[MAX_STRING], word2[MAX_STRING], str[MAX_STRING * 10]; 131 | FILE *fin; 132 | 133 | fin = fopen(train_file, "rb"); 134 | if (fin == NULL) 135 | { 136 | printf("ERROR: biterm data file not found!\n"); 137 | exit(1); 138 | } 139 | nedges = 0; 140 | while (1) 141 | { 142 | if (fgets(str, sizeof(str), fin) == NULL) break; 143 | if (str[0] == '\r' || str[0] == '\n') break; 144 | nedges++; 145 | } 146 | fclose(fin); 147 | 148 | edge_weight = (double *)malloc(nedges*sizeof(double)); 149 | edge_from = (int *)malloc(nedges*sizeof(int)); 150 | edge_to = (int *)malloc(nedges*sizeof(int)); 151 | 152 | if (edge_weight == NULL || edge_from == NULL || edge_to == NULL) 153 | { 154 | printf("Memory allocation failed\n"); 155 | exit(1); 156 | } 157 | 158 | fin = fopen(train_file, "rb"); 159 | int n1, n2; 160 | long long pst = 0; 161 | double curweight; 162 | for (long long k = 0; k != nedges; k++) 163 | { 164 | if (k % 10000 == 0) 165 | { 166 | printf("%cRead network: %.3lf%%", 13, k / (double)(nedges + 1) * 100); 167 | fflush(stdout); 168 | } 169 | fscanf(fin, "%s %s %lf", word1, word2, &curweight); 170 | n1 = (int)(SearchVocab(word1)); 171 | n2 = (int)(SearchVocab(word2)); 172 | if (n1 < 0 || n2 < 0) continue; 173 | edge_from[pst] = n1; 174 | edge_to[pst] = n2; 175 | edge_weight[pst] = curweight; 176 | vocab[n1].cn += curweight; 177 | pst++; 178 | } 179 | nedges = pst; 180 | printf("\n"); 181 | fclose(fin); 182 | if (debug_mode > 0) { 183 | printf("Edge size: %lld\n", nedges); 184 | } 185 | } 186 | 187 | void TrainModel() { 188 | long a, b; 189 | long long u, v, lu, lv; 190 | double w, p; 191 | FILE *fo; 192 | printf("Starting training using file %s\n", train_file); 193 | LearnVocabFromTrainFile(); 194 | if (output_file[0] == 0) return; 195 | ReadNet(); 196 | 197 | syn1 = (double *)malloc(vocab_size*layer1_size*sizeof(double)); 198 | for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) syn1[a * layer1_size + b] = 0; 199 | for (long long k = 0; k != nedges; k++) 200 | { 201 | u = edge_from[k]; 202 | v = edge_to[k]; 203 | w = edge_weight[k]; 204 | p = w / vocab[u].cn; 205 | 206 | lu = u * layer1_size; 207 | lv = v * layer1_size; 208 | 209 | for (int c = 0; c != layer1_size; c++) 210 | syn1[lu + c] += syn0[lv + c] * p; 211 | } 212 | 213 | for (a = 0; a != vocab_size; a++) 214 | { 215 | double len = 0; 216 | for (b = 0; b != layer1_size; b++) len += syn1[a * layer1_size + b] * syn1[a * layer1_size + b]; 217 | len = sqrt(len); 218 | if (len != 0) for (b = 0; b != layer1_size; b++) syn1[a * layer1_size + b] /= len; 219 | //else for (b = 0; b != layer1_size; b++) syn1[a * layer1_size + b] = 0.000001; 220 | } 221 | 222 | real f_num; 223 | fo = fopen(output_file, "wb"); 224 | // Save the word vectors 225 | fprintf(fo, "%lld %lld\n", vocab_size, layer1_size); 226 | for (a = 0; a < vocab_size; a++) { 227 | fprintf(fo, "%s ", vocab[a].word); 228 | if (binary) for (b = 0; b < layer1_size; b++) 229 | { 230 | f_num = (real)(syn1[a * layer1_size + b]); 231 | fwrite(&f_num, sizeof(real), 1, fo); 232 | } 233 | else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn1[a * layer1_size + b]); 234 | fprintf(fo, "\n"); 235 | } 236 | fclose(fo); 237 | } 238 | 239 | int ArgPos(char *str, int argc, char **argv) { 240 | int a; 241 | for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) { 242 | if (a == argc - 1) { 243 | printf("Argument missing for %s\n", str); 244 | exit(1); 245 | } 246 | return a; 247 | } 248 | return -1; 249 | } 250 | 251 | int main(int argc, char **argv) { 252 | int i; 253 | output_file[0] = 0; 254 | if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]); 255 | if ((i = ArgPos((char *)"-vector", argc, argv)) > 0) strcpy(vector_file, argv[i + 1]); 256 | if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]); 257 | if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]); 258 | if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]); 259 | TrainModel(); 260 | return 0; 261 | } -------------------------------------------------------------------------------- /codes/evaluation/liblinear/COPYRIGHT: -------------------------------------------------------------------------------- 1 | 2 | Copyright (c) 2007-2015 The LIBLINEAR Project. 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions 7 | are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright 13 | notice, this list of conditions and the following disclaimer in the 14 | documentation and/or other materials provided with the distribution. 15 | 16 | 3. Neither name of copyright holders nor the names of its contributors 17 | may be used to endorse or promote products derived from this software 18 | without specific prior written permission. 19 | 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR 25 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 26 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 27 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 28 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 29 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 30 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 31 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | -------------------------------------------------------------------------------- /codes/evaluation/liblinear/Makefile: -------------------------------------------------------------------------------- 1 | CXX ?= g++ 2 | CC ?= gcc 3 | CFLAGS = -Wall -Wconversion -O3 -fPIC 4 | LIBS = blas/blas.a 5 | SHVER = 3 6 | OS = $(shell uname) 7 | #LIBS = -lblas 8 | 9 | all: train predict 10 | 11 | lib: linear.o tron.o blas/blas.a 12 | if [ "$(OS)" = "Darwin" ]; then \ 13 | SHARED_LIB_FLAG="-dynamiclib -Wl,-install_name,liblinear.so.$(SHVER)"; \ 14 | else \ 15 | SHARED_LIB_FLAG="-shared -Wl,-soname,liblinear.so.$(SHVER)"; \ 16 | fi; \ 17 | $(CXX) $${SHARED_LIB_FLAG} linear.o tron.o blas/blas.a -o liblinear.so.$(SHVER) 18 | 19 | train: tron.o linear.o train.c blas/blas.a 20 | $(CXX) $(CFLAGS) -o train train.c tron.o linear.o $(LIBS) 21 | 22 | predict: tron.o linear.o predict.c blas/blas.a 23 | $(CXX) $(CFLAGS) -o predict predict.c tron.o linear.o $(LIBS) 24 | 25 | tron.o: tron.cpp tron.h 26 | $(CXX) $(CFLAGS) -c -o tron.o tron.cpp 27 | 28 | linear.o: linear.cpp linear.h 29 | $(CXX) $(CFLAGS) -c -o linear.o linear.cpp 30 | 31 | blas/blas.a: blas/*.c blas/*.h 32 | make -C blas OPTFLAGS='$(CFLAGS)' CC='$(CC)'; 33 | 34 | clean: 35 | make -C blas clean 36 | make -C matlab clean 37 | rm -f *~ tron.o linear.o train predict liblinear.so.$(SHVER) 38 | -------------------------------------------------------------------------------- /codes/evaluation/liblinear/Makefile.win: -------------------------------------------------------------------------------- 1 | CXX = cl.exe 2 | CFLAGS = /nologo /O2 /EHsc /I. /D _WIN64 /D _CRT_SECURE_NO_DEPRECATE 3 | TARGET = windows 4 | 5 | all: $(TARGET)\train.exe $(TARGET)\predict.exe lib 6 | 7 | $(TARGET)\train.exe: tron.obj linear.obj train.c blas\*.c 8 | $(CXX) $(CFLAGS) -Fe$(TARGET)\train.exe tron.obj linear.obj train.c blas\*.c 9 | 10 | $(TARGET)\predict.exe: tron.obj linear.obj predict.c blas\*.c 11 | $(CXX) $(CFLAGS) -Fe$(TARGET)\predict.exe tron.obj linear.obj predict.c blas\*.c 12 | 13 | linear.obj: linear.cpp linear.h 14 | $(CXX) $(CFLAGS) -c linear.cpp 15 | 16 | tron.obj: tron.cpp tron.h 17 | $(CXX) $(CFLAGS) -c tron.cpp 18 | 19 | lib: linear.cpp linear.h linear.def tron.obj 20 | $(CXX) $(CFLAGS) -LD linear.cpp tron.obj blas\*.c -Fe$(TARGET)\liblinear -link -DEF:linear.def 21 | 22 | clean: 23 | -erase /Q *.obj $(TARGET)\*.exe $(TARGET)\*.dll $(TARGET)\*.exp $(TARGET)\*.lib 24 | 25 | -------------------------------------------------------------------------------- /codes/evaluation/liblinear/blas/Makefile: -------------------------------------------------------------------------------- 1 | AR = ar rcv 2 | RANLIB = ranlib 3 | 4 | HEADERS = blas.h blasp.h 5 | FILES = dnrm2.o daxpy.o ddot.o dscal.o 6 | 7 | CFLAGS = $(OPTFLAGS) 8 | FFLAGS = $(OPTFLAGS) 9 | 10 | blas: $(FILES) $(HEADERS) 11 | $(AR) blas.a $(FILES) 12 | $(RANLIB) blas.a 13 | 14 | clean: 15 | - rm -f *.o 16 | - rm -f *.a 17 | - rm -f *~ 18 | 19 | .c.o: 20 | $(CC) $(CFLAGS) -c $*.c 21 | 22 | 23 | -------------------------------------------------------------------------------- /codes/evaluation/liblinear/blas/blas.h: -------------------------------------------------------------------------------- 1 | /* blas.h -- C header file for BLAS Ver 1.0 */ 2 | /* Jesse Bennett March 23, 2000 */ 3 | 4 | /** barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." 5 | 6 | - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */ 7 | 8 | #ifndef BLAS_INCLUDE 9 | #define BLAS_INCLUDE 10 | 11 | /* Data types specific to BLAS implementation */ 12 | typedef struct { float r, i; } fcomplex; 13 | typedef struct { double r, i; } dcomplex; 14 | typedef int blasbool; 15 | 16 | #include "blasp.h" /* Prototypes for all BLAS functions */ 17 | 18 | #define FALSE 0 19 | #define TRUE 1 20 | 21 | /* Macro functions */ 22 | #define MIN(a,b) ((a) <= (b) ? (a) : (b)) 23 | #define MAX(a,b) ((a) >= (b) ? (a) : (b)) 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /codes/evaluation/liblinear/blas/daxpy.c: -------------------------------------------------------------------------------- 1 | #include "blas.h" 2 | 3 | #ifdef __cplusplus 4 | extern "C" { 5 | #endif 6 | 7 | int daxpy_(int *n, double *sa, double *sx, int *incx, double *sy, 8 | int *incy) 9 | { 10 | long int i, m, ix, iy, nn, iincx, iincy; 11 | register double ssa; 12 | 13 | /* constant times a vector plus a vector. 14 | uses unrolled loop for increments equal to one. 15 | jack dongarra, linpack, 3/11/78. 16 | modified 12/3/93, array(1) declarations changed to array(*) */ 17 | 18 | /* Dereference inputs */ 19 | nn = *n; 20 | ssa = *sa; 21 | iincx = *incx; 22 | iincy = *incy; 23 | 24 | if( nn > 0 && ssa != 0.0 ) 25 | { 26 | if (iincx == 1 && iincy == 1) /* code for both increments equal to 1 */ 27 | { 28 | m = nn-3; 29 | for (i = 0; i < m; i += 4) 30 | { 31 | sy[i] += ssa * sx[i]; 32 | sy[i+1] += ssa * sx[i+1]; 33 | sy[i+2] += ssa * sx[i+2]; 34 | sy[i+3] += ssa * sx[i+3]; 35 | } 36 | for ( ; i < nn; ++i) /* clean-up loop */ 37 | sy[i] += ssa * sx[i]; 38 | } 39 | else /* code for unequal increments or equal increments not equal to 1 */ 40 | { 41 | ix = iincx >= 0 ? 0 : (1 - nn) * iincx; 42 | iy = iincy >= 0 ? 0 : (1 - nn) * iincy; 43 | for (i = 0; i < nn; i++) 44 | { 45 | sy[iy] += ssa * sx[ix]; 46 | ix += iincx; 47 | iy += iincy; 48 | } 49 | } 50 | } 51 | 52 | return 0; 53 | } /* daxpy_ */ 54 | 55 | #ifdef __cplusplus 56 | } 57 | #endif 58 | -------------------------------------------------------------------------------- /codes/evaluation/liblinear/blas/ddot.c: -------------------------------------------------------------------------------- 1 | #include "blas.h" 2 | 3 | #ifdef __cplusplus 4 | extern "C" { 5 | #endif 6 | 7 | double ddot_(int *n, double *sx, int *incx, double *sy, int *incy) 8 | { 9 | long int i, m, nn, iincx, iincy; 10 | double stemp; 11 | long int ix, iy; 12 | 13 | /* forms the dot product of two vectors. 14 | uses unrolled loops for increments equal to one. 15 | jack dongarra, linpack, 3/11/78. 16 | modified 12/3/93, array(1) declarations changed to array(*) */ 17 | 18 | /* Dereference inputs */ 19 | nn = *n; 20 | iincx = *incx; 21 | iincy = *incy; 22 | 23 | stemp = 0.0; 24 | if (nn > 0) 25 | { 26 | if (iincx == 1 && iincy == 1) /* code for both increments equal to 1 */ 27 | { 28 | m = nn-4; 29 | for (i = 0; i < m; i += 5) 30 | stemp += sx[i] * sy[i] + sx[i+1] * sy[i+1] + sx[i+2] * sy[i+2] + 31 | sx[i+3] * sy[i+3] + sx[i+4] * sy[i+4]; 32 | 33 | for ( ; i < nn; i++) /* clean-up loop */ 34 | stemp += sx[i] * sy[i]; 35 | } 36 | else /* code for unequal increments or equal increments not equal to 1 */ 37 | { 38 | ix = 0; 39 | iy = 0; 40 | if (iincx < 0) 41 | ix = (1 - nn) * iincx; 42 | if (iincy < 0) 43 | iy = (1 - nn) * iincy; 44 | for (i = 0; i < nn; i++) 45 | { 46 | stemp += sx[ix] * sy[iy]; 47 | ix += iincx; 48 | iy += iincy; 49 | } 50 | } 51 | } 52 | 53 | return stemp; 54 | } /* ddot_ */ 55 | 56 | #ifdef __cplusplus 57 | } 58 | #endif 59 | -------------------------------------------------------------------------------- /codes/evaluation/liblinear/blas/dnrm2.c: -------------------------------------------------------------------------------- 1 | #include /* Needed for fabs() and sqrt() */ 2 | #include "blas.h" 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | double dnrm2_(int *n, double *x, int *incx) 9 | { 10 | long int ix, nn, iincx; 11 | double norm, scale, absxi, ssq, temp; 12 | 13 | /* DNRM2 returns the euclidean norm of a vector via the function 14 | name, so that 15 | 16 | DNRM2 := sqrt( x'*x ) 17 | 18 | -- This version written on 25-October-1982. 19 | Modified on 14-October-1993 to inline the call to SLASSQ. 20 | Sven Hammarling, Nag Ltd. */ 21 | 22 | /* Dereference inputs */ 23 | nn = *n; 24 | iincx = *incx; 25 | 26 | if( nn > 0 && iincx > 0 ) 27 | { 28 | if (nn == 1) 29 | { 30 | norm = fabs(x[0]); 31 | } 32 | else 33 | { 34 | scale = 0.0; 35 | ssq = 1.0; 36 | 37 | /* The following loop is equivalent to this call to the LAPACK 38 | auxiliary routine: CALL SLASSQ( N, X, INCX, SCALE, SSQ ) */ 39 | 40 | for (ix=(nn-1)*iincx; ix>=0; ix-=iincx) 41 | { 42 | if (x[ix] != 0.0) 43 | { 44 | absxi = fabs(x[ix]); 45 | if (scale < absxi) 46 | { 47 | temp = scale / absxi; 48 | ssq = ssq * (temp * temp) + 1.0; 49 | scale = absxi; 50 | } 51 | else 52 | { 53 | temp = absxi / scale; 54 | ssq += temp * temp; 55 | } 56 | } 57 | } 58 | norm = scale * sqrt(ssq); 59 | } 60 | } 61 | else 62 | norm = 0.0; 63 | 64 | return norm; 65 | 66 | } /* dnrm2_ */ 67 | 68 | #ifdef __cplusplus 69 | } 70 | #endif 71 | -------------------------------------------------------------------------------- /codes/evaluation/liblinear/blas/dscal.c: -------------------------------------------------------------------------------- 1 | #include "blas.h" 2 | 3 | #ifdef __cplusplus 4 | extern "C" { 5 | #endif 6 | 7 | int dscal_(int *n, double *sa, double *sx, int *incx) 8 | { 9 | long int i, m, nincx, nn, iincx; 10 | double ssa; 11 | 12 | /* scales a vector by a constant. 13 | uses unrolled loops for increment equal to 1. 14 | jack dongarra, linpack, 3/11/78. 15 | modified 3/93 to return if incx .le. 0. 16 | modified 12/3/93, array(1) declarations changed to array(*) */ 17 | 18 | /* Dereference inputs */ 19 | nn = *n; 20 | iincx = *incx; 21 | ssa = *sa; 22 | 23 | if (nn > 0 && iincx > 0) 24 | { 25 | if (iincx == 1) /* code for increment equal to 1 */ 26 | { 27 | m = nn-4; 28 | for (i = 0; i < m; i += 5) 29 | { 30 | sx[i] = ssa * sx[i]; 31 | sx[i+1] = ssa * sx[i+1]; 32 | sx[i+2] = ssa * sx[i+2]; 33 | sx[i+3] = ssa * sx[i+3]; 34 | sx[i+4] = ssa * sx[i+4]; 35 | } 36 | for ( ; i < nn; ++i) /* clean-up loop */ 37 | sx[i] = ssa * sx[i]; 38 | } 39 | else /* code for increment not equal to 1 */ 40 | { 41 | nincx = nn * iincx; 42 | for (i = 0; i < nincx; i += iincx) 43 | sx[i] = ssa * sx[i]; 44 | } 45 | } 46 | 47 | return 0; 48 | } /* dscal_ */ 49 | 50 | #ifdef __cplusplus 51 | } 52 | #endif 53 | -------------------------------------------------------------------------------- /codes/evaluation/liblinear/linear.def: -------------------------------------------------------------------------------- 1 | LIBRARY liblinear 2 | EXPORTS 3 | train @1 4 | cross_validation @2 5 | save_model @3 6 | load_model @4 7 | get_nr_feature @5 8 | get_nr_class @6 9 | get_labels @7 10 | predict_values @8 11 | predict @9 12 | predict_probability @10 13 | free_and_destroy_model @11 14 | free_model_content @12 15 | destroy_param @13 16 | check_parameter @14 17 | check_probability_model @15 18 | set_print_string_function @16 19 | get_decfun_coef @17 20 | get_decfun_bias @18 21 | check_regression_model @19 22 | find_parameter_C @20 23 | -------------------------------------------------------------------------------- /codes/evaluation/liblinear/linear.h: -------------------------------------------------------------------------------- 1 | #ifndef _LIBLINEAR_H 2 | #define _LIBLINEAR_H 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | struct feature_node 9 | { 10 | int index; 11 | double value; 12 | }; 13 | 14 | struct problem 15 | { 16 | int l, n; 17 | double *y; 18 | struct feature_node **x; 19 | double bias; /* < 0 if no bias term */ 20 | }; 21 | 22 | enum { L2R_LR, L2R_L2LOSS_SVC_DUAL, L2R_L2LOSS_SVC, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L1R_L2LOSS_SVC, L1R_LR, L2R_LR_DUAL, L2R_L2LOSS_SVR = 11, L2R_L2LOSS_SVR_DUAL, L2R_L1LOSS_SVR_DUAL }; /* solver_type */ 23 | 24 | struct parameter 25 | { 26 | int solver_type; 27 | 28 | /* these are for training only */ 29 | double eps; /* stopping criteria */ 30 | double C; 31 | int nr_weight; 32 | int *weight_label; 33 | double* weight; 34 | double p; 35 | double *init_sol; 36 | }; 37 | 38 | struct model 39 | { 40 | struct parameter param; 41 | int nr_class; /* number of classes */ 42 | int nr_feature; 43 | double *w; 44 | int *label; /* label of each class */ 45 | double bias; 46 | }; 47 | 48 | struct model* train(const struct problem *prob, const struct parameter *param); 49 | void cross_validation(const struct problem *prob, const struct parameter *param, int nr_fold, double *target); 50 | void find_parameter_C(const struct problem *prob, const struct parameter *param, int nr_fold, double start_C, double max_C, double *best_C, double *best_rate); 51 | 52 | double predict_values(const struct model *model_, const struct feature_node *x, double* dec_values); 53 | double predict(const struct model *model_, const struct feature_node *x); 54 | double predict_probability(const struct model *model_, const struct feature_node *x, double* prob_estimates); 55 | 56 | int save_model(const char *model_file_name, const struct model *model_); 57 | struct model *load_model(const char *model_file_name); 58 | 59 | int get_nr_feature(const struct model *model_); 60 | int get_nr_class(const struct model *model_); 61 | void get_labels(const struct model *model_, int* label); 62 | double get_decfun_coef(const struct model *model_, int feat_idx, int label_idx); 63 | double get_decfun_bias(const struct model *model_, int label_idx); 64 | 65 | void free_model_content(struct model *model_ptr); 66 | void free_and_destroy_model(struct model **model_ptr_ptr); 67 | void destroy_param(struct parameter *param); 68 | 69 | const char *check_parameter(const struct problem *prob, const struct parameter *param); 70 | int check_probability_model(const struct model *model); 71 | int check_regression_model(const struct model *model); 72 | void set_print_string_function(void (*print_func) (const char*)); 73 | 74 | #ifdef __cplusplus 75 | } 76 | #endif 77 | 78 | #endif /* _LIBLINEAR_H */ 79 | 80 | -------------------------------------------------------------------------------- /codes/evaluation/liblinear/matlab/Makefile: -------------------------------------------------------------------------------- 1 | # This Makefile is used under Linux 2 | 3 | MATLABDIR ?= /usr/local/matlab 4 | CXX ?= g++ 5 | #CXX = g++-3.3 6 | CC ?= gcc 7 | CFLAGS = -Wall -Wconversion -O3 -fPIC -I$(MATLABDIR)/extern/include -I.. 8 | 9 | MEX = $(MATLABDIR)/bin/mex 10 | MEX_OPTION = CC="$(CXX)" CXX="$(CXX)" CFLAGS="$(CFLAGS)" CXXFLAGS="$(CFLAGS)" 11 | # comment the following line if you use MATLAB on a 32-bit computer 12 | MEX_OPTION += -largeArrayDims 13 | MEX_EXT = $(shell $(MATLABDIR)/bin/mexext) 14 | 15 | all: matlab 16 | 17 | matlab: binary 18 | 19 | octave: 20 | @echo "please type make under Octave" 21 | binary: train.$(MEX_EXT) predict.$(MEX_EXT) libsvmread.$(MEX_EXT) libsvmwrite.$(MEX_EXT) 22 | 23 | train.$(MEX_EXT): train.c ../linear.h ../tron.o ../linear.o linear_model_matlab.o ../blas/blas.a 24 | $(MEX) $(MEX_OPTION) train.c ../tron.o ../linear.o linear_model_matlab.o ../blas/blas.a 25 | 26 | predict.$(MEX_EXT): predict.c ../linear.h ../tron.o ../linear.o linear_model_matlab.o ../blas/blas.a 27 | $(MEX) $(MEX_OPTION) predict.c ../tron.o ../linear.o linear_model_matlab.o ../blas/blas.a 28 | 29 | libsvmread.$(MEX_EXT): libsvmread.c 30 | $(MEX) $(MEX_OPTION) libsvmread.c 31 | 32 | libsvmwrite.$(MEX_EXT): libsvmwrite.c 33 | $(MEX) $(MEX_OPTION) libsvmwrite.c 34 | 35 | linear_model_matlab.o: linear_model_matlab.c ../linear.h 36 | $(CXX) $(CFLAGS) -c linear_model_matlab.c 37 | 38 | ../linear.o: ../linear.cpp ../linear.h 39 | make -C .. linear.o 40 | 41 | ../tron.o: ../tron.cpp ../tron.h 42 | make -C .. tron.o 43 | 44 | ../blas/blas.a: ../blas/*.c ../blas/*.h 45 | make -C ../blas OPTFLAGS='$(CFLAGS)' CC='$(CC)'; 46 | 47 | clean: 48 | make -C ../blas clean 49 | rm -f *~ *.o *.mex* *.obj ../linear.o ../tron.o 50 | -------------------------------------------------------------------------------- /codes/evaluation/liblinear/matlab/README: -------------------------------------------------------------------------------- 1 | -------------------------------------------- 2 | --- MATLAB/OCTAVE interface of LIBLINEAR --- 3 | -------------------------------------------- 4 | 5 | Table of Contents 6 | ================= 7 | 8 | - Introduction 9 | - Installation 10 | - Usage 11 | - Returned Model Structure 12 | - Other Utilities 13 | - Examples 14 | - Additional Information 15 | 16 | 17 | Introduction 18 | ============ 19 | 20 | This tool provides a simple interface to LIBLINEAR, a library for 21 | large-scale regularized linear classification and regression 22 | (http://www.csie.ntu.edu.tw/~cjlin/liblinear). It is very easy to use 23 | as the usage and the way of specifying parameters are the same as that 24 | of LIBLINEAR. 25 | 26 | Installation 27 | ============ 28 | 29 | On Windows systems, pre-built binary files are already in the 30 | directory '..\windows', so no need to conduct installation. Now we 31 | provide binary files only for 64bit MATLAB on Windows. If you would 32 | like to re-build the package, please rely on the following steps. 33 | 34 | We recommend using make.m on both MATLAB and OCTAVE. Just type 'make' 35 | to build 'libsvmread.mex', 'libsvmwrite.mex', 'train.mex', and 36 | 'predict.mex'. 37 | 38 | On MATLAB or Octave: 39 | 40 | >> make 41 | 42 | If make.m does not work on MATLAB (especially for Windows), try 'mex 43 | -setup' to choose a suitable compiler for mex. Make sure your compiler 44 | is accessible and workable. Then type 'make' to start the 45 | installation. 46 | 47 | Example: 48 | 49 | matlab>> mex -setup 50 | (ps: MATLAB will show the following messages to setup default compiler.) 51 | Please choose your compiler for building external interface (MEX) files: 52 | Would you like mex to locate installed compilers [y]/n? y 53 | Select a compiler: 54 | [1] Microsoft Visual C/C++ version 7.1 in C:\Program Files\Microsoft Visual Studio 55 | [0] None 56 | Compiler: 1 57 | Please verify your choices: 58 | Compiler: Microsoft Visual C/C++ 7.1 59 | Location: C:\Program Files\Microsoft Visual Studio 60 | Are these correct?([y]/n): y 61 | 62 | matlab>> make 63 | 64 | On Unix systems, if neither make.m nor 'mex -setup' works, please use 65 | Makefile and type 'make' in a command window. Note that we assume 66 | your MATLAB is installed in '/usr/local/matlab'. If not, please change 67 | MATLABDIR in Makefile. 68 | 69 | Example: 70 | linux> make 71 | 72 | To use octave, type 'make octave': 73 | 74 | Example: 75 | linux> make octave 76 | 77 | For a list of supported/compatible compilers for MATLAB, please check 78 | the following page: 79 | 80 | http://www.mathworks.com/support/compilers/current_release/ 81 | 82 | Usage 83 | ===== 84 | 85 | matlab> model = train(training_label_vector, training_instance_matrix [,'liblinear_options', 'col']); 86 | 87 | -training_label_vector: 88 | An m by 1 vector of training labels. (type must be double) 89 | -training_instance_matrix: 90 | An m by n matrix of m training instances with n features. 91 | It must be a sparse matrix. (type must be double) 92 | -liblinear_options: 93 | A string of training options in the same format as that of LIBLINEAR. 94 | -col: 95 | if 'col' is set, each column of training_instance_matrix is a data instance. Otherwise each row is a data instance. 96 | 97 | matlab> [predicted_label, accuracy, decision_values/prob_estimates] = predict(testing_label_vector, testing_instance_matrix, model [, 'liblinear_options', 'col']); 98 | matlab> [predicted_label] = predict(testing_label_vector, testing_instance_matrix, model [, 'liblinear_options', 'col']); 99 | 100 | -testing_label_vector: 101 | An m by 1 vector of prediction labels. If labels of test 102 | data are unknown, simply use any random values. (type must be double) 103 | -testing_instance_matrix: 104 | An m by n matrix of m testing instances with n features. 105 | It must be a sparse matrix. (type must be double) 106 | -model: 107 | The output of train. 108 | -liblinear_options: 109 | A string of testing options in the same format as that of LIBLINEAR. 110 | -col: 111 | if 'col' is set, each column of testing_instance_matrix is a data instance. Otherwise each row is a data instance. 112 | 113 | Returned Model Structure 114 | ======================== 115 | 116 | The 'train' function returns a model which can be used for future 117 | prediction. It is a structure and is organized as [Parameters, nr_class, 118 | nr_feature, bias, Label, w]: 119 | 120 | -Parameters: Parameters (now only solver type is provided) 121 | -nr_class: number of classes; = 2 for regression 122 | -nr_feature: number of features in training data (without including the bias term) 123 | -bias: If >= 0, we assume one additional feature is added to the end 124 | of each data instance. 125 | -Label: label of each class; empty for regression 126 | -w: a nr_w-by-n matrix for the weights, where n is nr_feature 127 | or nr_feature+1 depending on the existence of the bias term. 128 | nr_w is 1 if nr_class=2 and -s is not 4 (i.e., not 129 | multi-class svm by Crammer and Singer). It is 130 | nr_class otherwise. 131 | 132 | If the '-v' option is specified, cross validation is conducted and the 133 | returned model is just a scalar: cross-validation accuracy for 134 | classification and mean-squared error for regression. If the '-C' option 135 | is specified, the best parameter C is found by cross validation. The 136 | returned model is a two dimensional vector, where the first value is 137 | the best C and the second value is the corresponding cross-validation 138 | accuracy. The parameter selection utility is supported by only -s 0 139 | and -s 2. 140 | 141 | Result of Prediction 142 | ==================== 143 | 144 | The function 'predict' has three outputs. The first one, 145 | predicted_label, is a vector of predicted labels. The second output, 146 | accuracy, is a vector including accuracy (for classification), mean 147 | squared error, and squared correlation coefficient (for regression). 148 | The third is a matrix containing decision values or probability 149 | estimates (if '-b 1' is specified). If k is the number of classes 150 | and k' is the number of classifiers (k'=1 if k=2, otherwise k'=k), for decision values, 151 | each row includes results of k' binary linear classifiers. For probabilities, 152 | each row contains k values indicating the probability that the testing instance is in 153 | each class. Note that the order of classes here is the same as 'Label' 154 | field in the model structure. 155 | 156 | Other Utilities 157 | =============== 158 | 159 | A matlab function libsvmread reads files in LIBSVM format: 160 | 161 | [label_vector, instance_matrix] = libsvmread('data.txt'); 162 | 163 | Two outputs are labels and instances, which can then be used as inputs 164 | of svmtrain or svmpredict. 165 | 166 | A matlab function libsvmwrite writes Matlab matrix to a file in LIBSVM format: 167 | 168 | libsvmwrite('data.txt', label_vector, instance_matrix] 169 | 170 | The instance_matrix must be a sparse matrix. (type must be double) 171 | For windows, `libsvmread.mexw64' and `libsvmwrite.mexw64' are ready in 172 | the directory `..\windows'. 173 | 174 | These codes are prepared by Rong-En Fan and Kai-Wei Chang from National 175 | Taiwan University. 176 | 177 | Examples 178 | ======== 179 | 180 | Train and test on the provided data heart_scale: 181 | 182 | matlab> [heart_scale_label, heart_scale_inst] = libsvmread('../heart_scale'); 183 | matlab> model = train(heart_scale_label, heart_scale_inst, '-c 1'); 184 | matlab> [predict_label, accuracy, dec_values] = predict(heart_scale_label, heart_scale_inst, model); % test the training data 185 | 186 | Note that for testing, you can put anything in the testing_label_vector. 187 | 188 | For probability estimates, you need '-b 1' only in the testing phase: 189 | 190 | matlab> [predict_label, accuracy, prob_estimates] = predict(heart_scale_label, heart_scale_inst, model, '-b 1'); 191 | 192 | Use the best parameter to train (only supported by -s 0 and -s 2): 193 | 194 | matlab> best = train(heart_scale_label, heart_scale_inst, '-C -s 0'); 195 | matlab> model = train(heart_scale_label, heart_scale_inst, sprintf('-c %f -s 0', best(1))); % use the same solver: -s 0 196 | 197 | Additional Information 198 | ====================== 199 | 200 | Please cite LIBLINEAR as follows 201 | 202 | R.-E. Fan, K.-W. Chang, C.-J. Hsieh, X.-R. Wang, and C.-J. Lin. 203 | LIBLINEAR: A Library for Large Linear Classification, Journal of 204 | Machine Learning Research 9(2008), 1871-1874.Software available at 205 | http://www.csie.ntu.edu.tw/~cjlin/liblinear 206 | 207 | For any question, please contact Chih-Jen Lin . 208 | 209 | -------------------------------------------------------------------------------- /codes/evaluation/liblinear/matlab/libsvmread.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "mex.h" 8 | 9 | #ifdef MX_API_VER 10 | #if MX_API_VER < 0x07030000 11 | typedef int mwIndex; 12 | #endif 13 | #endif 14 | #ifndef max 15 | #define max(x,y) (((x)>(y))?(x):(y)) 16 | #endif 17 | #ifndef min 18 | #define min(x,y) (((x)<(y))?(x):(y)) 19 | #endif 20 | 21 | void exit_with_help() 22 | { 23 | mexPrintf( 24 | "Usage: [label_vector, instance_matrix] = libsvmread('filename');\n" 25 | ); 26 | } 27 | 28 | static void fake_answer(int nlhs, mxArray *plhs[]) 29 | { 30 | int i; 31 | for(i=0;i start from 0 86 | strtok(line," \t"); // label 87 | while (1) 88 | { 89 | idx = strtok(NULL,":"); // index:value 90 | val = strtok(NULL," \t"); 91 | if(val == NULL) 92 | break; 93 | 94 | errno = 0; 95 | index = (int) strtol(idx,&endptr,10); 96 | if(endptr == idx || errno != 0 || *endptr != '\0' || index <= inst_max_index) 97 | { 98 | mexPrintf("Wrong input format at line %d\n",l+1); 99 | fake_answer(nlhs, plhs); 100 | return; 101 | } 102 | else 103 | inst_max_index = index; 104 | 105 | min_index = min(min_index, index); 106 | elements++; 107 | } 108 | max_index = max(max_index, inst_max_index); 109 | l++; 110 | } 111 | rewind(fp); 112 | 113 | // y 114 | plhs[0] = mxCreateDoubleMatrix(l, 1, mxREAL); 115 | // x^T 116 | if (min_index <= 0) 117 | plhs[1] = mxCreateSparse(max_index-min_index+1, l, elements, mxREAL); 118 | else 119 | plhs[1] = mxCreateSparse(max_index, l, elements, mxREAL); 120 | 121 | labels = mxGetPr(plhs[0]); 122 | samples = mxGetPr(plhs[1]); 123 | ir = mxGetIr(plhs[1]); 124 | jc = mxGetJc(plhs[1]); 125 | 126 | k=0; 127 | for(i=0;i start from 0 158 | 159 | errno = 0; 160 | samples[k] = strtod(val,&endptr); 161 | if (endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr))) 162 | { 163 | mexPrintf("Wrong input format at line %d\n",i+1); 164 | fake_answer(nlhs, plhs); 165 | return; 166 | } 167 | ++k; 168 | } 169 | } 170 | jc[l] = k; 171 | 172 | fclose(fp); 173 | free(line); 174 | 175 | { 176 | mxArray *rhs[1], *lhs[1]; 177 | rhs[0] = plhs[1]; 178 | if(mexCallMATLAB(1, lhs, 1, rhs, "transpose")) 179 | { 180 | mexPrintf("Error: cannot transpose problem\n"); 181 | fake_answer(nlhs, plhs); 182 | return; 183 | } 184 | plhs[1] = lhs[0]; 185 | } 186 | } 187 | 188 | void mexFunction( int nlhs, mxArray *plhs[], 189 | int nrhs, const mxArray *prhs[] ) 190 | { 191 | char filename[256]; 192 | 193 | if(nrhs != 1 || nlhs != 2) 194 | { 195 | exit_with_help(); 196 | fake_answer(nlhs, plhs); 197 | return; 198 | } 199 | 200 | mxGetString(prhs[0], filename, mxGetN(prhs[0]) + 1); 201 | 202 | if(filename == NULL) 203 | { 204 | mexPrintf("Error: filename is NULL\n"); 205 | return; 206 | } 207 | 208 | read_problem(filename, nlhs, plhs); 209 | 210 | return; 211 | } 212 | 213 | -------------------------------------------------------------------------------- /codes/evaluation/liblinear/matlab/libsvmwrite.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "mex.h" 5 | 6 | #ifdef MX_API_VER 7 | #if MX_API_VER < 0x07030000 8 | typedef int mwIndex; 9 | #endif 10 | #endif 11 | 12 | void exit_with_help() 13 | { 14 | mexPrintf( 15 | "Usage: libsvmwrite('filename', label_vector, instance_matrix);\n" 16 | ); 17 | } 18 | 19 | static void fake_answer(int nlhs, mxArray *plhs[]) 20 | { 21 | int i; 22 | for(i=0;i 0) 88 | { 89 | exit_with_help(); 90 | fake_answer(nlhs, plhs); 91 | return; 92 | } 93 | 94 | // Transform the input Matrix to libsvm format 95 | if(nrhs == 3) 96 | { 97 | char filename[256]; 98 | if(!mxIsDouble(prhs[1]) || !mxIsDouble(prhs[2])) 99 | { 100 | mexPrintf("Error: label vector and instance matrix must be double\n"); 101 | return; 102 | } 103 | 104 | mxGetString(prhs[0], filename, mxGetN(prhs[0])+1); 105 | 106 | if(mxIsSparse(prhs[2])) 107 | libsvmwrite(filename, prhs[1], prhs[2]); 108 | else 109 | { 110 | mexPrintf("Instance_matrix must be sparse\n"); 111 | return; 112 | } 113 | } 114 | else 115 | { 116 | exit_with_help(); 117 | return; 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /codes/evaluation/liblinear/matlab/linear_model_matlab.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "linear.h" 4 | 5 | #include "mex.h" 6 | 7 | #ifdef MX_API_VER 8 | #if MX_API_VER < 0x07030000 9 | typedef int mwIndex; 10 | #endif 11 | #endif 12 | 13 | #define Malloc(type,n) (type *)malloc((n)*sizeof(type)) 14 | 15 | #define NUM_OF_RETURN_FIELD 6 16 | 17 | static const char *field_names[] = { 18 | "Parameters", 19 | "nr_class", 20 | "nr_feature", 21 | "bias", 22 | "Label", 23 | "w", 24 | }; 25 | 26 | const char *model_to_matlab_structure(mxArray *plhs[], struct model *model_) 27 | { 28 | int i; 29 | int nr_w; 30 | double *ptr; 31 | mxArray *return_model, **rhs; 32 | int out_id = 0; 33 | int n, w_size; 34 | 35 | rhs = (mxArray **)mxMalloc(sizeof(mxArray *)*NUM_OF_RETURN_FIELD); 36 | 37 | // Parameters 38 | // for now, only solver_type is needed 39 | rhs[out_id] = mxCreateDoubleMatrix(1, 1, mxREAL); 40 | ptr = mxGetPr(rhs[out_id]); 41 | ptr[0] = model_->param.solver_type; 42 | out_id++; 43 | 44 | // nr_class 45 | rhs[out_id] = mxCreateDoubleMatrix(1, 1, mxREAL); 46 | ptr = mxGetPr(rhs[out_id]); 47 | ptr[0] = model_->nr_class; 48 | out_id++; 49 | 50 | if(model_->nr_class==2 && model_->param.solver_type != MCSVM_CS) 51 | nr_w=1; 52 | else 53 | nr_w=model_->nr_class; 54 | 55 | // nr_feature 56 | rhs[out_id] = mxCreateDoubleMatrix(1, 1, mxREAL); 57 | ptr = mxGetPr(rhs[out_id]); 58 | ptr[0] = model_->nr_feature; 59 | out_id++; 60 | 61 | // bias 62 | rhs[out_id] = mxCreateDoubleMatrix(1, 1, mxREAL); 63 | ptr = mxGetPr(rhs[out_id]); 64 | ptr[0] = model_->bias; 65 | out_id++; 66 | 67 | if(model_->bias>=0) 68 | n=model_->nr_feature+1; 69 | else 70 | n=model_->nr_feature; 71 | 72 | w_size = n; 73 | // Label 74 | if(model_->label) 75 | { 76 | rhs[out_id] = mxCreateDoubleMatrix(model_->nr_class, 1, mxREAL); 77 | ptr = mxGetPr(rhs[out_id]); 78 | for(i = 0; i < model_->nr_class; i++) 79 | ptr[i] = model_->label[i]; 80 | } 81 | else 82 | rhs[out_id] = mxCreateDoubleMatrix(0, 0, mxREAL); 83 | out_id++; 84 | 85 | // w 86 | rhs[out_id] = mxCreateDoubleMatrix(nr_w, w_size, mxREAL); 87 | ptr = mxGetPr(rhs[out_id]); 88 | for(i = 0; i < w_size*nr_w; i++) 89 | ptr[i]=model_->w[i]; 90 | out_id++; 91 | 92 | /* Create a struct matrix contains NUM_OF_RETURN_FIELD fields */ 93 | return_model = mxCreateStructMatrix(1, 1, NUM_OF_RETURN_FIELD, field_names); 94 | 95 | /* Fill struct matrix with input arguments */ 96 | for(i = 0; i < NUM_OF_RETURN_FIELD; i++) 97 | mxSetField(return_model,0,field_names[i],mxDuplicateArray(rhs[i])); 98 | /* return */ 99 | plhs[0] = return_model; 100 | mxFree(rhs); 101 | 102 | return NULL; 103 | } 104 | 105 | const char *matlab_matrix_to_model(struct model *model_, const mxArray *matlab_struct) 106 | { 107 | int i, num_of_fields; 108 | int nr_w; 109 | double *ptr; 110 | int id = 0; 111 | int n, w_size; 112 | mxArray **rhs; 113 | 114 | num_of_fields = mxGetNumberOfFields(matlab_struct); 115 | rhs = (mxArray **) mxMalloc(sizeof(mxArray *)*num_of_fields); 116 | 117 | for(i=0;inr_class=0; 121 | nr_w=0; 122 | model_->nr_feature=0; 123 | model_->w=NULL; 124 | model_->label=NULL; 125 | 126 | // Parameters 127 | ptr = mxGetPr(rhs[id]); 128 | model_->param.solver_type = (int)ptr[0]; 129 | id++; 130 | 131 | // nr_class 132 | ptr = mxGetPr(rhs[id]); 133 | model_->nr_class = (int)ptr[0]; 134 | id++; 135 | 136 | if(model_->nr_class==2 && model_->param.solver_type != MCSVM_CS) 137 | nr_w=1; 138 | else 139 | nr_w=model_->nr_class; 140 | 141 | // nr_feature 142 | ptr = mxGetPr(rhs[id]); 143 | model_->nr_feature = (int)ptr[0]; 144 | id++; 145 | 146 | // bias 147 | ptr = mxGetPr(rhs[id]); 148 | model_->bias = ptr[0]; 149 | id++; 150 | 151 | if(model_->bias>=0) 152 | n=model_->nr_feature+1; 153 | else 154 | n=model_->nr_feature; 155 | w_size = n; 156 | 157 | // Label 158 | if(mxIsEmpty(rhs[id]) == 0) 159 | { 160 | model_->label = Malloc(int, model_->nr_class); 161 | ptr = mxGetPr(rhs[id]); 162 | for(i=0;inr_class;i++) 163 | model_->label[i] = (int)ptr[i]; 164 | } 165 | id++; 166 | 167 | ptr = mxGetPr(rhs[id]); 168 | model_->w=Malloc(double, w_size*nr_w); 169 | for(i = 0; i < w_size*nr_w; i++) 170 | model_->w[i]=ptr[i]; 171 | id++; 172 | mxFree(rhs); 173 | 174 | return NULL; 175 | } 176 | 177 | -------------------------------------------------------------------------------- /codes/evaluation/liblinear/matlab/linear_model_matlab.h: -------------------------------------------------------------------------------- 1 | const char *model_to_matlab_structure(mxArray *plhs[], struct model *model_); 2 | const char *matlab_matrix_to_model(struct model *model_, const mxArray *matlab_struct); 3 | -------------------------------------------------------------------------------- /codes/evaluation/liblinear/matlab/make.m: -------------------------------------------------------------------------------- 1 | % This make.m is for MATLAB and OCTAVE under Windows, Mac, and Unix 2 | function make() 3 | try 4 | % This part is for OCTAVE 5 | if(exist('OCTAVE_VERSION', 'builtin')) 6 | mex libsvmread.c 7 | mex libsvmwrite.c 8 | mex -I.. train.c linear_model_matlab.c ../linear.cpp ../tron.cpp ../blas/daxpy.c ../blas/ddot.c ../blas/dnrm2.c ../blas/dscal.c 9 | mex -I.. predict.c linear_model_matlab.c ../linear.cpp ../tron.cpp ../blas/daxpy.c ../blas/ddot.c ../blas/dnrm2.c ../blas/dscal.c 10 | % This part is for MATLAB 11 | % Add -largeArrayDims on 64-bit machines of MATLAB 12 | else 13 | mex CFLAGS="\$CFLAGS -std=c99" -largeArrayDims libsvmread.c 14 | mex CFLAGS="\$CFLAGS -std=c99" -largeArrayDims libsvmwrite.c 15 | mex CFLAGS="\$CFLAGS -std=c99" -I.. -largeArrayDims train.c linear_model_matlab.c ../linear.cpp ../tron.cpp ../blas/daxpy.c ../blas/ddot.c ../blas/dnrm2.c ../blas/dscal.c 16 | mex CFLAGS="\$CFLAGS -std=c99" -I.. -largeArrayDims predict.c linear_model_matlab.c ../linear.cpp ../tron.cpp ../blas/daxpy.c ../blas/ddot.c ../blas/dnrm2.c ../blas/dscal.c 17 | end 18 | catch err 19 | fprintf('Error: %s failed (line %d)\n', err.stack(1).file, err.stack(1).line); 20 | disp(err.message); 21 | fprintf('=> Please check README for detailed instructions.\n'); 22 | end 23 | -------------------------------------------------------------------------------- /codes/evaluation/liblinear/matlab/predict.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "linear.h" 5 | 6 | #include "mex.h" 7 | #include "linear_model_matlab.h" 8 | 9 | #ifdef MX_API_VER 10 | #if MX_API_VER < 0x07030000 11 | typedef int mwIndex; 12 | #endif 13 | #endif 14 | 15 | #define CMD_LEN 2048 16 | 17 | #define Malloc(type,n) (type *)malloc((n)*sizeof(type)) 18 | 19 | int print_null(const char *s,...) {} 20 | int (*info)(const char *fmt,...); 21 | 22 | int col_format_flag; 23 | 24 | void read_sparse_instance(const mxArray *prhs, int index, struct feature_node *x, int feature_number, double bias) 25 | { 26 | int j; 27 | mwIndex *ir, *jc, low, high, i; 28 | double *samples; 29 | 30 | ir = mxGetIr(prhs); 31 | jc = mxGetJc(prhs); 32 | samples = mxGetPr(prhs); 33 | 34 | // each column is one instance 35 | j = 0; 36 | low = jc[index], high = jc[index+1]; 37 | for(i=low; i=0) 44 | { 45 | x[j].index = feature_number+1; 46 | x[j].value = bias; 47 | j++; 48 | } 49 | x[j].index = -1; 50 | } 51 | 52 | static void fake_answer(int nlhs, mxArray *plhs[]) 53 | { 54 | int i; 55 | for(i=0;iparam.solver_type!=MCSVM_CS) 80 | nr_w=1; 81 | else 82 | nr_w=nr_class; 83 | 84 | // prhs[1] = testing instance matrix 85 | feature_number = get_nr_feature(model_); 86 | testing_instance_number = (int) mxGetM(prhs[1]); 87 | if(col_format_flag) 88 | { 89 | feature_number = (int) mxGetM(prhs[1]); 90 | testing_instance_number = (int) mxGetN(prhs[1]); 91 | } 92 | 93 | label_vector_row_num = (int) mxGetM(prhs[0]); 94 | label_vector_col_num = (int) mxGetN(prhs[0]); 95 | 96 | if(label_vector_row_num!=testing_instance_number) 97 | { 98 | mexPrintf("Length of label vector does not match # of instances.\n"); 99 | fake_answer(nlhs, plhs); 100 | return; 101 | } 102 | if(label_vector_col_num!=1) 103 | { 104 | mexPrintf("label (1st argument) should be a vector (# of column is 1).\n"); 105 | fake_answer(nlhs, plhs); 106 | return; 107 | } 108 | 109 | ptr_label = mxGetPr(prhs[0]); 110 | 111 | // transpose instance matrix 112 | if(col_format_flag) 113 | pplhs[0] = (mxArray *)prhs[1]; 114 | else 115 | { 116 | mxArray *pprhs[1]; 117 | pprhs[0] = mxDuplicateArray(prhs[1]); 118 | if(mexCallMATLAB(1, pplhs, 1, pprhs, "transpose")) 119 | { 120 | mexPrintf("Error: cannot transpose testing instance matrix\n"); 121 | fake_answer(nlhs, plhs); 122 | return; 123 | } 124 | } 125 | 126 | 127 | prob_estimates = Malloc(double, nr_class); 128 | 129 | tplhs[0] = mxCreateDoubleMatrix(testing_instance_number, 1, mxREAL); 130 | if(predict_probability_flag) 131 | tplhs[2] = mxCreateDoubleMatrix(testing_instance_number, nr_class, mxREAL); 132 | else 133 | tplhs[2] = mxCreateDoubleMatrix(testing_instance_number, nr_w, mxREAL); 134 | 135 | ptr_predict_label = mxGetPr(tplhs[0]); 136 | ptr_prob_estimates = mxGetPr(tplhs[2]); 137 | ptr_dec_values = mxGetPr(tplhs[2]); 138 | x = Malloc(struct feature_node, feature_number+2); 139 | for(instance_index=0;instance_indexbias); 148 | 149 | if(predict_probability_flag) 150 | { 151 | predict_label = predict_probability(model_, x, prob_estimates); 152 | ptr_predict_label[instance_index] = predict_label; 153 | for(i=0;i 3 || nrhs > 5 || nrhs < 3) 239 | { 240 | exit_with_help(); 241 | fake_answer(nlhs, plhs); 242 | return; 243 | } 244 | if(nrhs == 5) 245 | { 246 | mxGetString(prhs[4], cmd, mxGetN(prhs[4])+1); 247 | if(strcmp(cmd, "col") == 0) 248 | { 249 | col_format_flag = 1; 250 | } 251 | } 252 | 253 | if(!mxIsDouble(prhs[0]) || !mxIsDouble(prhs[1])) { 254 | mexPrintf("Error: label vector and instance matrix must be double\n"); 255 | fake_answer(nlhs, plhs); 256 | return; 257 | } 258 | 259 | if(mxIsStruct(prhs[2])) 260 | { 261 | const char *error_msg; 262 | 263 | // parse options 264 | if(nrhs>=4) 265 | { 266 | int i, argc = 1; 267 | char *argv[CMD_LEN/2]; 268 | 269 | // put options in argv[] 270 | mxGetString(prhs[3], cmd, mxGetN(prhs[3]) + 1); 271 | if((argv[argc] = strtok(cmd, " ")) != NULL) 272 | while((argv[++argc] = strtok(NULL, " ")) != NULL) 273 | ; 274 | 275 | for(i=1;i=argc && argv[i-1][1] != 'q') 280 | { 281 | exit_with_help(); 282 | fake_answer(nlhs, plhs); 283 | return; 284 | } 285 | switch(argv[i-1][1]) 286 | { 287 | case 'b': 288 | prob_estimate_flag = atoi(argv[i]); 289 | break; 290 | case 'q': 291 | info = &print_null; 292 | i--; 293 | break; 294 | default: 295 | mexPrintf("unknown option\n"); 296 | exit_with_help(); 297 | fake_answer(nlhs, plhs); 298 | return; 299 | } 300 | } 301 | } 302 | 303 | model_ = Malloc(struct model, 1); 304 | error_msg = matlab_matrix_to_model(model_, prhs[2]); 305 | if(error_msg) 306 | { 307 | mexPrintf("Error: can't read model: %s\n", error_msg); 308 | free_and_destroy_model(&model_); 309 | fake_answer(nlhs, plhs); 310 | return; 311 | } 312 | 313 | if(prob_estimate_flag) 314 | { 315 | if(!check_probability_model(model_)) 316 | { 317 | mexPrintf("probability output is only supported for logistic regression\n"); 318 | prob_estimate_flag=0; 319 | } 320 | } 321 | 322 | if(mxIsSparse(prhs[1])) 323 | do_predict(nlhs, plhs, prhs, model_, prob_estimate_flag); 324 | else 325 | { 326 | mexPrintf("Testing_instance_matrix must be sparse; " 327 | "use sparse(Testing_instance_matrix) first\n"); 328 | fake_answer(nlhs, plhs); 329 | } 330 | 331 | // destroy model_ 332 | free_and_destroy_model(&model_); 333 | } 334 | else 335 | { 336 | mexPrintf("model file should be a struct array\n"); 337 | fake_answer(nlhs, plhs); 338 | } 339 | 340 | return; 341 | } 342 | -------------------------------------------------------------------------------- /codes/evaluation/liblinear/predict.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "linear.h" 7 | 8 | int print_null(const char *s,...) {return 0;} 9 | 10 | static int (*info)(const char *fmt,...) = &printf; 11 | 12 | struct feature_node *x; 13 | int max_nr_attr = 64; 14 | 15 | struct model* model_; 16 | int flag_predict_probability=0; 17 | 18 | void exit_input_error(int line_num) 19 | { 20 | fprintf(stderr,"Wrong input format at line %d\n", line_num); 21 | exit(1); 22 | } 23 | 24 | static char *line = NULL; 25 | static int max_line_len; 26 | 27 | static char* readline(FILE *input) 28 | { 29 | int len; 30 | 31 | if(fgets(line,max_line_len,input) == NULL) 32 | return NULL; 33 | 34 | while(strrchr(line,'\n') == NULL) 35 | { 36 | max_line_len *= 2; 37 | line = (char *) realloc(line,max_line_len); 38 | len = (int) strlen(line); 39 | if(fgets(line+len,max_line_len-len,input) == NULL) 40 | break; 41 | } 42 | return line; 43 | } 44 | 45 | void do_predict(FILE *input, FILE *output) 46 | { 47 | int correct = 0; 48 | int total = 0; 49 | double error = 0; 50 | double sump = 0, sumt = 0, sumpp = 0, sumtt = 0, sumpt = 0; 51 | 52 | int nr_class=get_nr_class(model_); 53 | double *prob_estimates=NULL; 54 | int j, n; 55 | int nr_feature=get_nr_feature(model_); 56 | if(model_->bias>=0) 57 | n=nr_feature+1; 58 | else 59 | n=nr_feature; 60 | 61 | if(flag_predict_probability) 62 | { 63 | int *labels; 64 | 65 | if(!check_probability_model(model_)) 66 | { 67 | fprintf(stderr, "probability output is only supported for logistic regression\n"); 68 | exit(1); 69 | } 70 | 71 | labels=(int *) malloc(nr_class*sizeof(int)); 72 | get_labels(model_,labels); 73 | prob_estimates = (double *) malloc(nr_class*sizeof(double)); 74 | fprintf(output,"labels"); 75 | for(j=0;j=max_nr_attr-2) // need one more for index = -1 101 | { 102 | max_nr_attr *= 2; 103 | x = (struct feature_node *) realloc(x,max_nr_attr*sizeof(struct feature_node)); 104 | } 105 | 106 | idx = strtok(NULL,":"); 107 | val = strtok(NULL," \t"); 108 | 109 | if(val == NULL) 110 | break; 111 | errno = 0; 112 | x[i].index = (int) strtol(idx,&endptr,10); 113 | if(endptr == idx || errno != 0 || *endptr != '\0' || x[i].index <= inst_max_index) 114 | exit_input_error(total+1); 115 | else 116 | inst_max_index = x[i].index; 117 | 118 | errno = 0; 119 | x[i].value = strtod(val,&endptr); 120 | if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr))) 121 | exit_input_error(total+1); 122 | 123 | // feature indices larger than those in training are not used 124 | if(x[i].index <= nr_feature) 125 | ++i; 126 | } 127 | 128 | if(model_->bias>=0) 129 | { 130 | x[i].index = n; 131 | x[i].value = model_->bias; 132 | i++; 133 | } 134 | x[i].index = -1; 135 | 136 | if(flag_predict_probability) 137 | { 138 | int j; 139 | predict_label = predict_probability(model_,x,prob_estimates); 140 | fprintf(output,"%g",predict_label); 141 | for(j=0;jnr_class;j++) 142 | fprintf(output," %g",prob_estimates[j]); 143 | fprintf(output,"\n"); 144 | } 145 | else 146 | { 147 | predict_label = predict(model_,x); 148 | fprintf(output,"%g\n",predict_label); 149 | } 150 | 151 | if(predict_label == target_label) 152 | ++correct; 153 | error += (predict_label-target_label)*(predict_label-target_label); 154 | sump += predict_label; 155 | sumt += target_label; 156 | sumpp += predict_label*predict_label; 157 | sumtt += target_label*target_label; 158 | sumpt += predict_label*target_label; 159 | ++total; 160 | } 161 | if(check_regression_model(model_)) 162 | { 163 | info("Mean squared error = %g (regression)\n",error/total); 164 | info("Squared correlation coefficient = %g (regression)\n", 165 | ((total*sumpt-sump*sumt)*(total*sumpt-sump*sumt))/ 166 | ((total*sumpp-sump*sump)*(total*sumtt-sumt*sumt)) 167 | ); 168 | } 169 | else 170 | info("Accuracy = %g%% (%d/%d)\n",(double) correct/total*100,correct,total); 171 | if(flag_predict_probability) 172 | free(prob_estimates); 173 | } 174 | 175 | void exit_with_help() 176 | { 177 | printf( 178 | "Usage: predict [options] test_file model_file output_file\n" 179 | "options:\n" 180 | "-b probability_estimates: whether to output probability estimates, 0 or 1 (default 0); currently for logistic regression only\n" 181 | "-q : quiet mode (no outputs)\n" 182 | ); 183 | exit(1); 184 | } 185 | 186 | int main(int argc, char **argv) 187 | { 188 | FILE *input, *output; 189 | int i; 190 | 191 | // parse options 192 | for(i=1;i=argc) 212 | exit_with_help(); 213 | 214 | input = fopen(argv[i],"r"); 215 | if(input == NULL) 216 | { 217 | fprintf(stderr,"can't open input file %s\n",argv[i]); 218 | exit(1); 219 | } 220 | 221 | output = fopen(argv[i+2],"w"); 222 | if(output == NULL) 223 | { 224 | fprintf(stderr,"can't open output file %s\n",argv[i+2]); 225 | exit(1); 226 | } 227 | 228 | if((model_=load_model(argv[i+1]))==0) 229 | { 230 | fprintf(stderr,"can't open model file %s\n",argv[i+1]); 231 | exit(1); 232 | } 233 | 234 | x = (struct feature_node *) malloc(max_nr_attr*sizeof(struct feature_node)); 235 | do_predict(input, output); 236 | free_and_destroy_model(&model_); 237 | free(line); 238 | free(x); 239 | fclose(input); 240 | fclose(output); 241 | return 0; 242 | } 243 | 244 | -------------------------------------------------------------------------------- /codes/evaluation/liblinear/python/Makefile: -------------------------------------------------------------------------------- 1 | all = lib 2 | 3 | lib: 4 | make -C .. lib 5 | -------------------------------------------------------------------------------- /codes/evaluation/liblinear/python/README: -------------------------------------------------------------------------------- 1 | ------------------------------------- 2 | --- Python interface of LIBLINEAR --- 3 | ------------------------------------- 4 | 5 | Table of Contents 6 | ================= 7 | 8 | - Introduction 9 | - Installation 10 | - Quick Start 11 | - Design Description 12 | - Data Structures 13 | - Utility Functions 14 | - Additional Information 15 | 16 | Introduction 17 | ============ 18 | 19 | Python (http://www.python.org/) is a programming language suitable for rapid 20 | development. This tool provides a simple Python interface to LIBLINEAR, a library 21 | for support vector machines (http://www.csie.ntu.edu.tw/~cjlin/liblinear). The 22 | interface is very easy to use as the usage is the same as that of LIBLINEAR. The 23 | interface is developed with the built-in Python library "ctypes." 24 | 25 | Installation 26 | ============ 27 | 28 | On Unix systems, type 29 | 30 | > make 31 | 32 | The interface needs only LIBLINEAR shared library, which is generated by 33 | the above command. We assume that the shared library is on the LIBLINEAR 34 | main directory or in the system path. 35 | 36 | For windows, the shared library liblinear.dll is ready in the directory 37 | `..\windows'. You can also copy it to the system directory (e.g., 38 | `C:\WINDOWS\system32\' for Windows XP). To regenerate the shared library, 39 | please follow the instruction of building windows binaries in LIBLINEAR README. 40 | 41 | Quick Start 42 | =========== 43 | 44 | There are two levels of usage. The high-level one uses utility functions 45 | in liblinearutil.py and the usage is the same as the LIBLINEAR MATLAB interface. 46 | 47 | >>> from liblinearutil import * 48 | # Read data in LIBSVM format 49 | >>> y, x = svm_read_problem('../heart_scale') 50 | >>> m = train(y[:200], x[:200], '-c 4') 51 | >>> p_label, p_acc, p_val = predict(y[200:], x[200:], m) 52 | 53 | # Construct problem in python format 54 | # Dense data 55 | >>> y, x = [1,-1], [[1,0,1], [-1,0,-1]] 56 | # Sparse data 57 | >>> y, x = [1,-1], [{1:1, 3:1}, {1:-1,3:-1}] 58 | >>> prob = problem(y, x) 59 | >>> param = parameter('-s 0 -c 4 -B 1') 60 | >>> m = train(prob, param) 61 | 62 | # Other utility functions 63 | >>> save_model('heart_scale.model', m) 64 | >>> m = load_model('heart_scale.model') 65 | >>> p_label, p_acc, p_val = predict(y, x, m, '-b 1') 66 | >>> ACC, MSE, SCC = evaluations(y, p_label) 67 | 68 | # Getting online help 69 | >>> help(train) 70 | 71 | The low-level use directly calls C interfaces imported by liblinear.py. Note that 72 | all arguments and return values are in ctypes format. You need to handle them 73 | carefully. 74 | 75 | >>> from liblinear import * 76 | >>> prob = problem([1,-1], [{1:1, 3:1}, {1:-1,3:-1}]) 77 | >>> param = parameter('-c 4') 78 | >>> m = liblinear.train(prob, param) # m is a ctype pointer to a model 79 | # Convert a Python-format instance to feature_nodearray, a ctypes structure 80 | >>> x0, max_idx = gen_feature_nodearray({1:1, 3:1}) 81 | >>> label = liblinear.predict(m, x0) 82 | 83 | Design Description 84 | ================== 85 | 86 | There are two files liblinear.py and liblinearutil.py, which respectively correspond to 87 | low-level and high-level use of the interface. 88 | 89 | In liblinear.py, we adopt the Python built-in library "ctypes," so that 90 | Python can directly access C structures and interface functions defined 91 | in linear.h. 92 | 93 | While advanced users can use structures/functions in liblinear.py, to 94 | avoid handling ctypes structures, in liblinearutil.py we provide some easy-to-use 95 | functions. The usage is similar to LIBLINEAR MATLAB interface. 96 | 97 | Data Structures 98 | =============== 99 | 100 | Three data structures derived from linear.h are node, problem, and 101 | parameter. They all contain fields with the same names in 102 | linear.h. Access these fields carefully because you directly use a C structure 103 | instead of a Python object. The following description introduces additional 104 | fields and methods. 105 | 106 | Before using the data structures, execute the following command to load the 107 | LIBLINEAR shared library: 108 | 109 | >>> from liblinear import * 110 | 111 | - class feature_node: 112 | 113 | Construct a feature_node. 114 | 115 | >>> node = feature_node(idx, val) 116 | 117 | idx: an integer indicates the feature index. 118 | 119 | val: a float indicates the feature value. 120 | 121 | Show the index and the value of a node. 122 | 123 | >>> print(node) 124 | 125 | - Function: gen_feature_nodearray(xi [,feature_max=None [,issparse=True]]) 126 | 127 | Generate a feature vector from a Python list/tuple or a dictionary: 128 | 129 | >>> xi, max_idx = gen_feature_nodearray({1:1, 3:1, 5:-2}) 130 | 131 | xi: the returned feature_nodearray (a ctypes structure) 132 | 133 | max_idx: the maximal feature index of xi 134 | 135 | issparse: if issparse == True, zero feature values are removed. The default 136 | value is True for the sparsity. 137 | 138 | feature_max: if feature_max is assigned, features with indices larger than 139 | feature_max are removed. 140 | 141 | - class problem: 142 | 143 | Construct a problem instance 144 | 145 | >>> prob = problem(y, x [,bias=-1]) 146 | 147 | y: a Python list/tuple of l labels (type must be int/double). 148 | 149 | x: a Python list/tuple of l data instances. Each element of x must be 150 | an instance of list/tuple/dictionary type. 151 | 152 | bias: if bias >= 0, instance x becomes [x; bias]; if < 0, no bias term 153 | added (default -1) 154 | 155 | You can also modify the bias value by 156 | 157 | >>> prob.set_bias(1) 158 | 159 | Note that if your x contains sparse data (i.e., dictionary), the internal 160 | ctypes data format is still sparse. 161 | 162 | - class parameter: 163 | 164 | Construct a parameter instance 165 | 166 | >>> param = parameter('training_options') 167 | 168 | If 'training_options' is empty, LIBLINEAR default values are applied. 169 | 170 | Set param to LIBLINEAR default values. 171 | 172 | >>> param.set_to_default_values() 173 | 174 | Parse a string of options. 175 | 176 | >>> param.parse_options('training_options') 177 | 178 | Show values of parameters. 179 | 180 | >>> print(param) 181 | 182 | - class model: 183 | 184 | There are two ways to obtain an instance of model: 185 | 186 | >>> model_ = train(y, x) 187 | >>> model_ = load_model('model_file_name') 188 | 189 | Note that the returned structure of interface functions 190 | liblinear.train and liblinear.load_model is a ctypes pointer of 191 | model, which is different from the model object returned 192 | by train and load_model in liblinearutil.py. We provide a 193 | function toPyModel for the conversion: 194 | 195 | >>> model_ptr = liblinear.train(prob, param) 196 | >>> model_ = toPyModel(model_ptr) 197 | 198 | If you obtain a model in a way other than the above approaches, 199 | handle it carefully to avoid memory leak or segmentation fault. 200 | 201 | Some interface functions to access LIBLINEAR models are wrapped as 202 | members of the class model: 203 | 204 | >>> nr_feature = model_.get_nr_feature() 205 | >>> nr_class = model_.get_nr_class() 206 | >>> class_labels = model_.get_labels() 207 | >>> is_prob_model = model_.is_probability_model() 208 | >>> is_regression_model = model_.is_regression_model() 209 | 210 | The decision function is W*x + b, where 211 | W is an nr_class-by-nr_feature matrix, and 212 | b is a vector of size nr_class. 213 | To access W_kj (i.e., coefficient for the k-th class and the j-th feature) 214 | and b_k (i.e., bias for the k-th class), use the following functions. 215 | 216 | >>> W_kj = model_.get_decfun_coef(feat_idx=j, label_idx=k) 217 | >>> b_k = model_.get_decfun_bias(label_idx=k) 218 | 219 | We also provide a function to extract w_k (i.e., the k-th row of W) and 220 | b_k directly as follows. 221 | 222 | >>> [w_k, b_k] = model_.get_decfun(label_idx=k) 223 | 224 | Note that w_k is a Python list of length nr_feature, which means that 225 | w_k[0] = W_k1. 226 | For regression models, W is just a vector of length nr_feature. Either 227 | set label_idx=0 or omit the label_idx parameter to access the coefficients. 228 | 229 | >>> W_j = model_.get_decfun_coef(feat_idx=j) 230 | >>> b = model_.get_decfun_bias() 231 | >>> [W, b] = model_.get_decfun() 232 | 233 | Note that in get_decfun_coef, get_decfun_bias, and get_decfun, feat_idx 234 | starts from 1, while label_idx starts from 0. If label_idx is not in the 235 | valid range (0 to nr_class-1), then a NaN will be returned; and if feat_idx 236 | is not in the valid range (1 to nr_feature), then a zero value will be 237 | returned. For regression models, label_idx is ignored. 238 | 239 | Utility Functions 240 | ================= 241 | 242 | To use utility functions, type 243 | 244 | >>> from liblinearutil import * 245 | 246 | The above command loads 247 | train() : train a linear model 248 | predict() : predict testing data 249 | svm_read_problem() : read the data from a LIBSVM-format file. 250 | load_model() : load a LIBLINEAR model. 251 | save_model() : save model to a file. 252 | evaluations() : evaluate prediction results. 253 | 254 | - Function: train 255 | 256 | There are three ways to call train() 257 | 258 | >>> model = train(y, x [, 'training_options']) 259 | >>> model = train(prob [, 'training_options']) 260 | >>> model = train(prob, param) 261 | 262 | y: a list/tuple of l training labels (type must be int/double). 263 | 264 | x: a list/tuple of l training instances. The feature vector of 265 | each training instance is an instance of list/tuple or dictionary. 266 | 267 | training_options: a string in the same form as that for LIBLINEAR command 268 | mode. 269 | 270 | prob: a problem instance generated by calling 271 | problem(y, x). 272 | 273 | param: a parameter instance generated by calling 274 | parameter('training_options') 275 | 276 | model: the returned model instance. See linear.h for details of this 277 | structure. If '-v' is specified, cross validation is 278 | conducted and the returned model is just a scalar: cross-validation 279 | accuracy for classification and mean-squared error for regression. 280 | If the '-C' option is specified, the best parameter C is found 281 | by cross validation. The returned model is a tuple of the best C 282 | and the corresponding cross-validation accuracy. The parameter 283 | selection utility is supported by only -s 0 and -s 2. 284 | 285 | 286 | To train the same data many times with different 287 | parameters, the second and the third ways should be faster.. 288 | 289 | Examples: 290 | 291 | >>> y, x = svm_read_problem('../heart_scale') 292 | >>> prob = problem(y, x) 293 | >>> param = parameter('-s 3 -c 5 -q') 294 | >>> m = train(y, x, '-c 5') 295 | >>> m = train(prob, '-w1 5 -c 5') 296 | >>> m = train(prob, param) 297 | >>> CV_ACC = train(y, x, '-v 3') 298 | >>> best_C, best_rate = train(y, x, '-C -s 0') 299 | >>> m = train(y, x, '-c {0} -s 0'.format(best_C)) # use the same solver: -s 0 300 | 301 | - Function: predict 302 | 303 | To predict testing data with a model, use 304 | 305 | >>> p_labs, p_acc, p_vals = predict(y, x, model [,'predicting_options']) 306 | 307 | y: a list/tuple of l true labels (type must be int/double). It is used 308 | for calculating the accuracy. Use [] if true labels are 309 | unavailable. 310 | 311 | x: a list/tuple of l predicting instances. The feature vector of 312 | each predicting instance is an instance of list/tuple or dictionary. 313 | 314 | predicting_options: a string of predicting options in the same format as 315 | that of LIBLINEAR. 316 | 317 | model: a model instance. 318 | 319 | p_labels: a list of predicted labels 320 | 321 | p_acc: a tuple including accuracy (for classification), mean 322 | squared error, and squared correlation coefficient (for 323 | regression). 324 | 325 | p_vals: a list of decision values or probability estimates (if '-b 1' 326 | is specified). If k is the number of classes, for decision values, 327 | each element includes results of predicting k binary-class 328 | SVMs. If k = 2 and solver is not MCSVM_CS, only one decision value 329 | is returned. For probabilities, each element contains k values 330 | indicating the probability that the testing instance is in each class. 331 | Note that the order of classes here is the same as 'model.label' 332 | field in the model structure. 333 | 334 | Example: 335 | 336 | >>> m = train(y, x, '-c 5') 337 | >>> p_labels, p_acc, p_vals = predict(y, x, m) 338 | 339 | - Functions: svm_read_problem/load_model/save_model 340 | 341 | See the usage by examples: 342 | 343 | >>> y, x = svm_read_problem('data.txt') 344 | >>> m = load_model('model_file') 345 | >>> save_model('model_file', m) 346 | 347 | - Function: evaluations 348 | 349 | Calculate some evaluations using the true values (ty) and predicted 350 | values (pv): 351 | 352 | >>> (ACC, MSE, SCC) = evaluations(ty, pv) 353 | 354 | ty: a list of true values. 355 | 356 | pv: a list of predict values. 357 | 358 | ACC: accuracy. 359 | 360 | MSE: mean squared error. 361 | 362 | SCC: squared correlation coefficient. 363 | 364 | 365 | Additional Information 366 | ====================== 367 | 368 | This interface was written by Hsiang-Fu Yu from Department of Computer 369 | Science, National Taiwan University. If you find this tool useful, please 370 | cite LIBLINEAR as follows 371 | 372 | R.-E. Fan, K.-W. Chang, C.-J. Hsieh, X.-R. Wang, and C.-J. Lin. 373 | LIBLINEAR: A Library for Large Linear Classification, Journal of 374 | Machine Learning Research 9(2008), 1871-1874. Software available at 375 | http://www.csie.ntu.edu.tw/~cjlin/liblinear 376 | 377 | For any question, please contact Chih-Jen Lin , 378 | or check the FAQ page: 379 | 380 | http://www.csie.ntu.edu.tw/~cjlin/liblinear/faq.html 381 | -------------------------------------------------------------------------------- /codes/evaluation/liblinear/python/liblinear.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from ctypes import * 4 | from ctypes.util import find_library 5 | from os import path 6 | import sys 7 | 8 | __all__ = ['liblinear', 'feature_node', 'gen_feature_nodearray', 'problem', 9 | 'parameter', 'model', 'toPyModel', 'L2R_LR', 'L2R_L2LOSS_SVC_DUAL', 10 | 'L2R_L2LOSS_SVC', 'L2R_L1LOSS_SVC_DUAL', 'MCSVM_CS', 11 | 'L1R_L2LOSS_SVC', 'L1R_LR', 'L2R_LR_DUAL', 'L2R_L2LOSS_SVR', 12 | 'L2R_L2LOSS_SVR_DUAL', 'L2R_L1LOSS_SVR_DUAL', 'print_null'] 13 | 14 | try: 15 | dirname = path.dirname(path.abspath(__file__)) 16 | if sys.platform == 'win32': 17 | liblinear = CDLL(path.join(dirname, r'..\windows\liblinear.dll')) 18 | else: 19 | liblinear = CDLL(path.join(dirname, '../liblinear.so.3')) 20 | except: 21 | # For unix the prefix 'lib' is not considered. 22 | if find_library('linear'): 23 | liblinear = CDLL(find_library('linear')) 24 | elif find_library('liblinear'): 25 | liblinear = CDLL(find_library('liblinear')) 26 | else: 27 | raise Exception('LIBLINEAR library not found.') 28 | 29 | L2R_LR = 0 30 | L2R_L2LOSS_SVC_DUAL = 1 31 | L2R_L2LOSS_SVC = 2 32 | L2R_L1LOSS_SVC_DUAL = 3 33 | MCSVM_CS = 4 34 | L1R_L2LOSS_SVC = 5 35 | L1R_LR = 6 36 | L2R_LR_DUAL = 7 37 | L2R_L2LOSS_SVR = 11 38 | L2R_L2LOSS_SVR_DUAL = 12 39 | L2R_L1LOSS_SVR_DUAL = 13 40 | 41 | PRINT_STRING_FUN = CFUNCTYPE(None, c_char_p) 42 | def print_null(s): 43 | return 44 | 45 | def genFields(names, types): 46 | return list(zip(names, types)) 47 | 48 | def fillprototype(f, restype, argtypes): 49 | f.restype = restype 50 | f.argtypes = argtypes 51 | 52 | class feature_node(Structure): 53 | _names = ["index", "value"] 54 | _types = [c_int, c_double] 55 | _fields_ = genFields(_names, _types) 56 | 57 | def __str__(self): 58 | return '%d:%g' % (self.index, self.value) 59 | 60 | def gen_feature_nodearray(xi, feature_max=None, issparse=True): 61 | if isinstance(xi, dict): 62 | index_range = xi.keys() 63 | elif isinstance(xi, (list, tuple)): 64 | xi = [0] + xi # idx should start from 1 65 | index_range = range(1, len(xi)) 66 | else: 67 | raise TypeError('xi should be a dictionary, list or tuple') 68 | 69 | if feature_max: 70 | assert(isinstance(feature_max, int)) 71 | index_range = filter(lambda j: j <= feature_max, index_range) 72 | if issparse: 73 | index_range = filter(lambda j:xi[j] != 0, index_range) 74 | 75 | index_range = sorted(index_range) 76 | ret = (feature_node * (len(index_range)+2))() 77 | ret[-1].index = -1 # for bias term 78 | ret[-2].index = -1 79 | for idx, j in enumerate(index_range): 80 | ret[idx].index = j 81 | ret[idx].value = xi[j] 82 | max_idx = 0 83 | if index_range : 84 | max_idx = index_range[-1] 85 | return ret, max_idx 86 | 87 | class problem(Structure): 88 | _names = ["l", "n", "y", "x", "bias"] 89 | _types = [c_int, c_int, POINTER(c_double), POINTER(POINTER(feature_node)), c_double] 90 | _fields_ = genFields(_names, _types) 91 | 92 | def __init__(self, y, x, bias = -1): 93 | if len(y) != len(x) : 94 | raise ValueError("len(y) != len(x)") 95 | self.l = l = len(y) 96 | self.bias = -1 97 | 98 | max_idx = 0 99 | x_space = self.x_space = [] 100 | for i, xi in enumerate(x): 101 | tmp_xi, tmp_idx = gen_feature_nodearray(xi) 102 | x_space += [tmp_xi] 103 | max_idx = max(max_idx, tmp_idx) 104 | self.n = max_idx 105 | 106 | self.y = (c_double * l)() 107 | for i, yi in enumerate(y): self.y[i] = y[i] 108 | 109 | self.x = (POINTER(feature_node) * l)() 110 | for i, xi in enumerate(self.x_space): self.x[i] = xi 111 | 112 | self.set_bias(bias) 113 | 114 | def set_bias(self, bias): 115 | if self.bias == bias: 116 | return 117 | if bias >= 0 and self.bias < 0: 118 | self.n += 1 119 | node = feature_node(self.n, bias) 120 | if bias < 0 and self.bias >= 0: 121 | self.n -= 1 122 | node = feature_node(-1, bias) 123 | 124 | for xi in self.x_space: 125 | xi[-2] = node 126 | self.bias = bias 127 | 128 | 129 | class parameter(Structure): 130 | _names = ["solver_type", "eps", "C", "nr_weight", "weight_label", "weight", "p", "init_sol"] 131 | _types = [c_int, c_double, c_double, c_int, POINTER(c_int), POINTER(c_double), c_double, POINTER(c_double)] 132 | _fields_ = genFields(_names, _types) 133 | 134 | def __init__(self, options = None): 135 | if options == None: 136 | options = '' 137 | self.parse_options(options) 138 | 139 | def __str__(self): 140 | s = '' 141 | attrs = parameter._names + list(self.__dict__.keys()) 142 | values = map(lambda attr: getattr(self, attr), attrs) 143 | for attr, val in zip(attrs, values): 144 | s += (' %s: %s\n' % (attr, val)) 145 | s = s.strip() 146 | 147 | return s 148 | 149 | def set_to_default_values(self): 150 | self.solver_type = L2R_L2LOSS_SVC_DUAL 151 | self.eps = float('inf') 152 | self.C = 1 153 | self.p = 0.1 154 | self.nr_weight = 0 155 | self.weight_label = None 156 | self.weight = None 157 | self.init_sol = None 158 | self.bias = -1 159 | self.flag_cross_validation = False 160 | self.flag_C_specified = False 161 | self.flag_solver_specified = False 162 | self.flag_find_C = False 163 | self.nr_fold = 0 164 | self.print_func = cast(None, PRINT_STRING_FUN) 165 | 166 | def parse_options(self, options): 167 | if isinstance(options, list): 168 | argv = options 169 | elif isinstance(options, str): 170 | argv = options.split() 171 | else: 172 | raise TypeError("arg 1 should be a list or a str.") 173 | self.set_to_default_values() 174 | self.print_func = cast(None, PRINT_STRING_FUN) 175 | weight_label = [] 176 | weight = [] 177 | 178 | i = 0 179 | while i < len(argv) : 180 | if argv[i] == "-s": 181 | i = i + 1 182 | self.solver_type = int(argv[i]) 183 | self.flag_solver_specified = True 184 | elif argv[i] == "-c": 185 | i = i + 1 186 | self.C = float(argv[i]) 187 | self.flag_C_specified = True 188 | elif argv[i] == "-p": 189 | i = i + 1 190 | self.p = float(argv[i]) 191 | elif argv[i] == "-e": 192 | i = i + 1 193 | self.eps = float(argv[i]) 194 | elif argv[i] == "-B": 195 | i = i + 1 196 | self.bias = float(argv[i]) 197 | elif argv[i] == "-v": 198 | i = i + 1 199 | self.flag_cross_validation = 1 200 | self.nr_fold = int(argv[i]) 201 | if self.nr_fold < 2 : 202 | raise ValueError("n-fold cross validation: n must >= 2") 203 | elif argv[i].startswith("-w"): 204 | i = i + 1 205 | self.nr_weight += 1 206 | weight_label += [int(argv[i-1][2:])] 207 | weight += [float(argv[i])] 208 | elif argv[i] == "-q": 209 | self.print_func = PRINT_STRING_FUN(print_null) 210 | elif argv[i] == "-C": 211 | self.flag_find_C = True 212 | 213 | else : 214 | raise ValueError("Wrong options") 215 | i += 1 216 | 217 | liblinear.set_print_string_function(self.print_func) 218 | self.weight_label = (c_int*self.nr_weight)() 219 | self.weight = (c_double*self.nr_weight)() 220 | for i in range(self.nr_weight): 221 | self.weight[i] = weight[i] 222 | self.weight_label[i] = weight_label[i] 223 | 224 | # default solver for parameter selection is L2R_L2LOSS_SVC 225 | if self.flag_find_C: 226 | if not self.flag_cross_validation: 227 | self.nr_fold = 5 228 | if not self.flag_solver_specified: 229 | self.solver_type = L2R_L2LOSS_SVC 230 | self.flag_solver_specified = True 231 | elif self.solver_type not in [L2R_LR, L2R_L2LOSS_SVC]: 232 | raise ValueError("Warm-start parameter search only available for -s 0 and -s 2") 233 | 234 | if self.eps == float('inf'): 235 | if self.solver_type in [L2R_LR, L2R_L2LOSS_SVC]: 236 | self.eps = 0.01 237 | elif self.solver_type in [L2R_L2LOSS_SVR]: 238 | self.eps = 0.001 239 | elif self.solver_type in [L2R_L2LOSS_SVC_DUAL, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L2R_LR_DUAL]: 240 | self.eps = 0.1 241 | elif self.solver_type in [L1R_L2LOSS_SVC, L1R_LR]: 242 | self.eps = 0.01 243 | elif self.solver_type in [L2R_L2LOSS_SVR_DUAL, L2R_L1LOSS_SVR_DUAL]: 244 | self.eps = 0.1 245 | 246 | class model(Structure): 247 | _names = ["param", "nr_class", "nr_feature", "w", "label", "bias"] 248 | _types = [parameter, c_int, c_int, POINTER(c_double), POINTER(c_int), c_double] 249 | _fields_ = genFields(_names, _types) 250 | 251 | def __init__(self): 252 | self.__createfrom__ = 'python' 253 | 254 | def __del__(self): 255 | # free memory created by C to avoid memory leak 256 | if hasattr(self, '__createfrom__') and self.__createfrom__ == 'C': 257 | liblinear.free_and_destroy_model(pointer(self)) 258 | 259 | def get_nr_feature(self): 260 | return liblinear.get_nr_feature(self) 261 | 262 | def get_nr_class(self): 263 | return liblinear.get_nr_class(self) 264 | 265 | def get_labels(self): 266 | nr_class = self.get_nr_class() 267 | labels = (c_int * nr_class)() 268 | liblinear.get_labels(self, labels) 269 | return labels[:nr_class] 270 | 271 | def get_decfun_coef(self, feat_idx, label_idx=0): 272 | return liblinear.get_decfun_coef(self, feat_idx, label_idx) 273 | 274 | def get_decfun_bias(self, label_idx=0): 275 | return liblinear.get_decfun_bias(self, label_idx) 276 | 277 | def get_decfun(self, label_idx=0): 278 | w = [liblinear.get_decfun_coef(self, feat_idx, label_idx) for feat_idx in range(1, self.nr_feature+1)] 279 | b = liblinear.get_decfun_bias(self, label_idx) 280 | return (w, b) 281 | 282 | def is_probability_model(self): 283 | return (liblinear.check_probability_model(self) == 1) 284 | 285 | def is_regression_model(self): 286 | return (liblinear.check_regression_model(self) == 1) 287 | 288 | def toPyModel(model_ptr): 289 | """ 290 | toPyModel(model_ptr) -> model 291 | 292 | Convert a ctypes POINTER(model) to a Python model 293 | """ 294 | if bool(model_ptr) == False: 295 | raise ValueError("Null pointer") 296 | m = model_ptr.contents 297 | m.__createfrom__ = 'C' 298 | return m 299 | 300 | fillprototype(liblinear.train, POINTER(model), [POINTER(problem), POINTER(parameter)]) 301 | fillprototype(liblinear.find_parameter_C, None, [POINTER(problem), POINTER(parameter), c_int, c_double, c_double, POINTER(c_double), POINTER(c_double)]) 302 | fillprototype(liblinear.cross_validation, None, [POINTER(problem), POINTER(parameter), c_int, POINTER(c_double)]) 303 | 304 | fillprototype(liblinear.predict_values, c_double, [POINTER(model), POINTER(feature_node), POINTER(c_double)]) 305 | fillprototype(liblinear.predict, c_double, [POINTER(model), POINTER(feature_node)]) 306 | fillprototype(liblinear.predict_probability, c_double, [POINTER(model), POINTER(feature_node), POINTER(c_double)]) 307 | 308 | fillprototype(liblinear.save_model, c_int, [c_char_p, POINTER(model)]) 309 | fillprototype(liblinear.load_model, POINTER(model), [c_char_p]) 310 | 311 | fillprototype(liblinear.get_nr_feature, c_int, [POINTER(model)]) 312 | fillprototype(liblinear.get_nr_class, c_int, [POINTER(model)]) 313 | fillprototype(liblinear.get_labels, None, [POINTER(model), POINTER(c_int)]) 314 | fillprototype(liblinear.get_decfun_coef, c_double, [POINTER(model), c_int, c_int]) 315 | fillprototype(liblinear.get_decfun_bias, c_double, [POINTER(model), c_int]) 316 | 317 | fillprototype(liblinear.free_model_content, None, [POINTER(model)]) 318 | fillprototype(liblinear.free_and_destroy_model, None, [POINTER(POINTER(model))]) 319 | fillprototype(liblinear.destroy_param, None, [POINTER(parameter)]) 320 | fillprototype(liblinear.check_parameter, c_char_p, [POINTER(problem), POINTER(parameter)]) 321 | fillprototype(liblinear.check_probability_model, c_int, [POINTER(model)]) 322 | fillprototype(liblinear.check_regression_model, c_int, [POINTER(model)]) 323 | fillprototype(liblinear.set_print_string_function, None, [CFUNCTYPE(None, c_char_p)]) 324 | -------------------------------------------------------------------------------- /codes/evaluation/liblinear/python/liblinearutil.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os, sys 4 | sys.path = [os.path.dirname(os.path.abspath(__file__))] + sys.path 5 | from liblinear import * 6 | from liblinear import __all__ as liblinear_all 7 | from ctypes import c_double 8 | 9 | __all__ = ['svm_read_problem', 'load_model', 'save_model', 'evaluations', 10 | 'train', 'predict'] + liblinear_all 11 | 12 | 13 | def svm_read_problem(data_file_name): 14 | """ 15 | svm_read_problem(data_file_name) -> [y, x] 16 | 17 | Read LIBSVM-format data from data_file_name and return labels y 18 | and data instances x. 19 | """ 20 | prob_y = [] 21 | prob_x = [] 22 | for line in open(data_file_name): 23 | line = line.split(None, 1) 24 | # In case an instance with all zero features 25 | if len(line) == 1: line += [''] 26 | label, features = line 27 | xi = {} 28 | for e in features.split(): 29 | ind, val = e.split(":") 30 | xi[int(ind)] = float(val) 31 | prob_y += [float(label)] 32 | prob_x += [xi] 33 | return (prob_y, prob_x) 34 | 35 | def load_model(model_file_name): 36 | """ 37 | load_model(model_file_name) -> model 38 | 39 | Load a LIBLINEAR model from model_file_name and return. 40 | """ 41 | model = liblinear.load_model(model_file_name.encode()) 42 | if not model: 43 | print("can't open model file %s" % model_file_name) 44 | return None 45 | model = toPyModel(model) 46 | return model 47 | 48 | def save_model(model_file_name, model): 49 | """ 50 | save_model(model_file_name, model) -> None 51 | 52 | Save a LIBLINEAR model to the file model_file_name. 53 | """ 54 | liblinear.save_model(model_file_name.encode(), model) 55 | 56 | def evaluations(ty, pv): 57 | """ 58 | evaluations(ty, pv) -> (ACC, MSE, SCC) 59 | 60 | Calculate accuracy, mean squared error and squared correlation coefficient 61 | using the true values (ty) and predicted values (pv). 62 | """ 63 | if len(ty) != len(pv): 64 | raise ValueError("len(ty) must equal to len(pv)") 65 | total_correct = total_error = 0 66 | sumv = sumy = sumvv = sumyy = sumvy = 0 67 | for v, y in zip(pv, ty): 68 | if y == v: 69 | total_correct += 1 70 | total_error += (v-y)*(v-y) 71 | sumv += v 72 | sumy += y 73 | sumvv += v*v 74 | sumyy += y*y 75 | sumvy += v*y 76 | l = len(ty) 77 | ACC = 100.0*total_correct/l 78 | MSE = total_error/l 79 | try: 80 | SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy)) 81 | except: 82 | SCC = float('nan') 83 | return (ACC, MSE, SCC) 84 | 85 | def train(arg1, arg2=None, arg3=None): 86 | """ 87 | train(y, x [, options]) -> model | ACC 88 | train(prob [, options]) -> model | ACC 89 | train(prob, param) -> model | ACC 90 | 91 | Train a model from data (y, x) or a problem prob using 92 | 'options' or a parameter param. 93 | If '-v' is specified in 'options' (i.e., cross validation) 94 | either accuracy (ACC) or mean-squared error (MSE) is returned. 95 | 96 | options: 97 | -s type : set type of solver (default 1) 98 | for multi-class classification 99 | 0 -- L2-regularized logistic regression (primal) 100 | 1 -- L2-regularized L2-loss support vector classification (dual) 101 | 2 -- L2-regularized L2-loss support vector classification (primal) 102 | 3 -- L2-regularized L1-loss support vector classification (dual) 103 | 4 -- support vector classification by Crammer and Singer 104 | 5 -- L1-regularized L2-loss support vector classification 105 | 6 -- L1-regularized logistic regression 106 | 7 -- L2-regularized logistic regression (dual) 107 | for regression 108 | 11 -- L2-regularized L2-loss support vector regression (primal) 109 | 12 -- L2-regularized L2-loss support vector regression (dual) 110 | 13 -- L2-regularized L1-loss support vector regression (dual) 111 | -c cost : set the parameter C (default 1) 112 | -p epsilon : set the epsilon in loss function of SVR (default 0.1) 113 | -e epsilon : set tolerance of termination criterion 114 | -s 0 and 2 115 | |f'(w)|_2 <= eps*min(pos,neg)/l*|f'(w0)|_2, 116 | where f is the primal function, (default 0.01) 117 | -s 11 118 | |f'(w)|_2 <= eps*|f'(w0)|_2 (default 0.001) 119 | -s 1, 3, 4, and 7 120 | Dual maximal violation <= eps; similar to liblinear (default 0.) 121 | -s 5 and 6 122 | |f'(w)|_inf <= eps*min(pos,neg)/l*|f'(w0)|_inf, 123 | where f is the primal function (default 0.01) 124 | -s 12 and 13 125 | |f'(alpha)|_1 <= eps |f'(alpha0)|, 126 | where f is the dual function (default 0.1) 127 | -B bias : if bias >= 0, instance x becomes [x; bias]; if < 0, no bias term added (default -1) 128 | -wi weight: weights adjust the parameter C of different classes (see README for details) 129 | -v n: n-fold cross validation mode 130 | -q : quiet mode (no outputs) 131 | """ 132 | prob, param = None, None 133 | if isinstance(arg1, (list, tuple)): 134 | assert isinstance(arg2, (list, tuple)) 135 | y, x, options = arg1, arg2, arg3 136 | prob = problem(y, x) 137 | param = parameter(options) 138 | elif isinstance(arg1, problem): 139 | prob = arg1 140 | if isinstance(arg2, parameter): 141 | param = arg2 142 | else : 143 | param = parameter(arg2) 144 | if prob == None or param == None : 145 | raise TypeError("Wrong types for the arguments") 146 | 147 | prob.set_bias(param.bias) 148 | liblinear.set_print_string_function(param.print_func) 149 | err_msg = liblinear.check_parameter(prob, param) 150 | if err_msg : 151 | raise ValueError('Error: %s' % err_msg) 152 | 153 | if param.flag_find_C: 154 | nr_fold = param.nr_fold 155 | best_C = c_double() 156 | best_rate = c_double() 157 | max_C = 1024 158 | if param.flag_C_specified: 159 | start_C = param.C 160 | else: 161 | start_C = -1.0 162 | liblinear.find_parameter_C(prob, param, nr_fold, start_C, max_C, best_C, best_rate) 163 | print("Best C = %lf CV accuracy = %g%%\n"% (best_C.value, 100.0*best_rate.value)) 164 | return best_C.value,best_rate.value 165 | 166 | 167 | elif param.flag_cross_validation: 168 | l, nr_fold = prob.l, param.nr_fold 169 | target = (c_double * l)() 170 | liblinear.cross_validation(prob, param, nr_fold, target) 171 | ACC, MSE, SCC = evaluations(prob.y[:l], target[:l]) 172 | if param.solver_type in [L2R_L2LOSS_SVR, L2R_L2LOSS_SVR_DUAL, L2R_L1LOSS_SVR_DUAL]: 173 | print("Cross Validation Mean squared error = %g" % MSE) 174 | print("Cross Validation Squared correlation coefficient = %g" % SCC) 175 | return MSE 176 | else: 177 | print("Cross Validation Accuracy = %g%%" % ACC) 178 | return ACC 179 | else : 180 | m = liblinear.train(prob, param) 181 | m = toPyModel(m) 182 | 183 | return m 184 | 185 | def predict(y, x, m, options=""): 186 | """ 187 | predict(y, x, m [, options]) -> (p_labels, p_acc, p_vals) 188 | 189 | Predict data (y, x) with the SVM model m. 190 | options: 191 | -b probability_estimates: whether to output probability estimates, 0 or 1 (default 0); currently for logistic regression only 192 | -q quiet mode (no outputs) 193 | 194 | The return tuple contains 195 | p_labels: a list of predicted labels 196 | p_acc: a tuple including accuracy (for classification), mean-squared 197 | error, and squared correlation coefficient (for regression). 198 | p_vals: a list of decision values or probability estimates (if '-b 1' 199 | is specified). If k is the number of classes, for decision values, 200 | each element includes results of predicting k binary-class 201 | SVMs. if k = 2 and solver is not MCSVM_CS, only one decision value 202 | is returned. For probabilities, each element contains k values 203 | indicating the probability that the testing instance is in each class. 204 | Note that the order of classes here is the same as 'model.label' 205 | field in the model structure. 206 | """ 207 | 208 | def info(s): 209 | print(s) 210 | 211 | predict_probability = 0 212 | argv = options.split() 213 | i = 0 214 | while i < len(argv): 215 | if argv[i] == '-b': 216 | i += 1 217 | predict_probability = int(argv[i]) 218 | elif argv[i] == '-q': 219 | info = print_null 220 | else: 221 | raise ValueError("Wrong options") 222 | i+=1 223 | 224 | solver_type = m.param.solver_type 225 | nr_class = m.get_nr_class() 226 | nr_feature = m.get_nr_feature() 227 | is_prob_model = m.is_probability_model() 228 | bias = m.bias 229 | if bias >= 0: 230 | biasterm = feature_node(nr_feature+1, bias) 231 | else: 232 | biasterm = feature_node(-1, bias) 233 | pred_labels = [] 234 | pred_values = [] 235 | 236 | if predict_probability: 237 | if not is_prob_model: 238 | raise TypeError('probability output is only supported for logistic regression') 239 | prob_estimates = (c_double * nr_class)() 240 | for xi in x: 241 | xi, idx = gen_feature_nodearray(xi, feature_max=nr_feature) 242 | xi[-2] = biasterm 243 | label = liblinear.predict_probability(m, xi, prob_estimates) 244 | values = prob_estimates[:nr_class] 245 | pred_labels += [label] 246 | pred_values += [values] 247 | else: 248 | if nr_class <= 2: 249 | nr_classifier = 1 250 | else: 251 | nr_classifier = nr_class 252 | dec_values = (c_double * nr_classifier)() 253 | for xi in x: 254 | xi, idx = gen_feature_nodearray(xi, feature_max=nr_feature) 255 | xi[-2] = biasterm 256 | label = liblinear.predict_values(m, xi, dec_values) 257 | values = dec_values[:nr_classifier] 258 | pred_labels += [label] 259 | pred_values += [values] 260 | if len(y) == 0: 261 | y = [0] * len(x) 262 | ACC, MSE, SCC = evaluations(y, pred_labels) 263 | l = len(y) 264 | if m.is_regression_model(): 265 | info("Mean squared error = %g (regression)" % MSE) 266 | info("Squared correlation coefficient = %g (regression)" % SCC) 267 | else: 268 | info("Accuracy = %g%% (%d/%d) (classification)" % (ACC, int(l*ACC/100), l)) 269 | 270 | return pred_labels, (ACC, MSE, SCC), pred_values 271 | -------------------------------------------------------------------------------- /codes/evaluation/liblinear/train.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "linear.h" 8 | #define Malloc(type,n) (type *)malloc((n)*sizeof(type)) 9 | #define INF HUGE_VAL 10 | 11 | void print_null(const char *s) {} 12 | 13 | void exit_with_help() 14 | { 15 | printf( 16 | "Usage: train [options] training_set_file [model_file]\n" 17 | "options:\n" 18 | "-s type : set type of solver (default 1)\n" 19 | " for multi-class classification\n" 20 | " 0 -- L2-regularized logistic regression (primal)\n" 21 | " 1 -- L2-regularized L2-loss support vector classification (dual)\n" 22 | " 2 -- L2-regularized L2-loss support vector classification (primal)\n" 23 | " 3 -- L2-regularized L1-loss support vector classification (dual)\n" 24 | " 4 -- support vector classification by Crammer and Singer\n" 25 | " 5 -- L1-regularized L2-loss support vector classification\n" 26 | " 6 -- L1-regularized logistic regression\n" 27 | " 7 -- L2-regularized logistic regression (dual)\n" 28 | " for regression\n" 29 | " 11 -- L2-regularized L2-loss support vector regression (primal)\n" 30 | " 12 -- L2-regularized L2-loss support vector regression (dual)\n" 31 | " 13 -- L2-regularized L1-loss support vector regression (dual)\n" 32 | "-c cost : set the parameter C (default 1)\n" 33 | "-p epsilon : set the epsilon in loss function of SVR (default 0.1)\n" 34 | "-e epsilon : set tolerance of termination criterion\n" 35 | " -s 0 and 2\n" 36 | " |f'(w)|_2 <= eps*min(pos,neg)/l*|f'(w0)|_2,\n" 37 | " where f is the primal function and pos/neg are # of\n" 38 | " positive/negative data (default 0.01)\n" 39 | " -s 11\n" 40 | " |f'(w)|_2 <= eps*|f'(w0)|_2 (default 0.001)\n" 41 | " -s 1, 3, 4, and 7\n" 42 | " Dual maximal violation <= eps; similar to libsvm (default 0.1)\n" 43 | " -s 5 and 6\n" 44 | " |f'(w)|_1 <= eps*min(pos,neg)/l*|f'(w0)|_1,\n" 45 | " where f is the primal function (default 0.01)\n" 46 | " -s 12 and 13\n" 47 | " |f'(alpha)|_1 <= eps |f'(alpha0)|,\n" 48 | " where f is the dual function (default 0.1)\n" 49 | "-B bias : if bias >= 0, instance x becomes [x; bias]; if < 0, no bias term added (default -1)\n" 50 | "-wi weight: weights adjust the parameter C of different classes (see README for details)\n" 51 | "-v n: n-fold cross validation mode\n" 52 | "-C : find parameter C (only for -s 0 and 2)\n" 53 | "-q : quiet mode (no outputs)\n" 54 | ); 55 | exit(1); 56 | } 57 | 58 | void exit_input_error(int line_num) 59 | { 60 | fprintf(stderr,"Wrong input format at line %d\n", line_num); 61 | exit(1); 62 | } 63 | 64 | static char *line = NULL; 65 | static int max_line_len; 66 | 67 | static char* readline(FILE *input) 68 | { 69 | int len; 70 | 71 | if(fgets(line,max_line_len,input) == NULL) 72 | return NULL; 73 | 74 | while(strrchr(line,'\n') == NULL) 75 | { 76 | max_line_len *= 2; 77 | line = (char *) realloc(line,max_line_len); 78 | len = (int) strlen(line); 79 | if(fgets(line+len,max_line_len-len,input) == NULL) 80 | break; 81 | } 82 | return line; 83 | } 84 | 85 | void parse_command_line(int argc, char **argv, char *input_file_name, char *model_file_name); 86 | void read_problem(const char *filename); 87 | void do_cross_validation(); 88 | void do_find_parameter_C(); 89 | 90 | struct feature_node *x_space; 91 | struct parameter param; 92 | struct problem prob; 93 | struct model* model_; 94 | int flag_cross_validation; 95 | int flag_find_C; 96 | int flag_C_specified; 97 | int flag_solver_specified; 98 | int nr_fold; 99 | double bias; 100 | 101 | int main(int argc, char **argv) 102 | { 103 | char input_file_name[1024]; 104 | char model_file_name[1024]; 105 | const char *error_msg; 106 | 107 | parse_command_line(argc, argv, input_file_name, model_file_name); 108 | read_problem(input_file_name); 109 | error_msg = check_parameter(&prob,¶m); 110 | 111 | if(error_msg) 112 | { 113 | fprintf(stderr,"ERROR: %s\n",error_msg); 114 | exit(1); 115 | } 116 | 117 | if (flag_find_C) 118 | { 119 | do_find_parameter_C(); 120 | } 121 | else if(flag_cross_validation) 122 | { 123 | do_cross_validation(); 124 | } 125 | else 126 | { 127 | model_=train(&prob, ¶m); 128 | if(save_model(model_file_name, model_)) 129 | { 130 | fprintf(stderr,"can't save model to file %s\n",model_file_name); 131 | exit(1); 132 | } 133 | free_and_destroy_model(&model_); 134 | } 135 | destroy_param(¶m); 136 | free(prob.y); 137 | free(prob.x); 138 | free(x_space); 139 | free(line); 140 | 141 | return 0; 142 | } 143 | 144 | void do_find_parameter_C() 145 | { 146 | double start_C, best_C, best_rate; 147 | double max_C = 1024; 148 | if (flag_C_specified) 149 | start_C = param.C; 150 | else 151 | start_C = -1.0; 152 | printf("Doing parameter search with %d-fold cross validation.\n", nr_fold); 153 | find_parameter_C(&prob, ¶m, nr_fold, start_C, max_C, &best_C, &best_rate); 154 | printf("Best C = %g CV accuracy = %g%%\n", best_C, 100.0*best_rate); 155 | } 156 | 157 | void do_cross_validation() 158 | { 159 | int i; 160 | int total_correct = 0; 161 | double total_error = 0; 162 | double sumv = 0, sumy = 0, sumvv = 0, sumyy = 0, sumvy = 0; 163 | double *target = Malloc(double, prob.l); 164 | 165 | cross_validation(&prob,¶m,nr_fold,target); 166 | if(param.solver_type == L2R_L2LOSS_SVR || 167 | param.solver_type == L2R_L1LOSS_SVR_DUAL || 168 | param.solver_type == L2R_L2LOSS_SVR_DUAL) 169 | { 170 | for(i=0;i=argc) 223 | exit_with_help(); 224 | switch(argv[i-1][1]) 225 | { 226 | case 's': 227 | param.solver_type = atoi(argv[i]); 228 | flag_solver_specified = 1; 229 | break; 230 | 231 | case 'c': 232 | param.C = atof(argv[i]); 233 | flag_C_specified = 1; 234 | break; 235 | 236 | case 'p': 237 | param.p = atof(argv[i]); 238 | break; 239 | 240 | case 'e': 241 | param.eps = atof(argv[i]); 242 | break; 243 | 244 | case 'B': 245 | bias = atof(argv[i]); 246 | break; 247 | 248 | case 'w': 249 | ++param.nr_weight; 250 | param.weight_label = (int *) realloc(param.weight_label,sizeof(int)*param.nr_weight); 251 | param.weight = (double *) realloc(param.weight,sizeof(double)*param.nr_weight); 252 | param.weight_label[param.nr_weight-1] = atoi(&argv[i-1][2]); 253 | param.weight[param.nr_weight-1] = atof(argv[i]); 254 | break; 255 | 256 | case 'v': 257 | flag_cross_validation = 1; 258 | nr_fold = atoi(argv[i]); 259 | if(nr_fold < 2) 260 | { 261 | fprintf(stderr,"n-fold cross validation: n must >= 2\n"); 262 | exit_with_help(); 263 | } 264 | break; 265 | 266 | case 'q': 267 | print_func = &print_null; 268 | i--; 269 | break; 270 | 271 | case 'C': 272 | flag_find_C = 1; 273 | i--; 274 | break; 275 | 276 | default: 277 | fprintf(stderr,"unknown option: -%c\n", argv[i-1][1]); 278 | exit_with_help(); 279 | break; 280 | } 281 | } 282 | 283 | set_print_string_function(print_func); 284 | 285 | // determine filenames 286 | if(i>=argc) 287 | exit_with_help(); 288 | 289 | strcpy(input_file_name, argv[i]); 290 | 291 | if(i max_index) 430 | max_index = inst_max_index; 431 | 432 | if(prob.bias >= 0) 433 | x_space[j++].value = prob.bias; 434 | 435 | x_space[j++].index = -1; 436 | } 437 | 438 | if(prob.bias >= 0) 439 | { 440 | prob.n=max_index+1; 441 | for(i=1;iindex = prob.n; 443 | x_space[j-2].index = prob.n; 444 | } 445 | else 446 | prob.n=max_index; 447 | 448 | fclose(fp); 449 | } 450 | -------------------------------------------------------------------------------- /codes/evaluation/liblinear/tron.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "tron.h" 6 | 7 | #ifndef min 8 | template static inline T min(T x,T y) { return (x static inline T max(T x,T y) { return (x>y)?x:y; } 13 | #endif 14 | 15 | #ifdef __cplusplus 16 | extern "C" { 17 | #endif 18 | 19 | extern double dnrm2_(int *, double *, int *); 20 | extern double ddot_(int *, double *, int *, double *, int *); 21 | extern int daxpy_(int *, double *, double *, int *, double *, int *); 22 | extern int dscal_(int *, double *, double *, int *); 23 | 24 | #ifdef __cplusplus 25 | } 26 | #endif 27 | 28 | static void default_print(const char *buf) 29 | { 30 | fputs(buf,stdout); 31 | fflush(stdout); 32 | } 33 | 34 | void TRON::info(const char *fmt,...) 35 | { 36 | char buf[BUFSIZ]; 37 | va_list ap; 38 | va_start(ap,fmt); 39 | vsprintf(buf,fmt,ap); 40 | va_end(ap); 41 | (*tron_print_string)(buf); 42 | } 43 | 44 | TRON::TRON(const function *fun_obj, double eps, double eps_cg, int max_iter) 45 | { 46 | this->fun_obj=const_cast(fun_obj); 47 | this->eps=eps; 48 | this->eps_cg=eps_cg; 49 | this->max_iter=max_iter; 50 | tron_print_string = default_print; 51 | } 52 | 53 | TRON::~TRON() 54 | { 55 | } 56 | 57 | void TRON::tron(double *w) 58 | { 59 | // Parameters for updating the iterates. 60 | double eta0 = 1e-4, eta1 = 0.25, eta2 = 0.75; 61 | 62 | // Parameters for updating the trust region size delta. 63 | double sigma1 = 0.25, sigma2 = 0.5, sigma3 = 4; 64 | 65 | int n = fun_obj->get_nr_variable(); 66 | int i, cg_iter; 67 | double delta, snorm, one=1.0; 68 | double alpha, f, fnew, prered, actred, gs; 69 | int search = 1, iter = 1, inc = 1; 70 | double *s = new double[n]; 71 | double *r = new double[n]; 72 | double *g = new double[n]; 73 | 74 | // calculate gradient norm at w=0 for stopping condition. 75 | double *w0 = new double[n]; 76 | for (i=0; ifun(w0); 79 | fun_obj->grad(w0, g); 80 | double gnorm0 = dnrm2_(&n, g, &inc); 81 | delete [] w0; 82 | 83 | f = fun_obj->fun(w); 84 | fun_obj->grad(w, g); 85 | delta = dnrm2_(&n, g, &inc); 86 | double gnorm = delta; 87 | 88 | if (gnorm <= eps*gnorm0) 89 | search = 0; 90 | 91 | iter = 1; 92 | 93 | double *w_new = new double[n]; 94 | while (iter <= max_iter && search) 95 | { 96 | cg_iter = trcg(delta, g, s, r); 97 | 98 | memcpy(w_new, w, sizeof(double)*n); 99 | daxpy_(&n, &one, s, &inc, w_new, &inc); 100 | 101 | gs = ddot_(&n, g, &inc, s, &inc); 102 | prered = -0.5*(gs-ddot_(&n, s, &inc, r, &inc)); 103 | fnew = fun_obj->fun(w_new); 104 | 105 | // Compute the actual reduction. 106 | actred = f - fnew; 107 | 108 | // On the first iteration, adjust the initial step bound. 109 | snorm = dnrm2_(&n, s, &inc); 110 | if (iter == 1) 111 | delta = min(delta, snorm); 112 | 113 | // Compute prediction alpha*snorm of the step. 114 | if (fnew - f - gs <= 0) 115 | alpha = sigma3; 116 | else 117 | alpha = max(sigma1, -0.5*(gs/(fnew - f - gs))); 118 | 119 | // Update the trust region bound according to the ratio of actual to predicted reduction. 120 | if (actred < eta0*prered) 121 | delta = min(max(alpha, sigma1)*snorm, sigma2*delta); 122 | else if (actred < eta1*prered) 123 | delta = max(sigma1*delta, min(alpha*snorm, sigma2*delta)); 124 | else if (actred < eta2*prered) 125 | delta = max(sigma1*delta, min(alpha*snorm, sigma3*delta)); 126 | else 127 | delta = max(delta, min(alpha*snorm, sigma3*delta)); 128 | 129 | info("iter %2d act %5.3e pre %5.3e delta %5.3e f %5.3e |g| %5.3e CG %3d\n", iter, actred, prered, delta, f, gnorm, cg_iter); 130 | 131 | if (actred > eta0*prered) 132 | { 133 | iter++; 134 | memcpy(w, w_new, sizeof(double)*n); 135 | f = fnew; 136 | fun_obj->grad(w, g); 137 | 138 | gnorm = dnrm2_(&n, g, &inc); 139 | if (gnorm <= eps*gnorm0) 140 | break; 141 | } 142 | if (f < -1.0e+32) 143 | { 144 | info("WARNING: f < -1.0e+32\n"); 145 | break; 146 | } 147 | if (fabs(actred) <= 0 && prered <= 0) 148 | { 149 | info("WARNING: actred and prered <= 0\n"); 150 | break; 151 | } 152 | if (fabs(actred) <= 1.0e-12*fabs(f) && 153 | fabs(prered) <= 1.0e-12*fabs(f)) 154 | { 155 | info("WARNING: actred and prered too small\n"); 156 | break; 157 | } 158 | } 159 | 160 | delete[] g; 161 | delete[] r; 162 | delete[] w_new; 163 | delete[] s; 164 | } 165 | 166 | int TRON::trcg(double delta, double *g, double *s, double *r) 167 | { 168 | int i, inc = 1; 169 | int n = fun_obj->get_nr_variable(); 170 | double one = 1; 171 | double *d = new double[n]; 172 | double *Hd = new double[n]; 173 | double rTr, rnewTrnew, alpha, beta, cgtol; 174 | 175 | for (i=0; iHv(d, Hd); 191 | 192 | alpha = rTr/ddot_(&n, d, &inc, Hd, &inc); 193 | daxpy_(&n, &alpha, d, &inc, s, &inc); 194 | if (dnrm2_(&n, s, &inc) > delta) 195 | { 196 | info("cg reaches trust region boundary\n"); 197 | alpha = -alpha; 198 | daxpy_(&n, &alpha, d, &inc, s, &inc); 199 | 200 | double std = ddot_(&n, s, &inc, d, &inc); 201 | double sts = ddot_(&n, s, &inc, s, &inc); 202 | double dtd = ddot_(&n, d, &inc, d, &inc); 203 | double dsq = delta*delta; 204 | double rad = sqrt(std*std + dtd*(dsq-sts)); 205 | if (std >= 0) 206 | alpha = (dsq - sts)/(std + rad); 207 | else 208 | alpha = (rad - std)/dtd; 209 | daxpy_(&n, &alpha, d, &inc, s, &inc); 210 | alpha = -alpha; 211 | daxpy_(&n, &alpha, Hd, &inc, r, &inc); 212 | break; 213 | } 214 | alpha = -alpha; 215 | daxpy_(&n, &alpha, Hd, &inc, r, &inc); 216 | rnewTrnew = ddot_(&n, r, &inc, r, &inc); 217 | beta = rnewTrnew/rTr; 218 | dscal_(&n, &beta, d, &inc); 219 | daxpy_(&n, &one, r, &inc, d, &inc); 220 | rTr = rnewTrnew; 221 | } 222 | 223 | delete[] d; 224 | delete[] Hd; 225 | 226 | return(cg_iter); 227 | } 228 | 229 | double TRON::norm_inf(int n, double *x) 230 | { 231 | double dmax = fabs(x[0]); 232 | for (int i=1; i= dmax) 234 | dmax = fabs(x[i]); 235 | return(dmax); 236 | } 237 | 238 | void TRON::set_print_string(void (*print_string) (const char *buf)) 239 | { 240 | tron_print_string = print_string; 241 | } 242 | -------------------------------------------------------------------------------- /codes/evaluation/liblinear/tron.h: -------------------------------------------------------------------------------- 1 | #ifndef _TRON_H 2 | #define _TRON_H 3 | 4 | class function 5 | { 6 | public: 7 | virtual double fun(double *w) = 0 ; 8 | virtual void grad(double *w, double *g) = 0 ; 9 | virtual void Hv(double *s, double *Hs) = 0 ; 10 | 11 | virtual int get_nr_variable(void) = 0 ; 12 | virtual ~function(void){} 13 | }; 14 | 15 | class TRON 16 | { 17 | public: 18 | TRON(const function *fun_obj, double eps = 0.1, double eps_cg = 0.1, int max_iter = 1000); 19 | ~TRON(); 20 | 21 | void tron(double *w); 22 | void set_print_string(void (*i_print) (const char *buf)); 23 | 24 | private: 25 | int trcg(double delta, double *g, double *s, double *r); 26 | double norm_inf(int n, double *x); 27 | 28 | double eps; 29 | double eps_cg; 30 | int max_iter; 31 | function *fun_obj; 32 | void info(const char *fmt,...); 33 | void (*tron_print_string)(const char *buf); 34 | }; 35 | #endif 36 | -------------------------------------------------------------------------------- /codes/evaluation/liblinear/windows/liblinear.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mnqu/DPE/23795d9e1246d699a3e61cc21098813227a82407/codes/evaluation/liblinear/windows/liblinear.dll -------------------------------------------------------------------------------- /codes/evaluation/liblinear/windows/libsvmread.mexw64: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mnqu/DPE/23795d9e1246d699a3e61cc21098813227a82407/codes/evaluation/liblinear/windows/libsvmread.mexw64 -------------------------------------------------------------------------------- /codes/evaluation/liblinear/windows/libsvmwrite.mexw64: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mnqu/DPE/23795d9e1246d699a3e61cc21098813227a82407/codes/evaluation/liblinear/windows/libsvmwrite.mexw64 -------------------------------------------------------------------------------- /codes/evaluation/liblinear/windows/predict.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mnqu/DPE/23795d9e1246d699a3e61cc21098813227a82407/codes/evaluation/liblinear/windows/predict.exe -------------------------------------------------------------------------------- /codes/evaluation/liblinear/windows/predict.mexw64: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mnqu/DPE/23795d9e1246d699a3e61cc21098813227a82407/codes/evaluation/liblinear/windows/predict.mexw64 -------------------------------------------------------------------------------- /codes/evaluation/liblinear/windows/train.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mnqu/DPE/23795d9e1246d699a3e61cc21098813227a82407/codes/evaluation/liblinear/windows/train.exe -------------------------------------------------------------------------------- /codes/evaluation/liblinear/windows/train.mexw64: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mnqu/DPE/23795d9e1246d699a3e61cc21098813227a82407/codes/evaluation/liblinear/windows/train.mexw64 -------------------------------------------------------------------------------- /codes/evaluation/make.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | g++ -O2 vocab.cpp -o vocab 4 | g++ -O2 -I../../eigen-3.2.5 gen_cand_eval.cpp -o gen_cand_eval 5 | g++ -O2 -I../../eigen-3.2.5 infer.cpp -o infer 6 | g++ -O2 concat.cpp -o concat 7 | g++ -O2 pair2bow.cpp -o pair2bow 8 | cd liblinear/ 9 | make 10 | cd .. 11 | -------------------------------------------------------------------------------- /codes/evaluation/pair2bow.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #define MAX_STRING 500 12 | using namespace std; 13 | 14 | const int vocab_hash_size = 30000000; 15 | 16 | typedef float real; // Precision of float numbers 17 | 18 | struct vocab_word { 19 | double idf; 20 | char word[MAX_STRING]; 21 | }; 22 | 23 | char vocab_file[MAX_STRING], index_file[MAX_STRING], vector_file[MAX_STRING], text_file[MAX_STRING], query_file[MAX_STRING], output_vector_file[MAX_STRING], output_pair_file[MAX_STRING]; 24 | 25 | struct vocab_word *vocab; 26 | int *vocab_hash; 27 | long long vocab_max_size = 1000, vocab_size = 0, doc_size = 0, vector_size = 0, query_size = 0; 28 | real *vec; 29 | std::set *wid2did; 30 | std::map *wid2did2pst; 31 | std::vector< std::vector > doc; 32 | 33 | void ReadWord(char *word, FILE *fin) { 34 | int a = 0, ch; 35 | while (!feof(fin)) { 36 | ch = fgetc(fin); 37 | if (ch == 13) continue; 38 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { 39 | if (a > 0) { 40 | if (ch == '\n') ungetc(ch, fin); 41 | break; 42 | } 43 | if (ch == '\n') { 44 | strcpy(word, (char *)""); 45 | return; 46 | } else continue; 47 | } 48 | word[a] = ch; 49 | a++; 50 | if (a >= MAX_STRING - 1) a--; // Truncate too long words 51 | } 52 | word[a] = 0; 53 | } 54 | int GetWordHash(char *word) { 55 | unsigned long long a, hash = 0; 56 | for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a]; 57 | hash = hash % vocab_hash_size; 58 | return hash; 59 | } 60 | 61 | int SearchVocab(char *word) { 62 | unsigned int hash = GetWordHash(word); 63 | while (1) { 64 | if (vocab_hash[hash] == -1) return -1; 65 | if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash]; 66 | hash = (hash + 1) % vocab_hash_size; 67 | } 68 | return -1; 69 | } 70 | 71 | // Adds a word to the vocabulary 72 | int AddWordToVocab(char *word, double idf) { 73 | unsigned int hash, length = strlen(word) + 1; 74 | if (length > MAX_STRING) length = MAX_STRING; 75 | strcpy(vocab[vocab_size].word, word); 76 | vocab[vocab_size].idf = idf; 77 | vocab_size++; 78 | // Reallocate memory if needed 79 | if (vocab_size + 2 >= vocab_max_size) { 80 | vocab_max_size += 1000; 81 | vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word)); 82 | } 83 | hash = GetWordHash(word); 84 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 85 | vocab_hash[hash] = vocab_size - 1; 86 | return vocab_size - 1; 87 | } 88 | 89 | void ReadVocab() 90 | { 91 | char word[MAX_STRING]; 92 | FILE *fin; 93 | int wid; 94 | double idf; 95 | for (long long a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 96 | fin = fopen(vocab_file, "rb"); 97 | if (fin == NULL) { 98 | printf("ERROR: training data file not found!\n"); 99 | exit(1); 100 | } 101 | vocab_size = 0; 102 | while (1) 103 | { 104 | if (fscanf(fin, "%s %d %lf", word, &wid, &idf) != 3) break; 105 | if (SearchVocab(word) != -1) continue; 106 | AddWordToVocab(word, idf); 107 | } 108 | fclose(fin); 109 | printf("Vocab size: %lld\n", vocab_size); 110 | wid2did = new std::set [vocab_size]; 111 | wid2did2pst = new std::map [vocab_size]; 112 | } 113 | 114 | void ReadIndex() 115 | { 116 | FILE *fi = fopen(index_file, "rb"); 117 | char word[MAX_STRING * 10]; 118 | int wid, did, pst; 119 | while (1) 120 | { 121 | if (fscanf(fi, "%s", word) != 1) break; 122 | 123 | wid = SearchVocab(word); 124 | 125 | while (1) 126 | { 127 | ReadWord(word, fi); 128 | if (strcmp(word, "") == 0) break; 129 | sscanf(word, "%d:%d", &did, &pst); 130 | if (wid == -1) continue; 131 | wid2did[wid].insert(did); 132 | wid2did2pst[wid][did] = pst; 133 | } 134 | } 135 | fclose(fi); 136 | } 137 | 138 | void ReadText() 139 | { 140 | FILE *fi = fopen(text_file, "rb"); 141 | char word[MAX_STRING]; 142 | int wid; 143 | std::vector curdoc; 144 | doc_size = 0; 145 | while (1) 146 | { 147 | ReadWord(word, fi); 148 | if (feof(fi)) break; 149 | 150 | wid = SearchVocab(word); 151 | curdoc.push_back(wid); 152 | if (strcmp(word, "") == 0) 153 | { 154 | doc.push_back(curdoc); 155 | curdoc.clear(); 156 | doc_size++; 157 | } 158 | } 159 | fclose(fi); 160 | printf("Text size: %lld\n", doc_size); 161 | } 162 | 163 | void ReadVector() 164 | { 165 | FILE *fi; 166 | long long T, S = 0; 167 | int wid; 168 | char ch, word[MAX_STRING]; 169 | real f_num; 170 | 171 | fi = fopen(vector_file, "rb"); 172 | fscanf(fi, "%lld %lld", &T, &vector_size); 173 | 174 | vec = (real *)calloc(vocab_size * vector_size, sizeof(real)); 175 | 176 | for (long long k = 0; k != T; k++) 177 | { 178 | fscanf(fi, "%s", word); 179 | ch = fgetc(fi); 180 | wid = SearchVocab(word); 181 | if (wid != -1) S++; 182 | for (int c = 0; c != vector_size; c++) 183 | { 184 | fread(&f_num, sizeof(real), 1, fi); 185 | if (wid == -1) continue; 186 | vec[wid * vector_size + c] = f_num; 187 | } 188 | } 189 | for (int k = 0; k != vocab_size; k++) 190 | { 191 | real len = 0; 192 | for (int c = 0; c != vector_size; c++) len += vec[k * vector_size + c] * vec[k * vector_size + c]; 193 | len = sqrtf(len); 194 | if (len == 0) continue; 195 | for (int c = 0; c != vector_size; c++) vec[k * vector_size + c] /= len; 196 | } 197 | fclose(fi); 198 | printf("Raw-init size: %lld\n", T); 199 | printf("Inited size: %lld\n", S); 200 | printf("Vector size: %lld\n", vector_size); 201 | } 202 | 203 | void Output() 204 | { 205 | FILE *fi, *fop, *fov; 206 | char wordu[MAX_STRING], wordv[MAX_STRING], label[MAX_STRING], word[MAX_STRING]; 207 | int u, v, wid, did, i, j, pstu, pstv, cnt; 208 | int *buf = (int *)calloc(doc_size, sizeof(int)); 209 | int *bg, *ed; 210 | double sum = 0; 211 | real *neu = (real *)malloc(vector_size * sizeof(real)); 212 | std::map mp; 213 | std::map::iterator iter; 214 | std::vector inbuf; 215 | fi = fopen(query_file, "rb"); 216 | fop = fopen(output_pair_file, "wb"); 217 | fov = fopen(output_vector_file, "wb"); 218 | while (1) 219 | { 220 | if (fscanf(fi, "%s %s %s", label, wordu, wordv) != 3) break; 221 | 222 | inbuf.clear(); 223 | while (1) 224 | { 225 | ReadWord(word, fi); 226 | if (strcmp(word, "") == 0) break; 227 | inbuf.push_back(word); 228 | } 229 | 230 | u = SearchVocab(wordu); 231 | v = SearchVocab(wordv); 232 | if (u == -1 || v == -1) continue; 233 | 234 | ed = set_intersection(wid2did[u].begin(), wid2did[u].end(), wid2did[v].begin(), wid2did[v].end(), buf); 235 | 236 | mp.clear(); 237 | for (bg = buf; bg != ed; bg++) 238 | { 239 | did = *bg; 240 | 241 | pstu = wid2did2pst[u][did]; 242 | pstv = wid2did2pst[v][did]; 243 | 244 | if (pstu > pstv) {i = pstv; j = pstu;} 245 | else {i = pstu; j = pstv;} 246 | 247 | if (j - i > 6) continue; 248 | for (int k = i - 1; k <= j + 1; k++) 249 | { 250 | if (k == i || k == j) continue; 251 | 252 | wid = doc[did][k]; 253 | mp[wid] += 1; 254 | } 255 | } 256 | 257 | if (mp.empty()) continue; 258 | 259 | sum = 0; 260 | for (iter = mp.begin(); iter != mp.end(); iter++) 261 | { 262 | wid = iter->first; 263 | cnt = iter->second; 264 | 265 | if (wid == -1) continue; 266 | 267 | sum += cnt;// * vocab[wid].idf; 268 | } 269 | 270 | for (int c = 0; c != vector_size; c++) neu[c] = 0; 271 | 272 | for (iter = mp.begin(); iter != mp.end(); iter++) 273 | { 274 | wid = iter->first; 275 | cnt = iter->second; 276 | 277 | if (wid == -1) continue; 278 | 279 | //double wei = cnt * vocab[wid].idf / sum; 280 | double wei = cnt / sum; 281 | for (int c = 0; c != vector_size; c++) neu[c] += wei * vec[wid * vector_size + c]; 282 | } 283 | 284 | fprintf(fop, "%s\t%s\t%s", label, wordu, wordv); 285 | for (int k = 0; k != (int)(inbuf.size()); k++) fprintf(fop, "\t%s", inbuf[k].c_str()); 286 | fprintf(fop, "\n"); 287 | 288 | fprintf(fov, "%s", label); 289 | for (int c = 0; c != vector_size; c++) fprintf(fov, " %d:%lf", c + 1, neu[c]); 290 | fprintf(fov, "\n"); 291 | 292 | query_size += 1; 293 | } 294 | fclose(fi); 295 | fclose(fop); 296 | fclose(fov); 297 | free(buf); 298 | free(neu); 299 | printf("Query size: %lld\n", query_size); 300 | } 301 | 302 | void TrainModel() 303 | { 304 | vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word)); 305 | vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int)); 306 | 307 | ReadVocab(); 308 | ReadIndex(); 309 | ReadText(); 310 | ReadVector(); 311 | 312 | Output(); 313 | } 314 | 315 | int ArgPos(char *str, int argc, char **argv) { 316 | int a; 317 | for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) { 318 | if (a == argc - 1) { 319 | printf("Argument missing for %s\n", str); 320 | exit(1); 321 | } 322 | return a; 323 | } 324 | return -1; 325 | } 326 | 327 | int main(int argc, char **argv) { 328 | int i; 329 | if ((i = ArgPos((char *)"-vocab", argc, argv)) > 0) strcpy(vocab_file, argv[i + 1]); 330 | if ((i = ArgPos((char *)"-index", argc, argv)) > 0) strcpy(index_file, argv[i + 1]); 331 | if ((i = ArgPos((char *)"-vector", argc, argv)) > 0) strcpy(vector_file, argv[i + 1]); 332 | if ((i = ArgPos((char *)"-text", argc, argv)) > 0) strcpy(text_file, argv[i + 1]); 333 | if ((i = ArgPos((char *)"-query", argc, argv)) > 0) strcpy(query_file, argv[i + 1]); 334 | if ((i = ArgPos((char *)"-output-pair", argc, argv)) > 0) strcpy(output_pair_file, argv[i + 1]); 335 | if ((i = ArgPos((char *)"-output-vector", argc, argv)) > 0) strcpy(output_vector_file, argv[i + 1]); 336 | TrainModel(); 337 | return 0; 338 | } 339 | -------------------------------------------------------------------------------- /codes/evaluation/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | text_file="../../data/text.txt" 4 | index_file="../../data/index.txt" 5 | eval_file="../../data/eval.txt" 6 | bipartite_file="../../data/bipartite.txt" 7 | 8 | vector_file="../dpe/word.emb" 9 | pat_file="../dpe/pat.txt" 10 | dis_file="../dpe/dis.txt" 11 | 12 | score_file="result.txt" 13 | 14 | k_nns=100 15 | k_max=10 16 | min_count=10 17 | vocab_size=0 18 | 19 | weight=0.1 20 | 21 | ./infer -train ${bipartite_file} -vector ${vector_file} -output cont.emb -debug 2 -binary 1 22 | ./concat -input1 ${vector_file} -input2 cont.emb -output concat.emb -debug 2 -binary 1 23 | python gen_model.py ${pat_file} ${dis_file} model.txt 24 | ./vocab -train ${text_file} -output vocab.txt -min-count ${min_count} -size ${vocab_size} 25 | ./gen_cand_eval -data ${eval_file} -vector concat.emb -output-cand cand.txt -output-pair pair.txt -k-max ${k_nns} -filter 1 26 | ./pair2bow -vector ${vector_file} -index ${index_file} -text ${text_file} -vocab vocab.txt -query pair.txt -output-vector data-eval.txt -output-pair pair-eval.txt 27 | ./liblinear/predict -b 1 -q data-eval.txt model.txt predict.txt 28 | 29 | python score_syn.py predict.txt pair-eval.txt score.txt 30 | 31 | python final_score_wt.py cand.txt score.txt ${score_file} ${weight} 32 | 33 | python cal_result.py ${eval_file} ${score_file} ${k_max} 34 | 35 | rm -rf vocab.txt cand.txt pair.txt score.txt pair-eval.txt model.txt predict.txt cont.emb concat.emb -------------------------------------------------------------------------------- /codes/evaluation/score_syn.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | pred_file = sys.argv[1] 5 | pair_file = sys.argv[2] 6 | out_file = sys.argv[3] 7 | 8 | score = [] 9 | fi = open(pred_file, 'r') 10 | line = fi.readline() 11 | pst = 0 12 | if line.strip() == 'labels 0 1': 13 | pst = 2 14 | elif line.strip() == 'labels 1 0': 15 | pst = 1 16 | while True: 17 | line = fi.readline() 18 | if not line: 19 | break 20 | s = float(line.strip().split()[pst]) 21 | score.append(s) 22 | fi.close() 23 | 24 | eid2cid2slst = {} 25 | eidcid2name = {} 26 | fi = open(pair_file, 'r') 27 | cnt = 0 28 | for line in fi: 29 | eid = line.strip().split()[3] 30 | cid = int(line.strip().split()[5]) 31 | name = line.strip().split()[2] 32 | 33 | if eid2cid2slst.get(eid, None) == None: 34 | eid2cid2slst[eid] = {} 35 | if eid2cid2slst[eid].get(cid, None) == None: 36 | eid2cid2slst[eid][cid] = [] 37 | eid2cid2slst[eid][cid].append(score[cnt]) 38 | 39 | eidcid2name[(eid, cid)] = name 40 | 41 | cnt += 1 42 | fi.close() 43 | 44 | fo = open(out_file, 'w') 45 | for eid, dic in eid2cid2slst.items(): 46 | cid2sc = {} 47 | for cid, slst in dic.items(): 48 | #cn = 0.0 49 | #sm = 0.0 50 | #for vl in slst: 51 | # sm += vl 52 | # cn += 1 53 | 54 | #if slst == []: 55 | # cid2sc[cid] = 0 56 | #else: 57 | # cid2sc[cid] = sm / cn 58 | 59 | mx = 0.0 60 | for vl in slst: 61 | mx = max(mx, vl) 62 | cid2sc[cid] = mx 63 | 64 | cid2sc = sorted(cid2sc.items(), key = lambda x:x[1], reverse = True) 65 | 66 | for cid, sc in cid2sc: 67 | name = eidcid2name.get((eid, cid), 'None"') 68 | fo.write(eid + '\t' + str(cid) + '\t' + str(sc) + '\t' + name + '\n') 69 | fo.close() 70 | 71 | -------------------------------------------------------------------------------- /codes/evaluation/vocab.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #define MAX_STRING 100 8 | using namespace std; 9 | 10 | const int vocab_hash_size = 30000000; 11 | 12 | struct vocab_word { 13 | long long cn, ndoc; 14 | char *word; 15 | }; 16 | 17 | long long train_words = 0; 18 | int min_count = 0, min_reduce = 1; 19 | char train_file[MAX_STRING], output_file[MAX_STRING]; 20 | 21 | struct vocab_word *vocab; 22 | int *vocab_hash; 23 | long long vocab_max_size = 1000, vocab_size = 0, output_size = 0, doc_size = 0; 24 | std::set doc; 25 | 26 | void ReadWord(char *word, FILE *fin) { 27 | int a = 0, ch; 28 | while (!feof(fin)) { 29 | ch = fgetc(fin); 30 | if (ch == 13) continue; 31 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { 32 | if (a > 0) { 33 | if (ch == '\n') ungetc(ch, fin); 34 | break; 35 | } 36 | if (ch == '\n') { 37 | strcpy(word, (char *)""); 38 | return; 39 | } else continue; 40 | } 41 | word[a] = ch; 42 | a++; 43 | if (a >= MAX_STRING - 1) a--; // Truncate too long words 44 | } 45 | word[a] = 0; 46 | } 47 | int GetWordHash(char *word) { 48 | unsigned long long a, hash = 0; 49 | for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a]; 50 | hash = hash % vocab_hash_size; 51 | return hash; 52 | } 53 | 54 | int SearchVocab(char *word) { 55 | unsigned int hash = GetWordHash(word); 56 | while (1) { 57 | if (vocab_hash[hash] == -1) return -1; 58 | if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash]; 59 | hash = (hash + 1) % vocab_hash_size; 60 | } 61 | return -1; 62 | } 63 | 64 | int ReadWordIndex(FILE *fin) { 65 | char word[MAX_STRING]; 66 | ReadWord(word, fin); 67 | if (feof(fin)) return -1; 68 | return SearchVocab(word); 69 | } 70 | 71 | int AddWordToVocab(char *word) { 72 | unsigned int hash, length = strlen(word) + 1; 73 | if (length > MAX_STRING) length = MAX_STRING; 74 | vocab[vocab_size].word = (char *)calloc(length, sizeof(char)); 75 | strcpy(vocab[vocab_size].word, word); 76 | vocab[vocab_size].cn = 0; 77 | vocab[vocab_size].ndoc = 0; 78 | vocab_size++; 79 | // Reallocate memory if needed 80 | if (vocab_size + 2 >= vocab_max_size) { 81 | vocab_max_size += 1000; 82 | vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word)); 83 | } 84 | hash = GetWordHash(word); 85 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 86 | vocab_hash[hash] = vocab_size - 1; 87 | return vocab_size - 1; 88 | } 89 | 90 | int VocabCompare(const void *a, const void *b) { 91 | return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn; 92 | } 93 | 94 | void SortVocab() { 95 | int a, size; 96 | unsigned int hash; 97 | // Sort the vocabulary and keep at the first position 98 | qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare); 99 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 100 | size = vocab_size; 101 | train_words = 0; 102 | for (a = 0; a < size; a++) { 103 | // Words occuring less than min_count times will be discarded from the vocab 104 | if (vocab[a].cn < min_count) { 105 | vocab_size--; 106 | free(vocab[vocab_size].word); 107 | } else { 108 | // Hash will be re-computed, as after the sorting it is not actual 109 | hash=GetWordHash(vocab[a].word); 110 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 111 | vocab_hash[hash] = a; 112 | train_words += vocab[a].cn; 113 | } 114 | } 115 | vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word)); 116 | } 117 | 118 | void ReduceVocab() { 119 | int a, b = 0; 120 | unsigned int hash; 121 | for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) { 122 | vocab[b].cn = vocab[a].cn; 123 | vocab[b].word = vocab[a].word; 124 | b++; 125 | } else free(vocab[a].word); 126 | vocab_size = b; 127 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 128 | for (a = 0; a < vocab_size; a++) { 129 | // Hash will be re-computed, as it is not actual 130 | hash = GetWordHash(vocab[a].word); 131 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 132 | vocab_hash[hash] = a; 133 | } 134 | fflush(stdout); 135 | min_reduce++; 136 | } 137 | 138 | void LearnVocabFromTrainFile() { 139 | char word[MAX_STRING]; 140 | FILE *fin; 141 | long long a, i; 142 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 143 | fin = fopen(train_file, "rb"); 144 | if (fin == NULL) { 145 | printf("ERROR: training data file not found!\n"); 146 | exit(1); 147 | } 148 | vocab_size = 0; 149 | AddWordToVocab((char *)""); 150 | while (1) { 151 | ReadWord(word, fin); 152 | if (feof(fin)) break; 153 | train_words++; 154 | if (train_words % 100000 == 0) { 155 | printf("%lldK%c", train_words / 1000, 13); 156 | fflush(stdout); 157 | } 158 | i = SearchVocab(word); 159 | if (i == -1) { 160 | a = AddWordToVocab(word); 161 | vocab[a].cn = 1; 162 | } else vocab[i].cn++; 163 | if (vocab_size > vocab_hash_size * 0.7) ReduceVocab(); 164 | } 165 | SortVocab(); 166 | printf("Vocab size: %lld\n", vocab_size); 167 | printf("Words in train file: %lld\n", train_words); 168 | fclose(fin); 169 | } 170 | 171 | 172 | void IDF() 173 | { 174 | FILE *fi = fopen(train_file, "rb"); 175 | char word[MAX_STRING]; 176 | int wid; 177 | std::set::iterator iter; 178 | doc.clear(); 179 | while (1) 180 | { 181 | ReadWord(word, fi); 182 | if (feof(fi)) break; 183 | 184 | wid = SearchVocab(word); 185 | if (wid == -1) continue; 186 | doc.insert(wid); 187 | if (wid == 0) 188 | { 189 | for (iter = doc.begin(); iter != doc.end(); iter++) 190 | { 191 | wid = (*iter); 192 | vocab[wid].ndoc += 1; 193 | } 194 | doc.clear(); 195 | doc_size += 1; 196 | } 197 | } 198 | fclose(fi); 199 | printf("Doc size: %lld\n", doc_size); 200 | } 201 | 202 | void TrainModel() 203 | { 204 | vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word)); 205 | vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int)); 206 | LearnVocabFromTrainFile(); 207 | IDF(); 208 | if (output_size == 0) output_size = vocab_size; 209 | printf("Output size: %lld\n", output_size); 210 | 211 | FILE *fo = fopen(output_file, "wb"); 212 | for (int k = 0; k != output_size; k++) 213 | fprintf(fo, "%s\t%d\t%lf\n", vocab[k].word, k, log(doc_size * 1.0 / (vocab[k].ndoc + 1.0))); 214 | fclose(fo); 215 | } 216 | 217 | int ArgPos(char *str, int argc, char **argv) { 218 | int a; 219 | for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) { 220 | if (a == argc - 1) { 221 | printf("Argument missing for %s\n", str); 222 | exit(1); 223 | } 224 | return a; 225 | } 226 | return -1; 227 | } 228 | 229 | int main(int argc, char **argv) { 230 | int i; 231 | if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]); 232 | if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]); 233 | if ((i = ArgPos((char *)"-size", argc, argv)) > 0) output_size = atoi(argv[i + 1]); 234 | if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]); 235 | TrainModel(); 236 | return 0; 237 | } 238 | -------------------------------------------------------------------------------- /codes/preprocess/data2net.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #define MAX_STRING 200 8 | using namespace std; 9 | 10 | const int vocab_hash_size = 30000000; 11 | 12 | struct vocab_word { 13 | long long cn; 14 | char *word; 15 | }; 16 | 17 | struct biterm 18 | { 19 | int u; 20 | int v; 21 | friend bool operator<(biterm b1,biterm b2) 22 | { 23 | if(b1.u==b2.u) 24 | return b1.v btm2cnt; 30 | //map word2cnt; 31 | long long totalword=0,totalbtm=0; 32 | long long last_processed=0,processed=0,file_size=0,train_words=0; 33 | int min_count=0,min_reduce=1,debug_mode=2,window; 34 | char input_file[MAX_STRING],output_file[MAX_STRING]; 35 | 36 | struct vocab_word *vocab; 37 | int *vocab_hash; 38 | long long vocab_max_size = 1000, vocab_size = 0; 39 | 40 | void ReadWord(char *word, FILE *fin) { 41 | int a = 0, ch; 42 | while (!feof(fin)) { 43 | ch = fgetc(fin); 44 | if (ch == 13) continue; 45 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { 46 | if (a > 0) { 47 | if (ch == '\n') ungetc(ch, fin); 48 | break; 49 | } 50 | if (ch == '\n') { 51 | strcpy(word, (char *)""); 52 | return; 53 | } else continue; 54 | } 55 | word[a] = ch; 56 | a++; 57 | if (a >= MAX_STRING - 1) a--; // Truncate too long words 58 | } 59 | word[a] = 0; 60 | } 61 | int GetWordHash(char *word) { 62 | unsigned long long a, hash = 0; 63 | for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a]; 64 | hash = hash % vocab_hash_size; 65 | return hash; 66 | } 67 | 68 | int SearchVocab(char *word) { 69 | unsigned int hash = GetWordHash(word); 70 | while (1) { 71 | if (vocab_hash[hash] == -1) return -1; 72 | if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash]; 73 | hash = (hash + 1) % vocab_hash_size; 74 | } 75 | return -1; 76 | } 77 | 78 | int ReadWordIndex(FILE *fin) { 79 | char word[MAX_STRING]; 80 | ReadWord(word, fin); 81 | if (feof(fin)) return -1; 82 | return SearchVocab(word); 83 | } 84 | 85 | int AddWordToVocab(char *word) { 86 | unsigned int hash, length = strlen(word) + 1; 87 | if (length > MAX_STRING) length = MAX_STRING; 88 | vocab[vocab_size].word = (char *)calloc(length, sizeof(char)); 89 | strcpy(vocab[vocab_size].word, word); 90 | vocab[vocab_size].cn = 0; 91 | vocab_size++; 92 | // Reallocate memory if needed 93 | if (vocab_size + 2 >= vocab_max_size) { 94 | vocab_max_size += 1000; 95 | vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word)); 96 | } 97 | hash = GetWordHash(word); 98 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 99 | vocab_hash[hash] = vocab_size - 1; 100 | return vocab_size - 1; 101 | } 102 | 103 | int VocabCompare(const void *a, const void *b) { 104 | return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn; 105 | } 106 | 107 | void SortVocab() { 108 | int a, size; 109 | unsigned int hash; 110 | // Sort the vocabulary and keep at the first position 111 | qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare); 112 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 113 | size = vocab_size; 114 | train_words = 0; 115 | for (a = 0; a < size; a++) { 116 | // Words occuring less than min_count times will be discarded from the vocab 117 | if (vocab[a].cn < min_count) { 118 | vocab_size--; 119 | free(vocab[vocab_size].word); 120 | } else { 121 | // Hash will be re-computed, as after the sorting it is not actual 122 | hash=GetWordHash(vocab[a].word); 123 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 124 | vocab_hash[hash] = a; 125 | train_words += vocab[a].cn; 126 | } 127 | } 128 | vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word)); 129 | } 130 | 131 | void ReduceVocab() { 132 | int a, b = 0; 133 | unsigned int hash; 134 | for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) { 135 | vocab[b].cn = vocab[a].cn; 136 | vocab[b].word = vocab[a].word; 137 | b++; 138 | } else free(vocab[a].word); 139 | vocab_size = b; 140 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 141 | for (a = 0; a < vocab_size; a++) { 142 | // Hash will be re-computed, as it is not actual 143 | hash = GetWordHash(vocab[a].word); 144 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 145 | vocab_hash[hash] = a; 146 | } 147 | fflush(stdout); 148 | min_reduce++; 149 | } 150 | 151 | void LearnVocabFromTrainFile() { 152 | char word[MAX_STRING]; 153 | FILE *fin; 154 | long long a, i; 155 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 156 | fin = fopen(input_file, "rb"); 157 | if (fin == NULL) { 158 | printf("ERROR: training data file not found!\n"); 159 | exit(1); 160 | } 161 | vocab_size = 0; 162 | AddWordToVocab((char *)""); 163 | while (1) { 164 | ReadWord(word, fin); 165 | if (feof(fin)) break; 166 | train_words++; 167 | if ((debug_mode > 1) && (train_words % 100000 == 0)) { 168 | printf("%lldK%c", train_words / 1000, 13); 169 | fflush(stdout); 170 | } 171 | i = SearchVocab(word); 172 | if (i == -1) { 173 | a = AddWordToVocab(word); 174 | vocab[a].cn = 1; 175 | } else vocab[i].cn++; 176 | if (vocab_size > vocab_hash_size * 0.7) ReduceVocab(); 177 | } 178 | SortVocab(); 179 | if (debug_mode > 0) { 180 | printf("Vocab size: %lld\n", vocab_size); 181 | printf("Words in train file: %lld\n", train_words); 182 | } 183 | file_size = ftell(fin); 184 | fclose(fin); 185 | } 186 | 187 | void TrainModel() 188 | { 189 | vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word)); 190 | vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int)); 191 | LearnVocabFromTrainFile(); 192 | 193 | FILE *fi=fopen(input_file,"r"); 194 | fseek(fi, 0, SEEK_END); 195 | file_size=ftell(fi); 196 | fclose(fi); 197 | 198 | fi=fopen(input_file,"r"); 199 | int word; 200 | biterm btm; 201 | int *buf=new int [window+2]; 202 | int pst=0,exch=0; 203 | while(!feof(fi)) 204 | { 205 | word=ReadWordIndex(fi); 206 | if(word==-1) continue; 207 | 208 | if(processed-last_processed>10000) 209 | { 210 | printf("%cRead file: %.3lf%%", 13, double(processed)/train_words*100); 211 | fflush(stdout); 212 | last_processed=processed; 213 | } 214 | 215 | processed++; 216 | if(word==0) { pst=0; exch=0; continue; } 217 | 218 | 219 | //word2cnt[word]++; 220 | //totalword++; 221 | //btm.u=word;btm.v=word;btm2cnt[btm]++; 222 | //totalbtm++; 223 | for(int k=0;k!=pst;k++) 224 | { 225 | btm.u=word;btm.v=buf[k];btm2cnt[btm]++; 226 | btm.u=buf[k];btm.v=word;btm2cnt[btm]++; 227 | 228 | totalbtm+=2; 229 | } 230 | 231 | if(pst=window) exch=0; 237 | } 238 | } 239 | printf("\n"); 240 | fclose(fi); 241 | 242 | FILE *fo=fopen(output_file,"w"); 243 | long long btmsize=btm2cnt.size(); 244 | printf("Number of edges: %lld\n", btmsize); 245 | long long written=0; 246 | map::iterator iter=btm2cnt.begin(); 247 | while(iter!=btm2cnt.end()) 248 | { 249 | if(written%10000==0) 250 | { 251 | printf("%cWrite file: %.3lf%%", 13, double(written)/btmsize*100); 252 | fflush(stdout); 253 | } 254 | fprintf(fo,"%s\t%s\t%lld\n",vocab[(iter->first).u].word,vocab[(iter->first).v].word,(iter->second)); 255 | 256 | written++; 257 | iter++; 258 | } 259 | printf("\n"); 260 | fclose(fo); 261 | } 262 | 263 | int ArgPos(char *str, int argc, char **argv) { 264 | int a; 265 | for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) { 266 | if (a == argc - 1) { 267 | printf("Argument missing for %s\n", str); 268 | exit(1); 269 | } 270 | return a; 271 | } 272 | return -1; 273 | } 274 | 275 | int main(int argc, char **argv) { 276 | int i; 277 | if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(input_file, argv[i + 1]); 278 | if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]); 279 | if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]); 280 | if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]); 281 | if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]); 282 | TrainModel(); 283 | return 0; 284 | } 285 | -------------------------------------------------------------------------------- /codes/preprocess/gen_index.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import json 4 | 5 | ent2dic = {} 6 | fi = open(sys.argv[1], 'r') 7 | fo = open(sys.argv[2], 'w') 8 | lineid = 0 9 | for line in fi: 10 | if lineid % 100000 == 0: 11 | print lineid 12 | 13 | lst = line.strip().split() 14 | for k in range(len(lst)): 15 | #if lst[k].find('||') == -1: 16 | # continue 17 | ent = lst[k] 18 | if ent2dic.get(ent, None) == None: 19 | ent2dic[ent] = {} 20 | ent2dic[ent][lineid] = k 21 | 22 | lineid += 1 23 | 24 | for ent, dic in ent2dic.items(): 25 | fo.write(ent) 26 | for lid, pst in dic.items(): 27 | fo.write(' ' + str(lid) + ':' + str(pst)) 28 | fo.write('\n') 29 | 30 | fi.close() 31 | fo.close() 32 | -------------------------------------------------------------------------------- /codes/preprocess/gen_pattern.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #define MAX_STRING 500 12 | using namespace std; 13 | 14 | const int vocab_hash_size = 30000000; 15 | 16 | typedef float real; // Precision of float numbers 17 | 18 | struct vocab_word { 19 | char word[MAX_STRING]; 20 | }; 21 | 22 | char vocab_file[MAX_STRING], index_file[MAX_STRING], text_file[MAX_STRING], query_file[MAX_STRING], output_pattern_file[MAX_STRING]; 23 | 24 | struct vocab_word *vocab; 25 | int *vocab_hash; 26 | long long vocab_max_size = 1000, vocab_size = 0, doc_size = 0, vector_size = 0, query_size = 0; 27 | real *vec; 28 | std::set *wid2did; 29 | std::map *wid2did2pst; 30 | std::vector< std::vector > doc; 31 | 32 | void ReadWord(char *word, FILE *fin) { 33 | int a = 0, ch; 34 | while (!feof(fin)) { 35 | ch = fgetc(fin); 36 | if (ch == 13) continue; 37 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { 38 | if (a > 0) { 39 | if (ch == '\n') ungetc(ch, fin); 40 | break; 41 | } 42 | if (ch == '\n') { 43 | strcpy(word, (char *)""); 44 | return; 45 | } else continue; 46 | } 47 | word[a] = ch; 48 | a++; 49 | if (a >= MAX_STRING - 1) a--; // Truncate too long words 50 | } 51 | word[a] = 0; 52 | } 53 | int GetWordHash(char *word) { 54 | unsigned long long a, hash = 0; 55 | for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a]; 56 | hash = hash % vocab_hash_size; 57 | return hash; 58 | } 59 | 60 | int SearchVocab(char *word) { 61 | unsigned int hash = GetWordHash(word); 62 | while (1) { 63 | if (vocab_hash[hash] == -1) return -1; 64 | if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash]; 65 | hash = (hash + 1) % vocab_hash_size; 66 | } 67 | return -1; 68 | } 69 | 70 | // Adds a word to the vocabulary 71 | int AddWordToVocab(char *word) { 72 | unsigned int hash, length = strlen(word) + 1; 73 | if (length > MAX_STRING) length = MAX_STRING; 74 | strcpy(vocab[vocab_size].word, word); 75 | vocab_size++; 76 | // Reallocate memory if needed 77 | if (vocab_size + 2 >= vocab_max_size) { 78 | vocab_max_size += 1000; 79 | vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word)); 80 | } 81 | hash = GetWordHash(word); 82 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 83 | vocab_hash[hash] = vocab_size - 1; 84 | return vocab_size - 1; 85 | } 86 | 87 | void ReadVocab() 88 | { 89 | char word[MAX_STRING]; 90 | FILE *fin; 91 | for (long long a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 92 | fin = fopen(vocab_file, "rb"); 93 | if (fin == NULL) { 94 | printf("ERROR: training data file not found!\n"); 95 | exit(1); 96 | } 97 | vocab_size = 0; 98 | while (1) 99 | { 100 | if (fscanf(fin, "%s", word) != 1) break; 101 | if (SearchVocab(word) != -1) continue; 102 | AddWordToVocab(word); 103 | } 104 | fclose(fin); 105 | printf("Vocab size: %lld\n", vocab_size); 106 | wid2did = new std::set [vocab_size]; 107 | wid2did2pst = new std::map [vocab_size]; 108 | } 109 | 110 | void ReadIndex() 111 | { 112 | FILE *fi = fopen(index_file, "rb"); 113 | char word[MAX_STRING * 10]; 114 | int wid, did, pst; 115 | while (1) 116 | { 117 | if (fscanf(fi, "%s", word) != 1) break; 118 | 119 | wid = SearchVocab(word); 120 | 121 | while (1) 122 | { 123 | ReadWord(word, fi); 124 | if (strcmp(word, "") == 0) break; 125 | sscanf(word, "%d:%d", &did, &pst); 126 | if (wid == -1) continue; 127 | wid2did[wid].insert(did); 128 | wid2did2pst[wid][did] = pst; 129 | } 130 | } 131 | fclose(fi); 132 | } 133 | 134 | void ReadText() 135 | { 136 | FILE *fi = fopen(text_file, "rb"); 137 | char word[MAX_STRING]; 138 | int wid; 139 | std::vector curdoc; 140 | doc_size = 0; 141 | while (1) 142 | { 143 | ReadWord(word, fi); 144 | if (feof(fi)) break; 145 | 146 | wid = SearchVocab(word); 147 | curdoc.push_back(wid); 148 | if (strcmp(word, "") == 0) 149 | { 150 | doc.push_back(curdoc); 151 | curdoc.clear(); 152 | doc_size++; 153 | } 154 | } 155 | fclose(fi); 156 | printf("Text size: %lld\n", doc_size); 157 | } 158 | 159 | void Output() 160 | { 161 | FILE *fi, *fo; 162 | char wordu[MAX_STRING], wordv[MAX_STRING], label[MAX_STRING], word[MAX_STRING]; 163 | int u, v, wid, did, i, j, pstu, pstv, cnt; 164 | int *buf = (int *)calloc(doc_size, sizeof(int)); 165 | int *bg, *ed; 166 | double sum = 0; 167 | std::map::iterator iter; 168 | std::vector inbuf; 169 | fi = fopen(query_file, "rb"); 170 | fo = fopen(output_pattern_file, "wb"); 171 | while (1) 172 | { 173 | if (fscanf(fi, "%s %s %s", label, wordu, wordv) != 3) break; 174 | 175 | inbuf.clear(); 176 | while (1) 177 | { 178 | ReadWord(word, fi); 179 | if (strcmp(word, "") == 0) break; 180 | inbuf.push_back(word); 181 | } 182 | 183 | u = SearchVocab(wordu); 184 | v = SearchVocab(wordv); 185 | if (u == -1 || v == -1) continue; 186 | 187 | ed = set_intersection(wid2did[u].begin(), wid2did[u].end(), wid2did[v].begin(), wid2did[v].end(), buf); 188 | 189 | for (bg = buf; bg != ed; bg++) 190 | { 191 | did = *bg; 192 | 193 | pstu = wid2did2pst[u][did]; 194 | pstv = wid2did2pst[v][did]; 195 | 196 | if (pstu > pstv) {i = pstv; j = pstu;} 197 | else {i = pstu; j = pstv;} 198 | 199 | if (j - i > 6) continue; 200 | 201 | fprintf(fo, "%s", label); 202 | for (int k = i - 1; k <= j + 1; k++) 203 | { 204 | if (k == i || k == j) continue; 205 | 206 | wid = doc[did][k]; 207 | if (wid == -1) continue; 208 | fprintf(fo, " %s", vocab[wid].word); 209 | } 210 | fprintf(fo, "\n"); 211 | } 212 | 213 | query_size += 1; 214 | } 215 | fclose(fi); 216 | fclose(fo); 217 | free(buf); 218 | printf("Query size: %lld\n", query_size); 219 | } 220 | 221 | void TrainModel() 222 | { 223 | vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word)); 224 | vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int)); 225 | 226 | ReadVocab(); 227 | ReadIndex(); 228 | ReadText(); 229 | 230 | Output(); 231 | } 232 | 233 | int ArgPos(char *str, int argc, char **argv) { 234 | int a; 235 | for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) { 236 | if (a == argc - 1) { 237 | printf("Argument missing for %s\n", str); 238 | exit(1); 239 | } 240 | return a; 241 | } 242 | return -1; 243 | } 244 | 245 | int main(int argc, char **argv) { 246 | int i; 247 | if ((i = ArgPos((char *)"-vocab", argc, argv)) > 0) strcpy(vocab_file, argv[i + 1]); 248 | if ((i = ArgPos((char *)"-index", argc, argv)) > 0) strcpy(index_file, argv[i + 1]); 249 | if ((i = ArgPos((char *)"-text", argc, argv)) > 0) strcpy(text_file, argv[i + 1]); 250 | if ((i = ArgPos((char *)"-query", argc, argv)) > 0) strcpy(query_file, argv[i + 1]); 251 | if ((i = ArgPos((char *)"-output-pattern", argc, argv)) > 0) strcpy(output_pattern_file, argv[i + 1]); 252 | TrainModel(); 253 | return 0; 254 | } 255 | -------------------------------------------------------------------------------- /codes/preprocess/label.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | syn_file = sys.argv[1] 5 | pair_file = sys.argv[2] 6 | out_file = sys.argv[3] 7 | 8 | pair2lb = {} 9 | 10 | fi = open(syn_file, 'r') 11 | for line in fi: 12 | lst = line.strip().split() 13 | u = lst[0] 14 | v = lst[1] 15 | pair2lb[(u,v)] = 1 16 | fi.close() 17 | 18 | fi = open(pair_file, 'r') 19 | fo = open(out_file, 'w') 20 | for line in fi: 21 | lst = line.strip().split() 22 | u = lst[0] 23 | v = lst[1] 24 | lb = pair2lb.get((u,v), 0) 25 | fo.write(str(lb) + '\t' + u + '\t' + v + '\n') 26 | fi.close() 27 | fo.close() 28 | -------------------------------------------------------------------------------- /codes/preprocess/make.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | g++ -O2 data2net.cpp -o data2net 4 | g++ -O2 gen_pattern.cpp -o gen_pattern 5 | g++ -O2 vocab.cpp -o vocab 6 | -------------------------------------------------------------------------------- /codes/preprocess/netww2netwe.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | fi = open(sys.argv[1], 'r') 5 | fo = open(sys.argv[2], 'w') 6 | cnt = 0 7 | for line in fi: 8 | if cnt % 1000000 == 0: 9 | print cnt 10 | cnt += 1 11 | v = line.strip().split()[1] 12 | if v.find('||') == -1: 13 | continue 14 | fo.write(line) 15 | print cnt 16 | fi.close() 17 | fo.close() 18 | 19 | -------------------------------------------------------------------------------- /codes/preprocess/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | text_file="../../data/text.txt" 4 | pair_file="../../data/pairs.txt" 5 | parsing_file="../../data/parsing.txt" 6 | eval_file="../../data/eval.txt" 7 | knn_file="../../data/knn.txt" 8 | 9 | window=5 10 | min_count=10 11 | vocab_size=0 12 | 13 | net_file="../../data/net.txt" 14 | index_file="../../data/index.txt" 15 | string_set="../../data/string.set" 16 | pattern_file="../../data/pattern.txt" 17 | bipartite_file="../../data/bipartite.txt" 18 | 19 | ./data2net -train ${text_file} -output ${net_file} -debug 2 -window ${window} -min-count ${min_count} 20 | python vocab.py ${net_file} ${string_set} 21 | python gen_index.py ${text_file} ${index_file} 22 | python label.py ${pair_file} ${knn_file} pairs.txt 23 | ./vocab -train ${text_file} -output vocab.txt -min-count ${min_count} -size ${vocab_size} 24 | ./gen_pattern -index ${index_file} -text ${text_file} -vocab vocab.txt -query pairs.txt -output-pattern ${pattern_file} -parsing ${parsing_file} 25 | python netww2netwe.py ${net_file} ${bipartite_file} 26 | 27 | rm -rf pairs.txt vocab.txt 28 | -------------------------------------------------------------------------------- /codes/preprocess/vocab.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #define MAX_STRING 100 8 | using namespace std; 9 | 10 | const int vocab_hash_size = 30000000; 11 | 12 | struct vocab_word { 13 | long long cn; 14 | char *word; 15 | }; 16 | 17 | long long train_words = 0; 18 | int min_count = 0, min_reduce = 1; 19 | char train_file[MAX_STRING], output_file[MAX_STRING]; 20 | 21 | struct vocab_word *vocab; 22 | int *vocab_hash; 23 | long long vocab_max_size = 1000, vocab_size = 0, output_size = 0; 24 | 25 | void ReadWord(char *word, FILE *fin) { 26 | int a = 0, ch; 27 | while (!feof(fin)) { 28 | ch = fgetc(fin); 29 | if (ch == 13) continue; 30 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { 31 | if (a > 0) { 32 | if (ch == '\n') ungetc(ch, fin); 33 | break; 34 | } 35 | if (ch == '\n') { 36 | strcpy(word, (char *)""); 37 | return; 38 | } else continue; 39 | } 40 | word[a] = ch; 41 | a++; 42 | if (a >= MAX_STRING - 1) a--; // Truncate too long words 43 | } 44 | word[a] = 0; 45 | } 46 | int GetWordHash(char *word) { 47 | unsigned long long a, hash = 0; 48 | for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a]; 49 | hash = hash % vocab_hash_size; 50 | return hash; 51 | } 52 | 53 | int SearchVocab(char *word) { 54 | unsigned int hash = GetWordHash(word); 55 | while (1) { 56 | if (vocab_hash[hash] == -1) return -1; 57 | if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash]; 58 | hash = (hash + 1) % vocab_hash_size; 59 | } 60 | return -1; 61 | } 62 | 63 | int ReadWordIndex(FILE *fin) { 64 | char word[MAX_STRING]; 65 | ReadWord(word, fin); 66 | if (feof(fin)) return -1; 67 | return SearchVocab(word); 68 | } 69 | 70 | int AddWordToVocab(char *word) { 71 | unsigned int hash, length = strlen(word) + 1; 72 | if (length > MAX_STRING) length = MAX_STRING; 73 | vocab[vocab_size].word = (char *)calloc(length, sizeof(char)); 74 | strcpy(vocab[vocab_size].word, word); 75 | vocab[vocab_size].cn = 0; 76 | vocab_size++; 77 | // Reallocate memory if needed 78 | if (vocab_size + 2 >= vocab_max_size) { 79 | vocab_max_size += 1000; 80 | vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word)); 81 | } 82 | hash = GetWordHash(word); 83 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 84 | vocab_hash[hash] = vocab_size - 1; 85 | return vocab_size - 1; 86 | } 87 | 88 | int VocabCompare(const void *a, const void *b) { 89 | return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn; 90 | } 91 | 92 | void SortVocab() { 93 | int a, size; 94 | unsigned int hash; 95 | // Sort the vocabulary and keep at the first position 96 | qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare); 97 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 98 | size = vocab_size; 99 | train_words = 0; 100 | for (a = 0; a < size; a++) { 101 | // Words occuring less than min_count times will be discarded from the vocab 102 | if (vocab[a].cn < min_count) { 103 | vocab_size--; 104 | free(vocab[vocab_size].word); 105 | } else { 106 | // Hash will be re-computed, as after the sorting it is not actual 107 | hash=GetWordHash(vocab[a].word); 108 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 109 | vocab_hash[hash] = a; 110 | train_words += vocab[a].cn; 111 | } 112 | } 113 | vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word)); 114 | } 115 | 116 | void ReduceVocab() { 117 | int a, b = 0; 118 | unsigned int hash; 119 | for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) { 120 | vocab[b].cn = vocab[a].cn; 121 | vocab[b].word = vocab[a].word; 122 | b++; 123 | } else free(vocab[a].word); 124 | vocab_size = b; 125 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 126 | for (a = 0; a < vocab_size; a++) { 127 | // Hash will be re-computed, as it is not actual 128 | hash = GetWordHash(vocab[a].word); 129 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 130 | vocab_hash[hash] = a; 131 | } 132 | fflush(stdout); 133 | min_reduce++; 134 | } 135 | 136 | void LearnVocabFromTrainFile() { 137 | char word[MAX_STRING]; 138 | FILE *fin; 139 | long long a, i; 140 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 141 | fin = fopen(train_file, "rb"); 142 | if (fin == NULL) { 143 | printf("ERROR: training data file not found!\n"); 144 | exit(1); 145 | } 146 | vocab_size = 0; 147 | AddWordToVocab((char *)""); 148 | while (1) { 149 | ReadWord(word, fin); 150 | if (feof(fin)) break; 151 | train_words++; 152 | if (train_words % 100000 == 0) { 153 | printf("%lldK%c", train_words / 1000, 13); 154 | fflush(stdout); 155 | } 156 | i = SearchVocab(word); 157 | if (i == -1) { 158 | a = AddWordToVocab(word); 159 | vocab[a].cn = 1; 160 | } else vocab[i].cn++; 161 | if (vocab_size > vocab_hash_size * 0.7) ReduceVocab(); 162 | } 163 | SortVocab(); 164 | printf("Vocab size: %lld\n", vocab_size); 165 | printf("Words in train file: %lld\n", train_words); 166 | fclose(fin); 167 | } 168 | 169 | void TrainModel() 170 | { 171 | vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word)); 172 | vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int)); 173 | LearnVocabFromTrainFile(); 174 | if (output_size == 0) output_size = vocab_size; 175 | printf("Output size: %lld\n", output_size); 176 | 177 | FILE *fo = fopen(output_file, "wb"); 178 | for (int k = 0; k != output_size; k++) 179 | fprintf(fo, "%s\n", vocab[k].word); 180 | fclose(fo); 181 | } 182 | 183 | int ArgPos(char *str, int argc, char **argv) { 184 | int a; 185 | for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) { 186 | if (a == argc - 1) { 187 | printf("Argument missing for %s\n", str); 188 | exit(1); 189 | } 190 | return a; 191 | } 192 | return -1; 193 | } 194 | 195 | int main(int argc, char **argv) { 196 | int i; 197 | if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]); 198 | if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]); 199 | if ((i = ArgPos((char *)"-size", argc, argv)) > 0) output_size = atoi(argv[i + 1]); 200 | if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]); 201 | TrainModel(); 202 | return 0; 203 | } 204 | -------------------------------------------------------------------------------- /codes/preprocess/vocab.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | fi = open(sys.argv[1], 'r') 5 | fo = open(sys.argv[2], 'w') 6 | word2flag = {} 7 | for line in fi: 8 | lst = line.strip().split() 9 | u = lst[0] 10 | v = lst[1] 11 | word2flag[u] = 1 12 | word2flag[v] = 1 13 | for word in word2flag.keys(): 14 | fo.write(word + '\n') 15 | fi.close() 16 | fo.close() 17 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | # Wiki-Freebase 2 | The data can be downloaded from the following link: 3 | 4 | https://drive.google.com/file/d/1jOkKxAtsMIRufZmG2zrhC_AuI-Fg4S5y/view?usp=sharing 5 | 6 | Users may first put all files in the data folder, and then run the script we provide. -------------------------------------------------------------------------------- /data/label.set: -------------------------------------------------------------------------------- 1 | 0 2 | 1 3 | 4 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | ### Compile the codes 4 | cd codes 5 | 6 | cd preprocess 7 | ./make.sh 8 | cd .. 9 | 10 | cd dpe 11 | make 12 | cd .. 13 | 14 | cd evaluation 15 | ./make.sh 16 | cd .. 17 | 18 | ### Data preprocessing 19 | cd preprocess 20 | ./run.sh 21 | cd .. 22 | 23 | ### Model training 24 | cd dpe 25 | ./run.sh 26 | cd .. 27 | 28 | ### Evaluation 29 | cd evaluation 30 | ./run.sh 31 | cd .. 32 | 33 | ### Clean temporary files 34 | cd ../data 35 | rm -rf net.txt index.txt string.set pattern.txt bipartite.txt 36 | cd .. --------------------------------------------------------------------------------