├── README.md ├── RecsysChallenge2017.pdf ├── auto-pipeline ├── App.config ├── Data │ ├── CleanOnlineData.cs │ └── LocalDataGenrator.cs ├── Program.cs ├── Properties │ └── AssemblyInfo.cs ├── RecSys17.csproj ├── SmallJobs.cs ├── Utils.cs ├── model │ ├── DocumentClustering.cs │ ├── DocumentRelated.cs │ ├── Evaluation.cs │ ├── FMProcessor.cs │ ├── FeatureFactory.cs │ ├── Item.cs │ ├── ItemProfile.cs │ ├── KNN.cs │ ├── KeywordMgr.cs │ ├── SubmissionHelper.cs │ ├── User.cs │ └── WordHashing.cs └── py-pull_and_submit │ ├── daily-pull-data.py │ ├── model.py │ ├── online_submit_auto.py │ ├── online_submit_auto_-1.py │ ├── online_submit_auto_2.py │ ├── parser.py │ └── recsys-submit-file.py └── models ├── StudyNDCG.script ├── StudyNDCG.script.cs ├── TEST_Localmodel_tlc3_pipeline.script ├── TEST_Localmodel_tlc3_pipeline.script.cs ├── TEST_TrainModel_Pipeline_cls_tlc3_sparse.script ├── TEST_TrainModel_Pipeline_cls_tlc3_sparse.script.cs ├── TEST_tmp_Location_ExtractFeatures.script ├── TEST_tmp_Location_ExtractFeatures.script.cs ├── ensemble-2stage.script ├── ensemble-2stage.script.cs ├── ensemble.script └── ensemble.script.cs /README.md: -------------------------------------------------------------------------------- 1 | This is our source code for Recsys Challenge 2017 http://2017.recsyschallenge.com/. 2 | The official rank of our team is 5th, and our final model ranked 2 for the last 2 consecutive weeks. It is a huge pity that we didn't use the best model in the first two weeks (in the most of other competitions, updating the best model before the last minute of deadline is enough, however, this is not true in this competiton, which is different from what we were expected.). 3 | 4 | The code is written with Microsoft's internal big data platform named COSMOS and the language is Scope. If you are interested in running it, you can try the public version in Azure, which is called Data Lake and U-SQL https://docs.microsoft.com/en-us/azure/data-lake-analytics/data-lake-analytics-data-lake-tools-get-started . 5 | 6 | Scripts under folder 'models' are scripts for extracting features, training model, making predictions, and many more post processing. 7 | The final features are in sparse format as SVMLight. 8 | 9 | Programs under folder 'auto-pipeline' are c# source code for our automatic pipeline. 10 | 11 | RecsysChallenge2017.pdf is our workshop paper, "Practical Lessons for Job Recommendations in the Cold-Start Scenario". 12 | https://dl.acm.org/citation.cfm?id=3124794 13 | 14 | Jianxun Lian, Fuzheng Zhang, Min Hou, Hongwei Wang, Xing Xie, and Guangzhong Sun. 2017. Practical Lessons for Job Recommendations in the Cold-Start Scenario. In Proceedings of the Recommender Systems Challenge 2017 (RecSys Challenge '17). ACM, New York, NY, USA, Article 4, 6 pages. DOI: https://doi.org/10.1145/3124791.3124794 15 | -------------------------------------------------------------------------------- /RecsysChallenge2017.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Leavingseason/RecsysChallenge2017/a05489995ef42805c88ef0984fcb93df8f6ac276/RecsysChallenge2017.pdf -------------------------------------------------------------------------------- /auto-pipeline/App.config: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /auto-pipeline/Data/CleanOnlineData.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Linq; 5 | using System.Text; 6 | using System.Threading.Tasks; 7 | 8 | namespace RecSys17.Data 9 | { 10 | class CleanOnlineData 11 | { 12 | public static void AdjustItemColumns(string infile, string outfile) 13 | { 14 | string[] headers = "id title career_level discipline_id industry_id country is_payed region latitude longitude employment tags created_at".Split('\t'); 15 | Dictionary newheader2idx = new Dictionary(); 16 | Dictionary idx2header = new Dictionary(); 17 | for (int i = 0; i < headers.Length; i++) 18 | { 19 | idx2header.Add(i, headers[i]); 20 | } 21 | 22 | 23 | 24 | using (StreamReader rd = new StreamReader(infile)) 25 | using (StreamWriter wt = new StreamWriter(outfile)) 26 | { 27 | string content = rd.ReadLine().Replace("recsyschallenge_vlive_2017_items.", "").Replace("recsyschallenge_vlive_2017_train_items_final.", ""); 28 | string[] words = content.Split('\t'); 29 | for (int i = 0; i < words.Length; i++) 30 | { 31 | newheader2idx.Add(words[i], i); 32 | } 33 | 34 | string res = ""; 35 | for (int i = 0; i < idx2header.Count; i++) 36 | { 37 | res += "\t" + words[newheader2idx[idx2header[i]]]; 38 | } 39 | wt.Write(res.Substring(1) + "\n"); 40 | 41 | while ((content = rd.ReadLine()) != null) 42 | { 43 | words = content.Split('\t'); 44 | res = ""; 45 | for (int i = 0; i < idx2header.Count; i++) 46 | { 47 | res += "\t" + words[newheader2idx[idx2header[i]]]; 48 | } 49 | wt.Write(res.Substring(1) + "\n"); 50 | } 51 | } 52 | } 53 | 54 | public static void AppendLossPairs(string date ,int user_max_cnt = 10) 55 | { 56 | ; 57 | //string target_users_file = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\second-stage\online-schedule\pull-data\target_users_2017-05-04.txt";// 58 | string accept_pair_file = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\second-stage\online-schedule\pull-data\accepted_pairs\accepted_pairs_"+date+".txt"; 59 | string user_side_file = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v_userside.csv"; 60 | 61 | string outfile = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v-1.csv"; 62 | 63 | string path = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online"; 64 | string[] tried_files = new string[] { 65 | Path.Combine(path,"recsys17-pred-highdim-submit_v1.csv"), 66 | Path.Combine(path,"recsys17-pred-highdim-submit_v2.csv"), 67 | Path.Combine(path,"recsys17-pred-highdim-submit_v3.csv"), 68 | Path.Combine(path,"recsys17-pred-highdim-submit_v4.csv"), 69 | Path.Combine(path,"recsys17-pred-highdim-submit_v5.csv"), 70 | Path.Combine(path,"recsys17-pred-highdim-submit_v6.csv"), 71 | Path.Combine(path,"recsys17-pred-highdim-submit_v7.csv"), 72 | Path.Combine(path,"recsys17-pred-highdim-submit.csv") 73 | }; 74 | 75 | 76 | Dictionary item2cnt = new Dictionary(); 77 | Dictionary existing_usrs = new Dictionary(); 78 | 79 | Dictionary> item2newusers = new Dictionary>(); 80 | 81 | List> user_item_scores = new List>(); 82 | using (StreamReader rd = new StreamReader(user_side_file)) 83 | { 84 | string content = null; 85 | while ((content = rd.ReadLine()) != null) 86 | { 87 | string[] words = content.Split('\t'); 88 | user_item_scores.Add(new Tuple(words[0], words[1], float.Parse(words[2]))); 89 | } 90 | } 91 | 92 | 93 | using (StreamReader rd = new StreamReader(accept_pair_file)) 94 | { 95 | string content = null; 96 | while ((content = rd.ReadLine()) != null) 97 | { 98 | string[] words = content.Split('\t'); 99 | string[] tokens = words[1].Split(new char[] { ' ', ',' }, StringSplitOptions.RemoveEmptyEntries); 100 | 101 | item2cnt.Add(words[0].Trim(), tokens.Length); 102 | 103 | foreach (var token in tokens) 104 | { 105 | if (!existing_usrs.ContainsKey(token)) 106 | { 107 | existing_usrs.Add(token, 1); 108 | } 109 | } 110 | } 111 | } 112 | 113 | HashSet tried_pairs = new HashSet(); 114 | foreach (var file in tried_files) 115 | { 116 | using (StreamReader rd = new StreamReader(file)) 117 | { 118 | string content = null; 119 | while ((content = rd.ReadLine()) != null) 120 | { 121 | string[] words = content.Split('\t'); 122 | string[] tokens = words[1].Split(new char[] { ' ', ',' }, StringSplitOptions.RemoveEmptyEntries); 123 | 124 | foreach (var token in tokens) 125 | { 126 | tried_pairs.Add(token + ":" + words[0]); 127 | } 128 | } 129 | } 130 | } 131 | 132 | user_item_scores.Sort((a, b) => b.Item3.CompareTo(a.Item3)); 133 | 134 | foreach (var tuple in user_item_scores) 135 | { 136 | if (!tried_pairs.Contains(tuple.Item1 + ":" + tuple.Item2) && (!existing_usrs.ContainsKey(tuple.Item1) || existing_usrs[tuple.Item1]()); 146 | } 147 | item2newusers[tuple.Item2].Add(tuple.Item1); 148 | 149 | if(!existing_usrs.ContainsKey(tuple.Item1)) 150 | existing_usrs.Add(tuple.Item1,0); 151 | existing_usrs[tuple.Item1]++; 152 | } 153 | } 154 | 155 | using (StreamWriter wt = new StreamWriter(outfile)) 156 | { 157 | foreach (var pair in item2newusers) 158 | { 159 | wt.Write("{0}\t{1}\n", pair.Key, string.Join(",", pair.Value.ToArray())); 160 | } 161 | } 162 | } 163 | 164 | public static void PrepareFMFile(string infile, string outfile01, string outfile02) 165 | { 166 | using(StreamReader rd = new StreamReader(infile)) 167 | using(StreamWriter wt01 = new StreamWriter(outfile01)) 168 | using (StreamWriter wt02 = new StreamWriter(outfile02)) 169 | { 170 | string content = null; 171 | int cnt = 0; 172 | while ((content = rd.ReadLine()) != null) 173 | { 174 | if (cnt++ % 100000 == 0) 175 | { 176 | Console.Write("{0}\r",cnt); 177 | } 178 | int idx = content.IndexOf("#"); 179 | wt01.Write(content.Substring(0,idx)+"\n"); 180 | wt02.Write(content.Substring(idx+1)+"\n"); 181 | } 182 | } 183 | } 184 | } 185 | } 186 | -------------------------------------------------------------------------------- /auto-pipeline/Properties/AssemblyInfo.cs: -------------------------------------------------------------------------------- 1 | using System.Reflection; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.InteropServices; 4 | 5 | // General Information about an assembly is controlled through the following 6 | // set of attributes. Change these attribute values to modify the information 7 | // associated with an assembly. 8 | [assembly: AssemblyTitle("RecSys17")] 9 | [assembly: AssemblyDescription("")] 10 | [assembly: AssemblyConfiguration("")] 11 | [assembly: AssemblyCompany("")] 12 | [assembly: AssemblyProduct("RecSys17")] 13 | [assembly: AssemblyCopyright("Copyright © 2017")] 14 | [assembly: AssemblyTrademark("")] 15 | [assembly: AssemblyCulture("")] 16 | 17 | // Setting ComVisible to false makes the types in this assembly not visible 18 | // to COM components. If you need to access a type in this assembly from 19 | // COM, set the ComVisible attribute to true on that type. 20 | [assembly: ComVisible(false)] 21 | 22 | // The following GUID is for the ID of the typelib if this project is exposed to COM 23 | [assembly: Guid("6666f690-4527-4d93-a733-3fc95f3fa7e4")] 24 | 25 | // Version information for an assembly consists of the following four values: 26 | // 27 | // Major Version 28 | // Minor Version 29 | // Build Number 30 | // Revision 31 | // 32 | // You can specify all the values or you can default the Build and Revision Numbers 33 | // by using the '*' as shown below: 34 | // [assembly: AssemblyVersion("1.0.*")] 35 | [assembly: AssemblyVersion("1.0.0.0")] 36 | [assembly: AssemblyFileVersion("1.0.0.0")] 37 | -------------------------------------------------------------------------------- /auto-pipeline/RecSys17.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | AnyCPU 7 | {6C3A3F7B-B868-4ECB-9BE0-E80A227551C1} 8 | Exe 9 | Properties 10 | RecSys17 11 | RecSys17 12 | v4.5 13 | 512 14 | 15 | 16 | x64 17 | true 18 | full 19 | false 20 | bin\Debug\ 21 | DEBUG;TRACE 22 | prompt 23 | 4 24 | 25 | 26 | x64 27 | pdbonly 28 | true 29 | bin\Release\ 30 | TRACE 31 | prompt 32 | 4 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | D:\My Projects\LyncOnlineAnalyse\Tools\bin\Release\Tools.dll 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 79 | -------------------------------------------------------------------------------- /auto-pipeline/Utils.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Linq; 5 | using System.Text; 6 | using System.Threading.Tasks; 7 | 8 | namespace RecSys17 9 | { 10 | class Utils 11 | { 12 | public static void OutputDict(Dictionary dict, string outfile) 13 | { 14 | using (StreamWriter wt = new StreamWriter(outfile)) 15 | { 16 | foreach (var pair in dict) 17 | { 18 | wt.WriteLine("{0},{1}", pair.Key, pair.Value); 19 | } 20 | } 21 | } 22 | 23 | public static Dictionary LoadDict(string infile, int keyIdx, int valueIdx) 24 | { 25 | Dictionary result = new Dictionary(); 26 | using (StreamReader rd = new StreamReader(infile)) 27 | { 28 | string content = null; 29 | while ((content = rd.ReadLine()) != null) 30 | { 31 | string[] words = content.Split(','); 32 | if(!string.IsNullOrEmpty(words[valueIdx])) 33 | result.Add(words[keyIdx], float.Parse(words[valueIdx])); 34 | } 35 | } 36 | return result; 37 | } 38 | 39 | 40 | public static void OverlapStat(string file01, string file02, int colidx) 41 | { 42 | HashSet values01 = LoadValue2Hashset(file01, colidx); 43 | HashSet values02 = LoadValue2Hashset(file02,colidx); 44 | 45 | int hit = values01.Intersect(values02).Count(); 46 | 47 | Console.WriteLine("{0}\t{1}\t{2}", hit, values01.Count, values02.Count); 48 | } 49 | 50 | private static HashSet LoadValue2Hashset(string file, int colidx) 51 | { 52 | HashSet res = new HashSet(); 53 | using (StreamReader rd = new StreamReader(file)) 54 | { 55 | string content = null; 56 | int cnt = 0; 57 | while ((content = rd.ReadLine()) != null) 58 | { 59 | if (cnt++ % 100000 == 0) 60 | { 61 | Console.WriteLine(cnt); 62 | } 63 | string[] words = content.Split(','); 64 | if (!res.Contains(words[colidx])) 65 | { 66 | res.Add(words[colidx]); 67 | } 68 | } 69 | } 70 | return res; 71 | } 72 | 73 | public static void SelectSubSet(string infile, string outfile, string[] col_names) 74 | { 75 | HashSet selectedFeatures = new HashSet(col_names); 76 | 77 | 78 | int cnt = 0; 79 | using (StreamReader rd = new StreamReader(infile)) 80 | using (StreamWriter wt = new StreamWriter(outfile)) 81 | { 82 | string content = rd.ReadLine(); 83 | string[] headers = content.Split(','); 84 | HashSet selectedFeatureIdx = new HashSet(); 85 | int dim = headers.Length; 86 | wt.Write(headers[0] + "," + headers[1]); 87 | for (int i = 2; i < dim; i++) 88 | { 89 | if (selectedFeatures.Contains(headers[i])) 90 | { 91 | selectedFeatureIdx.Add(i); 92 | wt.Write("," + headers[i]); 93 | } 94 | } 95 | wt.WriteLine(); 96 | 97 | while ((content = rd.ReadLine()) != null) 98 | { 99 | if (cnt++ % 10000 == 0) 100 | { 101 | Console.WriteLine(cnt); 102 | } 103 | string[] words = content.Split(','); 104 | wt.Write(words[0] + "," + words[1]); 105 | for (int i = 2; i < dim; i++) 106 | { 107 | if (selectedFeatureIdx.Contains(i)) 108 | { 109 | wt.Write("," + words[i]); 110 | } 111 | } 112 | wt.WriteLine(); 113 | } 114 | } 115 | } 116 | 117 | public static void ShuffleFile(string infile, string outfile) 118 | { 119 | Console.WriteLine("ShuffleFile..."); 120 | List lines = new List(); 121 | using (StreamReader rd = new StreamReader(infile)) 122 | { 123 | string content = null; 124 | int cnt = 0; 125 | while ((content = rd.ReadLine()) != null) 126 | { 127 | if (cnt++ % 1000000 == 0) 128 | { 129 | Console.Write(cnt + "\r"); 130 | } 131 | lines.Add(content); 132 | } 133 | } 134 | 135 | var arr = lines.ToArray(); 136 | Tools.Common.Shuffle(new Random(), arr); 137 | 138 | using (StreamWriter wt = new StreamWriter(outfile)) 139 | { 140 | foreach (var line in arr) 141 | { 142 | wt.WriteLine(line); 143 | } 144 | } 145 | } 146 | 147 | public static List RandomPickup(List list, int k) 148 | { 149 | if (list.Count <= k) 150 | { 151 | return new List(list); 152 | } 153 | 154 | int cnt = list.Count; 155 | Random rng = new Random(); 156 | for (int i = 0; i < k; i++) 157 | { 158 | int idx = rng.Next(cnt - i); 159 | string tmp = list[idx]; 160 | list[idx] = list[cnt - 1 - i]; 161 | list[cnt - 1 - i] = tmp; 162 | } 163 | 164 | return list.GetRange(cnt - k, k); 165 | } 166 | 167 | public static void SelectSubSet(string infile, string outfile, List selectedFeatureIdx, int topk = 100000) 168 | { 169 | int cnt = 0; 170 | using (StreamReader rd = new StreamReader(infile)) 171 | using (StreamWriter wt = new StreamWriter(outfile)) 172 | { 173 | string content = null; 174 | while ((content = rd.ReadLine()) != null) 175 | { 176 | if (cnt++ % 10000 == 0) 177 | { 178 | Console.WriteLine(cnt); 179 | } 180 | if (cnt > topk) 181 | { 182 | break; 183 | } 184 | string[] words = content.Split(','); 185 | wt.Write(words[0] + "," + words[1]); 186 | foreach(var idx in selectedFeatureIdx) 187 | { 188 | wt.Write("," + words[idx]); 189 | } 190 | wt.WriteLine(); 191 | } 192 | } 193 | } 194 | 195 | 196 | public static void SelectFeatureSubset(string infile, string outfile, string featureRankFile, int k, double r) 197 | { 198 | Random rng = new Random((int)DateTime.Now.Ticks); 199 | 200 | /// load features ranks 201 | List> feature2importance = LoadFeature2Importance(featureRankFile); 202 | 203 | /// select features 204 | HashSet selectedFeatures = new HashSet(); 205 | for (int i = 0; i < k; i++) 206 | { 207 | selectedFeatures.Add(feature2importance[i].Item1); 208 | } 209 | 210 | int cnt = 0; 211 | using (StreamReader rd = new StreamReader(infile)) 212 | using (StreamWriter wt = new StreamWriter(outfile)) 213 | { 214 | string content = rd.ReadLine(); 215 | string[] headers = content.Split(','); 216 | HashSet selectedFeatureIdx = new HashSet(); 217 | int dim = headers.Length; 218 | wt.Write(headers[0] + "," + headers[1]); 219 | for (int i = 2; i < dim; i++) 220 | { 221 | if (selectedFeatures.Contains(headers[i])) 222 | { 223 | selectedFeatureIdx.Add(i); 224 | wt.Write("," + headers[i]); 225 | } 226 | else 227 | { 228 | if (rng.NextDouble() < r) 229 | { 230 | selectedFeatureIdx.Add(i); 231 | wt.Write("," + headers[i]); 232 | } 233 | } 234 | } 235 | wt.WriteLine(); 236 | 237 | while ((content = rd.ReadLine()) != null) 238 | { 239 | if (cnt++ % 10000 == 0) 240 | { 241 | Console.WriteLine(cnt); 242 | } 243 | string[] words = content.Split(','); 244 | wt.Write(words[0] + "," + words[1]); 245 | for (int i = 2; i < dim; i++) 246 | { 247 | if (selectedFeatureIdx.Contains(i)) 248 | { 249 | wt.Write("," + words[i]); 250 | } 251 | } 252 | wt.WriteLine(); 253 | } 254 | } 255 | } 256 | 257 | public static List> LoadFeature2Importance(string featureRankFile) 258 | { 259 | List> feature2importance = new List>(); 260 | double t; 261 | using (StreamReader rd = new StreamReader(featureRankFile)) 262 | { 263 | string content = null; 264 | while ((content = rd.ReadLine()) != null) 265 | { 266 | string[] words = content.Replace("\"", "").Split(new char[] { '\t' }, StringSplitOptions.RemoveEmptyEntries); 267 | if (words.Length < 2 || !double.TryParse(words[1], out t)) 268 | { 269 | continue; 270 | } 271 | feature2importance.Add(new Tuple(words[0], double.Parse(words[1]))); 272 | } 273 | } 274 | return feature2importance; 275 | } 276 | 277 | 278 | internal static void StatColLabelCorre() 279 | { 280 | throw new NotImplementedException(); 281 | } 282 | 283 | internal static void StatColLabelCorre(string infile, string outfile, int label_idx, int col_idx) 284 | { 285 | Dictionary value2cnt = new Dictionary(); 286 | Dictionary value2poscnt = new Dictionary(); 287 | using (StreamReader rd = new StreamReader(infile)) 288 | { 289 | string content = rd.ReadLine(); 290 | while ((content = rd.ReadLine()) != null) 291 | { 292 | string[] words = content.Split(','); 293 | if (!value2cnt.ContainsKey(words[col_idx])) 294 | { 295 | value2cnt.Add(words[col_idx],0); 296 | value2poscnt.Add(words[col_idx],0); 297 | } 298 | value2cnt[words[col_idx]]++; 299 | if (words[label_idx].Equals("1") || words[label_idx].Equals("True")) 300 | { 301 | value2poscnt[words[col_idx]]++; 302 | } 303 | } 304 | } 305 | 306 | using (StreamWriter wt = new StreamWriter(outfile)) 307 | { 308 | foreach (var pair in value2cnt) 309 | { 310 | wt.WriteLine("{0},{1},{2},{3}", pair.Key, pair.Value, value2poscnt[pair.Key], value2poscnt[pair.Key] * 1.0 / pair.Value); 311 | } 312 | } 313 | } 314 | 315 | internal static void OutputDict02(Dictionary word_cnt, Dictionary word_hit, string outfile) 316 | { 317 | using (StreamWriter wt = new StreamWriter(outfile)) 318 | { 319 | foreach (var pair in word_cnt) 320 | { 321 | if (pair.Value > 0) 322 | { 323 | wt.WriteLine("{0},{1},{2},{3}", pair.Key, pair.Value, word_hit[pair.Key], word_hit[pair.Key] * 1.0 / pair.Value); 324 | } 325 | } 326 | } 327 | } 328 | } 329 | } 330 | -------------------------------------------------------------------------------- /auto-pipeline/model/DocumentClustering.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Linq; 5 | using System.Text; 6 | using System.Threading.Tasks; 7 | 8 | namespace RecSys17.model 9 | { 10 | class DocumentClustering 11 | { 12 | 13 | public static void TestGenClusterIdFeature() 14 | { 15 | string candi_file = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\train-test\train02_candidates_localgen.csv"; 16 | string outfile = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\clustering\train_feature_as_clusterid.csv"; 17 | 18 | string TLC_cluster_file = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\clustering\online\TLC\-1.inst.txt"; 19 | string TLC_training_file = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\clustering\online\training.txt"; 20 | string cluster_out_file = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\clustering\online\cluster_id_mapping.tsv"; 21 | 22 | Dictionary id2cluster = new Dictionary(); 23 | using(StreamReader rd01= new StreamReader(TLC_cluster_file)) 24 | using (StreamReader rd02 = new StreamReader(TLC_training_file)) 25 | { 26 | string content = rd01.ReadLine(); 27 | while ((content = rd02.ReadLine()) != null) 28 | { 29 | string id = content.Substring(content.IndexOf("#") + 1); 30 | string[] words = rd01.ReadLine().Split('\t'); 31 | id2cluster.Add(id, int.Parse(words[2])); 32 | } 33 | } 34 | 35 | //using(StreamReader rd= new StreamReader(candi_file)) 36 | //using (StreamWriter wt = new StreamWriter(outfile)) 37 | //{ 38 | // string content = null; 39 | // while ((content = rd.ReadLine()) != null) 40 | // { 41 | // string[] words = content.Split('\t'); 42 | // string uid = "uid_" + words[0]; 43 | // string iid = "iid_" + words[1]; 44 | // if (id2cluster.ContainsKey(uid) && id2cluster.ContainsKey(iid)) 45 | // { 46 | // wt.WriteLine("{0},{1},{2},{3},{4},{5}", words[2]=="0" || words[2]=="4"?"0":"1", words[0], words[1], id2cluster[uid], id2cluster[iid], id2cluster[uid] == id2cluster[iid] ? 1 : 0); 47 | // } 48 | // } 49 | //} 50 | 51 | using (StreamWriter wt = new StreamWriter(cluster_out_file)) 52 | { 53 | foreach (var pair in id2cluster) 54 | { 55 | wt.WriteLine("{0}\t{1}",pair.Key,pair.Value); 56 | } 57 | } 58 | 59 | } 60 | 61 | /// 62 | /// preapre svmlight feature for TLC kmeans clustering 63 | /// 64 | public static void PrepareFeatureFile() 65 | { 66 | bool reset_keymap = false; 67 | Dictionary user_titlefreq = Utils.LoadDict(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\user_title_stat.csv", 0, 1); 68 | Dictionary item_titlefreq = Utils.LoadDict(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\item_title_stat.csv", 0, 1); 69 | 70 | string keymapfile = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\clustering\word_id_mapping.csv"; 71 | 72 | if (reset_keymap) 73 | { 74 | BuildKeyMapping(keymapfile,user_titlefreq,item_titlefreq); 75 | } 76 | 77 | Dictionary keymapper = LoadKeymapfile(keymapfile); 78 | 79 | 80 | Dictionary userdict = FeatureFactory.BuildUserDict(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\second-stage\data_online\online\users_adj_schema.csv"); 81 | Dictionary itemdict = FeatureFactory.BuildItemDict(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\second-stage\data_online\online\items_noheader.csv"); 82 | 83 | string outfile = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\clustering\online\training.txt"; 84 | using (StreamWriter wt = new StreamWriter(outfile)) 85 | { 86 | foreach (var pair in userdict) 87 | { 88 | List> words = new List>(); 89 | if (pair.Value.title.Count > 0) 90 | { 91 | foreach (var word in pair.Value.title) 92 | { 93 | if (keymapper.ContainsKey(word)) 94 | { 95 | words.Add(new Tuple(word, keymapper[word])); 96 | } 97 | } 98 | if (words.Count > 4) 99 | { 100 | words.Sort((a, b) => a.Item2.CompareTo(b.Item2)); 101 | string res = ""; 102 | foreach (var tuple in words) 103 | { 104 | res += " " + tuple.Item2+":1"; 105 | } 106 | wt.WriteLine("0" + res + "#uid_" + pair.Key); 107 | } 108 | } 109 | } 110 | 111 | foreach (var pair in itemdict) 112 | { 113 | List> words = new List>(); 114 | if (pair.Value.title.Count > 0) 115 | { 116 | foreach (var word in pair.Value.title) 117 | { 118 | if (keymapper.ContainsKey(word)) 119 | { 120 | words.Add(new Tuple(word, keymapper[word])); 121 | } 122 | } 123 | if (words.Count > 4) 124 | { 125 | words.Sort((a, b) => a.Item2.CompareTo(b.Item2)); 126 | string res = ""; 127 | foreach (var tuple in words) 128 | { 129 | res += " " + tuple.Item2 + ":1"; 130 | } 131 | wt.WriteLine("0" + res + "#iid_" + pair.Key); 132 | } 133 | } 134 | } 135 | 136 | } 137 | 138 | } 139 | 140 | public static Dictionary LoadKeymapfile(string keymapfile) 141 | { 142 | Dictionary keymapper = new Dictionary(); 143 | using (StreamReader rd = new StreamReader(keymapfile)) 144 | { 145 | string content = null; 146 | while ((content = rd.ReadLine()) != null) 147 | { 148 | string[] words = content.Split(','); 149 | keymapper.Add(words[0], int.Parse(words[1])); 150 | } 151 | } 152 | return keymapper; 153 | } 154 | 155 | private static void BuildKeyMapping(string keymapfile, Dictionary user_titlefreq, Dictionary item_titlefreq) 156 | { 157 | Dictionary word2idx = new Dictionary(); 158 | foreach (var pair in user_titlefreq) 159 | { 160 | if (pair.Value >= 20) 161 | { 162 | if (!word2idx.ContainsKey(pair.Key)) 163 | { 164 | word2idx.Add(pair.Key, word2idx.Count + 1); 165 | } 166 | } 167 | } 168 | 169 | foreach (var pair in item_titlefreq) 170 | { 171 | if (pair.Value >= 20) 172 | { 173 | if (!word2idx.ContainsKey(pair.Key)) 174 | { 175 | word2idx.Add(pair.Key, word2idx.Count + 1); 176 | } 177 | } 178 | } 179 | 180 | using (StreamWriter wt = new StreamWriter(keymapfile)) 181 | { 182 | foreach (var pair in word2idx) 183 | { 184 | wt.WriteLine("{0},{1}",pair.Key,pair.Value); 185 | } 186 | } 187 | } 188 | } 189 | } 190 | -------------------------------------------------------------------------------- /auto-pipeline/model/DocumentRelated.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Linq; 5 | using System.Text; 6 | using System.Threading.Tasks; 7 | 8 | namespace RecSys17.model 9 | { 10 | class DocumentRelated 11 | { 12 | public static void GenKeyWords() 13 | { 14 | string outfile_useritem = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\keywords\keywords_useritem.csv"; 15 | string outfile_itemtitle = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\keywords\keywords_itemtitle.csv"; 16 | string outfile_itemtag = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\keywords\keywords_itemtag.csv"; 17 | 18 | string interation_file = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\train-test\train_interactions_sample1_0.3_moreneginst_shuffled.csv"; 19 | 20 | Dictionary userdict =FeatureFactory. BuildUserDict(); 21 | Dictionary itemdict = FeatureFactory.BuildItemDict(); 22 | Dictionary> user2interest_items = KNN.BuildUserInterestedItems(interation_file); 23 | 24 | Dictionary useritem_word_cnt = new Dictionary(); 25 | Dictionary useritem_word_hit = new Dictionary(); 26 | 27 | Dictionary itemtitle_word_cnt = new Dictionary(); 28 | Dictionary itemtitle_word_hit = new Dictionary(); 29 | 30 | Dictionary itemtag_word_cnt = new Dictionary(); 31 | Dictionary itemtag_word_hit = new Dictionary(); 32 | 33 | using (StreamReader rd = new StreamReader(interation_file)) 34 | { 35 | string content = null; 36 | int cnt = 0; 37 | while ((content = rd.ReadLine()) != null) 38 | { 39 | if (cnt++ % 100000 == 0) 40 | { 41 | Console.Write((cnt / 10000) + "w\r"); 42 | } 43 | string[] words = content.Split('\t'); 44 | if (itemdict.ContainsKey(words[1]) && userdict.ContainsKey(words[0])) 45 | { 46 | HashSet overlap01 = new HashSet(userdict[words[0]].title.Intersect(itemdict[words[1]].title)); 47 | HashSet overlap02 = new HashSet(); 48 | HashSet overlap03 = new HashSet(); 49 | 50 | if (user2interest_items.ContainsKey(words[0])) 51 | { 52 | foreach (var tid in user2interest_items[words[0]]) 53 | { 54 | if (tid!=words[1] && itemdict.ContainsKey(tid)) 55 | { 56 | foreach (var ttitle in itemdict[tid].title) 57 | { 58 | if (itemdict[words[1]].title.Contains(ttitle)) 59 | { 60 | if (!overlap02.Contains(ttitle)) 61 | { 62 | overlap02.Add(ttitle); 63 | } 64 | } 65 | } 66 | foreach (var ttag in itemdict[tid].tags) 67 | { 68 | if (itemdict[words[1]].tags.Contains(ttag)) 69 | { 70 | if (!overlap03.Contains(ttag)) 71 | { 72 | overlap03.Add(ttag); 73 | } 74 | } 75 | } 76 | } 77 | } 78 | } 79 | 80 | UpdateWordStatus(overlap01, useritem_word_cnt, useritem_word_hit, words[2]); 81 | UpdateWordStatus(overlap02, itemtitle_word_cnt, itemtitle_word_hit, words[2]); 82 | UpdateWordStatus(overlap03, itemtag_word_cnt, itemtag_word_hit, words[2]); 83 | } 84 | } 85 | } 86 | 87 | Utils.OutputDict02(useritem_word_cnt, useritem_word_hit, outfile_useritem); 88 | Utils.OutputDict02(itemtitle_word_cnt, itemtitle_word_hit, outfile_itemtitle); 89 | Utils.OutputDict02(itemtag_word_cnt, itemtag_word_hit, outfile_itemtag); 90 | 91 | } 92 | 93 | private static void UpdateWordStatus(HashSet overlap, Dictionary word_cnt, Dictionary word_hit, string status) 94 | { 95 | foreach (var word in overlap) 96 | { 97 | if (!word_cnt.ContainsKey(word)) 98 | { 99 | word_cnt.Add(word, 0); 100 | word_hit.Add(word, 0); 101 | } 102 | word_cnt[word]++; 103 | if (status != "0" && status != "4") 104 | { 105 | word_hit[word]++; 106 | } 107 | } 108 | } 109 | 110 | public static void PrepareTitleDocuments() 111 | { 112 | 113 | Dictionary user_titlefreq = Utils.LoadDict(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\user_title_stat.csv", 0, 1); 114 | Dictionary item_titlefreq = Utils.LoadDict(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\item_title_stat.csv", 0, 1); 115 | 116 | 117 | string outfile = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\word2vec\user_item_title_lines.txt"; 118 | 119 | List lines = new List(); 120 | 121 | Add2Lines(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\users.csv", user_titlefreq, lines); 122 | Add2Lines(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\items.csv", item_titlefreq, lines); 123 | 124 | var lines_arr = lines.ToArray(); 125 | 126 | Tools.Common.Shuffle(new Random(), lines_arr); 127 | 128 | using (StreamWriter wt = new StreamWriter(outfile)) 129 | { 130 | foreach (var line in lines_arr) 131 | { 132 | wt.Write(line + "\n"); 133 | } 134 | } 135 | 136 | } 137 | 138 | private static void Add2Lines(string file, Dictionary titlefreq, List lines) 139 | { 140 | using (StreamReader rd = new StreamReader(file)) 141 | { 142 | string content = rd.ReadLine(); 143 | while ((content = rd.ReadLine()) != null) 144 | { 145 | string line = ""; 146 | string[] words = content.Split('\t'); 147 | if (!string.IsNullOrEmpty(words[1])) 148 | { 149 | string[] tokens = words[1].Split(','); 150 | foreach (var token in tokens) 151 | { 152 | if (titlefreq.ContainsKey(token) && titlefreq[token] > 10) 153 | { 154 | line += "," + token; 155 | } 156 | } 157 | } 158 | if (line.Length > 1) 159 | { 160 | lines.Add(line.Substring(1)); 161 | } 162 | } 163 | } 164 | } 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /auto-pipeline/model/Evaluation.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Linq; 5 | using System.Text; 6 | using System.Threading.Tasks; 7 | 8 | namespace RecSys17.model 9 | { 10 | class Evaluation 11 | { 12 | 13 | public static void StatRecall(string predfile, string outfile) 14 | { 15 | Dictionary>> item2predictions = new Dictionary>>(); 16 | HashSet posset = new HashSet(); 17 | int cnt = 0; 18 | using (StreamReader rd = new StreamReader(predfile)) 19 | { 20 | string content = null; 21 | 22 | while ((content = rd.ReadLine()) != null) 23 | { 24 | if (cnt++ % 100000 == 0) 25 | { 26 | Console.Write(cnt + "\r"); 27 | } 28 | string[] words = content.Split('\t'); 29 | 30 | string[] tokens =words[0].Split('|'); 31 | 32 | double score = double.Parse(words[3]); 33 | 34 | if (!item2predictions.ContainsKey(tokens[1])) 35 | { 36 | item2predictions.Add(tokens[1], new List>()); 37 | } 38 | item2predictions[tokens[1]].Add(new Tuple(tokens[0], score, int.Parse(words[1]))); 39 | 40 | if (words[1] == "1") 41 | { 42 | posset.Add(words[0]); 43 | } 44 | } 45 | } 46 | 47 | Console.WriteLine("Poscnt : {0}", posset.Count); 48 | 49 | foreach (var iid in item2predictions.Keys) 50 | { 51 | item2predictions[iid].Sort((a, b) => b.Item2.CompareTo(a.Item2)); 52 | } 53 | 54 | using (StreamWriter wt = new StreamWriter(outfile)) 55 | { 56 | int hit = 0; 57 | for (int k = 1; k < 20000; k++) 58 | { 59 | foreach (var pair in item2predictions) 60 | { 61 | if (pair.Value.Count >= k) 62 | { 63 | if (pair.Value[k - 1].Item3 == 1) 64 | { 65 | hit++; 66 | } 67 | } 68 | } 69 | wt.WriteLine("{0},{1}", k, hit * 1.0 / posset.Count); 70 | } 71 | } 72 | 73 | } 74 | 75 | public static double RandomScore(string infile, string gtfile, Dictionary userdict, Dictionary itemdict) 76 | { 77 | int topk = 100; 78 | 79 | Dictionary> item2userset = new Dictionary>(); 80 | int cnt = 0; 81 | using (StreamReader rd = new StreamReader(infile)) 82 | { 83 | string content = rd.ReadLine(); 84 | while ((content = rd.ReadLine()) != null) 85 | { 86 | if (cnt++ % 100000 == 0) 87 | { 88 | Console.WriteLine(cnt); 89 | } 90 | string[] words = content.Split('\t')[0].Split('|'); 91 | if (!item2userset.ContainsKey(words[1])) 92 | { 93 | item2userset.Add(words[1], new List()); 94 | } 95 | item2userset[words[1]].Add(words[0]); 96 | } 97 | } 98 | 99 | 100 | Random rng = new Random(); 101 | 102 | Dictionary> item2user2status = LoadGTFile(gtfile, false); 103 | 104 | double res = 0; 105 | int line_cnt = 0; 106 | foreach (var pair in item2user2status) 107 | { 108 | line_cnt++; 109 | if (line_cnt % 100 == 0) 110 | { 111 | Console.WriteLine(line_cnt); 112 | } 113 | int success_user_cnt = 0; 114 | for (int i = 0; i < topk; i++) 115 | { 116 | string uid = item2userset[pair.Key][rng.Next(item2userset[pair.Key].Count)]; 117 | int cur_user_sucess = UserSucess(pair.Key, uid, item2user2status); 118 | if (cur_user_sucess > 0) 119 | { 120 | success_user_cnt++; 121 | } 122 | res += cur_user_sucess * (IsPremiumUser(uid, userdict)); 123 | } 124 | res += ItemSucess(success_user_cnt, pair.Key, itemdict); 125 | } 126 | 127 | 128 | Console.WriteLine("{0}\t{1}\t{2}", line_cnt, res, res / line_cnt); 129 | return res; 130 | } 131 | 132 | public static double Score(string subfile, string gtfile, Dictionary userdict = null, Dictionary itemdict = null, bool isFeatureMode = false) 133 | { 134 | 135 | //if (userdict == null) 136 | // userdict = FeatureFactory.BuildUserDict(); 137 | //if (itemdict == null) 138 | // itemdict = FeatureFactory.BuildItemDict(); 139 | 140 | Dictionary> item2user2status = LoadGTFile(gtfile, isFeatureMode); 141 | 142 | double res = 0; 143 | int line_cnt = 0; 144 | using (StreamReader rd = new StreamReader(subfile)) 145 | { 146 | string content = null; 147 | while ((content = rd.ReadLine()) != null) 148 | { 149 | string[] words = content.Split(new char[] { ' ', '\t' }); 150 | if (words.Length < 2) 151 | { 152 | continue; 153 | } 154 | line_cnt++; 155 | string[] tokens = words[1].Split(','); 156 | int success_user_cnt = 0; 157 | for (int j = 0; j < tokens.Length; j++) 158 | { 159 | var token = tokens[j]; 160 | int cur_user_sucess = UserSucess(words[0], token, item2user2status); 161 | if (cur_user_sucess > 0) 162 | { 163 | success_user_cnt++; 164 | } 165 | res += cur_user_sucess > 0 ? 1 : 0; //cur_user_sucess * (IsPremiumUser(token, userdict)); 166 | 167 | } 168 | 169 | // res += ItemSucess(success_user_cnt, words[0], itemdict); 170 | } 171 | } 172 | Console.WriteLine("{0}\t{1}\t{2}", line_cnt, res, res / line_cnt); 173 | return res; 174 | } 175 | 176 | public static int[] Score02(string subfile, string gtfile, Dictionary userdict = null, Dictionary itemdict = null) 177 | { 178 | int[] hit_cnt = new int[100 + 1]; 179 | Array.Clear(hit_cnt, 0, hit_cnt.Length); 180 | 181 | if (userdict == null) 182 | userdict = FeatureFactory.BuildUserDict(); 183 | if (itemdict == null) 184 | itemdict = FeatureFactory.BuildItemDict(); 185 | 186 | Dictionary> item2user2status = LoadGTFile(gtfile, false); 187 | 188 | int res = 0; 189 | int line_cnt = 0; 190 | using (StreamReader rd = new StreamReader(subfile)) 191 | { 192 | string content = null; 193 | while ((content = rd.ReadLine()) != null) 194 | { 195 | string[] words = content.Split(new char[] { ' ', '\t' }); 196 | if (words.Length < 2) 197 | { 198 | continue; 199 | } 200 | line_cnt++; 201 | string[] tokens = words[1].Split(','); 202 | 203 | for (int j = 0; j < tokens.Length && j<100; j++) 204 | { 205 | var token = tokens[j]; 206 | int cur_user_sucess = UserSucess(words[0], token, item2user2status) > 0 ? 1 : 0; 207 | 208 | hit_cnt[j] += cur_user_sucess; 209 | res += cur_user_sucess; 210 | } 211 | } 212 | } 213 | Console.WriteLine("{0}\t{1}\t{2}", line_cnt, res, res*1.0 / line_cnt); 214 | 215 | hit_cnt[hit_cnt.Length - 1] = res; 216 | return hit_cnt; 217 | } 218 | 219 | 220 | private static double ItemSucess(int success_user_cnt, string iid, Dictionary itemdict) 221 | { 222 | if (success_user_cnt <= 0) 223 | { 224 | return 0; 225 | } 226 | 227 | if (itemdict.ContainsKey(iid) && itemdict[iid].is_paid == "1") 228 | { 229 | return 50; 230 | } 231 | 232 | return 25; 233 | } 234 | 235 | private static int IsPremiumUser(string token, Dictionary userdict) 236 | { 237 | if (userdict.ContainsKey(token) && userdict[token].premium == "1") 238 | { 239 | return 2; 240 | } 241 | else 242 | { 243 | return 1; 244 | } 245 | } 246 | 247 | private static int UserSucess(string iid, string uid, Dictionary> item2user2status) 248 | { 249 | int score = 0; 250 | if (item2user2status.ContainsKey(iid)) 251 | { 252 | if (item2user2status[iid].ContainsKey(uid)) 253 | { 254 | if (item2user2status[iid][uid] == 1) 255 | { 256 | score = 1; 257 | } 258 | else if (item2user2status[iid][uid] == 2 || item2user2status[iid][uid] == 3) 259 | { 260 | score = 5; 261 | } 262 | else if (item2user2status[iid][uid] == 5) 263 | { 264 | score = 20; 265 | } 266 | else if (item2user2status[iid][uid] == 4) 267 | { 268 | score = -10; 269 | } 270 | } 271 | } 272 | return score; 273 | } 274 | 275 | private static Dictionary> LoadGTFile(string gtfile = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\train-test\test.tsv", bool isFeatureMode =false) 276 | { 277 | Dictionary> res = new Dictionary>(); 278 | char spliter = isFeatureMode ? ',' : '\t'; 279 | using (StreamReader rd = new StreamReader(gtfile)) 280 | { 281 | string content = null; 282 | while ((content = rd.ReadLine()) != null) 283 | { 284 | string[] words = content.Split(spliter); 285 | string uid = null, iid = null; 286 | int status = 0; 287 | if (isFeatureMode) 288 | { 289 | status = int.Parse(words[0]); 290 | string[] tokens = words[1].Split('|'); 291 | uid = tokens[0]; 292 | iid = tokens[1]; 293 | } 294 | else 295 | { 296 | status = int.Parse(words[2]); 297 | uid = words[0]; 298 | iid = words[1]; 299 | } 300 | if (status > 0) 301 | { 302 | if (!res.ContainsKey(iid)) 303 | { 304 | res.Add(iid, new Dictionary()); 305 | } 306 | res[iid].Add(uid, status); 307 | } 308 | } 309 | } 310 | return res; 311 | } 312 | } 313 | } 314 | -------------------------------------------------------------------------------- /auto-pipeline/model/FMProcessor.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Linq; 5 | using System.Text; 6 | using System.Threading.Tasks; 7 | 8 | namespace RecSys17.model 9 | { 10 | class FMProcessor 11 | { 12 | public static void AppendPredFile(string idfile, string predfile, string outfile) 13 | { 14 | using(StreamReader rd01 = new StreamReader(idfile)) 15 | using(StreamReader rd02 = new StreamReader(predfile)) 16 | using (StreamWriter wt = new StreamWriter(outfile)) 17 | { 18 | string content = null; 19 | while ((content = rd01.ReadLine() )!= null) 20 | { 21 | wt.Write("{0},{1}\n",content,rd02.ReadLine().Split(' ')[1]); 22 | } 23 | } 24 | } 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /auto-pipeline/model/Item.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | 7 | namespace RecSys17.model 8 | { 9 | class Item 10 | { 11 | public string id; 12 | public HashSet title; 13 | public int title_cnt; 14 | public Dictionary title2cnt; 15 | public string clevel; 16 | public string indus; 17 | public string disc; 18 | public string country; 19 | public string region; 20 | public string is_paid; 21 | public string employment; 22 | public HashSet tags; 23 | public DateTime create_at; 24 | 25 | public Item(){} 26 | public Item(string line) 27 | { 28 | string[] words = line.Split('\t'); 29 | 30 | id = words[0]; 31 | title = new HashSet(); 32 | title2cnt = new Dictionary(); 33 | title_cnt = 0; 34 | var tokens = words[1].Split(','); 35 | title_cnt = tokens.Length; 36 | foreach (var token in tokens) 37 | { 38 | title.Add(token); 39 | if (!title2cnt.ContainsKey(token)) 40 | { 41 | title2cnt.Add(token, 1.0f/title_cnt); 42 | } 43 | else 44 | { 45 | title2cnt[token] += 1.0f / title_cnt; 46 | } 47 | } 48 | clevel = words[2]; 49 | disc = words[3]; 50 | indus = words[4]; 51 | country = words[5]; 52 | is_paid = words[6]; 53 | region = words[7]; 54 | employment = words[10]; 55 | tags = new HashSet(); 56 | foreach (var token in words[11].Split(',')) 57 | { 58 | // if (token != "000") 59 | { 60 | tags.Add(token); 61 | } 62 | } 63 | if (!string.IsNullOrEmpty(words[12]) && words[12]!="null") 64 | { 65 | if (words[12].Contains("-")) 66 | { 67 | create_at = DateTime.Parse(words[12]); 68 | } 69 | else 70 | { 71 | create_at = Tools.Common.ParseTime(double.Parse(words[12])); 72 | } 73 | } 74 | else 75 | { 76 | create_at = DateTime.Parse("2017-01-01"); 77 | } 78 | } 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /auto-pipeline/model/ItemProfile.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Linq; 5 | using System.Text; 6 | using System.Threading.Tasks; 7 | 8 | namespace RecSys17.model 9 | { 10 | class ItemProfile 11 | { 12 | public static void BuildFeatureFile() 13 | { 14 | string label_file = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\itemprofile\offline_item_popularity.csv"; 15 | string outfile_like = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\itemprofile\features\offline_training_like.csv"; 16 | string outfile_hate = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\itemprofile\features\offline_training_hate.csv"; 17 | 18 | Dictionary itemdict = FeatureFactory. BuildItemDict(); 19 | 20 | Dictionary item_titlefreq = Utils.LoadDict(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\item_title_stat.csv", 0, 1); 21 | 22 | string keymapfile = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\clustering\word_id_mapping.csv"; 23 | 24 | Dictionary keymapper = DocumentClustering.LoadKeymapfile(keymapfile); 25 | 26 | using (StreamReader rd = new StreamReader(label_file)) 27 | using(StreamWriter wt_like = new StreamWriter(outfile_like)) 28 | using (StreamWriter wt_hate = new StreamWriter(outfile_hate)) 29 | { 30 | string content = null; 31 | while ((content = rd.ReadLine()) != null) 32 | { 33 | string[] words = content.Split(','); 34 | if (itemdict.ContainsKey(words[0]) && float.Parse(words[1])>0) 35 | { 36 | float book_ratio = float.Parse(words[3]); 37 | float reply_ratio = float.Parse(words[4]); 38 | float delete_ratio = float.Parse(words[5]); 39 | 40 | string featureline = ""; 41 | List titles = new List(); 42 | foreach (var title in itemdict[words[0]].title) 43 | { 44 | if (keymapper.ContainsKey(title)) 45 | { 46 | titles.Add(keymapper[title]); 47 | } 48 | } 49 | titles.Sort(); 50 | foreach (var idx in titles) 51 | { 52 | featureline += " " + idx + ":1"; 53 | } 54 | 55 | if (itemdict[words[0]].clevel == "0") 56 | { 57 | featureline += " " + (1+keymapper.Count) + ":1"; 58 | } 59 | else if (itemdict[words[0]].clevel == "1") 60 | { 61 | featureline += " " + (2 + keymapper.Count) + ":1"; 62 | } 63 | else if (itemdict[words[0]].clevel == "2") 64 | { 65 | featureline += " " + (3 + keymapper.Count) + ":1"; 66 | } 67 | else if (itemdict[words[0]].clevel == "3") 68 | { 69 | featureline += " " + (4 + keymapper.Count) + ":1"; 70 | } 71 | else if (itemdict[words[0]].clevel == "4") 72 | { 73 | featureline += " " + (5 + keymapper.Count) + ":1"; 74 | } 75 | else if (itemdict[words[0]].clevel == "5") 76 | { 77 | featureline += " " + (6 + keymapper.Count) + ":1"; 78 | } 79 | else if (itemdict[words[0]].clevel == "6") 80 | { 81 | featureline += " " + (7 + keymapper.Count) + ":1"; 82 | } 83 | else if (itemdict[words[0]].clevel == "7") 84 | { 85 | featureline += " " + (8 + keymapper.Count) + ":1"; 86 | } 87 | 88 | if (itemdict[words[0]].employment == "0") 89 | { 90 | featureline += " " + (9 + keymapper.Count) + ":1"; 91 | } 92 | else if (itemdict[words[0]].employment == "1") 93 | { 94 | featureline += " " + (10 + keymapper.Count) + ":1"; 95 | } 96 | else if (itemdict[words[0]].employment == "2") 97 | { 98 | featureline += " " + (11 + keymapper.Count) + ":1"; 99 | } 100 | else if (itemdict[words[0]].employment == "3") 101 | { 102 | featureline += " " + (12 + keymapper.Count) + ":1"; 103 | } 104 | else if (itemdict[words[0]].employment == "4") 105 | { 106 | featureline += " " + (13 + keymapper.Count) + ":1"; 107 | } 108 | else if (itemdict[words[0]].employment == "5") 109 | { 110 | featureline += " " + (14 + keymapper.Count) + ":1"; 111 | } 112 | 113 | featureline += " " + (15 + keymapper.Count) + ":" + itemdict[words[0]].tags.Count; 114 | 115 | 116 | if (delete_ratio > 0.1 || delete_ratio < 0.04) 117 | { 118 | int label = delete_ratio > 0.1 ? 1 : 0; 119 | wt_hate.Write(label); 120 | wt_hate.WriteLine(featureline); 121 | } 122 | 123 | if (book_ratio > 0.03 || reply_ratio > 0.03 || (book_ratio<0.02 && reply_ratio<0.02)) 124 | { 125 | int label = book_ratio > 0.03 || reply_ratio > 0.03?1:0; 126 | wt_like.Write(label); 127 | wt_like.WriteLine(featureline); 128 | } 129 | } 130 | } 131 | } 132 | 133 | 134 | 135 | } 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /auto-pipeline/model/KNN.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Linq; 5 | using System.Text; 6 | using System.Threading.Tasks; 7 | 8 | namespace RecSys17.model 9 | { 10 | class KNN 11 | { 12 | public static void PredictByUserDocsim() 13 | { 14 | int topk = 100; 15 | string outfile = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\knn_user_item_docsim.csv"; 16 | 17 | 18 | Dictionary itemdict = FeatureFactory.BuildItemDict(); 19 | Dictionary userdict = FeatureFactory.BuildUserDict(); 20 | 21 | Dictionary user_titlefreq = Utils.LoadDict(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\user_title_stat.csv", 0, 1); 22 | Dictionary item_titlefreq = Utils.LoadDict(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\item_title_stat.csv", 0, 1); 23 | 24 | 25 | List target_users = FeatureFactory.LoadListFromFile(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\targetUsers.csv"); 26 | List target_items = FeatureFactory.LoadListFromFile(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\targetItems.csv"); 27 | 28 | using (StreamWriter wt = new StreamWriter(outfile)) 29 | { 30 | int cnt = 0; 31 | foreach (var iid in target_items) 32 | { 33 | List> user2score = new List>(); 34 | foreach (var uid in target_users) 35 | { 36 | if (userdict.ContainsKey(uid)) 37 | { 38 | double score = GetUserScore(userdict[uid], itemdict[iid], user_titlefreq, item_titlefreq); 39 | if (score > 0) 40 | { 41 | user2score.Add(new Tuple(uid, score)); 42 | } 43 | } 44 | } 45 | Console.WriteLine("{0}\tnum of candi:\t{1}", cnt++, user2score.Count); 46 | if (user2score.Count > 0) 47 | { 48 | user2score.Sort((a, b) => b.Item2.CompareTo(a.Item2)); 49 | int k = Math.Min(topk, user2score.Count); 50 | wt.Write("{0}\t", iid); 51 | for (int i = 0; i < k - 1; i++) 52 | { 53 | wt.Write("{0},", user2score[i].Item1); 54 | } 55 | wt.Write("{0}\n", user2score[k - 1].Item1); 56 | } 57 | } 58 | } 59 | } 60 | 61 | private static double GetUserScore(User user, Item item, Dictionary user_titlefreq, Dictionary item_titlefreq) 62 | { 63 | double doc_sim = 0; 64 | foreach (var word in user.title) 65 | { 66 | if (!string.IsNullOrEmpty(word) && user_titlefreq.ContainsKey(word) && user_titlefreq[word] >= 20 && item_titlefreq.ContainsKey(word) && item_titlefreq[word] >= 20) 67 | { 68 | if (item.title2cnt.ContainsKey(word)) 69 | { 70 | doc_sim += Math.Sqrt(user.title2cnt[word] * item.title2cnt[word]) * Math.Log10(1000000.0 / user_titlefreq[word]); 71 | } 72 | } 73 | } 74 | return doc_sim; 75 | } 76 | 77 | public static void PredictByViewDocsim() 78 | { 79 | int topk = 100; 80 | string outfile = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\knn_tag.csv"; 81 | string trainfile = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\interactions_grouped.csv"; 82 | 83 | Dictionary> user2interest_items = BuildUserInterestedItems(trainfile); 84 | 85 | Dictionary itemdict = FeatureFactory.BuildItemDict(); 86 | // Dictionary userdict = FeatureFactory.BuildUserDict(); 87 | 88 | Dictionary user_titlefreq = Utils.LoadDict(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\user_title_stat.csv", 0, 1); 89 | Dictionary item_titlefreq = Utils.LoadDict(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\item_title_stat.csv", 0, 1); 90 | 91 | 92 | List target_users = FeatureFactory. LoadListFromFile(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\targetUsers.csv"); 93 | List target_items = FeatureFactory.LoadListFromFile(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\targetItems.csv"); 94 | 95 | using (StreamWriter wt = new StreamWriter(outfile)) 96 | { 97 | int cnt = 0; 98 | foreach (var iid in target_items) 99 | { 100 | List> user2score = new List>(); 101 | foreach (var uid in target_users) 102 | { 103 | if (user2interest_items.ContainsKey(uid)) 104 | { 105 | //double score = GetUserScore(iid, user2interest_items[uid], itemdict, item_titlefreq); 106 | double score = GetUserScore_Tag(iid, user2interest_items[uid], itemdict, item_titlefreq); 107 | 108 | if (score > 0) 109 | { 110 | user2score.Add(new Tuple(uid, score)); 111 | } 112 | } 113 | } 114 | Console.WriteLine("{0}\tnum of candi:\t{1}", cnt++, user2score.Count); 115 | if (user2score.Count > 0) 116 | { 117 | user2score.Sort((a, b) => b.Item2.CompareTo(a.Item2)); 118 | int k = Math.Min(topk, user2score.Count); 119 | wt.Write("{0}\t", iid); 120 | for (int i = 0; i < k - 1; i++) 121 | { 122 | wt.Write("{0},", user2score[i].Item1); 123 | } 124 | wt.Write("{0}\n", user2score[k - 1].Item1); 125 | } 126 | } 127 | } 128 | } 129 | 130 | public static double GetUserScore(string iid, List history, Dictionary itemdict, Dictionary item_titlefreq) 131 | { 132 | double score = 0; 133 | foreach (var tid in history) 134 | { 135 | if (itemdict.ContainsKey(tid) && tid != iid) 136 | { 137 | foreach (var word in itemdict[iid].title) 138 | { 139 | if (!string.IsNullOrEmpty(word) && item_titlefreq.ContainsKey(word) && item_titlefreq[word] >= 20) 140 | { 141 | if (itemdict[tid].title2cnt.ContainsKey(word)) 142 | { 143 | score += Math.Sqrt(itemdict[tid].title2cnt[word] * itemdict[iid].title2cnt[word]) * Math.Log10(1000000.0 / item_titlefreq[word]); 144 | } 145 | } 146 | } 147 | } 148 | } 149 | return score; 150 | } 151 | 152 | public static double GetUserScore_Tag(string iid, List history, Dictionary itemdict, Dictionary item_titlefreq) 153 | { 154 | double score = 0; 155 | foreach (var tid in history) 156 | { 157 | if (itemdict.ContainsKey(tid)) 158 | { 159 | foreach (var word in itemdict[iid].tags) 160 | { 161 | if (!string.IsNullOrEmpty(word)) 162 | { 163 | if (itemdict[tid].tags.Contains(word)) 164 | { 165 | score += 1; 166 | } 167 | } 168 | } 169 | } 170 | } 171 | if (history.Count > 0) 172 | { 173 | score /= history.Count; 174 | } 175 | return score; 176 | } 177 | 178 | public static Dictionary> BuildUserInterestedItems(string file) 179 | { 180 | Console.WriteLine("BuildUserInterestedItems..."); 181 | Dictionary> res = new Dictionary>(); 182 | using (StreamReader rd = new StreamReader(file)) 183 | { 184 | string content = null; 185 | while ((content = rd.ReadLine()) != null) 186 | { 187 | string[] words = content.Split('\t'); 188 | if (words[2] != "0" && words[2] != "4") 189 | { 190 | if (!res.ContainsKey(words[0])) 191 | { 192 | res.Add(words[0], new List()); 193 | } 194 | res[words[0]].Add(words[1]); 195 | } 196 | } 197 | } 198 | Console.WriteLine("BuildUserInterestedItems finished."); 199 | return res; 200 | } 201 | 202 | public static void PredictFromClosestJobs() 203 | { 204 | int topk = 5; 205 | string outfile = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\knn_closest_jobs.csv"; 206 | 207 | string logfile = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\knn_closest_jobs_logs.csv"; 208 | string logfile_bestscore = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\knn_closest_jobs_logs_bestscore.csv"; 209 | 210 | string interaction_file = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\interactions_grouped.csv"; 211 | 212 | Dictionary itemdict = FeatureFactory.BuildItemDict(); 213 | //Dictionary userdict = FeatureFactory.BuildUserDict(); 214 | 215 | // Dictionary user_titlefreq = Utils.LoadDict(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\user_title_stat.csv", 0, 1); 216 | Dictionary item_titlefreq = Utils.LoadDict(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\item_title_stat.csv", 0, 1); 217 | 218 | 219 | HashSet target_users = new HashSet(FeatureFactory.LoadListFromFile(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\targetUsers.csv")); 220 | List target_items = FeatureFactory.LoadListFromFile(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\targetItems.csv"); 221 | 222 | Dictionary> item2clicked_users = LoadItem2PosUsers(interaction_file, target_users); 223 | 224 | using (StreamWriter wt = new StreamWriter(outfile)) 225 | using(StreamWriter wt_log = new StreamWriter(logfile)) 226 | using (StreamWriter wt_log02 = new StreamWriter(logfile_bestscore)) 227 | { 228 | int cnt = 0; 229 | foreach (var iid in target_items) 230 | { 231 | if (cnt++ % 100 == 0) 232 | { 233 | Console.Write("writing {0}\r", cnt); 234 | } 235 | HashSet candidates = new HashSet(); 236 | 237 | List> item2sim = new List>(); 238 | foreach (var ciid in item2clicked_users.Keys) 239 | { 240 | item2sim.Add(new Tuple(ciid, GetItemSim(iid, ciid, item_titlefreq, itemdict))); 241 | } 242 | 243 | item2sim.Sort((a, b) => b.Item2.CompareTo(a.Item2)); 244 | 245 | //foreach (var citem in item2sim) 246 | //{ 247 | // wt_log.WriteLine("{0},{1},{2}", iid, citem.Item1, citem.Item2); 248 | //} 249 | wt_log02.WriteLine("{0},{1},{2}", iid, item2sim[0].Item1, item2sim[0].Item2); 250 | 251 | foreach (var tuple in item2sim) 252 | { 253 | foreach (var user in item2clicked_users[tuple.Item1]) 254 | { 255 | if (!candidates.Contains(user) && tuple.Item2>0) 256 | { 257 | candidates.Add(user); 258 | } 259 | } 260 | 261 | if (candidates.Count >= topk) 262 | { 263 | break; 264 | } 265 | } 266 | 267 | 268 | if (candidates.Count > 0) 269 | { 270 | var candi_list = candidates.ToList(); 271 | string out_line = iid + "\t"; 272 | 273 | for (int i = 0; i < candi_list.Count && i< topk; i++) 274 | { 275 | out_line+=candi_list[i]+","; 276 | } 277 | wt.WriteLine(out_line.Substring(0, out_line.Length - 1)); 278 | } 279 | } 280 | } 281 | } 282 | 283 | private static double GetItemSim(string iid, string ciid, Dictionary item_titlefreq, Dictionary itemdict) 284 | { 285 | if (!itemdict.ContainsKey(iid) || !itemdict.ContainsKey(ciid)) 286 | { 287 | return 0; 288 | } 289 | 290 | Item info_iid = itemdict[iid]; 291 | Item info_ciid = itemdict[ciid]; 292 | 293 | if (info_ciid.indus != info_iid.indus || info_ciid.disc != info_iid.disc || info_ciid.country != info_iid.country) 294 | { 295 | return 0; 296 | } 297 | 298 | double res = 0; 299 | foreach (var word in info_iid.title2cnt.Keys) 300 | { 301 | if (item_titlefreq.ContainsKey(word) && info_ciid.title2cnt.ContainsKey(word)) 302 | { 303 | res += Math.Log10(1000000.0 / item_titlefreq[word]) * info_iid.title2cnt[word] * info_ciid.title2cnt[word]; 304 | } 305 | } 306 | 307 | return res; 308 | } 309 | 310 | 311 | 312 | private static Dictionary> LoadItem2PosUsers(string interaction_file, HashSet target_users) 313 | { 314 | Dictionary> res = new Dictionary>(); 315 | using (StreamReader rd = new StreamReader(interaction_file)) 316 | { 317 | string content = null; 318 | while ((content = rd.ReadLine()) != null) 319 | { 320 | string[] words = content.Split('\t'); 321 | if (words[2] != "0" && words[2] != "4" && target_users.Contains(words[0])) 322 | { 323 | if (!res.ContainsKey(words[1])) 324 | { 325 | res.Add(words[1], new HashSet()); 326 | } 327 | if (!res[words[1]].Contains(words[0])) 328 | { 329 | res[words[1]].Add(words[0]); 330 | } 331 | } 332 | } 333 | } 334 | return res; 335 | } 336 | 337 | } 338 | } 339 | -------------------------------------------------------------------------------- /auto-pipeline/model/KeywordMgr.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Linq; 5 | using System.Text; 6 | using System.Threading.Tasks; 7 | 8 | namespace RecSys17.model 9 | { 10 | class KeywordMgr 11 | { 12 | public Dictionary useritem_index; 13 | public Dictionary itemitem_title_index; 14 | public Dictionary itemitem_tag_index; 15 | 16 | public KeywordMgr() 17 | { 18 | useritem_index = BuildIndex(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\keywords\keywords_useritem.csv"); 19 | itemitem_title_index = BuildIndex(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\keywords\keywords_itemtitle.csv"); 20 | itemitem_tag_index = BuildIndex(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\keywords\keywords_itemtag.csv"); 21 | } 22 | 23 | 24 | private Dictionary BuildIndex(string file) 25 | { 26 | int thre = 100; 27 | List> index = new List>(); 28 | using (StreamReader rd = new StreamReader(file)) 29 | { 30 | string content = null; 31 | while ((content = rd.ReadLine()) != null) 32 | { 33 | string[] words = content.Split(','); 34 | if (int.Parse(words[1]) >= thre) 35 | { 36 | index.Add(new Tuple(words[0], double.Parse(words[3]))); 37 | } 38 | } 39 | } 40 | index.Sort((a, b) => b.Item2.CompareTo(a.Item2)); 41 | 42 | Dictionary res = new Dictionary(); 43 | for (int i = 0; i < index.Count; i++) 44 | { 45 | res.Add(index[i].Item1, i); 46 | } 47 | return res; 48 | } 49 | 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /auto-pipeline/model/SubmissionHelper.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Linq; 5 | using System.Text; 6 | using System.Threading.Tasks; 7 | 8 | namespace RecSys17.model 9 | { 10 | class SubmissionHelper 11 | { 12 | public static void Ensemble(string infile01, string infile02, string outfile, int gap, int k , int start ) 13 | { 14 | Dictionary> iid2rec01 = LoadSubFile(infile01); 15 | Dictionary> iid2rec02 = LoadSubFile(infile02); 16 | 17 | using (StreamWriter wt = new StreamWriter(outfile)) 18 | { 19 | foreach (var pair in iid2rec01) 20 | { 21 | if (!iid2rec02.ContainsKey(pair.Key)) 22 | { 23 | wt.Write("{0}\t",pair.Key); 24 | wt.Write("{0}\n", string.Join(",", pair.Value.ToArray())); 25 | } 26 | else 27 | { 28 | List merge_list = MergeTwoList(pair.Value, iid2rec02[pair.Key], gap, k, start); 29 | wt.Write("{0}\t", pair.Key); 30 | wt.Write("{0}\n", string.Join(",", merge_list.ToArray())); 31 | } 32 | } 33 | 34 | foreach (var pair in iid2rec02) 35 | { 36 | if (!iid2rec01.ContainsKey(pair.Key)) 37 | { 38 | wt.Write("{0}\t", pair.Key); 39 | wt.Write("{0}\n", string.Join(",", pair.Value.ToArray())); 40 | } 41 | } 42 | } 43 | } 44 | 45 | private static List MergeTwoList(List list1, List list2, int gap, int max_k, int start) 46 | { 47 | HashSet visited = new HashSet(); 48 | List res = new List(); 49 | int t01 = 0, t02 = 0; 50 | int cnt01 = list1.Count, cnt02 = list2.Count; 51 | 52 | while (t01 < cnt01 && t01 < start) 53 | { 54 | res.Add(list1[t01]); 55 | visited.Add(list1[t01]); 56 | t01++; 57 | } 58 | 59 | while (t01 < cnt01 && t02 < cnt02) 60 | { 61 | for (int i = 0; i < gap && i+t01 max_k) 98 | { 99 | res = res.GetRange(0, max_k); 100 | } 101 | 102 | return res; 103 | } 104 | 105 | private static Dictionary> LoadSubFile(string infile) 106 | { 107 | Dictionary> res = new Dictionary>(); 108 | 109 | using (StreamReader rd = new StreamReader(infile)) 110 | { 111 | string content = null; 112 | while ((content = rd.ReadLine()) != null) 113 | { 114 | string[] words = content.Split(new char[] { ' ', '\t' }); 115 | if (words.Length < 2) 116 | { 117 | continue; 118 | } 119 | 120 | string[] tokens = words[1].Split(','); 121 | 122 | res.Add(words[0], new List()); 123 | for (int i = 0; i < tokens.Length; i++) 124 | { 125 | res[words[0]].Add(tokens[i]); 126 | } 127 | } 128 | } 129 | 130 | return res; 131 | } 132 | 133 | public static void GenSubFileFromTLCWithAlignment(string infile, string reffile, string outfile) 134 | { 135 | int topk = 100; 136 | var userdict = FeatureFactory.BuildUserDict(); 137 | var itemdict = FeatureFactory.BuildItemDict(); 138 | 139 | Dictionary>> item2userscore = new Dictionary>>(); 140 | int cnt = 0; 141 | using (StreamReader rd01 = new StreamReader(infile)) 142 | using (StreamReader rd02 = new StreamReader(reffile)) 143 | { 144 | string content = rd01.ReadLine(); 145 | while ((content = rd01.ReadLine()) != null) 146 | { 147 | if (cnt++ % 100000 == 0) 148 | { 149 | Console.Write(cnt + "\r"); 150 | } 151 | string[] words = content.Split('\t'); 152 | double score = double.Parse(words[3]); 153 | 154 | string[] tokens = rd02.ReadLine().Split(','); 155 | string uid = tokens[2]; 156 | string iid = tokens[3]; 157 | 158 | 159 | 160 | if (!item2userscore.ContainsKey(iid)) 161 | { 162 | item2userscore.Add(iid, new List>()); 163 | } 164 | 165 | item2userscore[iid].Add(new Tuple(uid, score)); 166 | } 167 | } 168 | 169 | cnt = 0; 170 | using (StreamWriter wt = new StreamWriter(outfile)) 171 | foreach (var iid in item2userscore.Keys) 172 | { 173 | if (cnt++ % 1000 == 0) 174 | { 175 | Console.WriteLine("Item {0}", cnt); 176 | } 177 | var list = item2userscore[iid]; 178 | //if (list.Count < 500) 179 | //{ 180 | // continue; 181 | //} 182 | list.Sort((a, b) => b.Item2.CompareTo(a.Item2)); 183 | int k = Math.Min(topk, list.Count); 184 | wt.Write("{0}\t", iid); 185 | for (int i = 0; i < k - 1; i++) 186 | { 187 | wt.Write("{0},", list[i].Item1); 188 | } 189 | if (k > 0) 190 | wt.Write("{0}\n", list[k - 1].Item1); 191 | else 192 | wt.Write("\n"); 193 | } 194 | 195 | } 196 | 197 | public static void GenSubFileFromTLC(string infile, string outfile, int[] name_idx, int value_idx, double thre = 0.1, char spliter = '\t', bool hasHeader = true, 198 | Dictionary userdict = null, Dictionary itemdict = null, Func get_score = null) 199 | { 200 | int topk = 100; 201 | if(userdict==null) 202 | userdict = FeatureFactory.BuildUserDict(); 203 | if (itemdict==null) 204 | itemdict = FeatureFactory.BuildItemDict(); 205 | 206 | Dictionary>> item2userscore = new Dictionary>>(); 207 | int cnt = 0; 208 | using (StreamReader rd = new StreamReader(infile)) 209 | { 210 | string content = null; 211 | if(hasHeader) 212 | rd.ReadLine(); 213 | while ((content = rd.ReadLine()) != null) 214 | { 215 | if (cnt++ % 100000 == 0) 216 | { 217 | Console.Write(cnt+"\r"); 218 | } 219 | string[] words = content.Split(spliter); 220 | 221 | string[] tokens = null; 222 | if (name_idx.Length == 1) 223 | { 224 | tokens = words[name_idx[0]].Split('|'); 225 | } 226 | else //if (name_idx.Length == 2) 227 | { 228 | tokens = new string[name_idx.Length]; 229 | for (int t = 0; t < name_idx.Length; t++) 230 | { 231 | tokens[t] = words[name_idx[t]]; 232 | } 233 | } 234 | double score = double.Parse(words[value_idx]); 235 | if (get_score != null) 236 | { 237 | score = get_score(words); 238 | } 239 | 240 | if (!item2userscore.ContainsKey(tokens[1])) 241 | { 242 | item2userscore.Add(tokens[1], new List>()); 243 | } 244 | 245 | if (score > thre ) // && userdict[tokens[0]].title.Intersect(itemdict[tokens[1]].title).Count()>0) 246 | { 247 | item2userscore[tokens[1]].Add(new Tuple(tokens[0], score)); 248 | } 249 | } 250 | } 251 | 252 | cnt = 0; 253 | using(StreamWriter wt = new StreamWriter(outfile)) 254 | foreach (var iid in item2userscore.Keys) 255 | { 256 | if (cnt++ % 1000 == 0) 257 | { 258 | Console.WriteLine("Item {0}", cnt); 259 | } 260 | var list = item2userscore[iid]; 261 | //if (list.Count < 500) 262 | //{ 263 | // continue; 264 | //} 265 | list.Sort((a, b) => b.Item2.CompareTo(a.Item2)); 266 | int k = Math.Min(topk, list.Count); 267 | wt.Write("{0}\t",iid); 268 | for (int i = 0; i < k - 1; i++) 269 | { 270 | wt.Write("{0},", list[i].Item1); 271 | } 272 | if (k > 0) 273 | wt.Write("{0}\n", list[k - 1].Item1); 274 | else 275 | wt.Write("\n"); 276 | } 277 | 278 | } 279 | } 280 | } 281 | -------------------------------------------------------------------------------- /auto-pipeline/model/User.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | 7 | namespace RecSys17.model 8 | { 9 | class User 10 | { 11 | public string id; 12 | public HashSet title; 13 | public Dictionary title2cnt; 14 | public int title_cnt; 15 | public string clevel; 16 | public string indus; 17 | public string disc; 18 | public string country; 19 | public string region; 20 | public string experience_n_entries_class; 21 | public string experience_years_experience; 22 | public string experience_years_in_current; 23 | public string edu_degree; 24 | public HashSet edu_fieldofstudies; 25 | public string wtcj; 26 | public string premium ; 27 | 28 | public List> interactions; 29 | public Dictionary viewed_item_title_words; 30 | public double viewed_titem_title_cnt; 31 | 32 | 33 | 34 | public User(){} 35 | public User(string line) 36 | { 37 | string[] words = line.Split('\t'); 38 | 39 | id = words[0]; 40 | title = new HashSet(); 41 | title2cnt = new Dictionary(); 42 | title_cnt = 0; 43 | var tokens = words[1].Split(','); 44 | title_cnt = tokens.Length; 45 | foreach (var token in tokens) 46 | { 47 | title.Add(token); 48 | if (!title2cnt.ContainsKey(token)) 49 | { 50 | title2cnt.Add(token, 1.0f / title_cnt); 51 | } 52 | else 53 | { 54 | title2cnt[token] += 1.0f / title_cnt; 55 | } 56 | } 57 | clevel = words[2]; 58 | disc = words[3]; 59 | indus = words[4]; 60 | country = words[5]; 61 | region = words[6]; 62 | experience_n_entries_class = words[7]; 63 | experience_years_experience = words[8]; 64 | experience_years_in_current = words[9]; 65 | edu_degree = words[10]; 66 | edu_fieldofstudies = new HashSet(); 67 | foreach (var token in words[11].Split(',')) 68 | { 69 | //if (token != "000") 70 | { 71 | edu_fieldofstudies.Add(token); 72 | } 73 | } 74 | wtcj = words[12]; 75 | premium = words[13]; 76 | 77 | viewed_titem_title_cnt = 0; 78 | interactions = null; 79 | viewed_item_title_words = null; 80 | } 81 | 82 | public void AddViewItem(Item it, int action){ 83 | if (interactions == null) 84 | { 85 | interactions = new List>(); 86 | viewed_item_title_words = new Dictionary(); 87 | } 88 | 89 | foreach (var pair in it.title2cnt) 90 | { 91 | int tcnt = (int)Math.Round(pair.Value * it.title_cnt); 92 | viewed_titem_title_cnt += tcnt; 93 | if (!viewed_item_title_words.ContainsKey(pair.Key)) 94 | { 95 | viewed_item_title_words.Add(pair.Key, tcnt); 96 | } 97 | else 98 | { 99 | viewed_item_title_words[pair.Key] += tcnt; 100 | } 101 | } 102 | 103 | interactions.Add(new Tuple(it.id, action)); 104 | } 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /auto-pipeline/model/WordHashing.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Linq; 5 | using System.Text; 6 | using System.Threading.Tasks; 7 | 8 | namespace RecSys17.model 9 | { 10 | class WordHashing 11 | { 12 | public static void BuildWordHashing() 13 | { 14 | Dictionary userdict = FeatureFactory.BuildUserDict(); 15 | Dictionary itemdict = FeatureFactory.BuildItemDict(); 16 | 17 | Dictionary user_titlefreq = Utils.LoadDict(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\user_title_stat.csv", 0, 1); 18 | Dictionary item_titlefreq = Utils.LoadDict(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\item_title_stat.csv", 0, 1); 19 | 20 | string outfile = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\clustering\word_hashing.csv"; 21 | 22 | Random rng = new Random(); 23 | int topic_cnt = 200; 24 | Dictionary word2idx = new Dictionary(); 25 | foreach (var pair in item_titlefreq) 26 | { 27 | if (pair.Value >= 20) 28 | { 29 | if (!word2idx.ContainsKey(pair.Key)) 30 | { 31 | word2idx.Add(pair.Key, rng.Next(topic_cnt)); 32 | } 33 | } 34 | } 35 | 36 | int ite_cnt = 50; 37 | for (int ite = 0; ite < ite_cnt; ite++) 38 | { 39 | Console.Write("ite : {0}\r",ite); 40 | foreach (var item in itemdict.Values) 41 | { 42 | if (item.title.Count > 1) 43 | { 44 | List words = new List(); 45 | foreach (var title in item.title) 46 | { 47 | if (word2idx.ContainsKey(title)) 48 | { 49 | words.Add(title); 50 | } 51 | } 52 | if (words.Count > 1) 53 | { 54 | List cur_topics = new List(); 55 | foreach (var word in words) 56 | { 57 | cur_topics.Add(word2idx[word]); 58 | } 59 | int new_topic = cur_topics[rng.Next(words.Count)]; 60 | foreach (var word in words) 61 | { 62 | if(rng.NextDouble()<0.8) 63 | word2idx[word] = new_topic; 64 | else 65 | { 66 | word2idx[word] = rng.Next(topic_cnt); 67 | } 68 | } 69 | } 70 | } 71 | } 72 | } 73 | 74 | using (StreamWriter wt = new StreamWriter(outfile)) 75 | { 76 | foreach (var pair in word2idx) 77 | { 78 | wt.WriteLine("{0},{1}",pair.Key,pair.Value); 79 | } 80 | } 81 | } 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /auto-pipeline/py-pull_and_submit/daily-pull-data.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Online example 3 | 4 | Uses the offline mode to make predictions 5 | for the online challenge. 6 | 7 | by Daniel Kohlsdorf 8 | ''' 9 | import urllib.request 10 | import time 11 | import sys 12 | import json 13 | from dateutil.parser import parse 14 | import datetime 15 | import parser 16 | #from recommendation_worker import * 17 | 18 | TMP_ITEMS = "data/current_items.csv" 19 | TMP_SOLUTION = "data/current_solution.csv" 20 | 21 | MODEL = "data/recsys2017.model" # Model from offline training 22 | USERS_FILE = "data/users.csv" # Online user data 23 | 24 | PULL_DATA_PATH = r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\second-stage\online-schedule\pull-data' 25 | PULL_DATA_LOG_FILE = PULL_DATA_PATH + '\\pull-dates.txt' 26 | 27 | cur_date_flag = '' 28 | 29 | TOKEN = "bGVhdmluZ3NlYXNvbjdiODFkYTRlLTM4MGUtNGZkOC1iYTVjLTM5MjA0M2VhOTQ5Yw==" # your key 30 | SERVER = "https://recsys.xing.com" 31 | 32 | def header(token): 33 | return {"Authorization" : "Bearer %s" %TOKEN} 34 | 35 | def post_url(server): 36 | return server + "/api/online/submission" 37 | 38 | def status_url(server): 39 | return server + "/api/online/data/status" 40 | 41 | def users_url(server): 42 | return server + "/api/online/data/users" 43 | 44 | def items_url(server): 45 | return server + "/api/online/data/items" 46 | 47 | def offline_submission(server): 48 | return server + "/api/submission" 49 | 50 | def get_stats(): 51 | req = urllib.request.Request(status_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN}) 52 | response=urllib.request.urlopen(req) 53 | content=response.read().decode('utf-8') 54 | response = json.loads(content) 55 | 56 | return parse(response['current']['updated_at']) 57 | 58 | def is_ready(): 59 | global cur_date_flag 60 | existing_dates = set() 61 | with open(PULL_DATA_LOG_FILE,'r') as rd: 62 | for date_str in rd.readlines(): 63 | existing_dates.add(date_str.rstrip()) 64 | status_date = get_stats().date() 65 | cur_date_flag = str(status_date) 66 | print('get_stats().date() = ' + cur_date_flag) 67 | print('datetime.date.today() = ' + str(datetime.date.today())) 68 | 69 | return cur_date_flag not in existing_dates 70 | 71 | def download_items(): 72 | req = urllib.request.Request(items_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN}) 73 | response=urllib.request.urlopen(req) 74 | content=response.read().decode('utf-8') 75 | global cur_date_flag 76 | fp = open(PULL_DATA_PATH+'\\target_items_'+cur_date_flag+'.txt', "w") 77 | fp.write(content) 78 | fp.close() 79 | #return parser.select(TMP_ITEMS, lambda x: True, parser.build_item, lambda x: int(x[0])) 80 | 81 | 82 | def user_info(user_ids): 83 | return parser.select( 84 | USERS_FILE, 85 | lambda x: int(x[0]) in user_ids and "NULL" not in x, 86 | parser.build_user, 87 | lambda x: int(x[0]) 88 | ) 89 | 90 | def download_target_users(): 91 | req = urllib.request.Request(users_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN}) 92 | response=urllib.request.urlopen(req) 93 | content=response.read().decode('utf-8') 94 | global cur_date_flag 95 | fp = open(PULL_DATA_PATH+'\\target_users_'+cur_date_flag+'.txt', "w") 96 | fp.write(content) 97 | fp.close() 98 | 99 | ''' 100 | user_ids = set([int(uid) for uid in content.split("\n") if len(uid) > 0]) 101 | 102 | with open(PULL_DATA_PATH+'\\target_users_'+datetime.date.today().isoformat()+'.txt','w') as wt: 103 | for uid in user_ids: 104 | wt.write(str(uid)+"\n") 105 | ''' 106 | #return user_ids 107 | 108 | def process(): 109 | print('downloading data...') 110 | download_target_users() 111 | download_items() 112 | global cur_date_flag 113 | with open(PULL_DATA_LOG_FILE,'a') as wt: 114 | wt.write(cur_date_flag+'\n') 115 | 116 | def offline_submit(filename): 117 | rd = open(filename,'r') 118 | content=rd.read() 119 | rd.close() 120 | content = content.encode('utf-8') 121 | req = urllib.request.Request(url=offline_submission(SERVER), data=content, headers={"Authorization": "Bearer %s" %TOKEN}, method='POST') 122 | response=urllib.request.urlopen(req) 123 | content=response.read().decode('utf-8') 124 | print(content) 125 | 126 | def submit(): 127 | http = httplib2.Http() 128 | filename = TMP_SOLUTION 129 | with open(filename, 'r') as content_file: 130 | content = content_file.read() 131 | response = http.request(post_url(SERVER), method="POST", body=content, 132 | headers=header(TOKEN) 133 | )[1].decode("utf-8") 134 | print("SUBMIT: " + filename + " " + response) 135 | 136 | def usage_test(): 137 | ''' 138 | req = urllib.request.Request(status_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN}) 139 | response=urllib.request.urlopen(req) 140 | content=response.read().decode('utf-8') 141 | print(content) 142 | response = json.loads(content) 143 | print(response) 144 | ''' 145 | print(is_ready()) 146 | 147 | process() 148 | 149 | 150 | if __name__ == "__main__": 151 | 152 | #usage_test() 153 | #offline_submit(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\train-test\inter-media\feature\sparse\recsys17-pred-submit.csv') 154 | 155 | 156 | last_submit = None 157 | while True: 158 | try: 159 | if is_ready() and last_submit != datetime.date.today(): 160 | print('data ready.') 161 | process() 162 | last_submit = datetime.date.today() 163 | #submit() 164 | else: 165 | print("Not ready yet: " + str(datetime.date.today())) 166 | time.sleep(600) 167 | except KeyboardInterrupt: 168 | break 169 | except: 170 | print("exception :"+str(sys.exc_info()[0])+"\n") 171 | 172 | 173 | -------------------------------------------------------------------------------- /auto-pipeline/py-pull_and_submit/model.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Modeling users, interactions and items from 3 | the recsys challenge 2017. 4 | 5 | by Daniel Kohlsdorf 6 | ''' 7 | 8 | class User: 9 | 10 | def __init__(self, title, clevel, indus, disc, country, region): 11 | self.title = title 12 | self.clevel = clevel 13 | self.indus = indus 14 | self.disc = disc 15 | self.country = country 16 | self.region = region 17 | 18 | class Item: 19 | 20 | def __init__(self, title, clevel, indus, disc, country, region): 21 | self.title = title 22 | self.clevel = clevel 23 | self.indus = indus 24 | self.disc = disc 25 | self.country = country 26 | self.region = region 27 | 28 | class Interaction: 29 | 30 | def __init__(self, user, item, interaction_type): 31 | self.user = user 32 | self.item = item 33 | self.interaction_type = interaction_type 34 | 35 | def title_match(self): 36 | return float(len(set(self.user.title).intersection(set(self.item.title)))) 37 | 38 | def clevel_match(self): 39 | if self.user.clevel == self.item.clevel: 40 | return 1.0 41 | else: 42 | return 0.0 43 | 44 | def indus_match(self): 45 | if self.user.indus == self.item.indus: 46 | return 1.0 47 | else: 48 | return 0.0 49 | 50 | def discipline_match(self): 51 | if self.user.disc == self.item.disc: 52 | return 2.0 53 | else: 54 | return 0.0 55 | 56 | def country_match(self): 57 | if self.user.country == self.item.country: 58 | return 1.0 59 | else: 60 | return 0.0 61 | 62 | def region_match(self): 63 | if self.user.region == self.item.region: 64 | return 1.0 65 | else: 66 | return 0.0 67 | 68 | def features(self): 69 | return [ 70 | self.title_match(), self.clevel_match(), self.indus_match(), 71 | self.discipline_match(), self.country_match(), self.region_match() 72 | ] 73 | 74 | def label(self): 75 | if self.interaction_type == 4: 76 | return 0.0 77 | else: 78 | return 1.0 79 | 80 | 81 | -------------------------------------------------------------------------------- /auto-pipeline/py-pull_and_submit/online_submit_auto.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Online example 3 | 4 | Uses the offline mode to make predictions 5 | for the online challenge. 6 | 7 | by Daniel Kohlsdorf 8 | ''' 9 | import urllib.request 10 | import time 11 | import sys 12 | 13 | import json 14 | from dateutil.parser import parse 15 | import datetime 16 | import parser 17 | #from recommendation_worker import * 18 | 19 | TMP_ITEMS = "data/current_items.csv" 20 | TMP_SOLUTION = "data/current_solution.csv" 21 | 22 | MODEL = "data/recsys2017.model" # Model from offline training 23 | USERS_FILE = "data/users.csv" # Online user data 24 | 25 | PULL_DATA_PATH = r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\second-stage\online-schedule\pull-data' 26 | 27 | TOKEN = "bGVhdmluZ3NlYXNvbjdiODFkYTRlLTM4MGUtNGZkOC1iYTVjLTM5MjA0M2VhOTQ5Yw==" # your key 28 | SERVER = "https://recsys.xing.com" 29 | 30 | def header(token): 31 | return {"Authorization" : "Bearer %s" %TOKEN} 32 | 33 | def post_url(server): 34 | return server + "/api/online/submission" 35 | 36 | def status_url(server): 37 | return server + "/api/online/data/status" 38 | 39 | def users_url(server): 40 | return server + "/api/online/data/users" 41 | 42 | def items_url(server): 43 | return server + "/api/online/data/items" 44 | 45 | def interaction_url(server): 46 | return server + "/api/online/data/interactions" 47 | 48 | def offline_submission(server): 49 | return server + "/api/submission" 50 | def online_submission(server): 51 | return server + "/api/online/submission" 52 | 53 | 54 | def get_stats(): 55 | req = urllib.request.Request(status_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN}) 56 | response=urllib.request.urlopen(req) 57 | content=response.read().decode('utf-8') 58 | response = json.loads(content) 59 | 60 | return parse(response['current']['updated_at']) 61 | 62 | def is_ready(): 63 | status_date = get_stats().date() 64 | print('get_stats().date() = ' + str(status_date)) 65 | print('datetime.date.today() = ' + str(datetime.date.today())) 66 | 67 | return status_date == datetime.date.today() 68 | 69 | def download_items(): 70 | req = urllib.request.Request(items_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN}) 71 | response=urllib.request.urlopen(req) 72 | content=response.read().decode('utf-8') 73 | 74 | fp = open(PULL_DATA_PATH+'\\target_items_'+datetime.date.today().isoformat()+'.txt', "w") 75 | fp.write(content) 76 | fp.close() 77 | #return parser.select(TMP_ITEMS, lambda x: True, parser.build_item, lambda x: int(x[0])) 78 | 79 | def download_acceptsubmission(): 80 | req = urllib.request.Request(online_submission(SERVER), None, {"Authorization": "Bearer %s" %TOKEN}) 81 | response=urllib.request.urlopen(req) 82 | content=response.read().decode('utf-8') 83 | 84 | fp = open(PULL_DATA_PATH+'\\accepted_pairs\\accepted_pairs_'+datetime.date.today().isoformat()+'.txt', "w") 85 | fp.write(content) 86 | fp.close() 87 | 88 | def download_interactions(): 89 | req = urllib.request.Request(interaction_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN}) 90 | response=urllib.request.urlopen(req) 91 | content=response.read().decode('utf-8') 92 | 93 | fp = open(PULL_DATA_PATH+'\\interactions\\interaction_'+datetime.date.today().isoformat()+'.txt', "w") 94 | fp.write(content) 95 | fp.close() 96 | #return parser.select(TMP_ITEMS, lambda x: True, parser.build_item, lambda x: int(x[0])) 97 | 98 | 99 | def user_info(user_ids): 100 | return parser.select( 101 | USERS_FILE, 102 | lambda x: int(x[0]) in user_ids and "NULL" not in x, 103 | parser.build_user, 104 | lambda x: int(x[0]) 105 | ) 106 | 107 | def download_target_users(): 108 | req = urllib.request.Request(users_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN}) 109 | response=urllib.request.urlopen(req) 110 | content=response.read().decode('utf-8') 111 | 112 | fp = open(PULL_DATA_PATH+'\\target_users_'+datetime.date.today().isoformat()+'.txt', "w") 113 | fp.write(content) 114 | fp.close() 115 | 116 | ''' 117 | user_ids = set([int(uid) for uid in content.split("\n") if len(uid) > 0]) 118 | 119 | with open(PULL_DATA_PATH+'\\target_users_'+datetime.date.today().isoformat()+'.txt','w') as wt: 120 | for uid in user_ids: 121 | wt.write(str(uid)+"\n") 122 | ''' 123 | #return user_ids 124 | 125 | def process(): 126 | download_target_users() 127 | download_items() 128 | 129 | def offline_submit(filename): 130 | rd = open(filename,'r') 131 | content=rd.read() 132 | rd.close() 133 | content = content.encode('utf-8') 134 | req = urllib.request.Request(url=offline_submission(SERVER), data=content, headers={"Authorization": "Bearer %s" %TOKEN}, method='POST') 135 | response=urllib.request.urlopen(req) 136 | content=response.read().decode('utf-8') 137 | print(content) 138 | 139 | def online_submit(filename): 140 | rd = open(filename,'r') 141 | content=rd.read() 142 | rd.close() 143 | content = content.encode('utf-8') 144 | req = urllib.request.Request(url=online_submission(SERVER), data=content, headers={"Authorization": "Bearer %s" %TOKEN}, method='POST') 145 | response=urllib.request.urlopen(req) 146 | content=response.read().decode('utf-8') 147 | print(content) 148 | 149 | def submit(): 150 | http = httplib2.Http() 151 | filename = TMP_SOLUTION 152 | with open(filename, 'r') as content_file: 153 | content = content_file.read() 154 | response = http.request(post_url(SERVER), method="POST", body=content, 155 | headers=header(TOKEN) 156 | )[1].decode("utf-8") 157 | print("SUBMIT: " + filename + " " + response) 158 | 159 | def usage_test(): 160 | ''' 161 | req = urllib.request.Request(status_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN}) 162 | response=urllib.request.urlopen(req) 163 | content=response.read().decode('utf-8') 164 | print(content) 165 | response = json.loads(content) 166 | print(response) 167 | ''' 168 | print(is_ready()) 169 | 170 | process() 171 | 172 | def submit_file_online(file): 173 | while True: 174 | try: 175 | print("submitting "+file) 176 | online_submit(file) 177 | print("submitting successfully") 178 | break 179 | except KeyboardInterrupt: 180 | break 181 | except: 182 | print("exception :"+str(sys.exc_info()[0])+"\n") 183 | 184 | 185 | if __name__ == "__main__": 186 | 187 | submit_file_online(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit.csv') 188 | submit_file_online(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v1.csv') 189 | submit_file_online(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v2.csv') 190 | submit_file_online(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v3.csv') 191 | submit_file_online(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v4.csv') 192 | submit_file_online(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v5.csv') 193 | submit_file_online(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v6.csv') 194 | submit_file_online(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v7.csv') 195 | 196 | while True: 197 | try: 198 | download_acceptsubmission(); 199 | break 200 | except KeyboardInterrupt: 201 | break 202 | except: 203 | print("exception :"+str(sys.exc_info()[0])+"\n") -------------------------------------------------------------------------------- /auto-pipeline/py-pull_and_submit/online_submit_auto_-1.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Online example 3 | 4 | Uses the offline mode to make predictions 5 | for the online challenge. 6 | 7 | by Daniel Kohlsdorf 8 | ''' 9 | import urllib.request 10 | import time 11 | import sys 12 | 13 | import json 14 | from dateutil.parser import parse 15 | import datetime 16 | import parser 17 | #from recommendation_worker import * 18 | 19 | TMP_ITEMS = "data/current_items.csv" 20 | TMP_SOLUTION = "data/current_solution.csv" 21 | 22 | MODEL = "data/recsys2017.model" # Model from offline training 23 | USERS_FILE = "data/users.csv" # Online user data 24 | 25 | PULL_DATA_PATH = r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\second-stage\online-schedule\pull-data' 26 | 27 | TOKEN = "bGVhdmluZ3NlYXNvbjdiODFkYTRlLTM4MGUtNGZkOC1iYTVjLTM5MjA0M2VhOTQ5Yw==" # your key 28 | SERVER = "https://recsys.xing.com" 29 | 30 | def header(token): 31 | return {"Authorization" : "Bearer %s" %TOKEN} 32 | 33 | def post_url(server): 34 | return server + "/api/online/submission" 35 | 36 | def status_url(server): 37 | return server + "/api/online/data/status" 38 | 39 | def users_url(server): 40 | return server + "/api/online/data/users" 41 | 42 | def items_url(server): 43 | return server + "/api/online/data/items" 44 | 45 | def interaction_url(server): 46 | return server + "/api/online/data/interactions" 47 | 48 | def offline_submission(server): 49 | return server + "/api/submission" 50 | def online_submission(server): 51 | return server + "/api/online/submission" 52 | 53 | 54 | def get_stats(): 55 | req = urllib.request.Request(status_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN}) 56 | response=urllib.request.urlopen(req) 57 | content=response.read().decode('utf-8') 58 | response = json.loads(content) 59 | 60 | return parse(response['current']['updated_at']) 61 | 62 | def is_ready(): 63 | status_date = get_stats().date() 64 | print('get_stats().date() = ' + str(status_date)) 65 | print('datetime.date.today() = ' + str(datetime.date.today())) 66 | 67 | return status_date == datetime.date.today() 68 | 69 | def download_items(): 70 | req = urllib.request.Request(items_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN}) 71 | response=urllib.request.urlopen(req) 72 | content=response.read().decode('utf-8') 73 | 74 | fp = open(PULL_DATA_PATH+'\\target_items_'+datetime.date.today().isoformat()+'.txt', "w") 75 | fp.write(content) 76 | fp.close() 77 | #return parser.select(TMP_ITEMS, lambda x: True, parser.build_item, lambda x: int(x[0])) 78 | 79 | def download_acceptsubmission(): 80 | req = urllib.request.Request(online_submission(SERVER), None, {"Authorization": "Bearer %s" %TOKEN}) 81 | response=urllib.request.urlopen(req) 82 | content=response.read().decode('utf-8') 83 | 84 | fp = open(PULL_DATA_PATH+'\\accepted_pairs\\accepted_pairs_'+datetime.date.today().isoformat()+'.txt', "w") 85 | fp.write(content) 86 | fp.close() 87 | 88 | def download_interactions(): 89 | req = urllib.request.Request(interaction_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN}) 90 | response=urllib.request.urlopen(req) 91 | content=response.read().decode('utf-8') 92 | 93 | fp = open(PULL_DATA_PATH+'\\interactions\\interaction_'+datetime.date.today().isoformat()+'.txt', "w") 94 | fp.write(content) 95 | fp.close() 96 | #return parser.select(TMP_ITEMS, lambda x: True, parser.build_item, lambda x: int(x[0])) 97 | 98 | 99 | def user_info(user_ids): 100 | return parser.select( 101 | USERS_FILE, 102 | lambda x: int(x[0]) in user_ids and "NULL" not in x, 103 | parser.build_user, 104 | lambda x: int(x[0]) 105 | ) 106 | 107 | def download_target_users(): 108 | req = urllib.request.Request(users_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN}) 109 | response=urllib.request.urlopen(req) 110 | content=response.read().decode('utf-8') 111 | 112 | fp = open(PULL_DATA_PATH+'\\target_users_'+datetime.date.today().isoformat()+'.txt', "w") 113 | fp.write(content) 114 | fp.close() 115 | 116 | ''' 117 | user_ids = set([int(uid) for uid in content.split("\n") if len(uid) > 0]) 118 | 119 | with open(PULL_DATA_PATH+'\\target_users_'+datetime.date.today().isoformat()+'.txt','w') as wt: 120 | for uid in user_ids: 121 | wt.write(str(uid)+"\n") 122 | ''' 123 | #return user_ids 124 | 125 | def process(): 126 | download_target_users() 127 | download_items() 128 | 129 | def offline_submit(filename): 130 | rd = open(filename,'r') 131 | content=rd.read() 132 | rd.close() 133 | content = content.encode('utf-8') 134 | req = urllib.request.Request(url=offline_submission(SERVER), data=content, headers={"Authorization": "Bearer %s" %TOKEN}, method='POST') 135 | response=urllib.request.urlopen(req) 136 | content=response.read().decode('utf-8') 137 | print(content) 138 | 139 | def online_submit(filename): 140 | rd = open(filename,'r') 141 | content=rd.read() 142 | rd.close() 143 | content = content.encode('utf-8') 144 | req = urllib.request.Request(url=online_submission(SERVER), data=content, headers={"Authorization": "Bearer %s" %TOKEN}, method='POST') 145 | response=urllib.request.urlopen(req) 146 | content=response.read().decode('utf-8') 147 | print(content) 148 | 149 | def submit(): 150 | http = httplib2.Http() 151 | filename = TMP_SOLUTION 152 | with open(filename, 'r') as content_file: 153 | content = content_file.read() 154 | response = http.request(post_url(SERVER), method="POST", body=content, 155 | headers=header(TOKEN) 156 | )[1].decode("utf-8") 157 | print("SUBMIT: " + filename + " " + response) 158 | 159 | def usage_test(): 160 | ''' 161 | req = urllib.request.Request(status_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN}) 162 | response=urllib.request.urlopen(req) 163 | content=response.read().decode('utf-8') 164 | print(content) 165 | response = json.loads(content) 166 | print(response) 167 | ''' 168 | print(is_ready()) 169 | 170 | process() 171 | 172 | def submit_file_online(file): 173 | while True: 174 | try: 175 | print("submitting "+file) 176 | online_submit(file) 177 | print("submitting successfully") 178 | break 179 | except KeyboardInterrupt: 180 | break 181 | except: 182 | print("exception :"+str(sys.exc_info()[0])+"\n") 183 | 184 | 185 | if __name__ == "__main__": 186 | 187 | submit_file_online(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v-1.csv') -------------------------------------------------------------------------------- /auto-pipeline/py-pull_and_submit/online_submit_auto_2.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Online example 3 | 4 | Uses the offline mode to make predictions 5 | for the online challenge. 6 | 7 | by Daniel Kohlsdorf 8 | ''' 9 | import urllib.request 10 | import time 11 | import sys 12 | 13 | import json 14 | from dateutil.parser import parse 15 | import datetime 16 | import parser 17 | #from recommendation_worker import * 18 | 19 | TMP_ITEMS = "data/current_items.csv" 20 | TMP_SOLUTION = "data/current_solution.csv" 21 | 22 | MODEL = "data/recsys2017.model" # Model from offline training 23 | USERS_FILE = "data/users.csv" # Online user data 24 | 25 | PULL_DATA_PATH = r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\second-stage\online-schedule\pull-data' 26 | 27 | TOKEN = "bGVhdmluZ3NlYXNvbjdiODFkYTRlLTM4MGUtNGZkOC1iYTVjLTM5MjA0M2VhOTQ5Yw==" # your key 28 | SERVER = "https://recsys.xing.com" 29 | 30 | def header(token): 31 | return {"Authorization" : "Bearer %s" %TOKEN} 32 | 33 | def post_url(server): 34 | return server + "/api/online/submission" 35 | 36 | def status_url(server): 37 | return server + "/api/online/data/status" 38 | 39 | def users_url(server): 40 | return server + "/api/online/data/users" 41 | 42 | def items_url(server): 43 | return server + "/api/online/data/items" 44 | 45 | def interaction_url(server): 46 | return server + "/api/online/data/interactions" 47 | 48 | def offline_submission(server): 49 | return server + "/api/submission" 50 | def online_submission(server): 51 | return server + "/api/online/submission" 52 | 53 | 54 | def get_stats(): 55 | req = urllib.request.Request(status_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN}) 56 | response=urllib.request.urlopen(req) 57 | content=response.read().decode('utf-8') 58 | response = json.loads(content) 59 | 60 | return parse(response['current']['updated_at']) 61 | 62 | def is_ready(): 63 | status_date = get_stats().date() 64 | print('get_stats().date() = ' + str(status_date)) 65 | print('datetime.date.today() = ' + str(datetime.date.today())) 66 | 67 | return status_date == datetime.date.today() 68 | 69 | def download_items(): 70 | req = urllib.request.Request(items_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN}) 71 | response=urllib.request.urlopen(req) 72 | content=response.read().decode('utf-8') 73 | 74 | fp = open(PULL_DATA_PATH+'\\target_items_'+datetime.date.today().isoformat()+'.txt', "w") 75 | fp.write(content) 76 | fp.close() 77 | #return parser.select(TMP_ITEMS, lambda x: True, parser.build_item, lambda x: int(x[0])) 78 | 79 | def download_acceptsubmission(): 80 | req = urllib.request.Request(online_submission(SERVER), None, {"Authorization": "Bearer %s" %TOKEN}) 81 | response=urllib.request.urlopen(req) 82 | content=response.read().decode('utf-8') 83 | 84 | fp = open(PULL_DATA_PATH+'\\accepted_pairs\\accepted_pairs_'+datetime.date.today().isoformat()+'.txt', "w") 85 | fp.write(content) 86 | fp.close() 87 | 88 | def download_interactions(): 89 | req = urllib.request.Request(interaction_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN}) 90 | response=urllib.request.urlopen(req) 91 | content=response.read().decode('utf-8') 92 | 93 | fp = open(PULL_DATA_PATH+'\\interactions\\interaction_'+datetime.date.today().isoformat()+'.txt', "w") 94 | fp.write(content) 95 | fp.close() 96 | #return parser.select(TMP_ITEMS, lambda x: True, parser.build_item, lambda x: int(x[0])) 97 | 98 | 99 | def user_info(user_ids): 100 | return parser.select( 101 | USERS_FILE, 102 | lambda x: int(x[0]) in user_ids and "NULL" not in x, 103 | parser.build_user, 104 | lambda x: int(x[0]) 105 | ) 106 | 107 | def download_target_users(): 108 | req = urllib.request.Request(users_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN}) 109 | response=urllib.request.urlopen(req) 110 | content=response.read().decode('utf-8') 111 | 112 | fp = open(PULL_DATA_PATH+'\\target_users_'+datetime.date.today().isoformat()+'.txt', "w") 113 | fp.write(content) 114 | fp.close() 115 | 116 | ''' 117 | user_ids = set([int(uid) for uid in content.split("\n") if len(uid) > 0]) 118 | 119 | with open(PULL_DATA_PATH+'\\target_users_'+datetime.date.today().isoformat()+'.txt','w') as wt: 120 | for uid in user_ids: 121 | wt.write(str(uid)+"\n") 122 | ''' 123 | #return user_ids 124 | 125 | def process(): 126 | download_target_users() 127 | download_items() 128 | 129 | def offline_submit(filename): 130 | rd = open(filename,'r') 131 | content=rd.read() 132 | rd.close() 133 | content = content.encode('utf-8') 134 | req = urllib.request.Request(url=offline_submission(SERVER), data=content, headers={"Authorization": "Bearer %s" %TOKEN}, method='POST') 135 | response=urllib.request.urlopen(req) 136 | content=response.read().decode('utf-8') 137 | print(content) 138 | 139 | def online_submit(filename): 140 | rd = open(filename,'r') 141 | content=rd.read() 142 | rd.close() 143 | content = content.encode('utf-8') 144 | req = urllib.request.Request(url=online_submission(SERVER), data=content, headers={"Authorization": "Bearer %s" %TOKEN}, method='POST') 145 | response=urllib.request.urlopen(req) 146 | content=response.read().decode('utf-8') 147 | print(content) 148 | 149 | def submit(): 150 | http = httplib2.Http() 151 | filename = TMP_SOLUTION 152 | with open(filename, 'r') as content_file: 153 | content = content_file.read() 154 | response = http.request(post_url(SERVER), method="POST", body=content, 155 | headers=header(TOKEN) 156 | )[1].decode("utf-8") 157 | print("SUBMIT: " + filename + " " + response) 158 | 159 | def usage_test(): 160 | ''' 161 | req = urllib.request.Request(status_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN}) 162 | response=urllib.request.urlopen(req) 163 | content=response.read().decode('utf-8') 164 | print(content) 165 | response = json.loads(content) 166 | print(response) 167 | ''' 168 | print(is_ready()) 169 | 170 | process() 171 | 172 | def submit_file_online(file): 173 | while True: 174 | try: 175 | print("submitting "+file) 176 | online_submit(file) 177 | print("submitting successfully") 178 | break 179 | except KeyboardInterrupt: 180 | break 181 | except: 182 | print("exception :"+str(sys.exc_info()[0])+"\n") 183 | 184 | 185 | if __name__ == "__main__": 186 | 187 | submit_file_online(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online_2\recsys17-pred-highdim-submit.csv') 188 | submit_file_online(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online_2\recsys17-pred-highdim-submit_v1.csv') 189 | submit_file_online(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online_2\recsys17-pred-highdim-submit_v2.csv') 190 | submit_file_online(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online_2\recsys17-pred-highdim-submit_v3.csv') 191 | submit_file_online(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online_2\recsys17-pred-highdim-submit_v4.csv') 192 | submit_file_online(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online_2\recsys17-pred-highdim-submit_v5.csv') 193 | submit_file_online(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online_2\recsys17-pred-highdim-submit_v6.csv') 194 | submit_file_online(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online_2\recsys17-pred-highdim-submit_v7.csv') 195 | 196 | while True: 197 | try: 198 | download_acceptsubmission(); 199 | break 200 | except KeyboardInterrupt: 201 | break 202 | except: 203 | print("exception :"+str(sys.exc_info()[0])+"\n") -------------------------------------------------------------------------------- /auto-pipeline/py-pull_and_submit/parser.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Parsing the ACM Recsys Challenge 2017 data into interactions, 3 | items and user models. 4 | 5 | by Daniel Kohlsdorf 6 | ''' 7 | 8 | from model import * 9 | 10 | def is_header(line): 11 | return "recsyschallenge" in line 12 | 13 | def process_header(header): 14 | x = {} 15 | pos = 0 16 | for name in header: 17 | x[name.split(".")[1]] = pos 18 | pos += 1 19 | return x 20 | 21 | def select(from_file, where, toObject, index): 22 | header = None 23 | data = {} 24 | i = 0 25 | for line in open(from_file): 26 | if is_header(line): 27 | header = process_header(line.strip().split("\t")) 28 | elif len(line.strip()) > 0 and header != None: 29 | cmp = line.strip().split("\t") 30 | if where(cmp) and len(cmp) == len(header): 31 | obj = toObject(cmp, header) 32 | if obj != None: 33 | data[index(cmp)] = obj 34 | i += 1 35 | if i % 100000 == 0: 36 | print("... reading line " + str(i) + " from file " + from_file) 37 | return(header, data) 38 | 39 | def build_user(str_user, names): 40 | return User( 41 | [int(x) for x in str_user[names["jobroles"]].split(",") if len(x) > 0], 42 | int(str_user[names["career_level"]]), 43 | int(str_user[names["industry_id"]]), 44 | int(str_user[names["discipline_id"]]), 45 | str_user[names["country"]], 46 | str_user[names["region"]] 47 | ) 48 | 49 | def build_item(str_item, names): 50 | return Item( 51 | [int(x) for x in str_item[names["title"]].split(",") if len(x) > 0], 52 | int(str_item[names["career_level"]]), 53 | int(str_item[names["industry_id"]]), 54 | int(str_item[names["discipline_id"]]), 55 | str_item[names["country"]], 56 | str_item[names["region"]] 57 | ) 58 | 59 | class InteractionBuilder: 60 | 61 | def __init__(self, user_dict, item_dict): 62 | self.user_dict = user_dict 63 | self.item_dict = item_dict 64 | 65 | def build_interaction(self, str_inter, names): 66 | if int(str_inter[names['item_id']]) in self.item_dict and int(str_inter[names['user_id']]) in self.user_dict: 67 | return Interaction( 68 | self.user_dict[int(str_inter[names['user_id']])], 69 | self.item_dict[int(str_inter[names['item_id']])], 70 | int(str_inter[names["interaction_type"]]) 71 | ) 72 | else: 73 | return None 74 | 75 | 76 | -------------------------------------------------------------------------------- /auto-pipeline/py-pull_and_submit/recsys-submit-file.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Online example 3 | 4 | Uses the offline mode to make predictions 5 | for the online challenge. 6 | 7 | by Daniel Kohlsdorf 8 | ''' 9 | import urllib.request 10 | import time 11 | 12 | import json 13 | from dateutil.parser import parse 14 | import datetime 15 | import parser 16 | #from recommendation_worker import * 17 | 18 | TMP_ITEMS = "data/current_items.csv" 19 | TMP_SOLUTION = "data/current_solution.csv" 20 | 21 | MODEL = "data/recsys2017.model" # Model from offline training 22 | USERS_FILE = "data/users.csv" # Online user data 23 | 24 | PULL_DATA_PATH = r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\second-stage\online-schedule\pull-data' 25 | 26 | TOKEN = "bGVhdmluZ3NlYXNvbjdiODFkYTRlLTM4MGUtNGZkOC1iYTVjLTM5MjA0M2VhOTQ5Yw==" # your key 27 | SERVER = "https://recsys.xing.com" 28 | 29 | def header(token): 30 | return {"Authorization" : "Bearer %s" %TOKEN} 31 | 32 | def post_url(server): 33 | return server + "/api/online/submission" 34 | 35 | def status_url(server): 36 | return server + "/api/online/data/status" 37 | 38 | def users_url(server): 39 | return server + "/api/online/data/users" 40 | 41 | def items_url(server): 42 | return server + "/api/online/data/items" 43 | 44 | def interaction_url(server): 45 | return server + "/api/online/data/interactions" 46 | 47 | def offline_submission(server): 48 | return server + "/api/submission" 49 | def online_submission(server): 50 | return server + "/api/online/submission" 51 | 52 | 53 | def get_stats(): 54 | req = urllib.request.Request(status_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN}) 55 | response=urllib.request.urlopen(req) 56 | content=response.read().decode('utf-8') 57 | response = json.loads(content) 58 | 59 | return parse(response['current']['updated_at']) 60 | 61 | def is_ready(): 62 | status_date = get_stats().date() 63 | print('get_stats().date() = ' + str(status_date)) 64 | print('datetime.date.today() = ' + str(datetime.date.today())) 65 | 66 | return status_date == datetime.date.today() 67 | 68 | def download_items(): 69 | req = urllib.request.Request(items_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN}) 70 | response=urllib.request.urlopen(req) 71 | content=response.read().decode('utf-8') 72 | 73 | fp = open(PULL_DATA_PATH+'\\target_items_'+datetime.date.today().isoformat()+'.txt', "w") 74 | fp.write(content) 75 | fp.close() 76 | #return parser.select(TMP_ITEMS, lambda x: True, parser.build_item, lambda x: int(x[0])) 77 | 78 | def download_acceptsubmission(): 79 | req = urllib.request.Request(online_submission(SERVER), None, {"Authorization": "Bearer %s" %TOKEN}) 80 | response=urllib.request.urlopen(req) 81 | content=response.read().decode('utf-8') 82 | 83 | fp = open(PULL_DATA_PATH+'\\accepted_pairs\\accepted_pairs_'+datetime.date.today().isoformat()+'.txt', "w") 84 | fp.write(content) 85 | fp.close() 86 | 87 | def download_interactions(): 88 | req = urllib.request.Request(interaction_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN}) 89 | response=urllib.request.urlopen(req) 90 | content=response.read().decode('utf-8') 91 | 92 | fp = open(PULL_DATA_PATH+'\\interactions\\interaction_'+datetime.date.today().isoformat()+'.txt', "w") 93 | fp.write(content) 94 | fp.close() 95 | #return parser.select(TMP_ITEMS, lambda x: True, parser.build_item, lambda x: int(x[0])) 96 | 97 | 98 | def user_info(user_ids): 99 | return parser.select( 100 | USERS_FILE, 101 | lambda x: int(x[0]) in user_ids and "NULL" not in x, 102 | parser.build_user, 103 | lambda x: int(x[0]) 104 | ) 105 | 106 | def download_target_users(): 107 | req = urllib.request.Request(users_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN}) 108 | response=urllib.request.urlopen(req) 109 | content=response.read().decode('utf-8') 110 | 111 | fp = open(PULL_DATA_PATH+'\\target_users_'+datetime.date.today().isoformat()+'.txt', "w") 112 | fp.write(content) 113 | fp.close() 114 | 115 | ''' 116 | user_ids = set([int(uid) for uid in content.split("\n") if len(uid) > 0]) 117 | 118 | with open(PULL_DATA_PATH+'\\target_users_'+datetime.date.today().isoformat()+'.txt','w') as wt: 119 | for uid in user_ids: 120 | wt.write(str(uid)+"\n") 121 | ''' 122 | #return user_ids 123 | 124 | def process(): 125 | download_target_users() 126 | download_items() 127 | 128 | def offline_submit(filename): 129 | rd = open(filename,'r') 130 | content=rd.read() 131 | rd.close() 132 | content = content.encode('utf-8') 133 | req = urllib.request.Request(url=offline_submission(SERVER), data=content, headers={"Authorization": "Bearer %s" %TOKEN}, method='POST') 134 | response=urllib.request.urlopen(req) 135 | content=response.read().decode('utf-8') 136 | print(content) 137 | 138 | def online_submit(filename): 139 | rd = open(filename,'r') 140 | content=rd.read() 141 | rd.close() 142 | content = content.encode('utf-8') 143 | req = urllib.request.Request(url=online_submission(SERVER), data=content, headers={"Authorization": "Bearer %s" %TOKEN}, method='POST') 144 | response=urllib.request.urlopen(req) 145 | content=response.read().decode('utf-8') 146 | print(content) 147 | 148 | def submit(): 149 | http = httplib2.Http() 150 | filename = TMP_SOLUTION 151 | with open(filename, 'r') as content_file: 152 | content = content_file.read() 153 | response = http.request(post_url(SERVER), method="POST", body=content, 154 | headers=header(TOKEN) 155 | )[1].decode("utf-8") 156 | print("SUBMIT: " + filename + " " + response) 157 | 158 | def usage_test(): 159 | ''' 160 | req = urllib.request.Request(status_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN}) 161 | response=urllib.request.urlopen(req) 162 | content=response.read().decode('utf-8') 163 | print(content) 164 | response = json.loads(content) 165 | print(response) 166 | ''' 167 | print(is_ready()) 168 | 169 | process() 170 | 171 | 172 | if __name__ == "__main__": 173 | 174 | #usage_test() 175 | 176 | path = r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\camera_ready' 177 | offline_submit(path+r'\test-FT_FT_laglng_premium0.005_submit_complete.txt') 178 | 179 | 180 | #download_interactions() 181 | #download_acceptsubmission(); 182 | 183 | #online_submit(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit.csv') 184 | r''' 185 | online_submit(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v1.csv') 186 | online_submit(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v2.csv') 187 | online_submit(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v3.csv') 188 | online_submit(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v4.csv') 189 | online_submit(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v5.csv') 190 | online_submit(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v6.csv') 191 | online_submit(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v7.csv') 192 | ''' 193 | #online_submit(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v-1.csv') 194 | 195 | ''' 196 | last_submit = None 197 | while True: 198 | if is_ready() and last_submit != datetime.date.today(): 199 | process() 200 | last_submit = datetime.date.today() 201 | #submit() 202 | else: 203 | print("Not ready yet: " + str(datetime.date.today())) 204 | time.sleep(600) 205 | ''' 206 | 207 | -------------------------------------------------------------------------------- /models/StudyNDCG.script.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.SCOPE.Types; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.IO; 5 | using System.Text; 6 | using ScopeRuntime; 7 | -------------------------------------------------------------------------------- /models/TEST_Localmodel_tlc3_pipeline.script: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | //// for cosmos09 TLC3.7 6 | #IF(EXISTS("local/users/v-lianji/TLC3.7/Tlc3Scope.module")) 7 | MODULE "local/users/v-lianji/TLC3.7/Tlc3Scope.module" AS Tlc3Learner; 8 | RESOURCE @"local/users/v-lianji/TLC3.7/Microsoft.MachineLearning.Garage.dll"; 9 | #ELSE 10 | /// cosmos14 11 | MODULE "/shares/CML.TLC/TLC/TLC-3.7.162.86/Tlc3Scope.module" AS Tlc3Learner; 12 | RESOURCE @"/shares/CML.TLC/TLC/TLC-3.7.162.86/Microsoft.MachineLearning.Garage.dll"; 13 | RESOURCE @"/shares/CML.TLC/TLC/TLC-3.7.162.86/libxgboost.dll"; 14 | RESOURCE @"/shares/CML.TLC/TLC/TLC-3.7.162.86/Microsoft.MachineLearning.XGBoost.dll"; 15 | #ENDIF 16 | 17 | 18 | 19 | #DECLARE path string = "local/users/v-lianji/camera_ready/train-val-online/overfitting/"; 20 | 21 | 22 | 23 | #DECLARE ModelString string = @path + "FT_p50_haslatlng.zip"; // 24 | RESOURCE @ModelString; 25 | 26 | #DECLARE TestDataFile string = @path+ "test_complete_0_overfitting_highdim.svmlight.csv"; 27 | #DECLARE ValidDataFile string = @path+ "valid02_overfitting_highdim.svmlight.csv"; 28 | 29 | #DECLARE out_submit_file_v0 string = @path+"results/FT_submit"+".csv"; 30 | #DECLARE out_submit_file string = @path+"results/FT_test_submit_top1.csv"; 31 | #DECLARE PredOut string = @path+"results/FT_test.predictions.tsv"; 32 | #DECLARE PredOut_valid string = @path+"results/FT_valid.predictions.tsv"; 33 | #DECLARE MetricsOut string = @path+"results/FT_valid.metrics.ss"; 34 | 35 | 36 | data = 37 | EXTRACT Line : string 38 | FROM @ValidDataFile 39 | USING DefaultTextExtractor("-d", "\n"); 40 | scoredTest = 41 | PROCESS data 42 | PRODUCE 43 | Comment, 44 | Label, 45 | Score, 46 | Probability 47 | USING TlcScoringProcessor("loader+", "in=FT_p50_haslatlng.zip"); //useLoader=+ 48 | 49 | OUTPUT 50 | TO @PredOut_valid 51 | USING DefaultTextOutputter(); 52 | 53 | metrics = 54 | REDUCE scoredTest ALL 55 | USING TlcEvaluatingReducer( 56 | // The binary evaluator is being used, 57 | // and the score and probability columns are specified. 58 | "eval=bin{prob=Probability score=Score}", 59 | // The label column. 60 | "lab=Label"); 61 | OUTPUT metrics 62 | TO SSTREAM @MetricsOut ; 63 | 64 | 65 | 66 | 67 | data = 68 | EXTRACT Line : string 69 | FROM @TestDataFile 70 | USING DefaultTextExtractor("-d", "\n"); 71 | 72 | 73 | scoredTest = 74 | PROCESS data 75 | PRODUCE 76 | Comment, 77 | Label, 78 | Score, 79 | Probability 80 | USING TlcScoringProcessor("loader+", "in=FT_p50_haslatlng.zip"); //useLoader=+ 81 | 82 | OUTPUT 83 | TO @PredOut 84 | USING DefaultTextOutputter(); 85 | 86 | 87 | 88 | preds = SELECT MyHelper.GetUserId(Comment) AS uid, 89 | MyHelper.GetItemId(Comment) AS iid, 90 | Probability 91 | FROM scoredTest; 92 | 93 | 94 | 95 | REDUCE preds 96 | ON iid 97 | USING SubmissionFormater(); 98 | 99 | OUTPUT 100 | TO @out_submit_file_v0 101 | USING DefaultTextOutputter(delimiter: '\t'); 102 | 103 | // 104 | //////// since each user can receive at most one recommendation 105 | tpreds = 106 | REDUCE preds 107 | ON iid 108 | USING TopKSelector(); 109 | 110 | REDUCE tpreds 111 | ON holder 112 | USING OnlineSubmissionFormater(); 113 | 114 | OUTPUT 115 | TO @out_submit_file 116 | USING DefaultTextOutputter(delimiter: '\t'); 117 | -------------------------------------------------------------------------------- /models/TEST_Localmodel_tlc3_pipeline.script.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.SCOPE.Types; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.IO; 5 | using System.Text; 6 | using ScopeRuntime; 7 | 8 | public static class MyHelper 9 | { 10 | public static string GetUserId(string str) 11 | { 12 | int idx = str.IndexOf(","); 13 | return str.Substring(0, idx); 14 | } 15 | 16 | public static string GetItemId(string str) 17 | { 18 | int idx = str.IndexOf(","); 19 | return str.Substring(idx + 1); 20 | } 21 | } 22 | 23 | 24 | 25 | public class SubmissionFormater : Reducer 26 | { 27 | public override Schema Produces(string[] requestedColumns, string[] args, Schema input) 28 | { 29 | ScopeRuntime.Diagnostics.DebugStream.WriteLine("requestdColumns: {0}", string.Join(",", requestedColumns)); 30 | return new Schema( 31 | "ItemId:string,TopUserId:string" 32 | ); 33 | } 34 | 35 | public override IEnumerable Reduce(RowSet input, Row outputRow, string[] args) 36 | { 37 | int topk = 100; 38 | 39 | string iid = ""; 40 | 41 | List> uid_score_list = new List>(); 42 | 43 | 44 | foreach (Row row in input.Rows) 45 | { 46 | iid = row[1].String; 47 | 48 | string uid = row[0].String; 49 | 50 | float score = row[2].Float; 51 | 52 | uid_score_list.Add(new Tuple(uid, score)); 53 | } 54 | 55 | uid_score_list.Sort((a, b) => b.Item2.CompareTo(a.Item2)); 56 | int k = Math.Min(topk, uid_score_list.Count); 57 | 58 | string value = ""; 59 | for (int i = 0; i < k; i++) 60 | { 61 | value += "," + uid_score_list[i].Item1; 62 | } 63 | 64 | if (value.Length > 0) 65 | { 66 | outputRow[0].Set(iid); 67 | outputRow[1].Set(value.Substring(1)); 68 | } 69 | else 70 | { 71 | outputRow[0].Set(iid); 72 | outputRow[1].Set(""); 73 | } 74 | 75 | yield return outputRow; 76 | 77 | } 78 | } 79 | 80 | 81 | 82 | 83 | public class OnlineSubmissionFormater : Reducer 84 | { 85 | public override Schema Produces(string[] requestedColumns, string[] args, Schema input) 86 | { 87 | ScopeRuntime.Diagnostics.DebugStream.WriteLine("requestdColumns: {0}", string.Join(",", requestedColumns)); 88 | return new Schema( 89 | "ItemId:string,TopUserId:string" 90 | ); 91 | } 92 | 93 | public override IEnumerable Reduce(RowSet input, Row outputRow, string[] args) 94 | { 95 | int topk = 250; 96 | 97 | string iid = ""; 98 | string uid = ""; 99 | float score = 0; 100 | 101 | List> score_list = new List>(); 102 | 103 | 104 | foreach (Row row in input.Rows) 105 | { 106 | iid = row[1].String; 107 | 108 | uid = row[0].String; 109 | 110 | score = row[2].Float; 111 | 112 | score_list.Add(new Tuple(uid, iid, score)); 113 | } 114 | 115 | score_list.Sort((a, b) => b.Item3.CompareTo(a.Item3)); 116 | 117 | Dictionary> iid2uids = new Dictionary>(); 118 | HashSet visited_uids = new HashSet(); 119 | foreach (var tuple in score_list) 120 | { 121 | uid = tuple.Item1; 122 | iid = tuple.Item2; 123 | if (!visited_uids.Contains(uid)) 124 | { 125 | if (!iid2uids.ContainsKey(iid)) 126 | { 127 | iid2uids.Add(iid, new List()); 128 | } 129 | if (iid2uids[iid].Count < topk) 130 | { 131 | iid2uids[iid].Add(uid); 132 | visited_uids.Add(uid); 133 | } 134 | } 135 | } 136 | 137 | 138 | foreach (var pair in iid2uids) 139 | { 140 | outputRow[0].Set(pair.Key); 141 | string res = ""; 142 | foreach (var tuid in pair.Value) 143 | { 144 | res += "," + tuid; 145 | } 146 | if (res.Length <= 0) 147 | { 148 | res = ", "; 149 | } 150 | outputRow[1].Set(res.Substring(1)); 151 | yield return outputRow; 152 | } 153 | 154 | } 155 | } 156 | 157 | 158 | public class TopKSelector : Reducer 159 | { 160 | public override Schema Produces(string[] requestedColumns, string[] args, Schema input) 161 | { 162 | ScopeRuntime.Diagnostics.DebugStream.WriteLine("requestdColumns: {0}", string.Join(",", requestedColumns)); 163 | return new Schema( 164 | "UserId:string,ItemId:string,Probability:float,holder:string" 165 | ); 166 | } 167 | 168 | public override IEnumerable Reduce(RowSet input, Row outputRow, string[] args) 169 | { 170 | int topk = 3000; 171 | 172 | string iid = ""; 173 | 174 | List> uid_score_list = new List>(); 175 | 176 | 177 | foreach (Row row in input.Rows) 178 | { 179 | iid = row[1].String; 180 | 181 | string uid = row[0].String; 182 | 183 | float score = row[2].Float; 184 | 185 | uid_score_list.Add(new Tuple(uid, score)); 186 | } 187 | 188 | uid_score_list.Sort((a, b) => b.Item2.CompareTo(a.Item2)); 189 | int k = Math.Min(topk, uid_score_list.Count); 190 | 191 | 192 | for (int i = 0; i < k; i++) 193 | { 194 | outputRow[0].Set(uid_score_list[i].Item1); 195 | outputRow[1].Set(iid); 196 | outputRow[2].Set(uid_score_list[i].Item2); 197 | outputRow[3].Set("1"); 198 | yield return outputRow; 199 | } 200 | 201 | } 202 | } 203 | 204 | public class TopKSelectorUserSide : Reducer 205 | { 206 | public override Schema Produces(string[] requestedColumns, string[] args, Schema input) 207 | { 208 | ScopeRuntime.Diagnostics.DebugStream.WriteLine("requestdColumns: {0}", string.Join(",", requestedColumns)); 209 | return new Schema( 210 | "UserId:string,ItemId:string,Probability:float" 211 | ); 212 | } 213 | 214 | public override IEnumerable Reduce(RowSet input, Row outputRow, string[] args) 215 | { 216 | int topk = 500; 217 | 218 | string uid = ""; 219 | 220 | List> iid_score_list = new List>(); 221 | 222 | 223 | foreach (Row row in input.Rows) 224 | { 225 | uid = row[0].String; 226 | 227 | string iid = row[1].String; 228 | 229 | float score = row[2].Float; 230 | 231 | iid_score_list.Add(new Tuple(iid, score)); 232 | } 233 | 234 | iid_score_list.Sort((a, b) => b.Item2.CompareTo(a.Item2)); 235 | int k = Math.Min(topk, iid_score_list.Count); 236 | 237 | 238 | for (int i = 0; i < k; i++) 239 | { 240 | outputRow[1].Set(iid_score_list[i].Item1); 241 | outputRow[0].Set(uid); 242 | outputRow[2].Set(iid_score_list[i].Item2); 243 | yield return outputRow; 244 | } 245 | 246 | } 247 | } 248 | 249 | 250 | public class UserTopSecondSelector : Reducer 251 | { 252 | public override Schema Produces(string[] requestedColumns, string[] args, Schema input) 253 | { 254 | ScopeRuntime.Diagnostics.DebugStream.WriteLine("requestdColumns: {0}", string.Join(",", requestedColumns)); 255 | return new Schema( 256 | "uid:string,iid:string,Probability:float,Place:int" 257 | ); 258 | } 259 | 260 | public override IEnumerable Reduce(RowSet input, Row outputRow, string[] args) 261 | { 262 | int topk = int.Parse(args[0]); 263 | 264 | string uid = ""; 265 | 266 | List> iid_score_list = new List>(); 267 | 268 | 269 | foreach (Row row in input.Rows) 270 | { 271 | uid = row[0].String; 272 | 273 | string iid = row[1].String; 274 | 275 | float score = row[2].Float; 276 | 277 | iid_score_list.Add(new Tuple(iid, score)); 278 | } 279 | 280 | if (iid_score_list.Count < 1) 281 | { 282 | outputRow[0].Set(""); 283 | outputRow[1].Set(""); 284 | outputRow[2].Set(0); 285 | yield return outputRow; 286 | } 287 | else 288 | { 289 | iid_score_list.Sort((a, b) => b.Item2.CompareTo(a.Item2)); 290 | for (int i = 0; i < topk; i++) 291 | { 292 | outputRow[0].Set(uid); 293 | outputRow[1].Set(iid_score_list[i].Item1); 294 | outputRow[2].Set(iid_score_list[i].Item2); 295 | outputRow[3].Set((i + 1).ToString()); 296 | yield return outputRow; 297 | } 298 | 299 | } 300 | 301 | } 302 | } 303 | -------------------------------------------------------------------------------- /models/TEST_TrainModel_Pipeline_cls_tlc3_sparse.script: -------------------------------------------------------------------------------- 1 | //Script GUID:ecd83686-8506-40c3-bc1c-85566fe24fca 2 | //Used for tracking history 3 | 4 | // shares/CML.TLC/TLC/TLC_3.6.65.0/ 5 | 6 | 7 | #DECLARE model_type string = "FT"; // LR 8 | 9 | 10 | 11 | #IF(@model_type == "FT") 12 | #DECLARE out_tag string = "_FT_L300_T300_" ; 13 | #DECLARE trainerArguments string = "tr=FastTreeBinaryClassification {lr=0.05 nl=300 mil=50 iter=300 } "; //ff=0.7 14 | #ELSEIF(@model_type=="LR") 15 | #DECLARE out_tag string = "_LR_" ; 16 | #DECLARE trainerArguments string = "tr=lr{optTol=1e-5} "; // -5 17 | #ELSEIF(@model_type=="XG") 18 | #DECLARE out_tag string = "_XG_" ; 19 | #DECLARE trainerArguments string = "tr=XGBoostBinary {iter=50}"; 20 | #ELSEIF(@model_type=="GBM") 21 | #DECLARE out_tag string = "_GBM_" ; 22 | #DECLARE trainerArguments string = "tr=LightGBMBinary {iter=300 lr=0.05 nl=300 mil=50} "; 23 | #ELSEIF(@model_type=="SVM") 24 | #DECLARE out_tag string = "_SVM_" ; 25 | #DECLARE trainerArguments string = "tr=LinearSVM{lambda=0.0001539926 iter=300 initwts=1}"; 26 | #ENDIF 27 | 28 | 29 | #DECLARE TrainIn string = "local/users/v-lianji/camera_ready/train-val-online/overfitting/train02_overfitting_highdim_big5.svmlight.csv" ; 30 | 31 | #DECLARE ModelOut string = "local/users/v-lianji/camera_ready/train-val-online/overfitting/FT_p50_big5_haslatlng.zip"; 32 | 33 | #DECLARE NPartitions int = 50; //35 34 | 35 | 36 | //// for cosmos09 TLC3.7 37 | #IF(EXISTS("local/users/v-lianji/TLC3.7/Tlc3Scope.module")) 38 | MODULE "local/users/v-lianji/TLC3.7/Tlc3Scope.module" AS Tlc3Learner; 39 | RESOURCE @"local/users/v-lianji/TLC3.7/Microsoft.MachineLearning.Garage.dll"; 40 | #ELSE 41 | /// cosmos14 42 | MODULE "/shares/CML.TLC/TLC/TLC-3.7.162.86/Tlc3Scope.module" AS Tlc3Learner; 43 | RESOURCE @"/shares/CML.TLC/TLC/TLC-3.7.162.86/Microsoft.MachineLearning.Garage.dll"; 44 | RESOURCE @"/shares/CML.TLC/TLC/TLC-3.7.162.86/libxgboost.dll"; 45 | RESOURCE @"/shares/CML.TLC/TLC/TLC-3.7.162.86/Microsoft.MachineLearning.XGBoost.dll"; 46 | #ENDIF 47 | 48 | 49 | trainData = 50 | EXTRACT Line : string 51 | FROM @TrainIn 52 | USING DefaultTextExtractor("-d", "\n"); 53 | 54 | ////////////////////////////////// 55 | //////////trainData02 = 56 | ////////// EXTRACT Line : string 57 | ////////// FROM @"my/RecSys2017/pipeline/offline/train02_highdim_nouid.svmlight.csv" 58 | ////////// USING DefaultTextExtractor("-d", "\n"); 59 | ////////// 60 | ////////// 61 | //////////trainData = 62 | ////////// SELECT * 63 | ////////// FROM trainData 64 | ////////// UNION ALL 65 | ////////// SELECT * 66 | ////////// FROM trainData02; 67 | ///////////////////////////// 68 | ////////// 69 | 70 | 71 | ////////////////////////////////////////////////////////////////////////////////////// 72 | //predicted_data = 73 | // EXTRACT id : string, 74 | // label : int, 75 | // score : float, 76 | // prob : float 77 | // FROM "local/users/v-lianji/camera_ready/train-val-online/LR_2_stage/train02_highdim.pred.tsv" 78 | // USING DefaultTextExtractor(delimiter: '\t'); 79 | //predicted_data = 80 | // SELECT id 81 | // FROM predicted_data 82 | // WHERE label == 1 OR prob >= 0.05 83 | // GROUP BY id; 84 | // 85 | //trainData = 86 | // SELECT Line, 87 | // Line.Substring(Line.IndexOf("#") + 1) AS id 88 | // FROM trainData; 89 | //trainData = 90 | // SELECT a.Line 91 | // FROM trainData AS a 92 | // INNER JOIN 93 | // predicted_data AS b 94 | // ON a.id == b.id; 95 | 96 | 97 | ////////////////////////////////////////////////////////////////////////////////////// 98 | 99 | //// shuffle training data 100 | trainData = 101 | SELECT MyHelper.GetUserIdFromLast(Line) AS uid, //MyHelper.GetRandomInt(10000) AS uid, // 102 | Line 103 | FROM trainData; 104 | 105 | trainData = 106 | REDUCE trainData 107 | ON uid 108 | USING ShuffleLinesReducer(); 109 | /// end of shuffle training data 110 | 111 | models = Tlc3Learner.Train 112 | ( 113 | TrainingData = trainData, 114 | numberOfDataPartitions = @NPartitions, 115 | 116 | loaderArguments = "group={} loader=SvmLightLoader{size=28000} ", //28000 98000 5400 117 | 118 | transforms = 119 | 120 | "skipCheck=+ ", 121 | 122 | trainerArguments = @trainerArguments 123 | 124 | ,combinerArguments = "c=ParallelEnsemble{oc=Median} " 125 | ); 126 | 127 | 128 | 129 | // Output the combined model. 130 | OUTPUT models.TrainedModel 131 | TO @ModelOut 132 | USING TlcModelOutputter; 133 | -------------------------------------------------------------------------------- /models/TEST_TrainModel_Pipeline_cls_tlc3_sparse.script.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.SCOPE.Types; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.IO; 5 | using System.Text; 6 | using ScopeRuntime; 7 | 8 | public static class MyHelper 9 | { 10 | public static Random rng = new Random(); 11 | public static string GetUserId(string str) 12 | { 13 | int idx = str.IndexOf(","); 14 | return str.Substring(0, idx); 15 | } 16 | 17 | public static string GetItemId(string str) 18 | { 19 | int idx = str.IndexOf(","); 20 | return str.Substring(idx + 1); 21 | } 22 | 23 | 24 | public static string GetUserIdFromLast(string str) 25 | { 26 | int idx = str.IndexOf("#"); 27 | return str.Substring(idx + 1).Split(',')[0]; 28 | } 29 | 30 | public static int GetRandomInt(int k) 31 | { 32 | return rng.Next(k); 33 | } 34 | } 35 | 36 | 37 | 38 | public class SubmissionFormater : Reducer 39 | { 40 | public override Schema Produces(string[] requestedColumns, string[] args, Schema input) 41 | { 42 | ScopeRuntime.Diagnostics.DebugStream.WriteLine("requestdColumns: {0}", string.Join(",", requestedColumns)); 43 | return new Schema( 44 | "ItemId:string,TopUserId:string" 45 | ); 46 | } 47 | 48 | public override IEnumerable Reduce(RowSet input, Row outputRow, string[] args) 49 | { 50 | int topk = 250; 51 | 52 | string iid = ""; 53 | 54 | List> uid_score_list = new List>(); 55 | 56 | 57 | foreach (Row row in input.Rows) 58 | { 59 | iid = row[1].String; 60 | 61 | string uid = row[0].String; 62 | 63 | float score = row[2].Float; 64 | 65 | uid_score_list.Add(new Tuple(uid, score)); 66 | } 67 | 68 | uid_score_list.Sort((a, b) => b.Item2.CompareTo(a.Item2)); 69 | int k = Math.Min(topk, uid_score_list.Count); 70 | 71 | string value = ""; 72 | for (int i = 0; i < k; i++) 73 | { 74 | value += "," + uid_score_list[i].Item1; 75 | } 76 | 77 | if (value.Length > 0) 78 | { 79 | outputRow[0].Set(iid); 80 | outputRow[1].Set(value.Substring(1)); 81 | } 82 | else 83 | { 84 | outputRow[0].Set(iid); 85 | outputRow[1].Set(""); 86 | } 87 | 88 | yield return outputRow; 89 | 90 | } 91 | } 92 | 93 | 94 | 95 | 96 | public class OnlineSubmissionFormater : Reducer 97 | { 98 | public override Schema Produces(string[] requestedColumns, string[] args, Schema input) 99 | { 100 | ScopeRuntime.Diagnostics.DebugStream.WriteLine("requestdColumns: {0}", string.Join(",", requestedColumns)); 101 | return new Schema( 102 | "ItemId:string,TopUserId:string" 103 | ); 104 | } 105 | 106 | public override IEnumerable Reduce(RowSet input, Row outputRow, string[] args) 107 | { 108 | int topk = 250; 109 | 110 | string iid = ""; 111 | string uid = ""; 112 | float score = 0; 113 | 114 | List> score_list = new List>(); 115 | 116 | 117 | foreach (Row row in input.Rows) 118 | { 119 | iid = row[1].String; 120 | 121 | uid = row[0].String; 122 | 123 | score = row[2].Float; 124 | 125 | score_list.Add(new Tuple(uid, iid, score)); 126 | } 127 | 128 | score_list.Sort((a, b) => b.Item3.CompareTo(a.Item3)); 129 | 130 | Dictionary> iid2uids = new Dictionary>(); 131 | HashSet visited_uids = new HashSet(); 132 | foreach (var tuple in score_list) 133 | { 134 | uid = tuple.Item1; 135 | iid = tuple.Item2; 136 | if (!visited_uids.Contains(uid)) 137 | { 138 | if (!iid2uids.ContainsKey(iid)) 139 | { 140 | iid2uids.Add(iid, new List()); 141 | } 142 | if (iid2uids[iid].Count < topk) 143 | { 144 | iid2uids[iid].Add(uid); 145 | visited_uids.Add(uid); 146 | } 147 | } 148 | } 149 | 150 | 151 | foreach (var pair in iid2uids) 152 | { 153 | outputRow[0].Set(pair.Key); 154 | string res = ""; 155 | foreach (var tuid in pair.Value) 156 | { 157 | res += "," + tuid; 158 | } 159 | if (res.Length <= 0) 160 | { 161 | res = ", "; 162 | } 163 | outputRow[1].Set(res.Substring(1)); 164 | yield return outputRow; 165 | } 166 | 167 | } 168 | } 169 | 170 | 171 | public class TopKSelector : Reducer 172 | { 173 | public override Schema Produces(string[] requestedColumns, string[] args, Schema input) 174 | { 175 | ScopeRuntime.Diagnostics.DebugStream.WriteLine("requestdColumns: {0}", string.Join(",", requestedColumns)); 176 | return new Schema( 177 | "UserId:string,ItemId:string,Probability:float,holder:string" 178 | ); 179 | } 180 | 181 | public override IEnumerable Reduce(RowSet input, Row outputRow, string[] args) 182 | { 183 | int topk = 1000; 184 | 185 | string iid = ""; 186 | 187 | List> uid_score_list = new List>(); 188 | 189 | 190 | foreach (Row row in input.Rows) 191 | { 192 | iid = row[1].String; 193 | 194 | string uid = row[0].String; 195 | 196 | float score = row[2].Float; 197 | 198 | uid_score_list.Add(new Tuple(uid, score)); 199 | } 200 | 201 | uid_score_list.Sort((a, b) => b.Item2.CompareTo(a.Item2)); 202 | int k = Math.Min(topk, uid_score_list.Count); 203 | 204 | 205 | for (int i = 0; i < k; i++) 206 | { 207 | outputRow[0].Set(uid_score_list[i].Item1); 208 | outputRow[1].Set(iid); 209 | outputRow[2].Set(uid_score_list[i].Item2); 210 | outputRow[3].Set("1"); 211 | yield return outputRow; 212 | } 213 | 214 | } 215 | } 216 | 217 | public class TopKSelectorUserSide : Reducer 218 | { 219 | public override Schema Produces(string[] requestedColumns, string[] args, Schema input) 220 | { 221 | ScopeRuntime.Diagnostics.DebugStream.WriteLine("requestdColumns: {0}", string.Join(",", requestedColumns)); 222 | return new Schema( 223 | "UserId:string,ItemId:string,Probability:float" 224 | ); 225 | } 226 | 227 | public override IEnumerable Reduce(RowSet input, Row outputRow, string[] args) 228 | { 229 | int topk = 500; 230 | 231 | string uid = ""; 232 | 233 | List> iid_score_list = new List>(); 234 | 235 | 236 | foreach (Row row in input.Rows) 237 | { 238 | uid = row[0].String; 239 | 240 | string iid = row[1].String; 241 | 242 | float score = row[2].Float; 243 | 244 | iid_score_list.Add(new Tuple(iid, score)); 245 | } 246 | 247 | iid_score_list.Sort((a, b) => b.Item2.CompareTo(a.Item2)); 248 | int k = Math.Min(topk, iid_score_list.Count); 249 | 250 | 251 | for (int i = 0; i < k; i++) 252 | { 253 | outputRow[1].Set(iid_score_list[i].Item1); 254 | outputRow[0].Set(uid); 255 | outputRow[2].Set(iid_score_list[i].Item2); 256 | yield return outputRow; 257 | } 258 | 259 | } 260 | } 261 | 262 | 263 | public class UserTopSecondSelector : Reducer 264 | { 265 | public override Schema Produces(string[] requestedColumns, string[] args, Schema input) 266 | { 267 | ScopeRuntime.Diagnostics.DebugStream.WriteLine("requestdColumns: {0}", string.Join(",", requestedColumns)); 268 | return new Schema( 269 | "uid:string,iid:string,Probability:float,Place:int" 270 | ); 271 | } 272 | 273 | public override IEnumerable Reduce(RowSet input, Row outputRow, string[] args) 274 | { 275 | int topk = int.Parse(args[0]); 276 | 277 | string uid = ""; 278 | 279 | List> iid_score_list = new List>(); 280 | 281 | 282 | foreach (Row row in input.Rows) 283 | { 284 | uid = row[0].String; 285 | 286 | string iid = row[1].String; 287 | 288 | float score = row[2].Float; 289 | 290 | iid_score_list.Add(new Tuple(iid, score)); 291 | } 292 | 293 | if (iid_score_list.Count < 1) 294 | { 295 | outputRow[0].Set(""); 296 | outputRow[1].Set(""); 297 | outputRow[2].Set(0); 298 | yield return outputRow; 299 | } 300 | else 301 | { 302 | iid_score_list.Sort((a, b) => b.Item2.CompareTo(a.Item2)); 303 | for (int i = 0; i < topk; i++) 304 | { 305 | outputRow[0].Set(uid); 306 | outputRow[1].Set(iid_score_list[i].Item1); 307 | outputRow[2].Set(iid_score_list[i].Item2); 308 | outputRow[3].Set((i + 1).ToString()); 309 | yield return outputRow; 310 | } 311 | 312 | } 313 | 314 | } 315 | } 316 | 317 | 318 | public class ShuffleLinesReducer : Reducer 319 | { 320 | public override Schema Produces(string[] requestedColumns, string[] args, Schema input) 321 | { 322 | ScopeRuntime.Diagnostics.DebugStream.WriteLine("requestdColumns: {0}", string.Join(",", requestedColumns)); 323 | return new Schema( 324 | "Line:string" 325 | ); 326 | } 327 | 328 | public override IEnumerable Reduce(RowSet input, Row outputRow, string[] args) 329 | { 330 | List lines = new List(); 331 | 332 | 333 | foreach (Row row in input.Rows) 334 | { 335 | lines.Add(row[1].String); 336 | } 337 | string[] array = lines.ToArray(); 338 | 339 | int n = array.Length; 340 | Random rng = new Random(); 341 | 342 | while (n > 1) 343 | { 344 | int k = rng.Next(n--); 345 | string tmp = array[n]; 346 | array[n] = array[k]; 347 | array[k] = tmp; 348 | } 349 | 350 | n = array.Length; 351 | if (n > 0) 352 | { 353 | for (int i = 0; i < n; i++) 354 | { 355 | outputRow[0].Set(array[i]); 356 | yield return outputRow; 357 | } 358 | } 359 | 360 | } 361 | } 362 | -------------------------------------------------------------------------------- /models/TEST_tmp_Location_ExtractFeatures.script.cs: -------------------------------------------------------------------------------- 1 |  -------------------------------------------------------------------------------- /models/ensemble-2stage.script: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | /////////////////////// ensemble ////////////////////////////// 5 | stage1_preds = 6 | EXTRACT InstanceIndex : string, 7 | label : string, 8 | output : float, 9 | Probability : float 10 | FROM @"my/RecSys2017/pipeline/LR/results/LR-test_complete_0_highdim.predictions.tsv" 11 | USING DefaultTextExtractor(delimiter: '\t'); 12 | stage1_preds = 13 | SELECT MyHelper.GetUserId(InstanceIndex) AS uid, 14 | MyHelper.GetItemId(InstanceIndex) AS iid, 15 | Probability 16 | FROM stage1_preds 17 | WHERE Probability > 0.2; 18 | 19 | // 20 | //stage1_preds = 21 | //REDUCE stage1_preds 22 | //ON uid 23 | //USING TopInstanceSelection(); 24 | // 25 | 26 | stage2_preds = 27 | EXTRACT InstanceIndex : string, 28 | label : string, 29 | output : float, 30 | Probability : float 31 | FROM @"local/users/v-lianji/RecSys2017/big_candidates/goodmodels/results/FT-bigneg0.5-300-300-p30.predictions.tsv" 32 | USING DefaultTextExtractor(delimiter: '\t'); 33 | stage2_preds = 34 | SELECT MyHelper.GetUserId(InstanceIndex) AS uid, 35 | MyHelper.GetItemId(InstanceIndex) AS iid, 36 | Probability; 37 | 38 | ensemble_predictions = 39 | SELECT a.uid, 40 | a.iid, 41 | a.Probability 42 | FROM stage2_preds AS a 43 | INNER JOIN 44 | stage1_preds AS b 45 | ON a.uid == b.uid AND a.iid == b.iid; 46 | // 47 | // 48 | //ensemble_predictions = 49 | // SELECT a.uid, 50 | // a.iid, 51 | // a.Probability AS Probability 52 | // FROM stage2_preds AS b 53 | // INNER JOIN 54 | // stage1_preds AS a 55 | // ON a.uid == b.uid AND a.iid == b.iid; 56 | // 57 | 58 | 59 | REDUCE ensemble_predictions 60 | ON iid 61 | USING SubmissionFormater(); 62 | 63 | OUTPUT 64 | TO @"my/RecSys2017/pipeline/LR/results/ensemble/FT-p10-T300-L300-0.5-submit-v0.tsv" 65 | USING DefaultTextOutputter(delimiter: '\t'); 66 | 67 | 68 | ////// since each user can receive at most one recommendation 69 | tpreds = 70 | REDUCE ensemble_predictions 71 | ON iid 72 | USING TopKSelector(); 73 | 74 | REDUCE tpreds 75 | ON holder 76 | USING OnlineSubmissionFormater(); 77 | 78 | OUTPUT 79 | TO @"my/RecSys2017/pipeline/LR/results/ensemble/FT-p10-T300-L300-0.5-submit.tsv" 80 | USING DefaultTextOutputter(delimiter: '\t'); 81 | -------------------------------------------------------------------------------- /models/ensemble-2stage.script.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.SCOPE.Types; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.IO; 5 | using System.Text; 6 | using ScopeRuntime; 7 | 8 | 9 | 10 | public static class MyHelper 11 | { 12 | public static string GetUserId(string str) 13 | { 14 | int idx = str.IndexOf(","); 15 | return str.Substring(0, idx); 16 | } 17 | 18 | public static string GetItemId(string str) 19 | { 20 | int idx = str.IndexOf(","); 21 | return str.Substring(idx + 1); 22 | } 23 | } 24 | 25 | 26 | 27 | public class TopInstanceSelection : Reducer 28 | { 29 | public override Schema Produces(string[] requestedColumns, string[] args, Schema input) 30 | { 31 | ScopeRuntime.Diagnostics.DebugStream.WriteLine("requestdColumns: {0}", string.Join(",", requestedColumns)); 32 | return new Schema( 33 | "uid:string,iid:string,Probability:float" 34 | ); 35 | } 36 | 37 | public override IEnumerable Reduce(RowSet input, Row outputRow, string[] args) 38 | { 39 | int topk = 100; 40 | 41 | string uid = ""; 42 | 43 | List> iid_score_list = new List>(); 44 | 45 | 46 | foreach (Row row in input.Rows) 47 | { 48 | uid = row[0].String; 49 | 50 | string iid = row[1].String; 51 | 52 | float score = row[2].Float; 53 | if(score>0.6) 54 | iid_score_list.Add(new Tuple(iid, score)); 55 | } 56 | 57 | iid_score_list.Sort((a, b) => b.Item2.CompareTo(a.Item2)); 58 | int k = Math.Min(topk, iid_score_list.Count); 59 | 60 | for (int i = 0; i < k; i++) 61 | { 62 | outputRow[1].Set(iid_score_list[i].Item1); 63 | outputRow[0].Set(uid); 64 | outputRow[2].Set(iid_score_list[i].Item2); 65 | yield return outputRow; 66 | } 67 | 68 | } 69 | } 70 | 71 | public class SubmissionFormater : Reducer 72 | { 73 | public override Schema Produces(string[] requestedColumns, string[] args, Schema input) 74 | { 75 | ScopeRuntime.Diagnostics.DebugStream.WriteLine("requestdColumns: {0}", string.Join(",", requestedColumns)); 76 | return new Schema( 77 | "ItemId:string,TopUserId:string" 78 | ); 79 | } 80 | 81 | public override IEnumerable Reduce(RowSet input, Row outputRow, string[] args) 82 | { 83 | int topk = 250; 84 | 85 | string iid = ""; 86 | 87 | List> uid_score_list = new List>(); 88 | 89 | 90 | foreach (Row row in input.Rows) 91 | { 92 | iid = row[1].String; 93 | 94 | string uid = row[0].String; 95 | 96 | float score = row[2].Float; 97 | 98 | uid_score_list.Add(new Tuple(uid, score)); 99 | } 100 | 101 | uid_score_list.Sort((a, b) => b.Item2.CompareTo(a.Item2)); 102 | int k = Math.Min(topk, uid_score_list.Count); 103 | 104 | string value = ""; 105 | for (int i = 0; i < k; i++) 106 | { 107 | value += "," + uid_score_list[i].Item1; 108 | } 109 | 110 | if (value.Length > 0) 111 | { 112 | outputRow[0].Set(iid); 113 | outputRow[1].Set(value.Substring(1)); 114 | } 115 | else 116 | { 117 | outputRow[0].Set(iid); 118 | outputRow[1].Set(""); 119 | } 120 | 121 | yield return outputRow; 122 | 123 | } 124 | } 125 | 126 | 127 | 128 | 129 | public class OnlineSubmissionFormater : Reducer 130 | { 131 | public override Schema Produces(string[] requestedColumns, string[] args, Schema input) 132 | { 133 | ScopeRuntime.Diagnostics.DebugStream.WriteLine("requestdColumns: {0}", string.Join(",", requestedColumns)); 134 | return new Schema( 135 | "ItemId:string,TopUserId:string" 136 | ); 137 | } 138 | 139 | public override IEnumerable Reduce(RowSet input, Row outputRow, string[] args) 140 | { 141 | int topk = 250; 142 | 143 | string iid = ""; 144 | string uid = ""; 145 | float score = 0; 146 | 147 | List> score_list = new List>(); 148 | 149 | 150 | foreach (Row row in input.Rows) 151 | { 152 | iid = row[1].String; 153 | 154 | uid = row[0].String; 155 | 156 | score = row[2].Float; 157 | 158 | score_list.Add(new Tuple(uid, iid, score)); 159 | } 160 | 161 | score_list.Sort((a, b) => b.Item3.CompareTo(a.Item3)); 162 | 163 | Dictionary> iid2uids = new Dictionary>(); 164 | HashSet visited_uids = new HashSet(); 165 | foreach (var tuple in score_list) 166 | { 167 | uid = tuple.Item1; 168 | iid = tuple.Item2; 169 | if (!visited_uids.Contains(uid)) 170 | { 171 | if (!iid2uids.ContainsKey(iid)) 172 | { 173 | iid2uids.Add(iid, new List()); 174 | } 175 | if (iid2uids[iid].Count < topk) 176 | { 177 | iid2uids[iid].Add(uid); 178 | visited_uids.Add(uid); 179 | } 180 | } 181 | } 182 | 183 | 184 | foreach (var pair in iid2uids) 185 | { 186 | outputRow[0].Set(pair.Key); 187 | string res = ""; 188 | foreach (var tuid in pair.Value) 189 | { 190 | res += "," + tuid; 191 | } 192 | if (res.Length <= 0) 193 | { 194 | res = ", "; 195 | } 196 | outputRow[1].Set(res.Substring(1)); 197 | yield return outputRow; 198 | } 199 | 200 | } 201 | } 202 | 203 | 204 | public class TopKSelector : Reducer 205 | { 206 | public override Schema Produces(string[] requestedColumns, string[] args, Schema input) 207 | { 208 | ScopeRuntime.Diagnostics.DebugStream.WriteLine("requestdColumns: {0}", string.Join(",", requestedColumns)); 209 | return new Schema( 210 | "UserId:string,ItemId:string,Probability:float,holder:string" 211 | ); 212 | } 213 | 214 | public override IEnumerable Reduce(RowSet input, Row outputRow, string[] args) 215 | { 216 | int topk = 3000; 217 | 218 | string iid = ""; 219 | 220 | List> uid_score_list = new List>(); 221 | 222 | 223 | foreach (Row row in input.Rows) 224 | { 225 | iid = row[1].String; 226 | 227 | string uid = row[0].String; 228 | 229 | float score = row[2].Float; 230 | 231 | uid_score_list.Add(new Tuple(uid, score)); 232 | } 233 | 234 | uid_score_list.Sort((a, b) => b.Item2.CompareTo(a.Item2)); 235 | int k = Math.Min(topk, uid_score_list.Count); 236 | 237 | 238 | for (int i = 0; i < k; i++) 239 | { 240 | outputRow[0].Set(uid_score_list[i].Item1); 241 | outputRow[1].Set(iid); 242 | outputRow[2].Set(uid_score_list[i].Item2); 243 | outputRow[3].Set("1"); 244 | yield return outputRow; 245 | } 246 | 247 | } 248 | } 249 | 250 | public class TopKSelectorUserSide : Reducer 251 | { 252 | public override Schema Produces(string[] requestedColumns, string[] args, Schema input) 253 | { 254 | ScopeRuntime.Diagnostics.DebugStream.WriteLine("requestdColumns: {0}", string.Join(",", requestedColumns)); 255 | return new Schema( 256 | "UserId:string,ItemId:string,Probability:float" 257 | ); 258 | } 259 | 260 | public override IEnumerable Reduce(RowSet input, Row outputRow, string[] args) 261 | { 262 | int topk = 500; 263 | 264 | string uid = ""; 265 | 266 | List> iid_score_list = new List>(); 267 | 268 | 269 | foreach (Row row in input.Rows) 270 | { 271 | uid = row[0].String; 272 | 273 | string iid = row[1].String; 274 | 275 | float score = row[2].Float; 276 | 277 | iid_score_list.Add(new Tuple(iid, score)); 278 | } 279 | 280 | iid_score_list.Sort((a, b) => b.Item2.CompareTo(a.Item2)); 281 | int k = Math.Min(topk, iid_score_list.Count); 282 | 283 | 284 | for (int i = 0; i < k; i++) 285 | { 286 | outputRow[1].Set(iid_score_list[i].Item1); 287 | outputRow[0].Set(uid); 288 | outputRow[2].Set(iid_score_list[i].Item2); 289 | yield return outputRow; 290 | } 291 | 292 | } 293 | } 294 | 295 | 296 | public class UserTopSecondSelector : Reducer 297 | { 298 | public override Schema Produces(string[] requestedColumns, string[] args, Schema input) 299 | { 300 | ScopeRuntime.Diagnostics.DebugStream.WriteLine("requestdColumns: {0}", string.Join(",", requestedColumns)); 301 | return new Schema( 302 | "uid:string,iid:string,Probability:float,Place:int" 303 | ); 304 | } 305 | 306 | public override IEnumerable Reduce(RowSet input, Row outputRow, string[] args) 307 | { 308 | int topk = int.Parse(args[0]); 309 | 310 | string uid = ""; 311 | 312 | List> iid_score_list = new List>(); 313 | 314 | 315 | foreach (Row row in input.Rows) 316 | { 317 | uid = row[0].String; 318 | 319 | string iid = row[1].String; 320 | 321 | float score = row[2].Float; 322 | 323 | iid_score_list.Add(new Tuple(iid, score)); 324 | } 325 | 326 | if (iid_score_list.Count < 1) 327 | { 328 | outputRow[0].Set(""); 329 | outputRow[1].Set(""); 330 | outputRow[2].Set(0); 331 | yield return outputRow; 332 | } 333 | else 334 | { 335 | iid_score_list.Sort((a, b) => b.Item2.CompareTo(a.Item2)); 336 | for (int i = 0; i < topk; i++) 337 | { 338 | outputRow[0].Set(uid); 339 | outputRow[1].Set(iid_score_list[i].Item1); 340 | outputRow[2].Set(iid_score_list[i].Item2); 341 | outputRow[3].Set((i + 1).ToString()); 342 | yield return outputRow; 343 | } 344 | 345 | } 346 | 347 | } 348 | } 349 | -------------------------------------------------------------------------------- /models/ensemble.script: -------------------------------------------------------------------------------- 1 |  2 | 3 | lr_data00 = 4 | EXTRACT InstanceIndex : string, 5 | label : string, 6 | output : float, 7 | Probability : float 8 | FROM @"local/users/v-lianji/camera_ready/results/FT-400-p50_offline/FT.predictions.00.tsv" 9 | USING DefaultTextExtractor(delimiter: '\t'); 10 | 11 | lr_data01 = 12 | EXTRACT InstanceIndex : string, 13 | label : string, 14 | output : float, 15 | Probability : float 16 | FROM @"local/users/v-lianji/camera_ready/results/FT-400-p50_offline/FT.predictions.01.tsv" 17 | USING DefaultTextExtractor(delimiter: '\t'); 18 | 19 | lr_data02 = 20 | EXTRACT InstanceIndex : string, 21 | label : string, 22 | output : float, 23 | Probability : float 24 | FROM @"local/users/v-lianji/camera_ready/results/FT-400-p50_offline/FT.predictions.02.tsv" 25 | USING DefaultTextExtractor(delimiter: '\t'); 26 | lr_data03 = 27 | EXTRACT InstanceIndex : string, 28 | label : string, 29 | output : float, 30 | Probability : float 31 | FROM @"local/users/v-lianji/camera_ready/results/FT-400-p50_offline/FT.predictions.03.tsv" 32 | USING DefaultTextExtractor(delimiter: '\t'); 33 | lr_data04 = 34 | EXTRACT InstanceIndex : string, 35 | label : string, 36 | output : float, 37 | Probability : float 38 | FROM @"local/users/v-lianji/camera_ready/results/FT-400-p50_offline/FT.predictions.04.tsv" 39 | USING DefaultTextExtractor(delimiter: '\t'); 40 | lr_data05 = 41 | EXTRACT InstanceIndex : string, 42 | label : string, 43 | output : float, 44 | Probability : float 45 | FROM @"local/users/v-lianji/camera_ready/results/FT-400-p50_offline/FT.predictions.05.tsv" 46 | USING DefaultTextExtractor(delimiter: '\t'); 47 | lr_data06 = 48 | EXTRACT InstanceIndex : string, 49 | label : string, 50 | output : float, 51 | Probability : float 52 | FROM @"local/users/v-lianji/camera_ready/results/FT-400-p50_offline/FT.predictions.06.tsv" 53 | USING DefaultTextExtractor(delimiter: '\t'); 54 | lr_data07 = 55 | EXTRACT InstanceIndex : string, 56 | label : string, 57 | output : float, 58 | Probability : float 59 | FROM @"local/users/v-lianji/camera_ready/results/FT-400-p50_offline/FT.predictions.07.tsv" 60 | USING DefaultTextExtractor(delimiter: '\t'); 61 | lr_data08 = 62 | EXTRACT InstanceIndex : string, 63 | label : string, 64 | output : float, 65 | Probability : float 66 | FROM @"local/users/v-lianji/camera_ready/results/FT-400-p50_offline/FT.predictions.08.tsv" 67 | USING DefaultTextExtractor(delimiter: '\t'); 68 | lr_data09 = 69 | EXTRACT InstanceIndex : string, 70 | label : string, 71 | output : float, 72 | Probability : float 73 | FROM @"local/users/v-lianji/camera_ready/results/FT-400-p50_offline/FT.predictions.09.tsv" 74 | USING DefaultTextExtractor(delimiter: '\t'); 75 | 76 | ft_offline_data = SELECT * 77 | FROM lr_data00 78 | UNION ALL 79 | SELECT * 80 | FROM lr_data01 81 | UNION ALL 82 | SELECT * 83 | FROM lr_data02 84 | UNION ALL 85 | SELECT * 86 | FROM lr_data03 87 | UNION ALL 88 | SELECT * 89 | FROM lr_data04 90 | UNION ALL 91 | SELECT * 92 | FROM lr_data05 93 | UNION ALL 94 | SELECT * 95 | FROM lr_data06 96 | UNION ALL 97 | SELECT * 98 | FROM lr_data07 99 | UNION ALL 100 | SELECT * 101 | FROM lr_data08 102 | UNION ALL 103 | SELECT * 104 | FROM lr_data09; 105 | 106 | 107 | 108 | ft_data00 = 109 | EXTRACT InstanceIndex : string, 110 | label : string, 111 | output : float, 112 | Probability : float 113 | FROM @"local/users/v-lianji/camera_ready/results/FT-L300_T400_online/FT.predictions.00.tsv" 114 | USING DefaultTextExtractor(delimiter: '\t'); 115 | 116 | ft_data01 = 117 | EXTRACT InstanceIndex : string, 118 | label : string, 119 | output : float, 120 | Probability : float 121 | FROM @"local/users/v-lianji/camera_ready/results/FT-L300_T400_online/FT.predictions.01.tsv" 122 | USING DefaultTextExtractor(delimiter: '\t'); 123 | 124 | ft_data02 = 125 | EXTRACT InstanceIndex : string, 126 | label : string, 127 | output : float, 128 | Probability : float 129 | FROM @"local/users/v-lianji/camera_ready/results/FT-L300_T400_online/FT.predictions.02.tsv" 130 | USING DefaultTextExtractor(delimiter: '\t'); 131 | ft_data03 = 132 | EXTRACT InstanceIndex : string, 133 | label : string, 134 | output : float, 135 | Probability : float 136 | FROM @"local/users/v-lianji/camera_ready/results/FT-L300_T400_online/FT.predictions.03.tsv" 137 | USING DefaultTextExtractor(delimiter: '\t'); 138 | ft_data04 = 139 | EXTRACT InstanceIndex : string, 140 | label : string, 141 | output : float, 142 | Probability : float 143 | FROM @"local/users/v-lianji/camera_ready/results/FT-L300_T400_online/FT.predictions.04.tsv" 144 | USING DefaultTextExtractor(delimiter: '\t'); 145 | ft_data05 = 146 | EXTRACT InstanceIndex : string, 147 | label : string, 148 | output : float, 149 | Probability : float 150 | FROM @"local/users/v-lianji/camera_ready/results/FT-L300_T400_online/FT.predictions.05.tsv" 151 | USING DefaultTextExtractor(delimiter: '\t'); 152 | ft_data06 = 153 | EXTRACT InstanceIndex : string, 154 | label : string, 155 | output : float, 156 | Probability : float 157 | FROM @"local/users/v-lianji/camera_ready/results/FT-L300_T400_online/FT.predictions.06.tsv" 158 | USING DefaultTextExtractor(delimiter: '\t'); 159 | ft_data07 = 160 | EXTRACT InstanceIndex : string, 161 | label : string, 162 | output : float, 163 | Probability : float 164 | FROM @"local/users/v-lianji/camera_ready/results/FT-L300_T400_online/FT.predictions.07.tsv" 165 | USING DefaultTextExtractor(delimiter: '\t'); 166 | ft_data08 = 167 | EXTRACT InstanceIndex : string, 168 | label : string, 169 | output : float, 170 | Probability : float 171 | FROM @"local/users/v-lianji/camera_ready/results/FT-L300_T400_online/FT.predictions.08.tsv" 172 | USING DefaultTextExtractor(delimiter: '\t'); 173 | ft_data09 = 174 | EXTRACT InstanceIndex : string, 175 | label : string, 176 | output : float, 177 | Probability : float 178 | FROM @"local/users/v-lianji/camera_ready/results/FT-L300_T400_online/FT.predictions.09.tsv" 179 | USING DefaultTextExtractor(delimiter: '\t'); 180 | 181 | ft_online_data = SELECT * 182 | FROM ft_data00 183 | UNION ALL 184 | SELECT * 185 | FROM ft_data01 186 | UNION ALL 187 | SELECT * 188 | FROM ft_data02 189 | UNION ALL 190 | SELECT * 191 | FROM ft_data03 192 | UNION ALL 193 | SELECT * 194 | FROM ft_data04 195 | UNION ALL 196 | SELECT * 197 | FROM ft_data05 198 | UNION ALL 199 | SELECT * 200 | FROM ft_data06 201 | UNION ALL 202 | SELECT * 203 | FROM ft_data07 204 | UNION ALL 205 | SELECT * 206 | FROM ft_data08 207 | UNION ALL 208 | SELECT * 209 | FROM ft_data09; 210 | 211 | 212 | lr_data00 = 213 | EXTRACT InstanceIndex : string, 214 | label : string, 215 | output : float, 216 | Probability : float 217 | FROM @"local/users/v-lianji/camera_ready/has_latlng/results/FT.predictions_0.tsv" 218 | USING DefaultTextExtractor(delimiter: '\t'); 219 | 220 | lr_data01 = 221 | EXTRACT InstanceIndex : string, 222 | label : string, 223 | output : float, 224 | Probability : float 225 | FROM @"local/users/v-lianji/camera_ready/has_latlng/results/FT.predictions_1.tsv" 226 | USING DefaultTextExtractor(delimiter: '\t'); 227 | 228 | lr_data02 = 229 | EXTRACT InstanceIndex : string, 230 | label : string, 231 | output : float, 232 | Probability : float 233 | FROM @"local/users/v-lianji/camera_ready/has_latlng/results/FT.predictions_2.tsv" 234 | USING DefaultTextExtractor(delimiter: '\t'); 235 | lr_data03 = 236 | EXTRACT InstanceIndex : string, 237 | label : string, 238 | output : float, 239 | Probability : float 240 | FROM @"local/users/v-lianji/camera_ready/has_latlng/results/FT.predictions_3.tsv" 241 | USING DefaultTextExtractor(delimiter: '\t'); 242 | lr_data04 = 243 | EXTRACT InstanceIndex : string, 244 | label : string, 245 | output : float, 246 | Probability : float 247 | FROM @"local/users/v-lianji/camera_ready/has_latlng/results/FT.predictions_4.tsv" 248 | USING DefaultTextExtractor(delimiter: '\t'); 249 | lr_data05 = 250 | EXTRACT InstanceIndex : string, 251 | label : string, 252 | output : float, 253 | Probability : float 254 | FROM @"local/users/v-lianji/camera_ready/has_latlng/results/FT.predictions_5.tsv" 255 | USING DefaultTextExtractor(delimiter: '\t'); 256 | lr_data06 = 257 | EXTRACT InstanceIndex : string, 258 | label : string, 259 | output : float, 260 | Probability : float 261 | FROM @"local/users/v-lianji/camera_ready/has_latlng/results/FT.predictions_6.tsv" 262 | USING DefaultTextExtractor(delimiter: '\t'); 263 | lr_data07 = 264 | EXTRACT InstanceIndex : string, 265 | label : string, 266 | output : float, 267 | Probability : float 268 | FROM @"local/users/v-lianji/camera_ready/has_latlng/results/FT.predictions_7.tsv" 269 | USING DefaultTextExtractor(delimiter: '\t'); 270 | lr_data08 = 271 | EXTRACT InstanceIndex : string, 272 | label : string, 273 | output : float, 274 | Probability : float 275 | FROM @"local/users/v-lianji/camera_ready/has_latlng/results/FT.predictions_8.tsv" 276 | USING DefaultTextExtractor(delimiter: '\t'); 277 | lr_data09 = 278 | EXTRACT InstanceIndex : string, 279 | label : string, 280 | output : float, 281 | Probability : float 282 | FROM @"local/users/v-lianji/camera_ready/has_latlng/results/FT.predictions_9.tsv" 283 | USING DefaultTextExtractor(delimiter: '\t'); 284 | 285 | lr_data = SELECT * 286 | FROM lr_data00 287 | UNION ALL 288 | SELECT * 289 | FROM lr_data01 290 | UNION ALL 291 | SELECT * 292 | FROM lr_data02 293 | UNION ALL 294 | SELECT * 295 | FROM lr_data03 296 | UNION ALL 297 | SELECT * 298 | FROM lr_data04 299 | UNION ALL 300 | SELECT * 301 | FROM lr_data05 302 | UNION ALL 303 | SELECT * 304 | FROM lr_data06 305 | UNION ALL 306 | SELECT * 307 | FROM lr_data07 308 | UNION ALL 309 | SELECT * 310 | FROM lr_data08 311 | UNION ALL 312 | SELECT * 313 | FROM lr_data09; 314 | 315 | 316 | /////////////////////// ensemble ////////////////////////////// 317 | //best_predictions = 318 | // EXTRACT InstanceIndex : string, 319 | // label : string, 320 | // output : float, 321 | // Probability : float 322 | // FROM @"my/RecSys2017/pipeline/offline/results/recsys17-FT-p50.tsv" 323 | // USING DefaultTextExtractor(delimiter: '\t'); 324 | // best_predictions = 325 | // SELECT MyHelper.GetUserId(InstanceIndex) AS uid, 326 | // MyHelper.GetItemId(InstanceIndex) AS iid, 327 | // Probability 328 | // FROM best_predictions; 329 | 330 | // 331 | //lr_top_predictions = 332 | //REDUCE lr_predictions 333 | //ON iid 334 | //USING TopInstanceSelection(); 335 | // 336 | // 337 | //subfeature02_predictions = 338 | // EXTRACT InstanceIndex : string, 339 | // label : string, 340 | // output : float, 341 | // Probability : float 342 | // FROM @"local/users/v-lianji/RecSys2017/subfeature/results/FT_L300_T300_feature02-test_complete_0_highdim.predictions.tsv" 343 | // USING DefaultTextExtractor(delimiter: '\t'); 344 | // subfeature02_predictions = 345 | // SELECT MyHelper.GetUserId(InstanceIndex) AS uid, 346 | // MyHelper.GetItemId(InstanceIndex) AS iid, 347 | // Probability 348 | // FROM subfeature02_predictions; 349 | // 350 | // 351 | //param_FT100_predictions = 352 | // EXTRACT InstanceIndex : string, 353 | // label : string, 354 | // output : float, 355 | // Probability : float 356 | // FROM @"local/users/v-lianji/RecSys2017/subparams/results/FT_L100_T100-test_complete_0_highdim.predictions.tsv" 357 | // USING DefaultTextExtractor(delimiter: '\t'); 358 | // param_FT100_predictions = 359 | // SELECT MyHelper.GetUserId(InstanceIndex) AS uid, 360 | // MyHelper.GetItemId(InstanceIndex) AS iid, 361 | // Probability 362 | // FROM param_FT100_predictions; 363 | // 364 | // 365 | // 366 | //param_FF0_7_predictions = 367 | // EXTRACT InstanceIndex : string, 368 | // label : string, 369 | // output : float, 370 | // Probability : float 371 | // FROM @"local/users/v-lianji/RecSys2017/subparams/results/FT_L300_T300_ff0.5-test_complete_0_highdim.predictions.tsv" 372 | // USING DefaultTextExtractor(delimiter: '\t'); 373 | // param_FF0_7_predictions = 374 | // SELECT MyHelper.GetUserId(InstanceIndex) AS uid, 375 | // MyHelper.GetItemId(InstanceIndex) AS iid, 376 | // Probability 377 | // FROM param_FF0_7_predictions; 378 | // 379 | // 380 | 381 | 382 | // 383 | //ensemble_predictions = 384 | // SELECT a.uid, 385 | // a.iid, 386 | // a.Probability AS Probability 387 | // FROM ft_predictions AS a 388 | // INNER JOIN 389 | // lr_top_predictions AS b 390 | // ON a.uid == b.uid AND a.iid == b.iid; 391 | // 392 | // 393 | //REDUCE ensemble_predictions 394 | //ON iid 395 | //USING SubmissionFormater(); 396 | // 397 | //OUTPUT 398 | //TO "my/RecSys2017/pipeline/results/ensemble-prediction-lr5000_submit.txt" 399 | //USING DefaultTextOutputter(delimiter: '\t'); 400 | /////////////////////////////////////////// 401 | 402 | #DECLARE path string = "local/users/v-lianji/offline/"; 403 | #DECLARE path_users_noheader string = @path + "users_noheader.csv"; 404 | 405 | RESOURCE @path_users_noheader; 406 | // 407 | //preds = 408 | // SELECT a.InstanceIndex, 409 | // a.Probability * 0.0 + b.Probability * 0.0 + c.Probability * 0.8 AS Probability 410 | // FROM ft_offline_data AS a 411 | // INNER JOIN 412 | // ft_online_data AS b 413 | // ON a.InstanceIndex == b.InstanceIndex 414 | // INNER JOIN 415 | // lr_data AS c 416 | // ON a.InstanceIndex == c.InstanceIndex 417 | // WHERE a.Probability > 0.05 AND b.Probability > 0.05; 418 | 419 | preds = 420 | SELECT MyHelper.GetUserId(InstanceIndex) AS uid, 421 | MyHelper.GetItemId(InstanceIndex) AS iid, 422 | Probability 423 | FROM lr_data; 424 | 425 | 426 | SELECT uid, 427 | iid, 428 | MyHelper.PromotePremiumUsers(uid, Probability, 0.001) AS Probability 429 | FROM preds; 430 | REDUCE 431 | ON iid 432 | USING SubmissionFormater(); 433 | OUTPUT 434 | TO @"local/users/v-lianji/camera_ready/results/test-FT_FT_laglng_premium0.001_submit_complete.txt" 435 | USING DefaultTextOutputter(delimiter: '\t'); 436 | 437 | 438 | SELECT uid, 439 | iid, 440 | MyHelper.PromotePremiumUsers(uid, Probability, 0.002) AS Probability 441 | FROM preds; 442 | REDUCE 443 | ON iid 444 | USING SubmissionFormater(); 445 | OUTPUT 446 | TO @"local/users/v-lianji/camera_ready/results/test-FT_FT_laglng_premium0.002_submit_complete.txt" 447 | USING DefaultTextOutputter(delimiter: '\t'); 448 | 449 | 450 | SELECT uid, 451 | iid, 452 | MyHelper.PromotePremiumUsers(uid, Probability, 0.005) AS Probability 453 | FROM preds; 454 | REDUCE 455 | ON iid 456 | USING SubmissionFormater(); 457 | OUTPUT 458 | TO @"local/users/v-lianji/camera_ready/results/test-FT_FT_laglng_premium0.005_submit_complete.txt" 459 | USING DefaultTextOutputter(delimiter: '\t'); 460 | 461 | 462 | SELECT uid, 463 | iid, 464 | MyHelper.PromotePremiumUsers(uid, Probability, 0.000) AS Probability 465 | FROM preds; 466 | REDUCE 467 | ON iid 468 | USING SubmissionFormater(); 469 | OUTPUT 470 | TO @"local/users/v-lianji/camera_ready/results/test-FT_FT_laglng_premium0.000_submit_complete.txt" 471 | USING DefaultTextOutputter(delimiter: '\t'); 472 | -------------------------------------------------------------------------------- /models/ensemble.script.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.SCOPE.Types; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.IO; 5 | using System.Text; 6 | using ScopeRuntime; 7 | 8 | 9 | 10 | public static class MyHelper 11 | { 12 | public static Dictionary userdict = null; 13 | 14 | static MyHelper() 15 | { 16 | userdict = BuildUserDict(); 17 | } 18 | 19 | public static double PromotePremiumUsers(string uid, double p, double up_ratio) 20 | { 21 | if (userdict.ContainsKey(uid) && userdict[uid].premium == "1") 22 | { 23 | return p * (1 + up_ratio); 24 | } 25 | else 26 | { 27 | return p; 28 | } 29 | } 30 | 31 | public static Dictionary BuildUserDict() 32 | { 33 | Dictionary res = new Dictionary(); 34 | using (StreamReader rd = new StreamReader(@"users_noheader.csv")) 35 | { 36 | string content = null; 37 | while ((content = rd.ReadLine()) != null) 38 | { 39 | if (content.StartsWith("id")) 40 | { 41 | continue; 42 | } 43 | User user = new User(content); 44 | if (!res.ContainsKey(user.id)) 45 | { 46 | res.Add(user.id, user); 47 | } 48 | } 49 | } 50 | return res; 51 | } 52 | 53 | public static string GetIDIndex(string line) 54 | { 55 | int idx = line.IndexOf("#"); 56 | return line.Substring(idx + 1); 57 | } 58 | 59 | public static string GetUserId(string str) 60 | { 61 | int idx = str.IndexOf(","); 62 | return str.Substring(0, idx); 63 | } 64 | 65 | public static string GetItemId(string str) 66 | { 67 | int idx = str.IndexOf(","); 68 | return str.Substring(idx + 1); 69 | } 70 | } 71 | 72 | 73 | public class User 74 | { 75 | public string id; 76 | public HashSet title; 77 | public Dictionary title2cnt; 78 | public int title_cnt; 79 | public string clevel; 80 | public string indus; 81 | public string disc; 82 | public string country; 83 | public string region; 84 | public string experience_n_entries_class; 85 | public string experience_years_experience; 86 | public string experience_years_in_current; 87 | public string edu_degree; 88 | public HashSet edu_fieldofstudies; 89 | public string wtcj; 90 | public string premium; 91 | 92 | public List> interactions; 93 | public Dictionary viewed_item_title_words; 94 | public double viewed_titem_title_cnt; 95 | 96 | 97 | 98 | public User() { } 99 | public User(string line) 100 | { 101 | string[] words = line.Split('\t'); 102 | 103 | id = words[0]; 104 | title = new HashSet(); 105 | title2cnt = new Dictionary(); 106 | title_cnt = 0; 107 | var tokens = words[1].Split(','); 108 | title_cnt = tokens.Length; 109 | foreach (var token in tokens) 110 | { 111 | title.Add(token); 112 | if (!title2cnt.ContainsKey(token)) 113 | { 114 | title2cnt.Add(token, 1.0f / title_cnt); 115 | } 116 | else 117 | { 118 | title2cnt[token] += 1.0f / title_cnt; 119 | } 120 | } 121 | clevel = words[2]; 122 | disc = words[3]; 123 | indus = words[4]; 124 | country = words[5]; 125 | region = words[6]; 126 | experience_n_entries_class = words[7]; 127 | experience_years_experience = words[8]; 128 | experience_years_in_current = words[9]; 129 | edu_degree = words[10]; 130 | edu_fieldofstudies = new HashSet(); 131 | foreach (var token in words[11].Split(',')) 132 | { 133 | //if (token != "000") 134 | { 135 | edu_fieldofstudies.Add(token); 136 | } 137 | } 138 | wtcj = words[12]; 139 | premium = words[13]; 140 | 141 | viewed_titem_title_cnt = 0; 142 | interactions = null; 143 | viewed_item_title_words = null; 144 | } 145 | } 146 | 147 | 148 | 149 | public class TopInstanceSelection : Reducer 150 | { 151 | public override Schema Produces(string[] requestedColumns, string[] args, Schema input) 152 | { 153 | ScopeRuntime.Diagnostics.DebugStream.WriteLine("requestdColumns: {0}", string.Join(",", requestedColumns)); 154 | return new Schema( 155 | "uid:string,iid:string,Probability:float" 156 | ); 157 | } 158 | 159 | public override IEnumerable Reduce(RowSet input, Row outputRow, string[] args) 160 | { 161 | int topk = 5000; 162 | 163 | string iid = ""; 164 | 165 | List> uid_score_list = new List>(); 166 | 167 | 168 | foreach (Row row in input.Rows) 169 | { 170 | iid = row[1].String; 171 | 172 | string uid = row[0].String; 173 | 174 | float score = row[2].Float; 175 | 176 | uid_score_list.Add(new Tuple(uid, score)); 177 | } 178 | 179 | uid_score_list.Sort((a, b) => b.Item2.CompareTo(a.Item2)); 180 | int k = Math.Min(topk, uid_score_list.Count); 181 | 182 | for (int i = 0; i < k; i++) 183 | { 184 | outputRow[0].Set(uid_score_list[i].Item1); 185 | outputRow[1].Set(iid); 186 | outputRow[2].Set(uid_score_list[i].Item2); 187 | yield return outputRow; 188 | } 189 | 190 | } 191 | } 192 | 193 | 194 | public class SubmissionFormater : Reducer 195 | { 196 | public override Schema Produces(string[] requestedColumns, string[] args, Schema input) 197 | { 198 | ScopeRuntime.Diagnostics.DebugStream.WriteLine("requestdColumns: {0}", string.Join(",", requestedColumns)); 199 | return new Schema( 200 | "ItemId:string,TopUserId:string" 201 | ); 202 | } 203 | 204 | public override IEnumerable Reduce(RowSet input, Row outputRow, string[] args) 205 | { 206 | int topk = 100; 207 | 208 | string iid = ""; 209 | 210 | List> uid_score_list = new List>(); 211 | 212 | 213 | foreach (Row row in input.Rows) 214 | { 215 | iid = row[1].String; 216 | 217 | string uid = row[0].String; 218 | 219 | float score = row[2].Float; 220 | 221 | uid_score_list.Add(new Tuple(uid, score)); 222 | } 223 | 224 | uid_score_list.Sort((a, b) => b.Item2.CompareTo(a.Item2)); 225 | int k = Math.Min(topk, uid_score_list.Count); 226 | 227 | string value = ""; 228 | for (int i = 0; i < k; i++) 229 | { 230 | value += "," + uid_score_list[i].Item1; 231 | } 232 | 233 | if (value.Length > 0) 234 | { 235 | outputRow[0].Set(iid); 236 | outputRow[1].Set(value.Substring(1)); 237 | } 238 | else 239 | { 240 | outputRow[0].Set(iid); 241 | outputRow[1].Set(""); 242 | } 243 | 244 | yield return outputRow; 245 | 246 | } 247 | } 248 | 249 | 250 | 251 | 252 | public class OnlineSubmissionFormater : Reducer 253 | { 254 | public override Schema Produces(string[] requestedColumns, string[] args, Schema input) 255 | { 256 | ScopeRuntime.Diagnostics.DebugStream.WriteLine("requestdColumns: {0}", string.Join(",", requestedColumns)); 257 | return new Schema( 258 | "ItemId:string,TopUserId:string" 259 | ); 260 | } 261 | 262 | public override IEnumerable Reduce(RowSet input, Row outputRow, string[] args) 263 | { 264 | int topk = 250; 265 | 266 | string iid = ""; 267 | string uid = ""; 268 | float score = 0; 269 | 270 | List> score_list = new List>(); 271 | 272 | 273 | foreach (Row row in input.Rows) 274 | { 275 | iid = row[1].String; 276 | 277 | uid = row[0].String; 278 | 279 | score = row[2].Float; 280 | 281 | score_list.Add(new Tuple(uid, iid, score)); 282 | } 283 | 284 | score_list.Sort((a, b) => b.Item3.CompareTo(a.Item3)); 285 | 286 | Dictionary> iid2uids = new Dictionary>(); 287 | HashSet visited_uids = new HashSet(); 288 | foreach (var tuple in score_list) 289 | { 290 | uid = tuple.Item1; 291 | iid = tuple.Item2; 292 | if (!visited_uids.Contains(uid)) 293 | { 294 | if (!iid2uids.ContainsKey(iid)) 295 | { 296 | iid2uids.Add(iid, new List()); 297 | } 298 | if (iid2uids[iid].Count < topk) 299 | { 300 | iid2uids[iid].Add(uid); 301 | visited_uids.Add(uid); 302 | } 303 | } 304 | } 305 | 306 | 307 | foreach (var pair in iid2uids) 308 | { 309 | outputRow[0].Set(pair.Key); 310 | string res = ""; 311 | foreach (var tuid in pair.Value) 312 | { 313 | res += "," + tuid; 314 | } 315 | if (res.Length <= 0) 316 | { 317 | res = ", "; 318 | } 319 | outputRow[1].Set(res.Substring(1)); 320 | yield return outputRow; 321 | } 322 | 323 | } 324 | } 325 | 326 | 327 | public class TopKSelector : Reducer 328 | { 329 | public override Schema Produces(string[] requestedColumns, string[] args, Schema input) 330 | { 331 | ScopeRuntime.Diagnostics.DebugStream.WriteLine("requestdColumns: {0}", string.Join(",", requestedColumns)); 332 | return new Schema( 333 | "UserId:string,ItemId:string,Probability:float,holder:string" 334 | ); 335 | } 336 | 337 | public override IEnumerable Reduce(RowSet input, Row outputRow, string[] args) 338 | { 339 | int topk = 3000; 340 | 341 | string iid = ""; 342 | 343 | List> uid_score_list = new List>(); 344 | 345 | 346 | foreach (Row row in input.Rows) 347 | { 348 | iid = row[1].String; 349 | 350 | string uid = row[0].String; 351 | 352 | float score = row[2].Float; 353 | 354 | uid_score_list.Add(new Tuple(uid, score)); 355 | } 356 | 357 | uid_score_list.Sort((a, b) => b.Item2.CompareTo(a.Item2)); 358 | int k = Math.Min(topk, uid_score_list.Count); 359 | 360 | 361 | for (int i = 0; i < k; i++) 362 | { 363 | outputRow[0].Set(uid_score_list[i].Item1); 364 | outputRow[1].Set(iid); 365 | outputRow[2].Set(uid_score_list[i].Item2); 366 | outputRow[3].Set("1"); 367 | yield return outputRow; 368 | } 369 | 370 | } 371 | } 372 | 373 | public class TopKSelectorUserSide : Reducer 374 | { 375 | public override Schema Produces(string[] requestedColumns, string[] args, Schema input) 376 | { 377 | ScopeRuntime.Diagnostics.DebugStream.WriteLine("requestdColumns: {0}", string.Join(",", requestedColumns)); 378 | return new Schema( 379 | "UserId:string,ItemId:string,Probability:float" 380 | ); 381 | } 382 | 383 | public override IEnumerable Reduce(RowSet input, Row outputRow, string[] args) 384 | { 385 | int topk = 500; 386 | 387 | string uid = ""; 388 | 389 | List> iid_score_list = new List>(); 390 | 391 | 392 | foreach (Row row in input.Rows) 393 | { 394 | uid = row[0].String; 395 | 396 | string iid = row[1].String; 397 | 398 | float score = row[2].Float; 399 | 400 | iid_score_list.Add(new Tuple(iid, score)); 401 | } 402 | 403 | iid_score_list.Sort((a, b) => b.Item2.CompareTo(a.Item2)); 404 | int k = Math.Min(topk, iid_score_list.Count); 405 | 406 | 407 | for (int i = 0; i < k; i++) 408 | { 409 | outputRow[1].Set(iid_score_list[i].Item1); 410 | outputRow[0].Set(uid); 411 | outputRow[2].Set(iid_score_list[i].Item2); 412 | yield return outputRow; 413 | } 414 | 415 | } 416 | } 417 | 418 | 419 | public class UserTopSecondSelector : Reducer 420 | { 421 | public override Schema Produces(string[] requestedColumns, string[] args, Schema input) 422 | { 423 | ScopeRuntime.Diagnostics.DebugStream.WriteLine("requestdColumns: {0}", string.Join(",", requestedColumns)); 424 | return new Schema( 425 | "uid:string,iid:string,Probability:float,Place:int" 426 | ); 427 | } 428 | 429 | public override IEnumerable Reduce(RowSet input, Row outputRow, string[] args) 430 | { 431 | int topk = int.Parse(args[0]); 432 | 433 | string uid = ""; 434 | 435 | List> iid_score_list = new List>(); 436 | 437 | 438 | foreach (Row row in input.Rows) 439 | { 440 | uid = row[0].String; 441 | 442 | string iid = row[1].String; 443 | 444 | float score = row[2].Float; 445 | 446 | iid_score_list.Add(new Tuple(iid, score)); 447 | } 448 | 449 | if (iid_score_list.Count < 1) 450 | { 451 | outputRow[0].Set(""); 452 | outputRow[1].Set(""); 453 | outputRow[2].Set(0); 454 | yield return outputRow; 455 | } 456 | else 457 | { 458 | iid_score_list.Sort((a, b) => b.Item2.CompareTo(a.Item2)); 459 | for (int i = 0; i < topk; i++) 460 | { 461 | outputRow[0].Set(uid); 462 | outputRow[1].Set(iid_score_list[i].Item1); 463 | outputRow[2].Set(iid_score_list[i].Item2); 464 | outputRow[3].Set((i + 1).ToString()); 465 | yield return outputRow; 466 | } 467 | 468 | } 469 | 470 | } 471 | } 472 | --------------------------------------------------------------------------------