├── README.md
├── RecsysChallenge2017.pdf
├── auto-pipeline
├── App.config
├── Data
│ ├── CleanOnlineData.cs
│ └── LocalDataGenrator.cs
├── Program.cs
├── Properties
│ └── AssemblyInfo.cs
├── RecSys17.csproj
├── SmallJobs.cs
├── Utils.cs
├── model
│ ├── DocumentClustering.cs
│ ├── DocumentRelated.cs
│ ├── Evaluation.cs
│ ├── FMProcessor.cs
│ ├── FeatureFactory.cs
│ ├── Item.cs
│ ├── ItemProfile.cs
│ ├── KNN.cs
│ ├── KeywordMgr.cs
│ ├── SubmissionHelper.cs
│ ├── User.cs
│ └── WordHashing.cs
└── py-pull_and_submit
│ ├── daily-pull-data.py
│ ├── model.py
│ ├── online_submit_auto.py
│ ├── online_submit_auto_-1.py
│ ├── online_submit_auto_2.py
│ ├── parser.py
│ └── recsys-submit-file.py
└── models
├── StudyNDCG.script
├── StudyNDCG.script.cs
├── TEST_Localmodel_tlc3_pipeline.script
├── TEST_Localmodel_tlc3_pipeline.script.cs
├── TEST_TrainModel_Pipeline_cls_tlc3_sparse.script
├── TEST_TrainModel_Pipeline_cls_tlc3_sparse.script.cs
├── TEST_tmp_Location_ExtractFeatures.script
├── TEST_tmp_Location_ExtractFeatures.script.cs
├── ensemble-2stage.script
├── ensemble-2stage.script.cs
├── ensemble.script
└── ensemble.script.cs
/README.md:
--------------------------------------------------------------------------------
1 | This is our source code for Recsys Challenge 2017 http://2017.recsyschallenge.com/.
2 | The official rank of our team is 5th, and our final model ranked 2 for the last 2 consecutive weeks. It is a huge pity that we didn't use the best model in the first two weeks (in the most of other competitions, updating the best model before the last minute of deadline is enough, however, this is not true in this competiton, which is different from what we were expected.).
3 |
4 | The code is written with Microsoft's internal big data platform named COSMOS and the language is Scope. If you are interested in running it, you can try the public version in Azure, which is called Data Lake and U-SQL https://docs.microsoft.com/en-us/azure/data-lake-analytics/data-lake-analytics-data-lake-tools-get-started .
5 |
6 | Scripts under folder 'models' are scripts for extracting features, training model, making predictions, and many more post processing.
7 | The final features are in sparse format as SVMLight.
8 |
9 | Programs under folder 'auto-pipeline' are c# source code for our automatic pipeline.
10 |
11 | RecsysChallenge2017.pdf is our workshop paper, "Practical Lessons for Job Recommendations in the Cold-Start Scenario".
12 | https://dl.acm.org/citation.cfm?id=3124794
13 |
14 | Jianxun Lian, Fuzheng Zhang, Min Hou, Hongwei Wang, Xing Xie, and Guangzhong Sun. 2017. Practical Lessons for Job Recommendations in the Cold-Start Scenario. In Proceedings of the Recommender Systems Challenge 2017 (RecSys Challenge '17). ACM, New York, NY, USA, Article 4, 6 pages. DOI: https://doi.org/10.1145/3124791.3124794
15 |
--------------------------------------------------------------------------------
/RecsysChallenge2017.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leavingseason/RecsysChallenge2017/a05489995ef42805c88ef0984fcb93df8f6ac276/RecsysChallenge2017.pdf
--------------------------------------------------------------------------------
/auto-pipeline/App.config:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/auto-pipeline/Data/CleanOnlineData.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.IO;
4 | using System.Linq;
5 | using System.Text;
6 | using System.Threading.Tasks;
7 |
8 | namespace RecSys17.Data
9 | {
10 | class CleanOnlineData
11 | {
12 | public static void AdjustItemColumns(string infile, string outfile)
13 | {
14 | string[] headers = "id title career_level discipline_id industry_id country is_payed region latitude longitude employment tags created_at".Split('\t');
15 | Dictionary newheader2idx = new Dictionary();
16 | Dictionary idx2header = new Dictionary();
17 | for (int i = 0; i < headers.Length; i++)
18 | {
19 | idx2header.Add(i, headers[i]);
20 | }
21 |
22 |
23 |
24 | using (StreamReader rd = new StreamReader(infile))
25 | using (StreamWriter wt = new StreamWriter(outfile))
26 | {
27 | string content = rd.ReadLine().Replace("recsyschallenge_vlive_2017_items.", "").Replace("recsyschallenge_vlive_2017_train_items_final.", "");
28 | string[] words = content.Split('\t');
29 | for (int i = 0; i < words.Length; i++)
30 | {
31 | newheader2idx.Add(words[i], i);
32 | }
33 |
34 | string res = "";
35 | for (int i = 0; i < idx2header.Count; i++)
36 | {
37 | res += "\t" + words[newheader2idx[idx2header[i]]];
38 | }
39 | wt.Write(res.Substring(1) + "\n");
40 |
41 | while ((content = rd.ReadLine()) != null)
42 | {
43 | words = content.Split('\t');
44 | res = "";
45 | for (int i = 0; i < idx2header.Count; i++)
46 | {
47 | res += "\t" + words[newheader2idx[idx2header[i]]];
48 | }
49 | wt.Write(res.Substring(1) + "\n");
50 | }
51 | }
52 | }
53 |
54 | public static void AppendLossPairs(string date ,int user_max_cnt = 10)
55 | {
56 | ;
57 | //string target_users_file = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\second-stage\online-schedule\pull-data\target_users_2017-05-04.txt";//
58 | string accept_pair_file = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\second-stage\online-schedule\pull-data\accepted_pairs\accepted_pairs_"+date+".txt";
59 | string user_side_file = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v_userside.csv";
60 |
61 | string outfile = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v-1.csv";
62 |
63 | string path = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online";
64 | string[] tried_files = new string[] {
65 | Path.Combine(path,"recsys17-pred-highdim-submit_v1.csv"),
66 | Path.Combine(path,"recsys17-pred-highdim-submit_v2.csv"),
67 | Path.Combine(path,"recsys17-pred-highdim-submit_v3.csv"),
68 | Path.Combine(path,"recsys17-pred-highdim-submit_v4.csv"),
69 | Path.Combine(path,"recsys17-pred-highdim-submit_v5.csv"),
70 | Path.Combine(path,"recsys17-pred-highdim-submit_v6.csv"),
71 | Path.Combine(path,"recsys17-pred-highdim-submit_v7.csv"),
72 | Path.Combine(path,"recsys17-pred-highdim-submit.csv")
73 | };
74 |
75 |
76 | Dictionary item2cnt = new Dictionary();
77 | Dictionary existing_usrs = new Dictionary();
78 |
79 | Dictionary> item2newusers = new Dictionary>();
80 |
81 | List> user_item_scores = new List>();
82 | using (StreamReader rd = new StreamReader(user_side_file))
83 | {
84 | string content = null;
85 | while ((content = rd.ReadLine()) != null)
86 | {
87 | string[] words = content.Split('\t');
88 | user_item_scores.Add(new Tuple(words[0], words[1], float.Parse(words[2])));
89 | }
90 | }
91 |
92 |
93 | using (StreamReader rd = new StreamReader(accept_pair_file))
94 | {
95 | string content = null;
96 | while ((content = rd.ReadLine()) != null)
97 | {
98 | string[] words = content.Split('\t');
99 | string[] tokens = words[1].Split(new char[] { ' ', ',' }, StringSplitOptions.RemoveEmptyEntries);
100 |
101 | item2cnt.Add(words[0].Trim(), tokens.Length);
102 |
103 | foreach (var token in tokens)
104 | {
105 | if (!existing_usrs.ContainsKey(token))
106 | {
107 | existing_usrs.Add(token, 1);
108 | }
109 | }
110 | }
111 | }
112 |
113 | HashSet tried_pairs = new HashSet();
114 | foreach (var file in tried_files)
115 | {
116 | using (StreamReader rd = new StreamReader(file))
117 | {
118 | string content = null;
119 | while ((content = rd.ReadLine()) != null)
120 | {
121 | string[] words = content.Split('\t');
122 | string[] tokens = words[1].Split(new char[] { ' ', ',' }, StringSplitOptions.RemoveEmptyEntries);
123 |
124 | foreach (var token in tokens)
125 | {
126 | tried_pairs.Add(token + ":" + words[0]);
127 | }
128 | }
129 | }
130 | }
131 |
132 | user_item_scores.Sort((a, b) => b.Item3.CompareTo(a.Item3));
133 |
134 | foreach (var tuple in user_item_scores)
135 | {
136 | if (!tried_pairs.Contains(tuple.Item1 + ":" + tuple.Item2) && (!existing_usrs.ContainsKey(tuple.Item1) || existing_usrs[tuple.Item1]());
146 | }
147 | item2newusers[tuple.Item2].Add(tuple.Item1);
148 |
149 | if(!existing_usrs.ContainsKey(tuple.Item1))
150 | existing_usrs.Add(tuple.Item1,0);
151 | existing_usrs[tuple.Item1]++;
152 | }
153 | }
154 |
155 | using (StreamWriter wt = new StreamWriter(outfile))
156 | {
157 | foreach (var pair in item2newusers)
158 | {
159 | wt.Write("{0}\t{1}\n", pair.Key, string.Join(",", pair.Value.ToArray()));
160 | }
161 | }
162 | }
163 |
164 | public static void PrepareFMFile(string infile, string outfile01, string outfile02)
165 | {
166 | using(StreamReader rd = new StreamReader(infile))
167 | using(StreamWriter wt01 = new StreamWriter(outfile01))
168 | using (StreamWriter wt02 = new StreamWriter(outfile02))
169 | {
170 | string content = null;
171 | int cnt = 0;
172 | while ((content = rd.ReadLine()) != null)
173 | {
174 | if (cnt++ % 100000 == 0)
175 | {
176 | Console.Write("{0}\r",cnt);
177 | }
178 | int idx = content.IndexOf("#");
179 | wt01.Write(content.Substring(0,idx)+"\n");
180 | wt02.Write(content.Substring(idx+1)+"\n");
181 | }
182 | }
183 | }
184 | }
185 | }
186 |
--------------------------------------------------------------------------------
/auto-pipeline/Properties/AssemblyInfo.cs:
--------------------------------------------------------------------------------
1 | using System.Reflection;
2 | using System.Runtime.CompilerServices;
3 | using System.Runtime.InteropServices;
4 |
5 | // General Information about an assembly is controlled through the following
6 | // set of attributes. Change these attribute values to modify the information
7 | // associated with an assembly.
8 | [assembly: AssemblyTitle("RecSys17")]
9 | [assembly: AssemblyDescription("")]
10 | [assembly: AssemblyConfiguration("")]
11 | [assembly: AssemblyCompany("")]
12 | [assembly: AssemblyProduct("RecSys17")]
13 | [assembly: AssemblyCopyright("Copyright © 2017")]
14 | [assembly: AssemblyTrademark("")]
15 | [assembly: AssemblyCulture("")]
16 |
17 | // Setting ComVisible to false makes the types in this assembly not visible
18 | // to COM components. If you need to access a type in this assembly from
19 | // COM, set the ComVisible attribute to true on that type.
20 | [assembly: ComVisible(false)]
21 |
22 | // The following GUID is for the ID of the typelib if this project is exposed to COM
23 | [assembly: Guid("6666f690-4527-4d93-a733-3fc95f3fa7e4")]
24 |
25 | // Version information for an assembly consists of the following four values:
26 | //
27 | // Major Version
28 | // Minor Version
29 | // Build Number
30 | // Revision
31 | //
32 | // You can specify all the values or you can default the Build and Revision Numbers
33 | // by using the '*' as shown below:
34 | // [assembly: AssemblyVersion("1.0.*")]
35 | [assembly: AssemblyVersion("1.0.0.0")]
36 | [assembly: AssemblyFileVersion("1.0.0.0")]
37 |
--------------------------------------------------------------------------------
/auto-pipeline/RecSys17.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Debug
6 | AnyCPU
7 | {6C3A3F7B-B868-4ECB-9BE0-E80A227551C1}
8 | Exe
9 | Properties
10 | RecSys17
11 | RecSys17
12 | v4.5
13 | 512
14 |
15 |
16 | x64
17 | true
18 | full
19 | false
20 | bin\Debug\
21 | DEBUG;TRACE
22 | prompt
23 | 4
24 |
25 |
26 | x64
27 | pdbonly
28 | true
29 | bin\Release\
30 | TRACE
31 | prompt
32 | 4
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 | D:\My Projects\LyncOnlineAnalyse\Tools\bin\Release\Tools.dll
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
79 |
--------------------------------------------------------------------------------
/auto-pipeline/Utils.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.IO;
4 | using System.Linq;
5 | using System.Text;
6 | using System.Threading.Tasks;
7 |
8 | namespace RecSys17
9 | {
10 | class Utils
11 | {
12 | public static void OutputDict(Dictionary dict, string outfile)
13 | {
14 | using (StreamWriter wt = new StreamWriter(outfile))
15 | {
16 | foreach (var pair in dict)
17 | {
18 | wt.WriteLine("{0},{1}", pair.Key, pair.Value);
19 | }
20 | }
21 | }
22 |
23 | public static Dictionary LoadDict(string infile, int keyIdx, int valueIdx)
24 | {
25 | Dictionary result = new Dictionary();
26 | using (StreamReader rd = new StreamReader(infile))
27 | {
28 | string content = null;
29 | while ((content = rd.ReadLine()) != null)
30 | {
31 | string[] words = content.Split(',');
32 | if(!string.IsNullOrEmpty(words[valueIdx]))
33 | result.Add(words[keyIdx], float.Parse(words[valueIdx]));
34 | }
35 | }
36 | return result;
37 | }
38 |
39 |
40 | public static void OverlapStat(string file01, string file02, int colidx)
41 | {
42 | HashSet values01 = LoadValue2Hashset(file01, colidx);
43 | HashSet values02 = LoadValue2Hashset(file02,colidx);
44 |
45 | int hit = values01.Intersect(values02).Count();
46 |
47 | Console.WriteLine("{0}\t{1}\t{2}", hit, values01.Count, values02.Count);
48 | }
49 |
50 | private static HashSet LoadValue2Hashset(string file, int colidx)
51 | {
52 | HashSet res = new HashSet();
53 | using (StreamReader rd = new StreamReader(file))
54 | {
55 | string content = null;
56 | int cnt = 0;
57 | while ((content = rd.ReadLine()) != null)
58 | {
59 | if (cnt++ % 100000 == 0)
60 | {
61 | Console.WriteLine(cnt);
62 | }
63 | string[] words = content.Split(',');
64 | if (!res.Contains(words[colidx]))
65 | {
66 | res.Add(words[colidx]);
67 | }
68 | }
69 | }
70 | return res;
71 | }
72 |
73 | public static void SelectSubSet(string infile, string outfile, string[] col_names)
74 | {
75 | HashSet selectedFeatures = new HashSet(col_names);
76 |
77 |
78 | int cnt = 0;
79 | using (StreamReader rd = new StreamReader(infile))
80 | using (StreamWriter wt = new StreamWriter(outfile))
81 | {
82 | string content = rd.ReadLine();
83 | string[] headers = content.Split(',');
84 | HashSet selectedFeatureIdx = new HashSet();
85 | int dim = headers.Length;
86 | wt.Write(headers[0] + "," + headers[1]);
87 | for (int i = 2; i < dim; i++)
88 | {
89 | if (selectedFeatures.Contains(headers[i]))
90 | {
91 | selectedFeatureIdx.Add(i);
92 | wt.Write("," + headers[i]);
93 | }
94 | }
95 | wt.WriteLine();
96 |
97 | while ((content = rd.ReadLine()) != null)
98 | {
99 | if (cnt++ % 10000 == 0)
100 | {
101 | Console.WriteLine(cnt);
102 | }
103 | string[] words = content.Split(',');
104 | wt.Write(words[0] + "," + words[1]);
105 | for (int i = 2; i < dim; i++)
106 | {
107 | if (selectedFeatureIdx.Contains(i))
108 | {
109 | wt.Write("," + words[i]);
110 | }
111 | }
112 | wt.WriteLine();
113 | }
114 | }
115 | }
116 |
117 | public static void ShuffleFile(string infile, string outfile)
118 | {
119 | Console.WriteLine("ShuffleFile...");
120 | List lines = new List();
121 | using (StreamReader rd = new StreamReader(infile))
122 | {
123 | string content = null;
124 | int cnt = 0;
125 | while ((content = rd.ReadLine()) != null)
126 | {
127 | if (cnt++ % 1000000 == 0)
128 | {
129 | Console.Write(cnt + "\r");
130 | }
131 | lines.Add(content);
132 | }
133 | }
134 |
135 | var arr = lines.ToArray();
136 | Tools.Common.Shuffle(new Random(), arr);
137 |
138 | using (StreamWriter wt = new StreamWriter(outfile))
139 | {
140 | foreach (var line in arr)
141 | {
142 | wt.WriteLine(line);
143 | }
144 | }
145 | }
146 |
147 | public static List RandomPickup(List list, int k)
148 | {
149 | if (list.Count <= k)
150 | {
151 | return new List(list);
152 | }
153 |
154 | int cnt = list.Count;
155 | Random rng = new Random();
156 | for (int i = 0; i < k; i++)
157 | {
158 | int idx = rng.Next(cnt - i);
159 | string tmp = list[idx];
160 | list[idx] = list[cnt - 1 - i];
161 | list[cnt - 1 - i] = tmp;
162 | }
163 |
164 | return list.GetRange(cnt - k, k);
165 | }
166 |
167 | public static void SelectSubSet(string infile, string outfile, List selectedFeatureIdx, int topk = 100000)
168 | {
169 | int cnt = 0;
170 | using (StreamReader rd = new StreamReader(infile))
171 | using (StreamWriter wt = new StreamWriter(outfile))
172 | {
173 | string content = null;
174 | while ((content = rd.ReadLine()) != null)
175 | {
176 | if (cnt++ % 10000 == 0)
177 | {
178 | Console.WriteLine(cnt);
179 | }
180 | if (cnt > topk)
181 | {
182 | break;
183 | }
184 | string[] words = content.Split(',');
185 | wt.Write(words[0] + "," + words[1]);
186 | foreach(var idx in selectedFeatureIdx)
187 | {
188 | wt.Write("," + words[idx]);
189 | }
190 | wt.WriteLine();
191 | }
192 | }
193 | }
194 |
195 |
196 | public static void SelectFeatureSubset(string infile, string outfile, string featureRankFile, int k, double r)
197 | {
198 | Random rng = new Random((int)DateTime.Now.Ticks);
199 |
200 | /// load features ranks
201 | List> feature2importance = LoadFeature2Importance(featureRankFile);
202 |
203 | /// select features
204 | HashSet selectedFeatures = new HashSet();
205 | for (int i = 0; i < k; i++)
206 | {
207 | selectedFeatures.Add(feature2importance[i].Item1);
208 | }
209 |
210 | int cnt = 0;
211 | using (StreamReader rd = new StreamReader(infile))
212 | using (StreamWriter wt = new StreamWriter(outfile))
213 | {
214 | string content = rd.ReadLine();
215 | string[] headers = content.Split(',');
216 | HashSet selectedFeatureIdx = new HashSet();
217 | int dim = headers.Length;
218 | wt.Write(headers[0] + "," + headers[1]);
219 | for (int i = 2; i < dim; i++)
220 | {
221 | if (selectedFeatures.Contains(headers[i]))
222 | {
223 | selectedFeatureIdx.Add(i);
224 | wt.Write("," + headers[i]);
225 | }
226 | else
227 | {
228 | if (rng.NextDouble() < r)
229 | {
230 | selectedFeatureIdx.Add(i);
231 | wt.Write("," + headers[i]);
232 | }
233 | }
234 | }
235 | wt.WriteLine();
236 |
237 | while ((content = rd.ReadLine()) != null)
238 | {
239 | if (cnt++ % 10000 == 0)
240 | {
241 | Console.WriteLine(cnt);
242 | }
243 | string[] words = content.Split(',');
244 | wt.Write(words[0] + "," + words[1]);
245 | for (int i = 2; i < dim; i++)
246 | {
247 | if (selectedFeatureIdx.Contains(i))
248 | {
249 | wt.Write("," + words[i]);
250 | }
251 | }
252 | wt.WriteLine();
253 | }
254 | }
255 | }
256 |
257 | public static List> LoadFeature2Importance(string featureRankFile)
258 | {
259 | List> feature2importance = new List>();
260 | double t;
261 | using (StreamReader rd = new StreamReader(featureRankFile))
262 | {
263 | string content = null;
264 | while ((content = rd.ReadLine()) != null)
265 | {
266 | string[] words = content.Replace("\"", "").Split(new char[] { '\t' }, StringSplitOptions.RemoveEmptyEntries);
267 | if (words.Length < 2 || !double.TryParse(words[1], out t))
268 | {
269 | continue;
270 | }
271 | feature2importance.Add(new Tuple(words[0], double.Parse(words[1])));
272 | }
273 | }
274 | return feature2importance;
275 | }
276 |
277 |
278 | internal static void StatColLabelCorre()
279 | {
280 | throw new NotImplementedException();
281 | }
282 |
283 | internal static void StatColLabelCorre(string infile, string outfile, int label_idx, int col_idx)
284 | {
285 | Dictionary value2cnt = new Dictionary();
286 | Dictionary value2poscnt = new Dictionary();
287 | using (StreamReader rd = new StreamReader(infile))
288 | {
289 | string content = rd.ReadLine();
290 | while ((content = rd.ReadLine()) != null)
291 | {
292 | string[] words = content.Split(',');
293 | if (!value2cnt.ContainsKey(words[col_idx]))
294 | {
295 | value2cnt.Add(words[col_idx],0);
296 | value2poscnt.Add(words[col_idx],0);
297 | }
298 | value2cnt[words[col_idx]]++;
299 | if (words[label_idx].Equals("1") || words[label_idx].Equals("True"))
300 | {
301 | value2poscnt[words[col_idx]]++;
302 | }
303 | }
304 | }
305 |
306 | using (StreamWriter wt = new StreamWriter(outfile))
307 | {
308 | foreach (var pair in value2cnt)
309 | {
310 | wt.WriteLine("{0},{1},{2},{3}", pair.Key, pair.Value, value2poscnt[pair.Key], value2poscnt[pair.Key] * 1.0 / pair.Value);
311 | }
312 | }
313 | }
314 |
315 | internal static void OutputDict02(Dictionary word_cnt, Dictionary word_hit, string outfile)
316 | {
317 | using (StreamWriter wt = new StreamWriter(outfile))
318 | {
319 | foreach (var pair in word_cnt)
320 | {
321 | if (pair.Value > 0)
322 | {
323 | wt.WriteLine("{0},{1},{2},{3}", pair.Key, pair.Value, word_hit[pair.Key], word_hit[pair.Key] * 1.0 / pair.Value);
324 | }
325 | }
326 | }
327 | }
328 | }
329 | }
330 |
--------------------------------------------------------------------------------
/auto-pipeline/model/DocumentClustering.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.IO;
4 | using System.Linq;
5 | using System.Text;
6 | using System.Threading.Tasks;
7 |
8 | namespace RecSys17.model
9 | {
10 | class DocumentClustering
11 | {
12 |
13 | public static void TestGenClusterIdFeature()
14 | {
15 | string candi_file = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\train-test\train02_candidates_localgen.csv";
16 | string outfile = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\clustering\train_feature_as_clusterid.csv";
17 |
18 | string TLC_cluster_file = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\clustering\online\TLC\-1.inst.txt";
19 | string TLC_training_file = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\clustering\online\training.txt";
20 | string cluster_out_file = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\clustering\online\cluster_id_mapping.tsv";
21 |
22 | Dictionary id2cluster = new Dictionary();
23 | using(StreamReader rd01= new StreamReader(TLC_cluster_file))
24 | using (StreamReader rd02 = new StreamReader(TLC_training_file))
25 | {
26 | string content = rd01.ReadLine();
27 | while ((content = rd02.ReadLine()) != null)
28 | {
29 | string id = content.Substring(content.IndexOf("#") + 1);
30 | string[] words = rd01.ReadLine().Split('\t');
31 | id2cluster.Add(id, int.Parse(words[2]));
32 | }
33 | }
34 |
35 | //using(StreamReader rd= new StreamReader(candi_file))
36 | //using (StreamWriter wt = new StreamWriter(outfile))
37 | //{
38 | // string content = null;
39 | // while ((content = rd.ReadLine()) != null)
40 | // {
41 | // string[] words = content.Split('\t');
42 | // string uid = "uid_" + words[0];
43 | // string iid = "iid_" + words[1];
44 | // if (id2cluster.ContainsKey(uid) && id2cluster.ContainsKey(iid))
45 | // {
46 | // wt.WriteLine("{0},{1},{2},{3},{4},{5}", words[2]=="0" || words[2]=="4"?"0":"1", words[0], words[1], id2cluster[uid], id2cluster[iid], id2cluster[uid] == id2cluster[iid] ? 1 : 0);
47 | // }
48 | // }
49 | //}
50 |
51 | using (StreamWriter wt = new StreamWriter(cluster_out_file))
52 | {
53 | foreach (var pair in id2cluster)
54 | {
55 | wt.WriteLine("{0}\t{1}",pair.Key,pair.Value);
56 | }
57 | }
58 |
59 | }
60 |
61 | ///
62 | /// preapre svmlight feature for TLC kmeans clustering
63 | ///
64 | public static void PrepareFeatureFile()
65 | {
66 | bool reset_keymap = false;
67 | Dictionary user_titlefreq = Utils.LoadDict(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\user_title_stat.csv", 0, 1);
68 | Dictionary item_titlefreq = Utils.LoadDict(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\item_title_stat.csv", 0, 1);
69 |
70 | string keymapfile = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\clustering\word_id_mapping.csv";
71 |
72 | if (reset_keymap)
73 | {
74 | BuildKeyMapping(keymapfile,user_titlefreq,item_titlefreq);
75 | }
76 |
77 | Dictionary keymapper = LoadKeymapfile(keymapfile);
78 |
79 |
80 | Dictionary userdict = FeatureFactory.BuildUserDict(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\second-stage\data_online\online\users_adj_schema.csv");
81 | Dictionary itemdict = FeatureFactory.BuildItemDict(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\second-stage\data_online\online\items_noheader.csv");
82 |
83 | string outfile = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\clustering\online\training.txt";
84 | using (StreamWriter wt = new StreamWriter(outfile))
85 | {
86 | foreach (var pair in userdict)
87 | {
88 | List> words = new List>();
89 | if (pair.Value.title.Count > 0)
90 | {
91 | foreach (var word in pair.Value.title)
92 | {
93 | if (keymapper.ContainsKey(word))
94 | {
95 | words.Add(new Tuple(word, keymapper[word]));
96 | }
97 | }
98 | if (words.Count > 4)
99 | {
100 | words.Sort((a, b) => a.Item2.CompareTo(b.Item2));
101 | string res = "";
102 | foreach (var tuple in words)
103 | {
104 | res += " " + tuple.Item2+":1";
105 | }
106 | wt.WriteLine("0" + res + "#uid_" + pair.Key);
107 | }
108 | }
109 | }
110 |
111 | foreach (var pair in itemdict)
112 | {
113 | List> words = new List>();
114 | if (pair.Value.title.Count > 0)
115 | {
116 | foreach (var word in pair.Value.title)
117 | {
118 | if (keymapper.ContainsKey(word))
119 | {
120 | words.Add(new Tuple(word, keymapper[word]));
121 | }
122 | }
123 | if (words.Count > 4)
124 | {
125 | words.Sort((a, b) => a.Item2.CompareTo(b.Item2));
126 | string res = "";
127 | foreach (var tuple in words)
128 | {
129 | res += " " + tuple.Item2 + ":1";
130 | }
131 | wt.WriteLine("0" + res + "#iid_" + pair.Key);
132 | }
133 | }
134 | }
135 |
136 | }
137 |
138 | }
139 |
140 | public static Dictionary LoadKeymapfile(string keymapfile)
141 | {
142 | Dictionary keymapper = new Dictionary();
143 | using (StreamReader rd = new StreamReader(keymapfile))
144 | {
145 | string content = null;
146 | while ((content = rd.ReadLine()) != null)
147 | {
148 | string[] words = content.Split(',');
149 | keymapper.Add(words[0], int.Parse(words[1]));
150 | }
151 | }
152 | return keymapper;
153 | }
154 |
155 | private static void BuildKeyMapping(string keymapfile, Dictionary user_titlefreq, Dictionary item_titlefreq)
156 | {
157 | Dictionary word2idx = new Dictionary();
158 | foreach (var pair in user_titlefreq)
159 | {
160 | if (pair.Value >= 20)
161 | {
162 | if (!word2idx.ContainsKey(pair.Key))
163 | {
164 | word2idx.Add(pair.Key, word2idx.Count + 1);
165 | }
166 | }
167 | }
168 |
169 | foreach (var pair in item_titlefreq)
170 | {
171 | if (pair.Value >= 20)
172 | {
173 | if (!word2idx.ContainsKey(pair.Key))
174 | {
175 | word2idx.Add(pair.Key, word2idx.Count + 1);
176 | }
177 | }
178 | }
179 |
180 | using (StreamWriter wt = new StreamWriter(keymapfile))
181 | {
182 | foreach (var pair in word2idx)
183 | {
184 | wt.WriteLine("{0},{1}",pair.Key,pair.Value);
185 | }
186 | }
187 | }
188 | }
189 | }
190 |
--------------------------------------------------------------------------------
/auto-pipeline/model/DocumentRelated.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.IO;
4 | using System.Linq;
5 | using System.Text;
6 | using System.Threading.Tasks;
7 |
8 | namespace RecSys17.model
9 | {
10 | class DocumentRelated
11 | {
12 | public static void GenKeyWords()
13 | {
14 | string outfile_useritem = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\keywords\keywords_useritem.csv";
15 | string outfile_itemtitle = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\keywords\keywords_itemtitle.csv";
16 | string outfile_itemtag = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\keywords\keywords_itemtag.csv";
17 |
18 | string interation_file = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\train-test\train_interactions_sample1_0.3_moreneginst_shuffled.csv";
19 |
20 | Dictionary userdict =FeatureFactory. BuildUserDict();
21 | Dictionary itemdict = FeatureFactory.BuildItemDict();
22 | Dictionary> user2interest_items = KNN.BuildUserInterestedItems(interation_file);
23 |
24 | Dictionary useritem_word_cnt = new Dictionary();
25 | Dictionary useritem_word_hit = new Dictionary();
26 |
27 | Dictionary itemtitle_word_cnt = new Dictionary();
28 | Dictionary itemtitle_word_hit = new Dictionary();
29 |
30 | Dictionary itemtag_word_cnt = new Dictionary();
31 | Dictionary itemtag_word_hit = new Dictionary();
32 |
33 | using (StreamReader rd = new StreamReader(interation_file))
34 | {
35 | string content = null;
36 | int cnt = 0;
37 | while ((content = rd.ReadLine()) != null)
38 | {
39 | if (cnt++ % 100000 == 0)
40 | {
41 | Console.Write((cnt / 10000) + "w\r");
42 | }
43 | string[] words = content.Split('\t');
44 | if (itemdict.ContainsKey(words[1]) && userdict.ContainsKey(words[0]))
45 | {
46 | HashSet overlap01 = new HashSet(userdict[words[0]].title.Intersect(itemdict[words[1]].title));
47 | HashSet overlap02 = new HashSet();
48 | HashSet overlap03 = new HashSet();
49 |
50 | if (user2interest_items.ContainsKey(words[0]))
51 | {
52 | foreach (var tid in user2interest_items[words[0]])
53 | {
54 | if (tid!=words[1] && itemdict.ContainsKey(tid))
55 | {
56 | foreach (var ttitle in itemdict[tid].title)
57 | {
58 | if (itemdict[words[1]].title.Contains(ttitle))
59 | {
60 | if (!overlap02.Contains(ttitle))
61 | {
62 | overlap02.Add(ttitle);
63 | }
64 | }
65 | }
66 | foreach (var ttag in itemdict[tid].tags)
67 | {
68 | if (itemdict[words[1]].tags.Contains(ttag))
69 | {
70 | if (!overlap03.Contains(ttag))
71 | {
72 | overlap03.Add(ttag);
73 | }
74 | }
75 | }
76 | }
77 | }
78 | }
79 |
80 | UpdateWordStatus(overlap01, useritem_word_cnt, useritem_word_hit, words[2]);
81 | UpdateWordStatus(overlap02, itemtitle_word_cnt, itemtitle_word_hit, words[2]);
82 | UpdateWordStatus(overlap03, itemtag_word_cnt, itemtag_word_hit, words[2]);
83 | }
84 | }
85 | }
86 |
87 | Utils.OutputDict02(useritem_word_cnt, useritem_word_hit, outfile_useritem);
88 | Utils.OutputDict02(itemtitle_word_cnt, itemtitle_word_hit, outfile_itemtitle);
89 | Utils.OutputDict02(itemtag_word_cnt, itemtag_word_hit, outfile_itemtag);
90 |
91 | }
92 |
93 | private static void UpdateWordStatus(HashSet overlap, Dictionary word_cnt, Dictionary word_hit, string status)
94 | {
95 | foreach (var word in overlap)
96 | {
97 | if (!word_cnt.ContainsKey(word))
98 | {
99 | word_cnt.Add(word, 0);
100 | word_hit.Add(word, 0);
101 | }
102 | word_cnt[word]++;
103 | if (status != "0" && status != "4")
104 | {
105 | word_hit[word]++;
106 | }
107 | }
108 | }
109 |
110 | public static void PrepareTitleDocuments()
111 | {
112 |
113 | Dictionary user_titlefreq = Utils.LoadDict(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\user_title_stat.csv", 0, 1);
114 | Dictionary item_titlefreq = Utils.LoadDict(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\item_title_stat.csv", 0, 1);
115 |
116 |
117 | string outfile = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\word2vec\user_item_title_lines.txt";
118 |
119 | List lines = new List();
120 |
121 | Add2Lines(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\users.csv", user_titlefreq, lines);
122 | Add2Lines(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\items.csv", item_titlefreq, lines);
123 |
124 | var lines_arr = lines.ToArray();
125 |
126 | Tools.Common.Shuffle(new Random(), lines_arr);
127 |
128 | using (StreamWriter wt = new StreamWriter(outfile))
129 | {
130 | foreach (var line in lines_arr)
131 | {
132 | wt.Write(line + "\n");
133 | }
134 | }
135 |
136 | }
137 |
138 | private static void Add2Lines(string file, Dictionary titlefreq, List lines)
139 | {
140 | using (StreamReader rd = new StreamReader(file))
141 | {
142 | string content = rd.ReadLine();
143 | while ((content = rd.ReadLine()) != null)
144 | {
145 | string line = "";
146 | string[] words = content.Split('\t');
147 | if (!string.IsNullOrEmpty(words[1]))
148 | {
149 | string[] tokens = words[1].Split(',');
150 | foreach (var token in tokens)
151 | {
152 | if (titlefreq.ContainsKey(token) && titlefreq[token] > 10)
153 | {
154 | line += "," + token;
155 | }
156 | }
157 | }
158 | if (line.Length > 1)
159 | {
160 | lines.Add(line.Substring(1));
161 | }
162 | }
163 | }
164 | }
165 | }
166 | }
167 |
--------------------------------------------------------------------------------
/auto-pipeline/model/Evaluation.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.IO;
4 | using System.Linq;
5 | using System.Text;
6 | using System.Threading.Tasks;
7 |
8 | namespace RecSys17.model
9 | {
10 | class Evaluation
11 | {
12 |
13 | public static void StatRecall(string predfile, string outfile)
14 | {
15 | Dictionary>> item2predictions = new Dictionary>>();
16 | HashSet posset = new HashSet();
17 | int cnt = 0;
18 | using (StreamReader rd = new StreamReader(predfile))
19 | {
20 | string content = null;
21 |
22 | while ((content = rd.ReadLine()) != null)
23 | {
24 | if (cnt++ % 100000 == 0)
25 | {
26 | Console.Write(cnt + "\r");
27 | }
28 | string[] words = content.Split('\t');
29 |
30 | string[] tokens =words[0].Split('|');
31 |
32 | double score = double.Parse(words[3]);
33 |
34 | if (!item2predictions.ContainsKey(tokens[1]))
35 | {
36 | item2predictions.Add(tokens[1], new List>());
37 | }
38 | item2predictions[tokens[1]].Add(new Tuple(tokens[0], score, int.Parse(words[1])));
39 |
40 | if (words[1] == "1")
41 | {
42 | posset.Add(words[0]);
43 | }
44 | }
45 | }
46 |
47 | Console.WriteLine("Poscnt : {0}", posset.Count);
48 |
49 | foreach (var iid in item2predictions.Keys)
50 | {
51 | item2predictions[iid].Sort((a, b) => b.Item2.CompareTo(a.Item2));
52 | }
53 |
54 | using (StreamWriter wt = new StreamWriter(outfile))
55 | {
56 | int hit = 0;
57 | for (int k = 1; k < 20000; k++)
58 | {
59 | foreach (var pair in item2predictions)
60 | {
61 | if (pair.Value.Count >= k)
62 | {
63 | if (pair.Value[k - 1].Item3 == 1)
64 | {
65 | hit++;
66 | }
67 | }
68 | }
69 | wt.WriteLine("{0},{1}", k, hit * 1.0 / posset.Count);
70 | }
71 | }
72 |
73 | }
74 |
75 | public static double RandomScore(string infile, string gtfile, Dictionary userdict, Dictionary itemdict)
76 | {
77 | int topk = 100;
78 |
79 | Dictionary> item2userset = new Dictionary>();
80 | int cnt = 0;
81 | using (StreamReader rd = new StreamReader(infile))
82 | {
83 | string content = rd.ReadLine();
84 | while ((content = rd.ReadLine()) != null)
85 | {
86 | if (cnt++ % 100000 == 0)
87 | {
88 | Console.WriteLine(cnt);
89 | }
90 | string[] words = content.Split('\t')[0].Split('|');
91 | if (!item2userset.ContainsKey(words[1]))
92 | {
93 | item2userset.Add(words[1], new List());
94 | }
95 | item2userset[words[1]].Add(words[0]);
96 | }
97 | }
98 |
99 |
100 | Random rng = new Random();
101 |
102 | Dictionary> item2user2status = LoadGTFile(gtfile, false);
103 |
104 | double res = 0;
105 | int line_cnt = 0;
106 | foreach (var pair in item2user2status)
107 | {
108 | line_cnt++;
109 | if (line_cnt % 100 == 0)
110 | {
111 | Console.WriteLine(line_cnt);
112 | }
113 | int success_user_cnt = 0;
114 | for (int i = 0; i < topk; i++)
115 | {
116 | string uid = item2userset[pair.Key][rng.Next(item2userset[pair.Key].Count)];
117 | int cur_user_sucess = UserSucess(pair.Key, uid, item2user2status);
118 | if (cur_user_sucess > 0)
119 | {
120 | success_user_cnt++;
121 | }
122 | res += cur_user_sucess * (IsPremiumUser(uid, userdict));
123 | }
124 | res += ItemSucess(success_user_cnt, pair.Key, itemdict);
125 | }
126 |
127 |
128 | Console.WriteLine("{0}\t{1}\t{2}", line_cnt, res, res / line_cnt);
129 | return res;
130 | }
131 |
132 | public static double Score(string subfile, string gtfile, Dictionary userdict = null, Dictionary itemdict = null, bool isFeatureMode = false)
133 | {
134 |
135 | //if (userdict == null)
136 | // userdict = FeatureFactory.BuildUserDict();
137 | //if (itemdict == null)
138 | // itemdict = FeatureFactory.BuildItemDict();
139 |
140 | Dictionary> item2user2status = LoadGTFile(gtfile, isFeatureMode);
141 |
142 | double res = 0;
143 | int line_cnt = 0;
144 | using (StreamReader rd = new StreamReader(subfile))
145 | {
146 | string content = null;
147 | while ((content = rd.ReadLine()) != null)
148 | {
149 | string[] words = content.Split(new char[] { ' ', '\t' });
150 | if (words.Length < 2)
151 | {
152 | continue;
153 | }
154 | line_cnt++;
155 | string[] tokens = words[1].Split(',');
156 | int success_user_cnt = 0;
157 | for (int j = 0; j < tokens.Length; j++)
158 | {
159 | var token = tokens[j];
160 | int cur_user_sucess = UserSucess(words[0], token, item2user2status);
161 | if (cur_user_sucess > 0)
162 | {
163 | success_user_cnt++;
164 | }
165 | res += cur_user_sucess > 0 ? 1 : 0; //cur_user_sucess * (IsPremiumUser(token, userdict));
166 |
167 | }
168 |
169 | // res += ItemSucess(success_user_cnt, words[0], itemdict);
170 | }
171 | }
172 | Console.WriteLine("{0}\t{1}\t{2}", line_cnt, res, res / line_cnt);
173 | return res;
174 | }
175 |
176 | public static int[] Score02(string subfile, string gtfile, Dictionary userdict = null, Dictionary itemdict = null)
177 | {
178 | int[] hit_cnt = new int[100 + 1];
179 | Array.Clear(hit_cnt, 0, hit_cnt.Length);
180 |
181 | if (userdict == null)
182 | userdict = FeatureFactory.BuildUserDict();
183 | if (itemdict == null)
184 | itemdict = FeatureFactory.BuildItemDict();
185 |
186 | Dictionary> item2user2status = LoadGTFile(gtfile, false);
187 |
188 | int res = 0;
189 | int line_cnt = 0;
190 | using (StreamReader rd = new StreamReader(subfile))
191 | {
192 | string content = null;
193 | while ((content = rd.ReadLine()) != null)
194 | {
195 | string[] words = content.Split(new char[] { ' ', '\t' });
196 | if (words.Length < 2)
197 | {
198 | continue;
199 | }
200 | line_cnt++;
201 | string[] tokens = words[1].Split(',');
202 |
203 | for (int j = 0; j < tokens.Length && j<100; j++)
204 | {
205 | var token = tokens[j];
206 | int cur_user_sucess = UserSucess(words[0], token, item2user2status) > 0 ? 1 : 0;
207 |
208 | hit_cnt[j] += cur_user_sucess;
209 | res += cur_user_sucess;
210 | }
211 | }
212 | }
213 | Console.WriteLine("{0}\t{1}\t{2}", line_cnt, res, res*1.0 / line_cnt);
214 |
215 | hit_cnt[hit_cnt.Length - 1] = res;
216 | return hit_cnt;
217 | }
218 |
219 |
220 | private static double ItemSucess(int success_user_cnt, string iid, Dictionary itemdict)
221 | {
222 | if (success_user_cnt <= 0)
223 | {
224 | return 0;
225 | }
226 |
227 | if (itemdict.ContainsKey(iid) && itemdict[iid].is_paid == "1")
228 | {
229 | return 50;
230 | }
231 |
232 | return 25;
233 | }
234 |
235 | private static int IsPremiumUser(string token, Dictionary userdict)
236 | {
237 | if (userdict.ContainsKey(token) && userdict[token].premium == "1")
238 | {
239 | return 2;
240 | }
241 | else
242 | {
243 | return 1;
244 | }
245 | }
246 |
247 | private static int UserSucess(string iid, string uid, Dictionary> item2user2status)
248 | {
249 | int score = 0;
250 | if (item2user2status.ContainsKey(iid))
251 | {
252 | if (item2user2status[iid].ContainsKey(uid))
253 | {
254 | if (item2user2status[iid][uid] == 1)
255 | {
256 | score = 1;
257 | }
258 | else if (item2user2status[iid][uid] == 2 || item2user2status[iid][uid] == 3)
259 | {
260 | score = 5;
261 | }
262 | else if (item2user2status[iid][uid] == 5)
263 | {
264 | score = 20;
265 | }
266 | else if (item2user2status[iid][uid] == 4)
267 | {
268 | score = -10;
269 | }
270 | }
271 | }
272 | return score;
273 | }
274 |
275 | private static Dictionary> LoadGTFile(string gtfile = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\train-test\test.tsv", bool isFeatureMode =false)
276 | {
277 | Dictionary> res = new Dictionary>();
278 | char spliter = isFeatureMode ? ',' : '\t';
279 | using (StreamReader rd = new StreamReader(gtfile))
280 | {
281 | string content = null;
282 | while ((content = rd.ReadLine()) != null)
283 | {
284 | string[] words = content.Split(spliter);
285 | string uid = null, iid = null;
286 | int status = 0;
287 | if (isFeatureMode)
288 | {
289 | status = int.Parse(words[0]);
290 | string[] tokens = words[1].Split('|');
291 | uid = tokens[0];
292 | iid = tokens[1];
293 | }
294 | else
295 | {
296 | status = int.Parse(words[2]);
297 | uid = words[0];
298 | iid = words[1];
299 | }
300 | if (status > 0)
301 | {
302 | if (!res.ContainsKey(iid))
303 | {
304 | res.Add(iid, new Dictionary());
305 | }
306 | res[iid].Add(uid, status);
307 | }
308 | }
309 | }
310 | return res;
311 | }
312 | }
313 | }
314 |
--------------------------------------------------------------------------------
/auto-pipeline/model/FMProcessor.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.IO;
4 | using System.Linq;
5 | using System.Text;
6 | using System.Threading.Tasks;
7 |
8 | namespace RecSys17.model
9 | {
10 | class FMProcessor
11 | {
12 | public static void AppendPredFile(string idfile, string predfile, string outfile)
13 | {
14 | using(StreamReader rd01 = new StreamReader(idfile))
15 | using(StreamReader rd02 = new StreamReader(predfile))
16 | using (StreamWriter wt = new StreamWriter(outfile))
17 | {
18 | string content = null;
19 | while ((content = rd01.ReadLine() )!= null)
20 | {
21 | wt.Write("{0},{1}\n",content,rd02.ReadLine().Split(' ')[1]);
22 | }
23 | }
24 | }
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/auto-pipeline/model/Item.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Linq;
4 | using System.Text;
5 | using System.Threading.Tasks;
6 |
7 | namespace RecSys17.model
8 | {
9 | class Item
10 | {
11 | public string id;
12 | public HashSet title;
13 | public int title_cnt;
14 | public Dictionary title2cnt;
15 | public string clevel;
16 | public string indus;
17 | public string disc;
18 | public string country;
19 | public string region;
20 | public string is_paid;
21 | public string employment;
22 | public HashSet tags;
23 | public DateTime create_at;
24 |
25 | public Item(){}
26 | public Item(string line)
27 | {
28 | string[] words = line.Split('\t');
29 |
30 | id = words[0];
31 | title = new HashSet();
32 | title2cnt = new Dictionary();
33 | title_cnt = 0;
34 | var tokens = words[1].Split(',');
35 | title_cnt = tokens.Length;
36 | foreach (var token in tokens)
37 | {
38 | title.Add(token);
39 | if (!title2cnt.ContainsKey(token))
40 | {
41 | title2cnt.Add(token, 1.0f/title_cnt);
42 | }
43 | else
44 | {
45 | title2cnt[token] += 1.0f / title_cnt;
46 | }
47 | }
48 | clevel = words[2];
49 | disc = words[3];
50 | indus = words[4];
51 | country = words[5];
52 | is_paid = words[6];
53 | region = words[7];
54 | employment = words[10];
55 | tags = new HashSet();
56 | foreach (var token in words[11].Split(','))
57 | {
58 | // if (token != "000")
59 | {
60 | tags.Add(token);
61 | }
62 | }
63 | if (!string.IsNullOrEmpty(words[12]) && words[12]!="null")
64 | {
65 | if (words[12].Contains("-"))
66 | {
67 | create_at = DateTime.Parse(words[12]);
68 | }
69 | else
70 | {
71 | create_at = Tools.Common.ParseTime(double.Parse(words[12]));
72 | }
73 | }
74 | else
75 | {
76 | create_at = DateTime.Parse("2017-01-01");
77 | }
78 | }
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/auto-pipeline/model/ItemProfile.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.IO;
4 | using System.Linq;
5 | using System.Text;
6 | using System.Threading.Tasks;
7 |
8 | namespace RecSys17.model
9 | {
10 | class ItemProfile
11 | {
12 | public static void BuildFeatureFile()
13 | {
14 | string label_file = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\itemprofile\offline_item_popularity.csv";
15 | string outfile_like = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\itemprofile\features\offline_training_like.csv";
16 | string outfile_hate = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\itemprofile\features\offline_training_hate.csv";
17 |
18 | Dictionary itemdict = FeatureFactory. BuildItemDict();
19 |
20 | Dictionary item_titlefreq = Utils.LoadDict(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\item_title_stat.csv", 0, 1);
21 |
22 | string keymapfile = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\clustering\word_id_mapping.csv";
23 |
24 | Dictionary keymapper = DocumentClustering.LoadKeymapfile(keymapfile);
25 |
26 | using (StreamReader rd = new StreamReader(label_file))
27 | using(StreamWriter wt_like = new StreamWriter(outfile_like))
28 | using (StreamWriter wt_hate = new StreamWriter(outfile_hate))
29 | {
30 | string content = null;
31 | while ((content = rd.ReadLine()) != null)
32 | {
33 | string[] words = content.Split(',');
34 | if (itemdict.ContainsKey(words[0]) && float.Parse(words[1])>0)
35 | {
36 | float book_ratio = float.Parse(words[3]);
37 | float reply_ratio = float.Parse(words[4]);
38 | float delete_ratio = float.Parse(words[5]);
39 |
40 | string featureline = "";
41 | List titles = new List();
42 | foreach (var title in itemdict[words[0]].title)
43 | {
44 | if (keymapper.ContainsKey(title))
45 | {
46 | titles.Add(keymapper[title]);
47 | }
48 | }
49 | titles.Sort();
50 | foreach (var idx in titles)
51 | {
52 | featureline += " " + idx + ":1";
53 | }
54 |
55 | if (itemdict[words[0]].clevel == "0")
56 | {
57 | featureline += " " + (1+keymapper.Count) + ":1";
58 | }
59 | else if (itemdict[words[0]].clevel == "1")
60 | {
61 | featureline += " " + (2 + keymapper.Count) + ":1";
62 | }
63 | else if (itemdict[words[0]].clevel == "2")
64 | {
65 | featureline += " " + (3 + keymapper.Count) + ":1";
66 | }
67 | else if (itemdict[words[0]].clevel == "3")
68 | {
69 | featureline += " " + (4 + keymapper.Count) + ":1";
70 | }
71 | else if (itemdict[words[0]].clevel == "4")
72 | {
73 | featureline += " " + (5 + keymapper.Count) + ":1";
74 | }
75 | else if (itemdict[words[0]].clevel == "5")
76 | {
77 | featureline += " " + (6 + keymapper.Count) + ":1";
78 | }
79 | else if (itemdict[words[0]].clevel == "6")
80 | {
81 | featureline += " " + (7 + keymapper.Count) + ":1";
82 | }
83 | else if (itemdict[words[0]].clevel == "7")
84 | {
85 | featureline += " " + (8 + keymapper.Count) + ":1";
86 | }
87 |
88 | if (itemdict[words[0]].employment == "0")
89 | {
90 | featureline += " " + (9 + keymapper.Count) + ":1";
91 | }
92 | else if (itemdict[words[0]].employment == "1")
93 | {
94 | featureline += " " + (10 + keymapper.Count) + ":1";
95 | }
96 | else if (itemdict[words[0]].employment == "2")
97 | {
98 | featureline += " " + (11 + keymapper.Count) + ":1";
99 | }
100 | else if (itemdict[words[0]].employment == "3")
101 | {
102 | featureline += " " + (12 + keymapper.Count) + ":1";
103 | }
104 | else if (itemdict[words[0]].employment == "4")
105 | {
106 | featureline += " " + (13 + keymapper.Count) + ":1";
107 | }
108 | else if (itemdict[words[0]].employment == "5")
109 | {
110 | featureline += " " + (14 + keymapper.Count) + ":1";
111 | }
112 |
113 | featureline += " " + (15 + keymapper.Count) + ":" + itemdict[words[0]].tags.Count;
114 |
115 |
116 | if (delete_ratio > 0.1 || delete_ratio < 0.04)
117 | {
118 | int label = delete_ratio > 0.1 ? 1 : 0;
119 | wt_hate.Write(label);
120 | wt_hate.WriteLine(featureline);
121 | }
122 |
123 | if (book_ratio > 0.03 || reply_ratio > 0.03 || (book_ratio<0.02 && reply_ratio<0.02))
124 | {
125 | int label = book_ratio > 0.03 || reply_ratio > 0.03?1:0;
126 | wt_like.Write(label);
127 | wt_like.WriteLine(featureline);
128 | }
129 | }
130 | }
131 | }
132 |
133 |
134 |
135 | }
136 | }
137 | }
138 |
--------------------------------------------------------------------------------
/auto-pipeline/model/KNN.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.IO;
4 | using System.Linq;
5 | using System.Text;
6 | using System.Threading.Tasks;
7 |
8 | namespace RecSys17.model
9 | {
10 | class KNN
11 | {
12 | public static void PredictByUserDocsim()
13 | {
14 | int topk = 100;
15 | string outfile = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\knn_user_item_docsim.csv";
16 |
17 |
18 | Dictionary itemdict = FeatureFactory.BuildItemDict();
19 | Dictionary userdict = FeatureFactory.BuildUserDict();
20 |
21 | Dictionary user_titlefreq = Utils.LoadDict(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\user_title_stat.csv", 0, 1);
22 | Dictionary item_titlefreq = Utils.LoadDict(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\item_title_stat.csv", 0, 1);
23 |
24 |
25 | List target_users = FeatureFactory.LoadListFromFile(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\targetUsers.csv");
26 | List target_items = FeatureFactory.LoadListFromFile(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\targetItems.csv");
27 |
28 | using (StreamWriter wt = new StreamWriter(outfile))
29 | {
30 | int cnt = 0;
31 | foreach (var iid in target_items)
32 | {
33 | List> user2score = new List>();
34 | foreach (var uid in target_users)
35 | {
36 | if (userdict.ContainsKey(uid))
37 | {
38 | double score = GetUserScore(userdict[uid], itemdict[iid], user_titlefreq, item_titlefreq);
39 | if (score > 0)
40 | {
41 | user2score.Add(new Tuple(uid, score));
42 | }
43 | }
44 | }
45 | Console.WriteLine("{0}\tnum of candi:\t{1}", cnt++, user2score.Count);
46 | if (user2score.Count > 0)
47 | {
48 | user2score.Sort((a, b) => b.Item2.CompareTo(a.Item2));
49 | int k = Math.Min(topk, user2score.Count);
50 | wt.Write("{0}\t", iid);
51 | for (int i = 0; i < k - 1; i++)
52 | {
53 | wt.Write("{0},", user2score[i].Item1);
54 | }
55 | wt.Write("{0}\n", user2score[k - 1].Item1);
56 | }
57 | }
58 | }
59 | }
60 |
61 | private static double GetUserScore(User user, Item item, Dictionary user_titlefreq, Dictionary item_titlefreq)
62 | {
63 | double doc_sim = 0;
64 | foreach (var word in user.title)
65 | {
66 | if (!string.IsNullOrEmpty(word) && user_titlefreq.ContainsKey(word) && user_titlefreq[word] >= 20 && item_titlefreq.ContainsKey(word) && item_titlefreq[word] >= 20)
67 | {
68 | if (item.title2cnt.ContainsKey(word))
69 | {
70 | doc_sim += Math.Sqrt(user.title2cnt[word] * item.title2cnt[word]) * Math.Log10(1000000.0 / user_titlefreq[word]);
71 | }
72 | }
73 | }
74 | return doc_sim;
75 | }
76 |
77 | public static void PredictByViewDocsim()
78 | {
79 | int topk = 100;
80 | string outfile = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\knn_tag.csv";
81 | string trainfile = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\interactions_grouped.csv";
82 |
83 | Dictionary> user2interest_items = BuildUserInterestedItems(trainfile);
84 |
85 | Dictionary itemdict = FeatureFactory.BuildItemDict();
86 | // Dictionary userdict = FeatureFactory.BuildUserDict();
87 |
88 | Dictionary user_titlefreq = Utils.LoadDict(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\user_title_stat.csv", 0, 1);
89 | Dictionary item_titlefreq = Utils.LoadDict(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\item_title_stat.csv", 0, 1);
90 |
91 |
92 | List target_users = FeatureFactory. LoadListFromFile(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\targetUsers.csv");
93 | List target_items = FeatureFactory.LoadListFromFile(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\targetItems.csv");
94 |
95 | using (StreamWriter wt = new StreamWriter(outfile))
96 | {
97 | int cnt = 0;
98 | foreach (var iid in target_items)
99 | {
100 | List> user2score = new List>();
101 | foreach (var uid in target_users)
102 | {
103 | if (user2interest_items.ContainsKey(uid))
104 | {
105 | //double score = GetUserScore(iid, user2interest_items[uid], itemdict, item_titlefreq);
106 | double score = GetUserScore_Tag(iid, user2interest_items[uid], itemdict, item_titlefreq);
107 |
108 | if (score > 0)
109 | {
110 | user2score.Add(new Tuple(uid, score));
111 | }
112 | }
113 | }
114 | Console.WriteLine("{0}\tnum of candi:\t{1}", cnt++, user2score.Count);
115 | if (user2score.Count > 0)
116 | {
117 | user2score.Sort((a, b) => b.Item2.CompareTo(a.Item2));
118 | int k = Math.Min(topk, user2score.Count);
119 | wt.Write("{0}\t", iid);
120 | for (int i = 0; i < k - 1; i++)
121 | {
122 | wt.Write("{0},", user2score[i].Item1);
123 | }
124 | wt.Write("{0}\n", user2score[k - 1].Item1);
125 | }
126 | }
127 | }
128 | }
129 |
130 | public static double GetUserScore(string iid, List history, Dictionary itemdict, Dictionary item_titlefreq)
131 | {
132 | double score = 0;
133 | foreach (var tid in history)
134 | {
135 | if (itemdict.ContainsKey(tid) && tid != iid)
136 | {
137 | foreach (var word in itemdict[iid].title)
138 | {
139 | if (!string.IsNullOrEmpty(word) && item_titlefreq.ContainsKey(word) && item_titlefreq[word] >= 20)
140 | {
141 | if (itemdict[tid].title2cnt.ContainsKey(word))
142 | {
143 | score += Math.Sqrt(itemdict[tid].title2cnt[word] * itemdict[iid].title2cnt[word]) * Math.Log10(1000000.0 / item_titlefreq[word]);
144 | }
145 | }
146 | }
147 | }
148 | }
149 | return score;
150 | }
151 |
152 | public static double GetUserScore_Tag(string iid, List history, Dictionary itemdict, Dictionary item_titlefreq)
153 | {
154 | double score = 0;
155 | foreach (var tid in history)
156 | {
157 | if (itemdict.ContainsKey(tid))
158 | {
159 | foreach (var word in itemdict[iid].tags)
160 | {
161 | if (!string.IsNullOrEmpty(word))
162 | {
163 | if (itemdict[tid].tags.Contains(word))
164 | {
165 | score += 1;
166 | }
167 | }
168 | }
169 | }
170 | }
171 | if (history.Count > 0)
172 | {
173 | score /= history.Count;
174 | }
175 | return score;
176 | }
177 |
178 | public static Dictionary> BuildUserInterestedItems(string file)
179 | {
180 | Console.WriteLine("BuildUserInterestedItems...");
181 | Dictionary> res = new Dictionary>();
182 | using (StreamReader rd = new StreamReader(file))
183 | {
184 | string content = null;
185 | while ((content = rd.ReadLine()) != null)
186 | {
187 | string[] words = content.Split('\t');
188 | if (words[2] != "0" && words[2] != "4")
189 | {
190 | if (!res.ContainsKey(words[0]))
191 | {
192 | res.Add(words[0], new List());
193 | }
194 | res[words[0]].Add(words[1]);
195 | }
196 | }
197 | }
198 | Console.WriteLine("BuildUserInterestedItems finished.");
199 | return res;
200 | }
201 |
202 | public static void PredictFromClosestJobs()
203 | {
204 | int topk = 5;
205 | string outfile = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\knn_closest_jobs.csv";
206 |
207 | string logfile = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\knn_closest_jobs_logs.csv";
208 | string logfile_bestscore = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\knn_closest_jobs_logs_bestscore.csv";
209 |
210 | string interaction_file = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\interactions_grouped.csv";
211 |
212 | Dictionary itemdict = FeatureFactory.BuildItemDict();
213 | //Dictionary userdict = FeatureFactory.BuildUserDict();
214 |
215 | // Dictionary user_titlefreq = Utils.LoadDict(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\user_title_stat.csv", 0, 1);
216 | Dictionary item_titlefreq = Utils.LoadDict(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\item_title_stat.csv", 0, 1);
217 |
218 |
219 | HashSet target_users = new HashSet(FeatureFactory.LoadListFromFile(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\targetUsers.csv"));
220 | List target_items = FeatureFactory.LoadListFromFile(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\targetItems.csv");
221 |
222 | Dictionary> item2clicked_users = LoadItem2PosUsers(interaction_file, target_users);
223 |
224 | using (StreamWriter wt = new StreamWriter(outfile))
225 | using(StreamWriter wt_log = new StreamWriter(logfile))
226 | using (StreamWriter wt_log02 = new StreamWriter(logfile_bestscore))
227 | {
228 | int cnt = 0;
229 | foreach (var iid in target_items)
230 | {
231 | if (cnt++ % 100 == 0)
232 | {
233 | Console.Write("writing {0}\r", cnt);
234 | }
235 | HashSet candidates = new HashSet();
236 |
237 | List> item2sim = new List>();
238 | foreach (var ciid in item2clicked_users.Keys)
239 | {
240 | item2sim.Add(new Tuple(ciid, GetItemSim(iid, ciid, item_titlefreq, itemdict)));
241 | }
242 |
243 | item2sim.Sort((a, b) => b.Item2.CompareTo(a.Item2));
244 |
245 | //foreach (var citem in item2sim)
246 | //{
247 | // wt_log.WriteLine("{0},{1},{2}", iid, citem.Item1, citem.Item2);
248 | //}
249 | wt_log02.WriteLine("{0},{1},{2}", iid, item2sim[0].Item1, item2sim[0].Item2);
250 |
251 | foreach (var tuple in item2sim)
252 | {
253 | foreach (var user in item2clicked_users[tuple.Item1])
254 | {
255 | if (!candidates.Contains(user) && tuple.Item2>0)
256 | {
257 | candidates.Add(user);
258 | }
259 | }
260 |
261 | if (candidates.Count >= topk)
262 | {
263 | break;
264 | }
265 | }
266 |
267 |
268 | if (candidates.Count > 0)
269 | {
270 | var candi_list = candidates.ToList();
271 | string out_line = iid + "\t";
272 |
273 | for (int i = 0; i < candi_list.Count && i< topk; i++)
274 | {
275 | out_line+=candi_list[i]+",";
276 | }
277 | wt.WriteLine(out_line.Substring(0, out_line.Length - 1));
278 | }
279 | }
280 | }
281 | }
282 |
283 | private static double GetItemSim(string iid, string ciid, Dictionary item_titlefreq, Dictionary itemdict)
284 | {
285 | if (!itemdict.ContainsKey(iid) || !itemdict.ContainsKey(ciid))
286 | {
287 | return 0;
288 | }
289 |
290 | Item info_iid = itemdict[iid];
291 | Item info_ciid = itemdict[ciid];
292 |
293 | if (info_ciid.indus != info_iid.indus || info_ciid.disc != info_iid.disc || info_ciid.country != info_iid.country)
294 | {
295 | return 0;
296 | }
297 |
298 | double res = 0;
299 | foreach (var word in info_iid.title2cnt.Keys)
300 | {
301 | if (item_titlefreq.ContainsKey(word) && info_ciid.title2cnt.ContainsKey(word))
302 | {
303 | res += Math.Log10(1000000.0 / item_titlefreq[word]) * info_iid.title2cnt[word] * info_ciid.title2cnt[word];
304 | }
305 | }
306 |
307 | return res;
308 | }
309 |
310 |
311 |
312 | private static Dictionary> LoadItem2PosUsers(string interaction_file, HashSet target_users)
313 | {
314 | Dictionary> res = new Dictionary>();
315 | using (StreamReader rd = new StreamReader(interaction_file))
316 | {
317 | string content = null;
318 | while ((content = rd.ReadLine()) != null)
319 | {
320 | string[] words = content.Split('\t');
321 | if (words[2] != "0" && words[2] != "4" && target_users.Contains(words[0]))
322 | {
323 | if (!res.ContainsKey(words[1]))
324 | {
325 | res.Add(words[1], new HashSet());
326 | }
327 | if (!res[words[1]].Contains(words[0]))
328 | {
329 | res[words[1]].Add(words[0]);
330 | }
331 | }
332 | }
333 | }
334 | return res;
335 | }
336 |
337 | }
338 | }
339 |
--------------------------------------------------------------------------------
/auto-pipeline/model/KeywordMgr.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.IO;
4 | using System.Linq;
5 | using System.Text;
6 | using System.Threading.Tasks;
7 |
8 | namespace RecSys17.model
9 | {
10 | class KeywordMgr
11 | {
12 | public Dictionary useritem_index;
13 | public Dictionary itemitem_title_index;
14 | public Dictionary itemitem_tag_index;
15 |
16 | public KeywordMgr()
17 | {
18 | useritem_index = BuildIndex(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\keywords\keywords_useritem.csv");
19 | itemitem_title_index = BuildIndex(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\keywords\keywords_itemtitle.csv");
20 | itemitem_tag_index = BuildIndex(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\keywords\keywords_itemtag.csv");
21 | }
22 |
23 |
24 | private Dictionary BuildIndex(string file)
25 | {
26 | int thre = 100;
27 | List> index = new List>();
28 | using (StreamReader rd = new StreamReader(file))
29 | {
30 | string content = null;
31 | while ((content = rd.ReadLine()) != null)
32 | {
33 | string[] words = content.Split(',');
34 | if (int.Parse(words[1]) >= thre)
35 | {
36 | index.Add(new Tuple(words[0], double.Parse(words[3])));
37 | }
38 | }
39 | }
40 | index.Sort((a, b) => b.Item2.CompareTo(a.Item2));
41 |
42 | Dictionary res = new Dictionary();
43 | for (int i = 0; i < index.Count; i++)
44 | {
45 | res.Add(index[i].Item1, i);
46 | }
47 | return res;
48 | }
49 |
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/auto-pipeline/model/SubmissionHelper.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.IO;
4 | using System.Linq;
5 | using System.Text;
6 | using System.Threading.Tasks;
7 |
8 | namespace RecSys17.model
9 | {
10 | class SubmissionHelper
11 | {
12 | public static void Ensemble(string infile01, string infile02, string outfile, int gap, int k , int start )
13 | {
14 | Dictionary> iid2rec01 = LoadSubFile(infile01);
15 | Dictionary> iid2rec02 = LoadSubFile(infile02);
16 |
17 | using (StreamWriter wt = new StreamWriter(outfile))
18 | {
19 | foreach (var pair in iid2rec01)
20 | {
21 | if (!iid2rec02.ContainsKey(pair.Key))
22 | {
23 | wt.Write("{0}\t",pair.Key);
24 | wt.Write("{0}\n", string.Join(",", pair.Value.ToArray()));
25 | }
26 | else
27 | {
28 | List merge_list = MergeTwoList(pair.Value, iid2rec02[pair.Key], gap, k, start);
29 | wt.Write("{0}\t", pair.Key);
30 | wt.Write("{0}\n", string.Join(",", merge_list.ToArray()));
31 | }
32 | }
33 |
34 | foreach (var pair in iid2rec02)
35 | {
36 | if (!iid2rec01.ContainsKey(pair.Key))
37 | {
38 | wt.Write("{0}\t", pair.Key);
39 | wt.Write("{0}\n", string.Join(",", pair.Value.ToArray()));
40 | }
41 | }
42 | }
43 | }
44 |
45 | private static List MergeTwoList(List list1, List list2, int gap, int max_k, int start)
46 | {
47 | HashSet visited = new HashSet();
48 | List res = new List();
49 | int t01 = 0, t02 = 0;
50 | int cnt01 = list1.Count, cnt02 = list2.Count;
51 |
52 | while (t01 < cnt01 && t01 < start)
53 | {
54 | res.Add(list1[t01]);
55 | visited.Add(list1[t01]);
56 | t01++;
57 | }
58 |
59 | while (t01 < cnt01 && t02 < cnt02)
60 | {
61 | for (int i = 0; i < gap && i+t01 max_k)
98 | {
99 | res = res.GetRange(0, max_k);
100 | }
101 |
102 | return res;
103 | }
104 |
105 | private static Dictionary> LoadSubFile(string infile)
106 | {
107 | Dictionary> res = new Dictionary>();
108 |
109 | using (StreamReader rd = new StreamReader(infile))
110 | {
111 | string content = null;
112 | while ((content = rd.ReadLine()) != null)
113 | {
114 | string[] words = content.Split(new char[] { ' ', '\t' });
115 | if (words.Length < 2)
116 | {
117 | continue;
118 | }
119 |
120 | string[] tokens = words[1].Split(',');
121 |
122 | res.Add(words[0], new List());
123 | for (int i = 0; i < tokens.Length; i++)
124 | {
125 | res[words[0]].Add(tokens[i]);
126 | }
127 | }
128 | }
129 |
130 | return res;
131 | }
132 |
133 | public static void GenSubFileFromTLCWithAlignment(string infile, string reffile, string outfile)
134 | {
135 | int topk = 100;
136 | var userdict = FeatureFactory.BuildUserDict();
137 | var itemdict = FeatureFactory.BuildItemDict();
138 |
139 | Dictionary>> item2userscore = new Dictionary>>();
140 | int cnt = 0;
141 | using (StreamReader rd01 = new StreamReader(infile))
142 | using (StreamReader rd02 = new StreamReader(reffile))
143 | {
144 | string content = rd01.ReadLine();
145 | while ((content = rd01.ReadLine()) != null)
146 | {
147 | if (cnt++ % 100000 == 0)
148 | {
149 | Console.Write(cnt + "\r");
150 | }
151 | string[] words = content.Split('\t');
152 | double score = double.Parse(words[3]);
153 |
154 | string[] tokens = rd02.ReadLine().Split(',');
155 | string uid = tokens[2];
156 | string iid = tokens[3];
157 |
158 |
159 |
160 | if (!item2userscore.ContainsKey(iid))
161 | {
162 | item2userscore.Add(iid, new List>());
163 | }
164 |
165 | item2userscore[iid].Add(new Tuple(uid, score));
166 | }
167 | }
168 |
169 | cnt = 0;
170 | using (StreamWriter wt = new StreamWriter(outfile))
171 | foreach (var iid in item2userscore.Keys)
172 | {
173 | if (cnt++ % 1000 == 0)
174 | {
175 | Console.WriteLine("Item {0}", cnt);
176 | }
177 | var list = item2userscore[iid];
178 | //if (list.Count < 500)
179 | //{
180 | // continue;
181 | //}
182 | list.Sort((a, b) => b.Item2.CompareTo(a.Item2));
183 | int k = Math.Min(topk, list.Count);
184 | wt.Write("{0}\t", iid);
185 | for (int i = 0; i < k - 1; i++)
186 | {
187 | wt.Write("{0},", list[i].Item1);
188 | }
189 | if (k > 0)
190 | wt.Write("{0}\n", list[k - 1].Item1);
191 | else
192 | wt.Write("\n");
193 | }
194 |
195 | }
196 |
197 | public static void GenSubFileFromTLC(string infile, string outfile, int[] name_idx, int value_idx, double thre = 0.1, char spliter = '\t', bool hasHeader = true,
198 | Dictionary userdict = null, Dictionary itemdict = null, Func get_score = null)
199 | {
200 | int topk = 100;
201 | if(userdict==null)
202 | userdict = FeatureFactory.BuildUserDict();
203 | if (itemdict==null)
204 | itemdict = FeatureFactory.BuildItemDict();
205 |
206 | Dictionary>> item2userscore = new Dictionary>>();
207 | int cnt = 0;
208 | using (StreamReader rd = new StreamReader(infile))
209 | {
210 | string content = null;
211 | if(hasHeader)
212 | rd.ReadLine();
213 | while ((content = rd.ReadLine()) != null)
214 | {
215 | if (cnt++ % 100000 == 0)
216 | {
217 | Console.Write(cnt+"\r");
218 | }
219 | string[] words = content.Split(spliter);
220 |
221 | string[] tokens = null;
222 | if (name_idx.Length == 1)
223 | {
224 | tokens = words[name_idx[0]].Split('|');
225 | }
226 | else //if (name_idx.Length == 2)
227 | {
228 | tokens = new string[name_idx.Length];
229 | for (int t = 0; t < name_idx.Length; t++)
230 | {
231 | tokens[t] = words[name_idx[t]];
232 | }
233 | }
234 | double score = double.Parse(words[value_idx]);
235 | if (get_score != null)
236 | {
237 | score = get_score(words);
238 | }
239 |
240 | if (!item2userscore.ContainsKey(tokens[1]))
241 | {
242 | item2userscore.Add(tokens[1], new List>());
243 | }
244 |
245 | if (score > thre ) // && userdict[tokens[0]].title.Intersect(itemdict[tokens[1]].title).Count()>0)
246 | {
247 | item2userscore[tokens[1]].Add(new Tuple(tokens[0], score));
248 | }
249 | }
250 | }
251 |
252 | cnt = 0;
253 | using(StreamWriter wt = new StreamWriter(outfile))
254 | foreach (var iid in item2userscore.Keys)
255 | {
256 | if (cnt++ % 1000 == 0)
257 | {
258 | Console.WriteLine("Item {0}", cnt);
259 | }
260 | var list = item2userscore[iid];
261 | //if (list.Count < 500)
262 | //{
263 | // continue;
264 | //}
265 | list.Sort((a, b) => b.Item2.CompareTo(a.Item2));
266 | int k = Math.Min(topk, list.Count);
267 | wt.Write("{0}\t",iid);
268 | for (int i = 0; i < k - 1; i++)
269 | {
270 | wt.Write("{0},", list[i].Item1);
271 | }
272 | if (k > 0)
273 | wt.Write("{0}\n", list[k - 1].Item1);
274 | else
275 | wt.Write("\n");
276 | }
277 |
278 | }
279 | }
280 | }
281 |
--------------------------------------------------------------------------------
/auto-pipeline/model/User.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Linq;
4 | using System.Text;
5 | using System.Threading.Tasks;
6 |
7 | namespace RecSys17.model
8 | {
9 | class User
10 | {
11 | public string id;
12 | public HashSet title;
13 | public Dictionary title2cnt;
14 | public int title_cnt;
15 | public string clevel;
16 | public string indus;
17 | public string disc;
18 | public string country;
19 | public string region;
20 | public string experience_n_entries_class;
21 | public string experience_years_experience;
22 | public string experience_years_in_current;
23 | public string edu_degree;
24 | public HashSet edu_fieldofstudies;
25 | public string wtcj;
26 | public string premium ;
27 |
28 | public List> interactions;
29 | public Dictionary viewed_item_title_words;
30 | public double viewed_titem_title_cnt;
31 |
32 |
33 |
34 | public User(){}
35 | public User(string line)
36 | {
37 | string[] words = line.Split('\t');
38 |
39 | id = words[0];
40 | title = new HashSet();
41 | title2cnt = new Dictionary();
42 | title_cnt = 0;
43 | var tokens = words[1].Split(',');
44 | title_cnt = tokens.Length;
45 | foreach (var token in tokens)
46 | {
47 | title.Add(token);
48 | if (!title2cnt.ContainsKey(token))
49 | {
50 | title2cnt.Add(token, 1.0f / title_cnt);
51 | }
52 | else
53 | {
54 | title2cnt[token] += 1.0f / title_cnt;
55 | }
56 | }
57 | clevel = words[2];
58 | disc = words[3];
59 | indus = words[4];
60 | country = words[5];
61 | region = words[6];
62 | experience_n_entries_class = words[7];
63 | experience_years_experience = words[8];
64 | experience_years_in_current = words[9];
65 | edu_degree = words[10];
66 | edu_fieldofstudies = new HashSet();
67 | foreach (var token in words[11].Split(','))
68 | {
69 | //if (token != "000")
70 | {
71 | edu_fieldofstudies.Add(token);
72 | }
73 | }
74 | wtcj = words[12];
75 | premium = words[13];
76 |
77 | viewed_titem_title_cnt = 0;
78 | interactions = null;
79 | viewed_item_title_words = null;
80 | }
81 |
82 | public void AddViewItem(Item it, int action){
83 | if (interactions == null)
84 | {
85 | interactions = new List>();
86 | viewed_item_title_words = new Dictionary();
87 | }
88 |
89 | foreach (var pair in it.title2cnt)
90 | {
91 | int tcnt = (int)Math.Round(pair.Value * it.title_cnt);
92 | viewed_titem_title_cnt += tcnt;
93 | if (!viewed_item_title_words.ContainsKey(pair.Key))
94 | {
95 | viewed_item_title_words.Add(pair.Key, tcnt);
96 | }
97 | else
98 | {
99 | viewed_item_title_words[pair.Key] += tcnt;
100 | }
101 | }
102 |
103 | interactions.Add(new Tuple(it.id, action));
104 | }
105 | }
106 | }
107 |
--------------------------------------------------------------------------------
/auto-pipeline/model/WordHashing.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.IO;
4 | using System.Linq;
5 | using System.Text;
6 | using System.Threading.Tasks;
7 |
8 | namespace RecSys17.model
9 | {
10 | class WordHashing
11 | {
12 | public static void BuildWordHashing()
13 | {
14 | Dictionary userdict = FeatureFactory.BuildUserDict();
15 | Dictionary itemdict = FeatureFactory.BuildItemDict();
16 |
17 | Dictionary user_titlefreq = Utils.LoadDict(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\user_title_stat.csv", 0, 1);
18 | Dictionary item_titlefreq = Utils.LoadDict(@"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\item_title_stat.csv", 0, 1);
19 |
20 | string outfile = @"\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\stat\clustering\word_hashing.csv";
21 |
22 | Random rng = new Random();
23 | int topic_cnt = 200;
24 | Dictionary word2idx = new Dictionary();
25 | foreach (var pair in item_titlefreq)
26 | {
27 | if (pair.Value >= 20)
28 | {
29 | if (!word2idx.ContainsKey(pair.Key))
30 | {
31 | word2idx.Add(pair.Key, rng.Next(topic_cnt));
32 | }
33 | }
34 | }
35 |
36 | int ite_cnt = 50;
37 | for (int ite = 0; ite < ite_cnt; ite++)
38 | {
39 | Console.Write("ite : {0}\r",ite);
40 | foreach (var item in itemdict.Values)
41 | {
42 | if (item.title.Count > 1)
43 | {
44 | List words = new List();
45 | foreach (var title in item.title)
46 | {
47 | if (word2idx.ContainsKey(title))
48 | {
49 | words.Add(title);
50 | }
51 | }
52 | if (words.Count > 1)
53 | {
54 | List cur_topics = new List();
55 | foreach (var word in words)
56 | {
57 | cur_topics.Add(word2idx[word]);
58 | }
59 | int new_topic = cur_topics[rng.Next(words.Count)];
60 | foreach (var word in words)
61 | {
62 | if(rng.NextDouble()<0.8)
63 | word2idx[word] = new_topic;
64 | else
65 | {
66 | word2idx[word] = rng.Next(topic_cnt);
67 | }
68 | }
69 | }
70 | }
71 | }
72 | }
73 |
74 | using (StreamWriter wt = new StreamWriter(outfile))
75 | {
76 | foreach (var pair in word2idx)
77 | {
78 | wt.WriteLine("{0},{1}",pair.Key,pair.Value);
79 | }
80 | }
81 | }
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/auto-pipeline/py-pull_and_submit/daily-pull-data.py:
--------------------------------------------------------------------------------
1 | '''
2 | Online example
3 |
4 | Uses the offline mode to make predictions
5 | for the online challenge.
6 |
7 | by Daniel Kohlsdorf
8 | '''
9 | import urllib.request
10 | import time
11 | import sys
12 | import json
13 | from dateutil.parser import parse
14 | import datetime
15 | import parser
16 | #from recommendation_worker import *
17 |
18 | TMP_ITEMS = "data/current_items.csv"
19 | TMP_SOLUTION = "data/current_solution.csv"
20 |
21 | MODEL = "data/recsys2017.model" # Model from offline training
22 | USERS_FILE = "data/users.csv" # Online user data
23 |
24 | PULL_DATA_PATH = r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\second-stage\online-schedule\pull-data'
25 | PULL_DATA_LOG_FILE = PULL_DATA_PATH + '\\pull-dates.txt'
26 |
27 | cur_date_flag = ''
28 |
29 | TOKEN = "bGVhdmluZ3NlYXNvbjdiODFkYTRlLTM4MGUtNGZkOC1iYTVjLTM5MjA0M2VhOTQ5Yw==" # your key
30 | SERVER = "https://recsys.xing.com"
31 |
32 | def header(token):
33 | return {"Authorization" : "Bearer %s" %TOKEN}
34 |
35 | def post_url(server):
36 | return server + "/api/online/submission"
37 |
38 | def status_url(server):
39 | return server + "/api/online/data/status"
40 |
41 | def users_url(server):
42 | return server + "/api/online/data/users"
43 |
44 | def items_url(server):
45 | return server + "/api/online/data/items"
46 |
47 | def offline_submission(server):
48 | return server + "/api/submission"
49 |
50 | def get_stats():
51 | req = urllib.request.Request(status_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN})
52 | response=urllib.request.urlopen(req)
53 | content=response.read().decode('utf-8')
54 | response = json.loads(content)
55 |
56 | return parse(response['current']['updated_at'])
57 |
58 | def is_ready():
59 | global cur_date_flag
60 | existing_dates = set()
61 | with open(PULL_DATA_LOG_FILE,'r') as rd:
62 | for date_str in rd.readlines():
63 | existing_dates.add(date_str.rstrip())
64 | status_date = get_stats().date()
65 | cur_date_flag = str(status_date)
66 | print('get_stats().date() = ' + cur_date_flag)
67 | print('datetime.date.today() = ' + str(datetime.date.today()))
68 |
69 | return cur_date_flag not in existing_dates
70 |
71 | def download_items():
72 | req = urllib.request.Request(items_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN})
73 | response=urllib.request.urlopen(req)
74 | content=response.read().decode('utf-8')
75 | global cur_date_flag
76 | fp = open(PULL_DATA_PATH+'\\target_items_'+cur_date_flag+'.txt', "w")
77 | fp.write(content)
78 | fp.close()
79 | #return parser.select(TMP_ITEMS, lambda x: True, parser.build_item, lambda x: int(x[0]))
80 |
81 |
82 | def user_info(user_ids):
83 | return parser.select(
84 | USERS_FILE,
85 | lambda x: int(x[0]) in user_ids and "NULL" not in x,
86 | parser.build_user,
87 | lambda x: int(x[0])
88 | )
89 |
90 | def download_target_users():
91 | req = urllib.request.Request(users_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN})
92 | response=urllib.request.urlopen(req)
93 | content=response.read().decode('utf-8')
94 | global cur_date_flag
95 | fp = open(PULL_DATA_PATH+'\\target_users_'+cur_date_flag+'.txt', "w")
96 | fp.write(content)
97 | fp.close()
98 |
99 | '''
100 | user_ids = set([int(uid) for uid in content.split("\n") if len(uid) > 0])
101 |
102 | with open(PULL_DATA_PATH+'\\target_users_'+datetime.date.today().isoformat()+'.txt','w') as wt:
103 | for uid in user_ids:
104 | wt.write(str(uid)+"\n")
105 | '''
106 | #return user_ids
107 |
108 | def process():
109 | print('downloading data...')
110 | download_target_users()
111 | download_items()
112 | global cur_date_flag
113 | with open(PULL_DATA_LOG_FILE,'a') as wt:
114 | wt.write(cur_date_flag+'\n')
115 |
116 | def offline_submit(filename):
117 | rd = open(filename,'r')
118 | content=rd.read()
119 | rd.close()
120 | content = content.encode('utf-8')
121 | req = urllib.request.Request(url=offline_submission(SERVER), data=content, headers={"Authorization": "Bearer %s" %TOKEN}, method='POST')
122 | response=urllib.request.urlopen(req)
123 | content=response.read().decode('utf-8')
124 | print(content)
125 |
126 | def submit():
127 | http = httplib2.Http()
128 | filename = TMP_SOLUTION
129 | with open(filename, 'r') as content_file:
130 | content = content_file.read()
131 | response = http.request(post_url(SERVER), method="POST", body=content,
132 | headers=header(TOKEN)
133 | )[1].decode("utf-8")
134 | print("SUBMIT: " + filename + " " + response)
135 |
136 | def usage_test():
137 | '''
138 | req = urllib.request.Request(status_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN})
139 | response=urllib.request.urlopen(req)
140 | content=response.read().decode('utf-8')
141 | print(content)
142 | response = json.loads(content)
143 | print(response)
144 | '''
145 | print(is_ready())
146 |
147 | process()
148 |
149 |
150 | if __name__ == "__main__":
151 |
152 | #usage_test()
153 | #offline_submit(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\train-test\inter-media\feature\sparse\recsys17-pred-submit.csv')
154 |
155 |
156 | last_submit = None
157 | while True:
158 | try:
159 | if is_ready() and last_submit != datetime.date.today():
160 | print('data ready.')
161 | process()
162 | last_submit = datetime.date.today()
163 | #submit()
164 | else:
165 | print("Not ready yet: " + str(datetime.date.today()))
166 | time.sleep(600)
167 | except KeyboardInterrupt:
168 | break
169 | except:
170 | print("exception :"+str(sys.exc_info()[0])+"\n")
171 |
172 |
173 |
--------------------------------------------------------------------------------
/auto-pipeline/py-pull_and_submit/model.py:
--------------------------------------------------------------------------------
1 | '''
2 | Modeling users, interactions and items from
3 | the recsys challenge 2017.
4 |
5 | by Daniel Kohlsdorf
6 | '''
7 |
8 | class User:
9 |
10 | def __init__(self, title, clevel, indus, disc, country, region):
11 | self.title = title
12 | self.clevel = clevel
13 | self.indus = indus
14 | self.disc = disc
15 | self.country = country
16 | self.region = region
17 |
18 | class Item:
19 |
20 | def __init__(self, title, clevel, indus, disc, country, region):
21 | self.title = title
22 | self.clevel = clevel
23 | self.indus = indus
24 | self.disc = disc
25 | self.country = country
26 | self.region = region
27 |
28 | class Interaction:
29 |
30 | def __init__(self, user, item, interaction_type):
31 | self.user = user
32 | self.item = item
33 | self.interaction_type = interaction_type
34 |
35 | def title_match(self):
36 | return float(len(set(self.user.title).intersection(set(self.item.title))))
37 |
38 | def clevel_match(self):
39 | if self.user.clevel == self.item.clevel:
40 | return 1.0
41 | else:
42 | return 0.0
43 |
44 | def indus_match(self):
45 | if self.user.indus == self.item.indus:
46 | return 1.0
47 | else:
48 | return 0.0
49 |
50 | def discipline_match(self):
51 | if self.user.disc == self.item.disc:
52 | return 2.0
53 | else:
54 | return 0.0
55 |
56 | def country_match(self):
57 | if self.user.country == self.item.country:
58 | return 1.0
59 | else:
60 | return 0.0
61 |
62 | def region_match(self):
63 | if self.user.region == self.item.region:
64 | return 1.0
65 | else:
66 | return 0.0
67 |
68 | def features(self):
69 | return [
70 | self.title_match(), self.clevel_match(), self.indus_match(),
71 | self.discipline_match(), self.country_match(), self.region_match()
72 | ]
73 |
74 | def label(self):
75 | if self.interaction_type == 4:
76 | return 0.0
77 | else:
78 | return 1.0
79 |
80 |
81 |
--------------------------------------------------------------------------------
/auto-pipeline/py-pull_and_submit/online_submit_auto.py:
--------------------------------------------------------------------------------
1 | '''
2 | Online example
3 |
4 | Uses the offline mode to make predictions
5 | for the online challenge.
6 |
7 | by Daniel Kohlsdorf
8 | '''
9 | import urllib.request
10 | import time
11 | import sys
12 |
13 | import json
14 | from dateutil.parser import parse
15 | import datetime
16 | import parser
17 | #from recommendation_worker import *
18 |
19 | TMP_ITEMS = "data/current_items.csv"
20 | TMP_SOLUTION = "data/current_solution.csv"
21 |
22 | MODEL = "data/recsys2017.model" # Model from offline training
23 | USERS_FILE = "data/users.csv" # Online user data
24 |
25 | PULL_DATA_PATH = r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\second-stage\online-schedule\pull-data'
26 |
27 | TOKEN = "bGVhdmluZ3NlYXNvbjdiODFkYTRlLTM4MGUtNGZkOC1iYTVjLTM5MjA0M2VhOTQ5Yw==" # your key
28 | SERVER = "https://recsys.xing.com"
29 |
30 | def header(token):
31 | return {"Authorization" : "Bearer %s" %TOKEN}
32 |
33 | def post_url(server):
34 | return server + "/api/online/submission"
35 |
36 | def status_url(server):
37 | return server + "/api/online/data/status"
38 |
39 | def users_url(server):
40 | return server + "/api/online/data/users"
41 |
42 | def items_url(server):
43 | return server + "/api/online/data/items"
44 |
45 | def interaction_url(server):
46 | return server + "/api/online/data/interactions"
47 |
48 | def offline_submission(server):
49 | return server + "/api/submission"
50 | def online_submission(server):
51 | return server + "/api/online/submission"
52 |
53 |
54 | def get_stats():
55 | req = urllib.request.Request(status_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN})
56 | response=urllib.request.urlopen(req)
57 | content=response.read().decode('utf-8')
58 | response = json.loads(content)
59 |
60 | return parse(response['current']['updated_at'])
61 |
62 | def is_ready():
63 | status_date = get_stats().date()
64 | print('get_stats().date() = ' + str(status_date))
65 | print('datetime.date.today() = ' + str(datetime.date.today()))
66 |
67 | return status_date == datetime.date.today()
68 |
69 | def download_items():
70 | req = urllib.request.Request(items_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN})
71 | response=urllib.request.urlopen(req)
72 | content=response.read().decode('utf-8')
73 |
74 | fp = open(PULL_DATA_PATH+'\\target_items_'+datetime.date.today().isoformat()+'.txt', "w")
75 | fp.write(content)
76 | fp.close()
77 | #return parser.select(TMP_ITEMS, lambda x: True, parser.build_item, lambda x: int(x[0]))
78 |
79 | def download_acceptsubmission():
80 | req = urllib.request.Request(online_submission(SERVER), None, {"Authorization": "Bearer %s" %TOKEN})
81 | response=urllib.request.urlopen(req)
82 | content=response.read().decode('utf-8')
83 |
84 | fp = open(PULL_DATA_PATH+'\\accepted_pairs\\accepted_pairs_'+datetime.date.today().isoformat()+'.txt', "w")
85 | fp.write(content)
86 | fp.close()
87 |
88 | def download_interactions():
89 | req = urllib.request.Request(interaction_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN})
90 | response=urllib.request.urlopen(req)
91 | content=response.read().decode('utf-8')
92 |
93 | fp = open(PULL_DATA_PATH+'\\interactions\\interaction_'+datetime.date.today().isoformat()+'.txt', "w")
94 | fp.write(content)
95 | fp.close()
96 | #return parser.select(TMP_ITEMS, lambda x: True, parser.build_item, lambda x: int(x[0]))
97 |
98 |
99 | def user_info(user_ids):
100 | return parser.select(
101 | USERS_FILE,
102 | lambda x: int(x[0]) in user_ids and "NULL" not in x,
103 | parser.build_user,
104 | lambda x: int(x[0])
105 | )
106 |
107 | def download_target_users():
108 | req = urllib.request.Request(users_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN})
109 | response=urllib.request.urlopen(req)
110 | content=response.read().decode('utf-8')
111 |
112 | fp = open(PULL_DATA_PATH+'\\target_users_'+datetime.date.today().isoformat()+'.txt', "w")
113 | fp.write(content)
114 | fp.close()
115 |
116 | '''
117 | user_ids = set([int(uid) for uid in content.split("\n") if len(uid) > 0])
118 |
119 | with open(PULL_DATA_PATH+'\\target_users_'+datetime.date.today().isoformat()+'.txt','w') as wt:
120 | for uid in user_ids:
121 | wt.write(str(uid)+"\n")
122 | '''
123 | #return user_ids
124 |
125 | def process():
126 | download_target_users()
127 | download_items()
128 |
129 | def offline_submit(filename):
130 | rd = open(filename,'r')
131 | content=rd.read()
132 | rd.close()
133 | content = content.encode('utf-8')
134 | req = urllib.request.Request(url=offline_submission(SERVER), data=content, headers={"Authorization": "Bearer %s" %TOKEN}, method='POST')
135 | response=urllib.request.urlopen(req)
136 | content=response.read().decode('utf-8')
137 | print(content)
138 |
139 | def online_submit(filename):
140 | rd = open(filename,'r')
141 | content=rd.read()
142 | rd.close()
143 | content = content.encode('utf-8')
144 | req = urllib.request.Request(url=online_submission(SERVER), data=content, headers={"Authorization": "Bearer %s" %TOKEN}, method='POST')
145 | response=urllib.request.urlopen(req)
146 | content=response.read().decode('utf-8')
147 | print(content)
148 |
149 | def submit():
150 | http = httplib2.Http()
151 | filename = TMP_SOLUTION
152 | with open(filename, 'r') as content_file:
153 | content = content_file.read()
154 | response = http.request(post_url(SERVER), method="POST", body=content,
155 | headers=header(TOKEN)
156 | )[1].decode("utf-8")
157 | print("SUBMIT: " + filename + " " + response)
158 |
159 | def usage_test():
160 | '''
161 | req = urllib.request.Request(status_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN})
162 | response=urllib.request.urlopen(req)
163 | content=response.read().decode('utf-8')
164 | print(content)
165 | response = json.loads(content)
166 | print(response)
167 | '''
168 | print(is_ready())
169 |
170 | process()
171 |
172 | def submit_file_online(file):
173 | while True:
174 | try:
175 | print("submitting "+file)
176 | online_submit(file)
177 | print("submitting successfully")
178 | break
179 | except KeyboardInterrupt:
180 | break
181 | except:
182 | print("exception :"+str(sys.exc_info()[0])+"\n")
183 |
184 |
185 | if __name__ == "__main__":
186 |
187 | submit_file_online(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit.csv')
188 | submit_file_online(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v1.csv')
189 | submit_file_online(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v2.csv')
190 | submit_file_online(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v3.csv')
191 | submit_file_online(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v4.csv')
192 | submit_file_online(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v5.csv')
193 | submit_file_online(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v6.csv')
194 | submit_file_online(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v7.csv')
195 |
196 | while True:
197 | try:
198 | download_acceptsubmission();
199 | break
200 | except KeyboardInterrupt:
201 | break
202 | except:
203 | print("exception :"+str(sys.exc_info()[0])+"\n")
--------------------------------------------------------------------------------
/auto-pipeline/py-pull_and_submit/online_submit_auto_-1.py:
--------------------------------------------------------------------------------
1 | '''
2 | Online example
3 |
4 | Uses the offline mode to make predictions
5 | for the online challenge.
6 |
7 | by Daniel Kohlsdorf
8 | '''
9 | import urllib.request
10 | import time
11 | import sys
12 |
13 | import json
14 | from dateutil.parser import parse
15 | import datetime
16 | import parser
17 | #from recommendation_worker import *
18 |
19 | TMP_ITEMS = "data/current_items.csv"
20 | TMP_SOLUTION = "data/current_solution.csv"
21 |
22 | MODEL = "data/recsys2017.model" # Model from offline training
23 | USERS_FILE = "data/users.csv" # Online user data
24 |
25 | PULL_DATA_PATH = r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\second-stage\online-schedule\pull-data'
26 |
27 | TOKEN = "bGVhdmluZ3NlYXNvbjdiODFkYTRlLTM4MGUtNGZkOC1iYTVjLTM5MjA0M2VhOTQ5Yw==" # your key
28 | SERVER = "https://recsys.xing.com"
29 |
30 | def header(token):
31 | return {"Authorization" : "Bearer %s" %TOKEN}
32 |
33 | def post_url(server):
34 | return server + "/api/online/submission"
35 |
36 | def status_url(server):
37 | return server + "/api/online/data/status"
38 |
39 | def users_url(server):
40 | return server + "/api/online/data/users"
41 |
42 | def items_url(server):
43 | return server + "/api/online/data/items"
44 |
45 | def interaction_url(server):
46 | return server + "/api/online/data/interactions"
47 |
48 | def offline_submission(server):
49 | return server + "/api/submission"
50 | def online_submission(server):
51 | return server + "/api/online/submission"
52 |
53 |
54 | def get_stats():
55 | req = urllib.request.Request(status_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN})
56 | response=urllib.request.urlopen(req)
57 | content=response.read().decode('utf-8')
58 | response = json.loads(content)
59 |
60 | return parse(response['current']['updated_at'])
61 |
62 | def is_ready():
63 | status_date = get_stats().date()
64 | print('get_stats().date() = ' + str(status_date))
65 | print('datetime.date.today() = ' + str(datetime.date.today()))
66 |
67 | return status_date == datetime.date.today()
68 |
69 | def download_items():
70 | req = urllib.request.Request(items_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN})
71 | response=urllib.request.urlopen(req)
72 | content=response.read().decode('utf-8')
73 |
74 | fp = open(PULL_DATA_PATH+'\\target_items_'+datetime.date.today().isoformat()+'.txt', "w")
75 | fp.write(content)
76 | fp.close()
77 | #return parser.select(TMP_ITEMS, lambda x: True, parser.build_item, lambda x: int(x[0]))
78 |
79 | def download_acceptsubmission():
80 | req = urllib.request.Request(online_submission(SERVER), None, {"Authorization": "Bearer %s" %TOKEN})
81 | response=urllib.request.urlopen(req)
82 | content=response.read().decode('utf-8')
83 |
84 | fp = open(PULL_DATA_PATH+'\\accepted_pairs\\accepted_pairs_'+datetime.date.today().isoformat()+'.txt', "w")
85 | fp.write(content)
86 | fp.close()
87 |
88 | def download_interactions():
89 | req = urllib.request.Request(interaction_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN})
90 | response=urllib.request.urlopen(req)
91 | content=response.read().decode('utf-8')
92 |
93 | fp = open(PULL_DATA_PATH+'\\interactions\\interaction_'+datetime.date.today().isoformat()+'.txt', "w")
94 | fp.write(content)
95 | fp.close()
96 | #return parser.select(TMP_ITEMS, lambda x: True, parser.build_item, lambda x: int(x[0]))
97 |
98 |
99 | def user_info(user_ids):
100 | return parser.select(
101 | USERS_FILE,
102 | lambda x: int(x[0]) in user_ids and "NULL" not in x,
103 | parser.build_user,
104 | lambda x: int(x[0])
105 | )
106 |
107 | def download_target_users():
108 | req = urllib.request.Request(users_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN})
109 | response=urllib.request.urlopen(req)
110 | content=response.read().decode('utf-8')
111 |
112 | fp = open(PULL_DATA_PATH+'\\target_users_'+datetime.date.today().isoformat()+'.txt', "w")
113 | fp.write(content)
114 | fp.close()
115 |
116 | '''
117 | user_ids = set([int(uid) for uid in content.split("\n") if len(uid) > 0])
118 |
119 | with open(PULL_DATA_PATH+'\\target_users_'+datetime.date.today().isoformat()+'.txt','w') as wt:
120 | for uid in user_ids:
121 | wt.write(str(uid)+"\n")
122 | '''
123 | #return user_ids
124 |
125 | def process():
126 | download_target_users()
127 | download_items()
128 |
129 | def offline_submit(filename):
130 | rd = open(filename,'r')
131 | content=rd.read()
132 | rd.close()
133 | content = content.encode('utf-8')
134 | req = urllib.request.Request(url=offline_submission(SERVER), data=content, headers={"Authorization": "Bearer %s" %TOKEN}, method='POST')
135 | response=urllib.request.urlopen(req)
136 | content=response.read().decode('utf-8')
137 | print(content)
138 |
139 | def online_submit(filename):
140 | rd = open(filename,'r')
141 | content=rd.read()
142 | rd.close()
143 | content = content.encode('utf-8')
144 | req = urllib.request.Request(url=online_submission(SERVER), data=content, headers={"Authorization": "Bearer %s" %TOKEN}, method='POST')
145 | response=urllib.request.urlopen(req)
146 | content=response.read().decode('utf-8')
147 | print(content)
148 |
149 | def submit():
150 | http = httplib2.Http()
151 | filename = TMP_SOLUTION
152 | with open(filename, 'r') as content_file:
153 | content = content_file.read()
154 | response = http.request(post_url(SERVER), method="POST", body=content,
155 | headers=header(TOKEN)
156 | )[1].decode("utf-8")
157 | print("SUBMIT: " + filename + " " + response)
158 |
159 | def usage_test():
160 | '''
161 | req = urllib.request.Request(status_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN})
162 | response=urllib.request.urlopen(req)
163 | content=response.read().decode('utf-8')
164 | print(content)
165 | response = json.loads(content)
166 | print(response)
167 | '''
168 | print(is_ready())
169 |
170 | process()
171 |
172 | def submit_file_online(file):
173 | while True:
174 | try:
175 | print("submitting "+file)
176 | online_submit(file)
177 | print("submitting successfully")
178 | break
179 | except KeyboardInterrupt:
180 | break
181 | except:
182 | print("exception :"+str(sys.exc_info()[0])+"\n")
183 |
184 |
185 | if __name__ == "__main__":
186 |
187 | submit_file_online(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v-1.csv')
--------------------------------------------------------------------------------
/auto-pipeline/py-pull_and_submit/online_submit_auto_2.py:
--------------------------------------------------------------------------------
1 | '''
2 | Online example
3 |
4 | Uses the offline mode to make predictions
5 | for the online challenge.
6 |
7 | by Daniel Kohlsdorf
8 | '''
9 | import urllib.request
10 | import time
11 | import sys
12 |
13 | import json
14 | from dateutil.parser import parse
15 | import datetime
16 | import parser
17 | #from recommendation_worker import *
18 |
19 | TMP_ITEMS = "data/current_items.csv"
20 | TMP_SOLUTION = "data/current_solution.csv"
21 |
22 | MODEL = "data/recsys2017.model" # Model from offline training
23 | USERS_FILE = "data/users.csv" # Online user data
24 |
25 | PULL_DATA_PATH = r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\second-stage\online-schedule\pull-data'
26 |
27 | TOKEN = "bGVhdmluZ3NlYXNvbjdiODFkYTRlLTM4MGUtNGZkOC1iYTVjLTM5MjA0M2VhOTQ5Yw==" # your key
28 | SERVER = "https://recsys.xing.com"
29 |
30 | def header(token):
31 | return {"Authorization" : "Bearer %s" %TOKEN}
32 |
33 | def post_url(server):
34 | return server + "/api/online/submission"
35 |
36 | def status_url(server):
37 | return server + "/api/online/data/status"
38 |
39 | def users_url(server):
40 | return server + "/api/online/data/users"
41 |
42 | def items_url(server):
43 | return server + "/api/online/data/items"
44 |
45 | def interaction_url(server):
46 | return server + "/api/online/data/interactions"
47 |
48 | def offline_submission(server):
49 | return server + "/api/submission"
50 | def online_submission(server):
51 | return server + "/api/online/submission"
52 |
53 |
54 | def get_stats():
55 | req = urllib.request.Request(status_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN})
56 | response=urllib.request.urlopen(req)
57 | content=response.read().decode('utf-8')
58 | response = json.loads(content)
59 |
60 | return parse(response['current']['updated_at'])
61 |
62 | def is_ready():
63 | status_date = get_stats().date()
64 | print('get_stats().date() = ' + str(status_date))
65 | print('datetime.date.today() = ' + str(datetime.date.today()))
66 |
67 | return status_date == datetime.date.today()
68 |
69 | def download_items():
70 | req = urllib.request.Request(items_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN})
71 | response=urllib.request.urlopen(req)
72 | content=response.read().decode('utf-8')
73 |
74 | fp = open(PULL_DATA_PATH+'\\target_items_'+datetime.date.today().isoformat()+'.txt', "w")
75 | fp.write(content)
76 | fp.close()
77 | #return parser.select(TMP_ITEMS, lambda x: True, parser.build_item, lambda x: int(x[0]))
78 |
79 | def download_acceptsubmission():
80 | req = urllib.request.Request(online_submission(SERVER), None, {"Authorization": "Bearer %s" %TOKEN})
81 | response=urllib.request.urlopen(req)
82 | content=response.read().decode('utf-8')
83 |
84 | fp = open(PULL_DATA_PATH+'\\accepted_pairs\\accepted_pairs_'+datetime.date.today().isoformat()+'.txt', "w")
85 | fp.write(content)
86 | fp.close()
87 |
88 | def download_interactions():
89 | req = urllib.request.Request(interaction_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN})
90 | response=urllib.request.urlopen(req)
91 | content=response.read().decode('utf-8')
92 |
93 | fp = open(PULL_DATA_PATH+'\\interactions\\interaction_'+datetime.date.today().isoformat()+'.txt', "w")
94 | fp.write(content)
95 | fp.close()
96 | #return parser.select(TMP_ITEMS, lambda x: True, parser.build_item, lambda x: int(x[0]))
97 |
98 |
99 | def user_info(user_ids):
100 | return parser.select(
101 | USERS_FILE,
102 | lambda x: int(x[0]) in user_ids and "NULL" not in x,
103 | parser.build_user,
104 | lambda x: int(x[0])
105 | )
106 |
107 | def download_target_users():
108 | req = urllib.request.Request(users_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN})
109 | response=urllib.request.urlopen(req)
110 | content=response.read().decode('utf-8')
111 |
112 | fp = open(PULL_DATA_PATH+'\\target_users_'+datetime.date.today().isoformat()+'.txt', "w")
113 | fp.write(content)
114 | fp.close()
115 |
116 | '''
117 | user_ids = set([int(uid) for uid in content.split("\n") if len(uid) > 0])
118 |
119 | with open(PULL_DATA_PATH+'\\target_users_'+datetime.date.today().isoformat()+'.txt','w') as wt:
120 | for uid in user_ids:
121 | wt.write(str(uid)+"\n")
122 | '''
123 | #return user_ids
124 |
125 | def process():
126 | download_target_users()
127 | download_items()
128 |
129 | def offline_submit(filename):
130 | rd = open(filename,'r')
131 | content=rd.read()
132 | rd.close()
133 | content = content.encode('utf-8')
134 | req = urllib.request.Request(url=offline_submission(SERVER), data=content, headers={"Authorization": "Bearer %s" %TOKEN}, method='POST')
135 | response=urllib.request.urlopen(req)
136 | content=response.read().decode('utf-8')
137 | print(content)
138 |
139 | def online_submit(filename):
140 | rd = open(filename,'r')
141 | content=rd.read()
142 | rd.close()
143 | content = content.encode('utf-8')
144 | req = urllib.request.Request(url=online_submission(SERVER), data=content, headers={"Authorization": "Bearer %s" %TOKEN}, method='POST')
145 | response=urllib.request.urlopen(req)
146 | content=response.read().decode('utf-8')
147 | print(content)
148 |
149 | def submit():
150 | http = httplib2.Http()
151 | filename = TMP_SOLUTION
152 | with open(filename, 'r') as content_file:
153 | content = content_file.read()
154 | response = http.request(post_url(SERVER), method="POST", body=content,
155 | headers=header(TOKEN)
156 | )[1].decode("utf-8")
157 | print("SUBMIT: " + filename + " " + response)
158 |
159 | def usage_test():
160 | '''
161 | req = urllib.request.Request(status_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN})
162 | response=urllib.request.urlopen(req)
163 | content=response.read().decode('utf-8')
164 | print(content)
165 | response = json.loads(content)
166 | print(response)
167 | '''
168 | print(is_ready())
169 |
170 | process()
171 |
172 | def submit_file_online(file):
173 | while True:
174 | try:
175 | print("submitting "+file)
176 | online_submit(file)
177 | print("submitting successfully")
178 | break
179 | except KeyboardInterrupt:
180 | break
181 | except:
182 | print("exception :"+str(sys.exc_info()[0])+"\n")
183 |
184 |
185 | if __name__ == "__main__":
186 |
187 | submit_file_online(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online_2\recsys17-pred-highdim-submit.csv')
188 | submit_file_online(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online_2\recsys17-pred-highdim-submit_v1.csv')
189 | submit_file_online(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online_2\recsys17-pred-highdim-submit_v2.csv')
190 | submit_file_online(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online_2\recsys17-pred-highdim-submit_v3.csv')
191 | submit_file_online(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online_2\recsys17-pred-highdim-submit_v4.csv')
192 | submit_file_online(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online_2\recsys17-pred-highdim-submit_v5.csv')
193 | submit_file_online(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online_2\recsys17-pred-highdim-submit_v6.csv')
194 | submit_file_online(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online_2\recsys17-pred-highdim-submit_v7.csv')
195 |
196 | while True:
197 | try:
198 | download_acceptsubmission();
199 | break
200 | except KeyboardInterrupt:
201 | break
202 | except:
203 | print("exception :"+str(sys.exc_info()[0])+"\n")
--------------------------------------------------------------------------------
/auto-pipeline/py-pull_and_submit/parser.py:
--------------------------------------------------------------------------------
1 | '''
2 | Parsing the ACM Recsys Challenge 2017 data into interactions,
3 | items and user models.
4 |
5 | by Daniel Kohlsdorf
6 | '''
7 |
8 | from model import *
9 |
10 | def is_header(line):
11 | return "recsyschallenge" in line
12 |
13 | def process_header(header):
14 | x = {}
15 | pos = 0
16 | for name in header:
17 | x[name.split(".")[1]] = pos
18 | pos += 1
19 | return x
20 |
21 | def select(from_file, where, toObject, index):
22 | header = None
23 | data = {}
24 | i = 0
25 | for line in open(from_file):
26 | if is_header(line):
27 | header = process_header(line.strip().split("\t"))
28 | elif len(line.strip()) > 0 and header != None:
29 | cmp = line.strip().split("\t")
30 | if where(cmp) and len(cmp) == len(header):
31 | obj = toObject(cmp, header)
32 | if obj != None:
33 | data[index(cmp)] = obj
34 | i += 1
35 | if i % 100000 == 0:
36 | print("... reading line " + str(i) + " from file " + from_file)
37 | return(header, data)
38 |
39 | def build_user(str_user, names):
40 | return User(
41 | [int(x) for x in str_user[names["jobroles"]].split(",") if len(x) > 0],
42 | int(str_user[names["career_level"]]),
43 | int(str_user[names["industry_id"]]),
44 | int(str_user[names["discipline_id"]]),
45 | str_user[names["country"]],
46 | str_user[names["region"]]
47 | )
48 |
49 | def build_item(str_item, names):
50 | return Item(
51 | [int(x) for x in str_item[names["title"]].split(",") if len(x) > 0],
52 | int(str_item[names["career_level"]]),
53 | int(str_item[names["industry_id"]]),
54 | int(str_item[names["discipline_id"]]),
55 | str_item[names["country"]],
56 | str_item[names["region"]]
57 | )
58 |
59 | class InteractionBuilder:
60 |
61 | def __init__(self, user_dict, item_dict):
62 | self.user_dict = user_dict
63 | self.item_dict = item_dict
64 |
65 | def build_interaction(self, str_inter, names):
66 | if int(str_inter[names['item_id']]) in self.item_dict and int(str_inter[names['user_id']]) in self.user_dict:
67 | return Interaction(
68 | self.user_dict[int(str_inter[names['user_id']])],
69 | self.item_dict[int(str_inter[names['item_id']])],
70 | int(str_inter[names["interaction_type"]])
71 | )
72 | else:
73 | return None
74 |
75 |
76 |
--------------------------------------------------------------------------------
/auto-pipeline/py-pull_and_submit/recsys-submit-file.py:
--------------------------------------------------------------------------------
1 | '''
2 | Online example
3 |
4 | Uses the offline mode to make predictions
5 | for the online challenge.
6 |
7 | by Daniel Kohlsdorf
8 | '''
9 | import urllib.request
10 | import time
11 |
12 | import json
13 | from dateutil.parser import parse
14 | import datetime
15 | import parser
16 | #from recommendation_worker import *
17 |
18 | TMP_ITEMS = "data/current_items.csv"
19 | TMP_SOLUTION = "data/current_solution.csv"
20 |
21 | MODEL = "data/recsys2017.model" # Model from offline training
22 | USERS_FILE = "data/users.csv" # Online user data
23 |
24 | PULL_DATA_PATH = r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\second-stage\online-schedule\pull-data'
25 |
26 | TOKEN = "bGVhdmluZ3NlYXNvbjdiODFkYTRlLTM4MGUtNGZkOC1iYTVjLTM5MjA0M2VhOTQ5Yw==" # your key
27 | SERVER = "https://recsys.xing.com"
28 |
29 | def header(token):
30 | return {"Authorization" : "Bearer %s" %TOKEN}
31 |
32 | def post_url(server):
33 | return server + "/api/online/submission"
34 |
35 | def status_url(server):
36 | return server + "/api/online/data/status"
37 |
38 | def users_url(server):
39 | return server + "/api/online/data/users"
40 |
41 | def items_url(server):
42 | return server + "/api/online/data/items"
43 |
44 | def interaction_url(server):
45 | return server + "/api/online/data/interactions"
46 |
47 | def offline_submission(server):
48 | return server + "/api/submission"
49 | def online_submission(server):
50 | return server + "/api/online/submission"
51 |
52 |
53 | def get_stats():
54 | req = urllib.request.Request(status_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN})
55 | response=urllib.request.urlopen(req)
56 | content=response.read().decode('utf-8')
57 | response = json.loads(content)
58 |
59 | return parse(response['current']['updated_at'])
60 |
61 | def is_ready():
62 | status_date = get_stats().date()
63 | print('get_stats().date() = ' + str(status_date))
64 | print('datetime.date.today() = ' + str(datetime.date.today()))
65 |
66 | return status_date == datetime.date.today()
67 |
68 | def download_items():
69 | req = urllib.request.Request(items_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN})
70 | response=urllib.request.urlopen(req)
71 | content=response.read().decode('utf-8')
72 |
73 | fp = open(PULL_DATA_PATH+'\\target_items_'+datetime.date.today().isoformat()+'.txt', "w")
74 | fp.write(content)
75 | fp.close()
76 | #return parser.select(TMP_ITEMS, lambda x: True, parser.build_item, lambda x: int(x[0]))
77 |
78 | def download_acceptsubmission():
79 | req = urllib.request.Request(online_submission(SERVER), None, {"Authorization": "Bearer %s" %TOKEN})
80 | response=urllib.request.urlopen(req)
81 | content=response.read().decode('utf-8')
82 |
83 | fp = open(PULL_DATA_PATH+'\\accepted_pairs\\accepted_pairs_'+datetime.date.today().isoformat()+'.txt', "w")
84 | fp.write(content)
85 | fp.close()
86 |
87 | def download_interactions():
88 | req = urllib.request.Request(interaction_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN})
89 | response=urllib.request.urlopen(req)
90 | content=response.read().decode('utf-8')
91 |
92 | fp = open(PULL_DATA_PATH+'\\interactions\\interaction_'+datetime.date.today().isoformat()+'.txt', "w")
93 | fp.write(content)
94 | fp.close()
95 | #return parser.select(TMP_ITEMS, lambda x: True, parser.build_item, lambda x: int(x[0]))
96 |
97 |
98 | def user_info(user_ids):
99 | return parser.select(
100 | USERS_FILE,
101 | lambda x: int(x[0]) in user_ids and "NULL" not in x,
102 | parser.build_user,
103 | lambda x: int(x[0])
104 | )
105 |
106 | def download_target_users():
107 | req = urllib.request.Request(users_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN})
108 | response=urllib.request.urlopen(req)
109 | content=response.read().decode('utf-8')
110 |
111 | fp = open(PULL_DATA_PATH+'\\target_users_'+datetime.date.today().isoformat()+'.txt', "w")
112 | fp.write(content)
113 | fp.close()
114 |
115 | '''
116 | user_ids = set([int(uid) for uid in content.split("\n") if len(uid) > 0])
117 |
118 | with open(PULL_DATA_PATH+'\\target_users_'+datetime.date.today().isoformat()+'.txt','w') as wt:
119 | for uid in user_ids:
120 | wt.write(str(uid)+"\n")
121 | '''
122 | #return user_ids
123 |
124 | def process():
125 | download_target_users()
126 | download_items()
127 |
128 | def offline_submit(filename):
129 | rd = open(filename,'r')
130 | content=rd.read()
131 | rd.close()
132 | content = content.encode('utf-8')
133 | req = urllib.request.Request(url=offline_submission(SERVER), data=content, headers={"Authorization": "Bearer %s" %TOKEN}, method='POST')
134 | response=urllib.request.urlopen(req)
135 | content=response.read().decode('utf-8')
136 | print(content)
137 |
138 | def online_submit(filename):
139 | rd = open(filename,'r')
140 | content=rd.read()
141 | rd.close()
142 | content = content.encode('utf-8')
143 | req = urllib.request.Request(url=online_submission(SERVER), data=content, headers={"Authorization": "Bearer %s" %TOKEN}, method='POST')
144 | response=urllib.request.urlopen(req)
145 | content=response.read().decode('utf-8')
146 | print(content)
147 |
148 | def submit():
149 | http = httplib2.Http()
150 | filename = TMP_SOLUTION
151 | with open(filename, 'r') as content_file:
152 | content = content_file.read()
153 | response = http.request(post_url(SERVER), method="POST", body=content,
154 | headers=header(TOKEN)
155 | )[1].decode("utf-8")
156 | print("SUBMIT: " + filename + " " + response)
157 |
158 | def usage_test():
159 | '''
160 | req = urllib.request.Request(status_url(SERVER), None, {"Authorization": "Bearer %s" %TOKEN})
161 | response=urllib.request.urlopen(req)
162 | content=response.read().decode('utf-8')
163 | print(content)
164 | response = json.loads(content)
165 | print(response)
166 | '''
167 | print(is_ready())
168 |
169 | process()
170 |
171 |
172 | if __name__ == "__main__":
173 |
174 | #usage_test()
175 |
176 | path = r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\camera_ready'
177 | offline_submit(path+r'\test-FT_FT_laglng_premium0.005_submit_complete.txt')
178 |
179 |
180 | #download_interactions()
181 | #download_acceptsubmission();
182 |
183 | #online_submit(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit.csv')
184 | r'''
185 | online_submit(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v1.csv')
186 | online_submit(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v2.csv')
187 | online_submit(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v3.csv')
188 | online_submit(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v4.csv')
189 | online_submit(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v5.csv')
190 | online_submit(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v6.csv')
191 | online_submit(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v7.csv')
192 | '''
193 | #online_submit(r'\\mlsdata\e$\Users\v-lianji\mlsdata\Recsys17\submit\online\recsys17-pred-highdim-submit_v-1.csv')
194 |
195 | '''
196 | last_submit = None
197 | while True:
198 | if is_ready() and last_submit != datetime.date.today():
199 | process()
200 | last_submit = datetime.date.today()
201 | #submit()
202 | else:
203 | print("Not ready yet: " + str(datetime.date.today()))
204 | time.sleep(600)
205 | '''
206 |
207 |
--------------------------------------------------------------------------------
/models/StudyNDCG.script.cs:
--------------------------------------------------------------------------------
1 | using Microsoft.SCOPE.Types;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.IO;
5 | using System.Text;
6 | using ScopeRuntime;
7 |
--------------------------------------------------------------------------------
/models/TEST_Localmodel_tlc3_pipeline.script:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | //// for cosmos09 TLC3.7
6 | #IF(EXISTS("local/users/v-lianji/TLC3.7/Tlc3Scope.module"))
7 | MODULE "local/users/v-lianji/TLC3.7/Tlc3Scope.module" AS Tlc3Learner;
8 | RESOURCE @"local/users/v-lianji/TLC3.7/Microsoft.MachineLearning.Garage.dll";
9 | #ELSE
10 | /// cosmos14
11 | MODULE "/shares/CML.TLC/TLC/TLC-3.7.162.86/Tlc3Scope.module" AS Tlc3Learner;
12 | RESOURCE @"/shares/CML.TLC/TLC/TLC-3.7.162.86/Microsoft.MachineLearning.Garage.dll";
13 | RESOURCE @"/shares/CML.TLC/TLC/TLC-3.7.162.86/libxgboost.dll";
14 | RESOURCE @"/shares/CML.TLC/TLC/TLC-3.7.162.86/Microsoft.MachineLearning.XGBoost.dll";
15 | #ENDIF
16 |
17 |
18 |
19 | #DECLARE path string = "local/users/v-lianji/camera_ready/train-val-online/overfitting/";
20 |
21 |
22 |
23 | #DECLARE ModelString string = @path + "FT_p50_haslatlng.zip"; //
24 | RESOURCE @ModelString;
25 |
26 | #DECLARE TestDataFile string = @path+ "test_complete_0_overfitting_highdim.svmlight.csv";
27 | #DECLARE ValidDataFile string = @path+ "valid02_overfitting_highdim.svmlight.csv";
28 |
29 | #DECLARE out_submit_file_v0 string = @path+"results/FT_submit"+".csv";
30 | #DECLARE out_submit_file string = @path+"results/FT_test_submit_top1.csv";
31 | #DECLARE PredOut string = @path+"results/FT_test.predictions.tsv";
32 | #DECLARE PredOut_valid string = @path+"results/FT_valid.predictions.tsv";
33 | #DECLARE MetricsOut string = @path+"results/FT_valid.metrics.ss";
34 |
35 |
36 | data =
37 | EXTRACT Line : string
38 | FROM @ValidDataFile
39 | USING DefaultTextExtractor("-d", "\n");
40 | scoredTest =
41 | PROCESS data
42 | PRODUCE
43 | Comment,
44 | Label,
45 | Score,
46 | Probability
47 | USING TlcScoringProcessor("loader+", "in=FT_p50_haslatlng.zip"); //useLoader=+
48 |
49 | OUTPUT
50 | TO @PredOut_valid
51 | USING DefaultTextOutputter();
52 |
53 | metrics =
54 | REDUCE scoredTest ALL
55 | USING TlcEvaluatingReducer(
56 | // The binary evaluator is being used,
57 | // and the score and probability columns are specified.
58 | "eval=bin{prob=Probability score=Score}",
59 | // The label column.
60 | "lab=Label");
61 | OUTPUT metrics
62 | TO SSTREAM @MetricsOut ;
63 |
64 |
65 |
66 |
67 | data =
68 | EXTRACT Line : string
69 | FROM @TestDataFile
70 | USING DefaultTextExtractor("-d", "\n");
71 |
72 |
73 | scoredTest =
74 | PROCESS data
75 | PRODUCE
76 | Comment,
77 | Label,
78 | Score,
79 | Probability
80 | USING TlcScoringProcessor("loader+", "in=FT_p50_haslatlng.zip"); //useLoader=+
81 |
82 | OUTPUT
83 | TO @PredOut
84 | USING DefaultTextOutputter();
85 |
86 |
87 |
88 | preds = SELECT MyHelper.GetUserId(Comment) AS uid,
89 | MyHelper.GetItemId(Comment) AS iid,
90 | Probability
91 | FROM scoredTest;
92 |
93 |
94 |
95 | REDUCE preds
96 | ON iid
97 | USING SubmissionFormater();
98 |
99 | OUTPUT
100 | TO @out_submit_file_v0
101 | USING DefaultTextOutputter(delimiter: '\t');
102 |
103 | //
104 | //////// since each user can receive at most one recommendation
105 | tpreds =
106 | REDUCE preds
107 | ON iid
108 | USING TopKSelector();
109 |
110 | REDUCE tpreds
111 | ON holder
112 | USING OnlineSubmissionFormater();
113 |
114 | OUTPUT
115 | TO @out_submit_file
116 | USING DefaultTextOutputter(delimiter: '\t');
117 |
--------------------------------------------------------------------------------
/models/TEST_Localmodel_tlc3_pipeline.script.cs:
--------------------------------------------------------------------------------
1 | using Microsoft.SCOPE.Types;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.IO;
5 | using System.Text;
6 | using ScopeRuntime;
7 |
8 | public static class MyHelper
9 | {
10 | public static string GetUserId(string str)
11 | {
12 | int idx = str.IndexOf(",");
13 | return str.Substring(0, idx);
14 | }
15 |
16 | public static string GetItemId(string str)
17 | {
18 | int idx = str.IndexOf(",");
19 | return str.Substring(idx + 1);
20 | }
21 | }
22 |
23 |
24 |
25 | public class SubmissionFormater : Reducer
26 | {
27 | public override Schema Produces(string[] requestedColumns, string[] args, Schema input)
28 | {
29 | ScopeRuntime.Diagnostics.DebugStream.WriteLine("requestdColumns: {0}", string.Join(",", requestedColumns));
30 | return new Schema(
31 | "ItemId:string,TopUserId:string"
32 | );
33 | }
34 |
35 | public override IEnumerable Reduce(RowSet input, Row outputRow, string[] args)
36 | {
37 | int topk = 100;
38 |
39 | string iid = "";
40 |
41 | List> uid_score_list = new List>();
42 |
43 |
44 | foreach (Row row in input.Rows)
45 | {
46 | iid = row[1].String;
47 |
48 | string uid = row[0].String;
49 |
50 | float score = row[2].Float;
51 |
52 | uid_score_list.Add(new Tuple(uid, score));
53 | }
54 |
55 | uid_score_list.Sort((a, b) => b.Item2.CompareTo(a.Item2));
56 | int k = Math.Min(topk, uid_score_list.Count);
57 |
58 | string value = "";
59 | for (int i = 0; i < k; i++)
60 | {
61 | value += "," + uid_score_list[i].Item1;
62 | }
63 |
64 | if (value.Length > 0)
65 | {
66 | outputRow[0].Set(iid);
67 | outputRow[1].Set(value.Substring(1));
68 | }
69 | else
70 | {
71 | outputRow[0].Set(iid);
72 | outputRow[1].Set("");
73 | }
74 |
75 | yield return outputRow;
76 |
77 | }
78 | }
79 |
80 |
81 |
82 |
83 | public class OnlineSubmissionFormater : Reducer
84 | {
85 | public override Schema Produces(string[] requestedColumns, string[] args, Schema input)
86 | {
87 | ScopeRuntime.Diagnostics.DebugStream.WriteLine("requestdColumns: {0}", string.Join(",", requestedColumns));
88 | return new Schema(
89 | "ItemId:string,TopUserId:string"
90 | );
91 | }
92 |
93 | public override IEnumerable Reduce(RowSet input, Row outputRow, string[] args)
94 | {
95 | int topk = 250;
96 |
97 | string iid = "";
98 | string uid = "";
99 | float score = 0;
100 |
101 | List> score_list = new List>();
102 |
103 |
104 | foreach (Row row in input.Rows)
105 | {
106 | iid = row[1].String;
107 |
108 | uid = row[0].String;
109 |
110 | score = row[2].Float;
111 |
112 | score_list.Add(new Tuple(uid, iid, score));
113 | }
114 |
115 | score_list.Sort((a, b) => b.Item3.CompareTo(a.Item3));
116 |
117 | Dictionary