├── Recommender-systems ├── readme ├── pmf.m ├── bayespmf.m ├── demo.m ├── pred.m ├── makematrix.m ├── README.txt ├── 深度学习预测.py └── shape.ipynb ├── README.md ├── prediction.csv └── catdog.ipynb /Recommender-systems/readme: -------------------------------------------------------------------------------- 1 | DateCastle猜你喜欢比赛 第二名代码分享 2 | -------------------------------------------------------------------------------- /Recommender-systems/pmf.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashishpatel26/AlexNet-based-on-keras/master/Recommender-systems/pmf.m -------------------------------------------------------------------------------- /Recommender-systems/bayespmf.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashishpatel26/AlexNet-based-on-keras/master/Recommender-systems/bayespmf.m -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DateCastle 2 | 3 | 这里是猫狗大战的分享区域。 4 | 5 | html文件下载后浏览器打开可见。 6 | 7 | ipynb文件下载后可直接运行于jupyter notebook。 8 | 9 | prediction.csv为本次的比赛的最后一次提交。 10 | 11 | -- 12 | 欢迎提出改进意见。 13 | -------------------------------------------------------------------------------- /Recommender-systems/demo.m: -------------------------------------------------------------------------------- 1 | %restart=1; 2 | %fprintf(1,'Running Probabilistic Matrix Factorization (PMF) \n'); 3 | %pmf 4 | 5 | restart=1; 6 | fprintf(1,'\nRunning Bayesian PMF\n'); 7 | bayespmf 8 | 9 | -------------------------------------------------------------------------------- /Recommender-systems/pred.m: -------------------------------------------------------------------------------- 1 | function [pred_out] = pred(w1_M1_sample,w1_P1_sample,N,mean_rating); 2 | 3 | %%% Make predicitions on the validation data 4 | 5 | aa_p = double(N(:,1)); 6 | aa_m = double(N(:,2)); 7 | rating = double(N(:,3)); 8 | 9 | pred_out = sum(w1_M1_sample(aa_m,:).*w1_P1_sample(aa_p,:),2) + mean_rating; 10 | ff = find(pred_out>5); pred_out(ff)=5; 11 | ff = find(pred_out<1); pred_out(ff)=1; 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /Recommender-systems/makematrix.m: -------------------------------------------------------------------------------- 1 | % Version 1.000 2 | % 3 | % Code provided by Ruslan Salakhutdinov 4 | % 5 | % Permission is granted for anyone to copy, use, modify, or distribute this 6 | % program and accompanying programs and documents for any purpose, provided 7 | % this copyright notice is retained and prominently displayed, along with 8 | % a note saying that the original programs are available from our 9 | % web page. 10 | % The programs and documents are distributed without any warranty, express or 11 | % implied. As the programs were written for research purposes only, they have 12 | % not been tested to the degree that would be advisable in any important 13 | % application. All use of these programs is entirely at the user's own risk. 14 | 15 | 16 | 17 | %% Create a matrix of size num_p by num_m from triplets {user_id, movie_id, rating_id} 18 | 19 | load train 20 | 21 | num_m = 14726; 22 | num_p = 223970; 23 | count = sparse(num_p,num_m); %for Netflida data, use sparse matrix instead. 24 | 25 | for mm=1:num_m 26 | ff= find(M(:,2)==mm); 27 | fprintf(1, '\n %d / %d \t \n', mm,num_m); 28 | count(M(ff,1),mm) = M(ff,3); 29 | end 30 | 31 | save makematrix count 32 | -------------------------------------------------------------------------------- /Recommender-systems/README.txt: -------------------------------------------------------------------------------- 1 | % Code provided by Ruslan Salakhutdinov 2 | % 3 | % Permission is granted for anyone to copy, use, modify, or distribute this 4 | % program and accompanying programs and documents for any purpose, provided 5 | % this copyright notice is retained and prominently displayed, along with 6 | % a note saying that the original programs are available from our 7 | % web page. 8 | % The programs and documents are distributed without any warranty, express or 9 | % implied. As the programs were written for research purposes only, they have 10 | % not been tested to the degree that would be advisable in any important 11 | % application. All use of these programs is entirely at the user's own risk. 12 | 13 | How to make it work: 14 | 15 | 1. Create a separate directory and download all these files into the same directory 16 | 2. Download the following 7 files: 17 | * demo.m Main file for training PMF and Bayesian PMF 18 | * pmf.m Training PMF model 19 | * bayespmf.m Bayesian PMF model that implements Gibbs sampler. 20 | * moviedata.mat Sample data that contains triplets (user_id, movie_id, rating) 21 | * makematrix.m Helper function that converts triplets into large matrix. 22 | This file is used by bayespmf.m 23 | * pred.m Helper function that makes predictions on the validation set. 24 | * README.txt 25 | 26 | 3. Simply run demo.m in Matlab. It will fit PMF and then will run Bayesian PMF. 27 | 28 | This code uses Matlab stats toolbox to sample from Wishart distribution. 29 | If you don't have stats toolbox you can use Tom Minka's 30 | "The Lightspeed Matlab Toolbox" (just google it). 31 | 32 | 33 | I did not try to optimize this code, but please e-mail me if you find bugs. 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /Recommender-systems/深度学习预测.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | np.random.seed(2016) 5 | 6 | import os 7 | import glob 8 | import math 9 | import pickle 10 | import datetime 11 | 12 | from keras.layers import Input, Embedding, LSTM, Dense,Flatten, Dropout, merge 13 | from keras.models import Model 14 | 15 | def load_train(): 16 | X_train_uid=[] 17 | X_train_iid=[] 18 | Y_train_score=[] 19 | 20 | path = os.path.join('./data', 'train.csv') 21 | print('Read train data',path) 22 | 23 | f = open(path, 'r') 24 | line = f.readline() 25 | while (1): 26 | line = f.readline() 27 | if line == '': 28 | break 29 | arr = line.strip().split(',') 30 | X_train_uid.append(int(arr[0])) 31 | X_train_iid.append(int(arr[1])) 32 | Y_train_score.append(int(arr[2])) 33 | f.close() 34 | return X_train_uid,X_train_iid,Y_train_score 35 | 36 | def load_test(): 37 | X_test_uid=[] 38 | X_test_iid=[] 39 | 40 | path = os.path.join('./data', 'test.csv') 41 | print('Read test data',path) 42 | 43 | f = open(path, 'r') 44 | line = f.readline() 45 | while (1): 46 | line = f.readline() 47 | if line == '': 48 | break 49 | arr = line.strip().split(',') 50 | X_test_uid.append(int(arr[0])) 51 | X_test_iid.append(int(arr[1])) 52 | f.close() 53 | return X_test_uid,X_test_iid 54 | 55 | 56 | X_train_uid,X_train_iid,Y_train_score = load_train() 57 | #print len(X_train_uid),X_train_uid[33177260],max(X_train_uid) 58 | #print len(X_train_iid),X_train_iid[33177260],max(X_train_iid) 59 | #print len(Y_train_score),Y_train_score[33177260] 60 | print "load train data OK." 61 | 62 | X_test_uid,X_test_iid = load_test() 63 | #print len(X_test_uid),X_test_uid[100],max(X_test_uid) 64 | #print len(X_test_iid),X_test_iid[100],max(X_test_iid) 65 | print "load test data OK." 66 | 67 | # normalize train date 68 | X_train_uid=np.array(X_train_uid) 69 | X_train_uid=X_train_uid.reshape(X_train_uid.shape[0],1) 70 | 71 | X_train_iid=np.array(X_train_iid) 72 | X_train_iid=X_train_iid.reshape(X_train_iid.shape[0],1) 73 | 74 | Y_train_score = np.array(Y_train_score).astype('float32') 75 | Y_train_score = (Y_train_score - 1)/ 4 76 | 77 | # normalize test date 78 | X_test_uid=np.array(X_test_uid) 79 | X_test_uid=X_test_uid.reshape(X_test_uid.shape[0],1) 80 | 81 | X_test_iid=np.array(X_test_iid) 82 | X_test_iid=X_test_iid.reshape(X_test_iid.shape[0],1) 83 | 84 | # define model 85 | input_1=Input(shape=(1,), dtype='int32') 86 | input_2=Input(shape=(1,), dtype='int32') 87 | x1=Embedding(output_dim=128, input_dim=223970, input_length=1)(input_1) 88 | x2=Embedding(output_dim=128, input_dim=14726, input_length=1)(input_2) 89 | x1=Flatten()(x1) 90 | x2=Flatten()(x2) 91 | x = merge([x1, x2], mode='concat') 92 | x = Dropout(0.2)(x) 93 | x = Dense(512, activation='relu')(x) 94 | x = Dropout(0.2)(x) 95 | x = Dense(64, activation='relu')(x) 96 | x = Dropout(0.2)(x) 97 | out = Dense(1, activation='sigmoid')(x) 98 | model = Model(input=[input_1, input_2], output=out) 99 | model.compile(optimizer='rmsprop', 100 | loss='mean_squared_error', 101 | metrics=[]) 102 | # train model 103 | model.fit([X_train_uid, X_train_iid], Y_train_score, 104 | nb_epoch=10, batch_size=1024*6) 105 | 106 | # predict 107 | Y_test_score = model.predict([X_test_uid, X_test_iid],batch_size=2048) 108 | Y_test_score = Y_test_score * 4 + 1 109 | 110 | f=open("out.csv","w") 111 | f.write("score\n") 112 | for i in range(Y_test_score.shape[0]): 113 | f.write("{:1.4f}".format(Y_test_score[i,0])) 114 | f.write("\n") 115 | f.close() 116 | 117 | -------------------------------------------------------------------------------- /prediction.csv: -------------------------------------------------------------------------------- 1 | uid 2 | 98c71123-8ea8-45f6-b8ab-4f6b28d2f7bc 3 | 9198e297-850f-4c48-aec6-3a9e99820e87 4 | 7460b03a-bedd-4f4b-a1b0-7bda91aebd7c 5 | eabd19b7-95d2-49aa-9e89-32f4c6bb28f9 6 | 98f1b7f2-79d1-490e-ae62-ee2d5799f93f 7 | 899f8b90-d8bc-408a-b7e4-8975fc837d0c 8 | feaaafb9-cde2-4c87-bf66-0e0ec0920ce3 9 | af94d1a3-5f26-4078-834b-17784da08779 10 | 78a60efc-9524-4a29-8228-2187d4b84b95 11 | c000409d-5ed5-4983-9f08-64f511e06830 12 | 17ce07e6-8577-47b6-988c-5e6de8009beb 13 | eea851fc-8936-4923-80c3-9714a4d73aee 14 | c330b9e0-92a5-4670-b050-7de1cbd0b905 15 | 4d11c8c4-a697-4a1c-9f4b-2617cbd55cd6 16 | 715907a2-1ff9-4dc3-bffc-22e9ff48e94b 17 | 2d36f463-feef-46dd-8daf-c1c78e81bbe1 18 | 42f6dfaa-f9d0-46b1-b947-edbe4f4d2653 19 | 5ce2706c-ee74-4220-9776-fdf2f02b480c 20 | 288e784d-7b4e-40b3-9f66-2b6cf61dc95f 21 | 42403abe-4a26-4199-9696-558948cf583d 22 | 1a9bfa35-b8e3-499c-a0fc-f95430ed2987 23 | b3906795-0dd5-4d0d-a28f-ae3a6845eee5 24 | 4794109a-5c2c-4436-b9be-71c6678fab67 25 | 2489cf3e-c55c-4238-bff3-f171fcfcf3ec 26 | 2aed02f1-e3de-4588-9870-d2d1c59b13af 27 | f1faaee2-01f3-4888-8e21-c8d161c3416e 28 | aae83e33-5673-44bb-87e9-38b5ccc72deb 29 | eba49705-092b-4717-b51a-c1ec52c0979a 30 | c63045c9-f0f2-457e-a315-7c6decf32c22 31 | 14eb67bc-dc31-4b8e-8e01-9dcfbc098930 32 | 8b88f36a-ecf2-42b9-b7e8-b135101ab91c 33 | e139d7fe-094b-45d3-993e-234d4d06056e 34 | 82ceb57f-da93-496b-af81-87c93b08e526 35 | 327ec6aa-5f44-4841-85c0-4e71c13dff43 36 | 0260b7b8-e41c-42b8-8baf-50d7b1108c2f 37 | 2028fd50-5f1f-4507-b297-2bbabc3a8c5f 38 | 51549a99-f18f-460b-a6a6-e174908955a6 39 | 018b0e60-a879-40c1-bc5b-7fd077182105 40 | 0d42164c-c34b-488e-a3a2-a2adb32c6a0f 41 | 7f080fc2-06e0-45cf-a2ca-a8b89a5a074b 42 | 7515d2d6-63a7-414d-ab57-327f5da89444 43 | cc412093-ca64-4ec2-9f7f-1a2ce3fe5346 44 | 049baf54-be84-4fb6-b1f9-05b56214c3f8 45 | 8f3a5bd6-b902-4a0b-83bb-69b88c730b1f 46 | 36eb1d96-a9ae-446a-906f-2224dfba8788 47 | 67a4cd4f-311c-4ea8-ad9c-cce99be9f7a5 48 | 6abdd425-deff-4564-a148-1879f4c25023 49 | 199790f3-e3c1-4891-8943-e620764e8423 50 | 66cfcf2c-7700-455d-ae97-10db4a559265 51 | 3604a301-d147-4b68-975c-916a427d8f87 52 | bf5df503-d6f9-4bfc-a757-40c0cef1d456 53 | 0da163ce-936c-4e09-a153-eadc5068079c 54 | cee60b2b-6417-4528-92ac-24ec369f8642 55 | 2b4b6788-1315-41f1-a262-955c000fa116 56 | 09e2a69e-5419-40cf-9bd1-13fb0cbe7301 57 | 0a9ef475-6921-47b2-ab80-2fd14932bade 58 | 3005e62d-ea44-4d39-8cf7-69a79a438cde 59 | 8b9d8799-8a19-4c81-9419-a03016ee6982 60 | 96705bd5-785b-416d-9871-ec19a3f65ccb 61 | 98e59dc5-eb5a-4ade-a2e5-834f1710ac5f 62 | bd1d0736-725d-4b98-bcbb-d051f2cb584a 63 | 07cbbe56-4a68-4bcd-a8d9-61f644aadf4a 64 | 992ea018-508b-45de-b5ec-163238fe26b8 65 | acae71c7-f867-4011-8ae3-90f31871279e 66 | 8c854c49-303d-4d13-b39c-cfe346f9785b 67 | 8807fe82-5074-443c-9d17-596c8ccbf2b7 68 | f3227171-312b-47dd-bdea-1b0c81ab41d6 69 | ca2bd5f2-10c0-4768-bb53-f9ba8ed944e4 70 | f5878404-bb68-441e-8a09-0e3524a87153 71 | 2d7d9cec-7944-49f8-95c0-2596eed08466 72 | 6453462d-16a8-4a8d-a0bc-29fafd4b05a0 73 | 54c8dd63-049b-465c-8d78-d761ac6bcdda 74 | ca7928ee-cdef-40b9-a7ac-3e1422450db2 75 | 2745ba4e-07e7-4537-b9ea-d5f6946f3de1 76 | 7562e0ad-bece-42f2-a2ae-61802d6bd57d 77 | 5968a65e-d087-40b8-aa90-ac05b1aaa8ca 78 | 21fafd0b-0ef3-4e5f-baa4-390a7e05baf8 79 | e2f54f15-dd23-4f65-a486-283bd9819598 80 | 7816f4bd-c2ff-4042-a746-c609809ee77a 81 | 27dc1c88-fb7b-48e1-a054-b8412c0a72ba 82 | ff2426a1-c91e-4886-ab0e-b040b020820a 83 | d7668d8e-d6f6-40a0-beb8-a90049a3d9f2 84 | 5f70610e-73af-427d-b36d-97e436c6b514 85 | e55bb5d1-5e9e-4e31-9588-ead094352451 86 | edab3925-48a4-42ac-8416-7f0c4677b325 87 | beeae62c-047f-4099-acad-162ae4889a10 88 | 0c34ec58-bbfb-4221-8066-861ff995643a 89 | 2784f212-d330-49ba-8480-2c97be2bb8a0 90 | 485babac-34d7-4e17-b500-3f564cb9be39 91 | cf264000-9b86-4dc1-8956-76644438763f 92 | 4cdf5b9f-837e-49b1-98c1-4727198d1d04 93 | c998bb80-1774-4266-93ea-0d82f685ebef 94 | 17bab7f9-ba9d-4ee3-8ce5-483461ec1f34 95 | defe2b2a-7011-48d0-a164-0f812c6cdb6e 96 | de327aff-fa5f-4f7c-9bb5-8b5e18102202 97 | 94ebf189-6a7b-4597-934b-1378692e4b00 98 | 79e941af-571f-4ea7-8db1-677ee78c7567 99 | bdd60d84-b6d8-4f61-a7a1-aa84a1a8a2a1 100 | fad4a83b-a475-42fd-8889-4e7f3b4b04e5 101 | 2dc78ee4-8fe3-4272-b7dc-1e2ee787ebc3 102 | 082cebbd-9142-4195-8551-cea70f61378e 103 | ac8185dc-4425-4f82-94fd-95defccb3d71 104 | f36c696c-f6bc-458c-8097-696e373aef42 105 | 21c3f5e0-8e78-47ae-8607-3d0a2c965484 106 | 756db94e-abed-4899-8f28-b7977bbb50d4 107 | 49d7fffc-0e45-4064-9f1b-d58411956c59 108 | e0e8b71c-ebe9-4333-bcbe-597198d47003 109 | 0b979299-02c8-4bea-89b4-0eff868fe3cd 110 | 40d44977-5d41-4361-ae95-7817c93823bd 111 | 6fd17138-d8b9-4831-b637-d9c2e5ae15db 112 | 584cb650-9fe5-4df2-970d-20ccf4ac8648 113 | 6d1854b3-dbe1-4609-9e6b-19202f15d8a0 114 | 061339f5-ef6e-4266-b814-9381d9a93595 115 | 45cdea24-e737-4af3-9108-9ae1c23bec6e 116 | 221f8ff4-2310-49bb-8de1-e81c80bbf418 117 | f81efd6e-a97e-4044-adf7-91e21ab24b08 118 | 8ef403bb-dbcf-493f-ae3e-9b38e4983808 119 | 3bb87eba-d23f-4d5b-b73d-0a2cf591f89c 120 | c5f53ba5-1a6c-4645-96db-b946b098ab21 121 | 92f58c79-dc50-49ba-a75a-9978ddac877e 122 | fa72ead1-15c1-40cb-9d34-a57c9e919540 123 | b3f6bb8e-b421-44ab-9c76-86a8c367888b 124 | 5ae66fcc-23fc-46a3-848e-c096eb75bb16 125 | aca7541a-1474-4c83-9436-848c57e4dc4a 126 | 4930b820-0d47-4227-9762-cfe98c5294bf 127 | 2a2fda53-6978-4e7b-963f-296459fae495 128 | 7ed6ef7a-733e-4acb-9871-1ccb8a400abb 129 | 9fff6b02-c1b9-4fb2-abc0-b4cd08d3a7f7 130 | 3589dfd0-8d91-47cd-8d1d-3cb46c7c48b5 131 | 777381ec-ecc1-460c-b69a-2e6687fe7404 132 | 371569b0-a68b-4ffd-a26c-08680170d8ca 133 | 2e47f523-2948-4deb-9e06-672aef1482dc 134 | 4a594e57-f026-4571-af24-8b83008a9651 135 | 3d1c8345-0256-4675-843b-b0dd4dd19eb0 136 | 1e3c6be0-fdfc-4e65-a4f0-24374e78fae8 137 | f3a3d8d0-76d8-4a49-9e98-fc0bfca42f1d 138 | b919c436-f528-4210-9700-9eb42c7f21ec 139 | dbcd5cab-a316-48ec-98bf-e59be3c7b013 140 | 0760bcf6-25c0-4f96-b02c-18f6831280da 141 | 335985bc-120a-4208-89d6-be22bed33aca 142 | 9d40c006-8be1-4ce9-94ff-8a16f4205faf 143 | 59173289-bb10-4f2a-8bbf-0d24b123495d 144 | c33000b9-cad3-4b4a-96c1-57c311a2c406 145 | 8100e7ec-f6ab-46b5-90d5-f2e9396de63a 146 | 39eabf6f-8c03-467f-8612-7e21d5c887a2 147 | b2a5d360-8fe4-4fa3-9fcd-02ba61488d06 148 | 3ca71d82-8a99-4aeb-b842-940add0a72e1 149 | 563a079c-f106-44f5-b016-136ec01e846c 150 | a46efc2b-1486-4172-a39f-546657edb892 151 | e8f3ac8d-a7bd-4040-a0a3-d5cd8ea21910 152 | d78806b5-587d-4a75-95c8-7a562bb5080a 153 | 7a8aca35-8b07-4a2a-9831-e96c2e751788 154 | ae71c80b-74d6-4a89-af1c-989c7a74dc30 155 | 63eaf44e-081d-4ad6-b6c7-55fdc98b2bf6 156 | bef8ddfe-c1a4-4fa6-84c3-5088d61b23e8 157 | b3b3f78e-07ac-496a-ac6d-17fc394b626c 158 | 4bd403e0-ba9b-44e2-ad7d-f5a7546181e3 159 | 942ecc0e-2dfa-4752-bb09-c220a4ba2c88 160 | d0fe33f3-68fd-4c64-ac9a-8a885d081dae 161 | 12802da7-f63c-40c0-808a-1b5361309f61 162 | d5b06fc1-0734-43d3-9f13-e194e2f68484 163 | 2f75b658-49f0-4f42-b987-181265ee5add 164 | 792b570b-22c1-44a6-816d-7a7993973585 165 | 89793271-e049-47e2-9a06-8d09b48f8861 166 | 4c468349-937c-476f-bf3f-e2a3ae47149c 167 | cb971158-75af-4396-8c6b-4227541e10bb 168 | a66d24ca-cad9-4bbd-8d60-0433686070aa 169 | 8c2e64f4-b0cb-45b0-a917-e53ea767780c 170 | 8019fbfb-0c47-4598-85df-4390dc57a0b6 171 | 8855ec95-a9cc-4a0b-835b-14655c2e3374 172 | 43774179-eeb3-46f2-8e4d-f5c9ba898dc9 173 | c7a7106c-5913-47e3-85a6-04507d034c78 174 | f7133c08-bbf6-4ef3-aa4d-60a053c8b033 175 | f5586481-66dc-46a9-8aff-3e2ab134b58f 176 | 3f38a004-0cad-4761-8082-b69c6f21cff1 177 | 07975a2c-0e97-465f-abae-eb4d37b0ce84 178 | a19e17bb-7275-4817-aac0-8df51a1330fe 179 | 2b33dff2-b027-4b12-a1e8-f31dc796054b 180 | 5e8b3d30-d85a-4526-8eca-9a6da11147b4 181 | 2f996601-2ba8-4269-9a71-18b616792158 182 | 4e8836c2-26df-4a0e-b23f-0cdb5141c2b1 183 | 0c73df46-16cc-482d-9017-c03e6d45d1b2 184 | 93d24bec-f7c0-4b7e-b5d8-62d60d5e2de8 185 | 840702bd-5dac-4041-98f0-edadb04272e2 186 | 5e7d0670-a2d1-4792-93e2-100424ee02b5 187 | a65aca2f-217c-46ab-8d9a-1afe0aba183c 188 | 96ac1e1f-6d49-470f-ac1e-28ff1a38f01f 189 | 7c806e64-4068-43ac-81d1-4aa3d8e16e41 190 | 6bf77b8b-864b-45d4-b241-22d6458ad298 191 | 192 | -------------------------------------------------------------------------------- /catdog.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "我是参加DataCastle猫狗大战的选手,kuhung。在测评中,我提交的数据集最后评分0.98639。以下是我的备战过程及心得体会。(最后有完整代码及较全面的注释)\n", 8 | "\n", 9 | "## 个人介绍\n", 10 | "华中科技大学机械学院的大二(准大三)学生,接触数据挖掘快有一年了。早期在学生团队做过一些D3数据可视化方面的工作,今年上半年开始数据挖掘实践。想把这个爱好发展成事业。做过阿里的天池竞赛,也有在kaggle混迹。算个数据新手,但一直不承认:你是新人,所以成绩不好看没啥关系。 \n", 11 | "\n", 12 | "## 初识比赛\n", 13 | "第一次接触数据集,就感觉有些难度。因为以前没做过图片分类的比赛,更没想过要用深度学习的神经网络进行识别。思索一番,还是觉得特征提取后,使用决策树靠谱。自己也下去找过资料,发现并不容易实现。期间,还曾一度想过肉眼识别。但打开文件,看到那1400+图片,就觉得这时间花在肉眼识别上不值。中间一度消停。\n", 14 | "\n", 15 | "## 初见曙光——yinjh战队分享\n", 16 | "后来上论坛逛过几次。一次偶然的机会,让我看到了yinjh团队分享的vgg16模型。乍一看,代码简单、效果不错。更为重要的是,这个模型自己以前从未见过。于是抱着验证学习的态度,我把代码扣了下来,打算自己照着做一遍。\n", 17 | "\n", 18 | "## 过程艰难\n", 19 | "一开始,我就把一屏的代码放进了我的jupyter notebook中,一步一步试水。很明显,我的很多依赖包都没安装,所以也是错误不断。早先是在Windows系统下,使用python2.7,需要什么包,就安装什么包。在安装keras过程中,我发现了Anaconda——很好用的一个科学计算环境,集成了各种数据挖掘包。即使是这样,仍然是满屏的错误,亟待排查。\n", 20 | "\n", 21 | "## 步步优化\n", 22 | "离比赛截止就还只有几天,一边准备期末考试,一边焦急地排查bug。Windows系统下仍有个别难以解决的错误,我索性切换到了做NAO机器人时装的Ubuntu系统下。结合keras给的官方文档,我对原代码进行了函数拆分解耦,又在循环体部分增加了异常检测。综合考虑性能,稍微修改了循环结构。下载好训练的vgg16_weights,在没有错误之后,焦急地等待25分钟后,屏幕开始打印结果。\n", 23 | "\n", 24 | "## 欣喜万分\n", 25 | "第一次提交,随便截取了前面一段,没成绩。折腾了几次,才发现是提交的格式出了问题。后面取p=0.99+部分,提交结果在0.58左右,数据集大概有90个。估计了下,狗狗总数应该在180左右。第二次提交,取了180左右,结果0.97多一点。第三次,也是最后一次提交,取了result前189个,结果0.98639,一举升到第一。\n", 26 | "\n", 27 | "---\n", 28 | "### 比赛总结\n", 29 | "这次比赛,首先还得感谢yinjh团队的yin前辈。如果没有您分享的代码,就不会有我今天的成绩。感谢您分享的代码,感想您在我写这篇分享时提供的代码指导。\n", 30 | "再者,感谢我的女票晶晶,谢谢你一直陪在我身边,谢谢你包容我写代码时不那么快的回复手速。我是新手,但我一直不觉得成绩低是理所当。立志从事这一行,就需要快速地学习、快速地成长。新人,也需要做到最好。当然,自己目前还存在很多问题。一些基本的概念只是模糊掌握,需要更多的实践,需要更多的理论积淀,而不是简单地做一个调包侠。\n", 31 | "\n", 32 | "### 给新手的建议\n", 33 | "- 善用搜索引擎,多读官方文档,不要一开始就依赖Google。\n", 34 | "- Google Groups、Stack Overflow、GitHub是好东西。\n", 35 | "- 干!就是干!" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "** ------------------------------------------------------------------------------------ **" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "# 完整代码\n", 50 | "- ** 以下操作均在Ubuntu14.04+Anaconda中进行 **\n", 51 | 52 | "### 导入python标准包 " 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": { 59 | "collapsed": true 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "import os # 处理字符串路径\n", 64 | "\n", 65 | "import glob # 用于查找文件" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "### 导入相关库\n", 73 | "- keras\n", 74 | " - keras是基于Theano的深度学习(Deep Learning)框架 \n", 75 | "\n", 76 | " - 详细信息请见[keras官方文档](http://keras.io/) \n", 77 | " \n", 78 | "##### 安装过程\n", 79 | " \n", 80 | " > conda update conda\n", 81 | " \n", 82 | " > conda update --all\n", 83 | " \n", 84 | " > conda install mingw libpython\n", 85 | " \n", 86 | " > pip install git+git://github.com/Theano/Theano.git\n", 87 | " \n", 88 | " > pip install git+git://github.com/fchollet/keras.git\n", 89 | "\n", 90 | "- cv2 \n", 91 | " - OpenCV库\n", 92 | " \n", 93 | " > conda isntall opnecv \n", 94 | "- numpy\n", 95 | " - Anaconda自带" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": { 102 | "collapsed": false 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "from keras.models import Sequential\n", 107 | "\n", 108 | "from keras.layers.core import Flatten, Dense, Dropout\n", 109 | "\n", 110 | "from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D\n", 111 | "\n", 112 | "from keras.optimizers import SGD\n", 113 | "\n", 114 | "import cv2, numpy as np" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "### 使用keras建立vgg16模型\n", 122 | " - 参考官方示例" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": { 129 | "collapsed": false 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "def VGG_16(weights_path=None):\n", 134 | "\n", 135 | " model = Sequential()\n", 136 | "\n", 137 | " model.add(ZeroPadding2D((1,1),input_shape=(3,224,224)))\n", 138 | "\n", 139 | " model.add(Convolution2D(64, 3, 3, activation='relu'))\n", 140 | "\n", 141 | " model.add(ZeroPadding2D((1,1)))\n", 142 | "\n", 143 | " model.add(Convolution2D(64, 3, 3, activation='relu'))\n", 144 | "\n", 145 | " model.add(MaxPooling2D((2,2), strides=(2,2)))\n", 146 | "\n", 147 | "\n", 148 | "\n", 149 | " model.add(ZeroPadding2D((1,1)))\n", 150 | "\n", 151 | " model.add(Convolution2D(128, 3, 3, activation='relu'))\n", 152 | "\n", 153 | " model.add(ZeroPadding2D((1,1)))\n", 154 | "\n", 155 | " model.add(Convolution2D(128, 3, 3, activation='relu'))\n", 156 | "\n", 157 | " model.add(MaxPooling2D((2,2), strides=(2,2)))\n", 158 | "\n", 159 | "\n", 160 | " model.add(ZeroPadding2D((1,1)))\n", 161 | "\n", 162 | " model.add(Convolution2D(256, 3, 3, activation='relu'))\n", 163 | "\n", 164 | " model.add(ZeroPadding2D((1,1)))\n", 165 | "\n", 166 | " model.add(Convolution2D(256, 3, 3, activation='relu'))\n", 167 | "\n", 168 | " model.add(ZeroPadding2D((1,1)))\n", 169 | "\n", 170 | " model.add(Convolution2D(256, 3, 3, activation='relu'))\n", 171 | "\n", 172 | " model.add(MaxPooling2D((2,2), strides=(2,2)))\n", 173 | "\n", 174 | "\n", 175 | " model.add(ZeroPadding2D((1,1)))\n", 176 | "\n", 177 | " model.add(Convolution2D(512, 3, 3, activation='relu'))\n", 178 | "\n", 179 | " model.add(ZeroPadding2D((1,1)))\n", 180 | "\n", 181 | " model.add(Convolution2D(512, 3, 3, activation='relu'))\n", 182 | "\n", 183 | " model.add(ZeroPadding2D((1,1)))\n", 184 | "\n", 185 | " model.add(Convolution2D(512, 3, 3, activation='relu'))\n", 186 | "\n", 187 | " model.add(MaxPooling2D((2,2), strides=(2,2)))\n", 188 | "\n", 189 | "\n", 190 | " model.add(ZeroPadding2D((1,1)))\n", 191 | "\n", 192 | " model.add(Convolution2D(512, 3, 3, activation='relu'))\n", 193 | "\n", 194 | " model.add(ZeroPadding2D((1,1)))\n", 195 | "\n", 196 | " model.add(Convolution2D(512, 3, 3, activation='relu'))\n", 197 | "\n", 198 | " model.add(ZeroPadding2D((1,1)))\n", 199 | "\n", 200 | " model.add(Convolution2D(512, 3, 3, activation='relu'))\n", 201 | "\n", 202 | " model.add(MaxPooling2D((2,2), strides=(2,2)))\n", 203 | "\n", 204 | "\n", 205 | " model.add(Flatten())\n", 206 | "\n", 207 | " model.add(Dense(4096, activation='relu'))\n", 208 | "\n", 209 | " model.add(Dropout(0.5))\n", 210 | "\n", 211 | " model.add(Dense(4096, activation='relu'))\n", 212 | "\n", 213 | " model.add(Dropout(0.5))\n", 214 | "\n", 215 | " model.add(Dense(1000, activation='softmax'))\n", 216 | "\n", 217 | "\n", 218 | " if weights_path:\n", 219 | "\n", 220 | " model.load_weights(weights_path)\n", 221 | "\n", 222 | "\n", 223 | " return model" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": {}, 229 | "source": [ 230 | "### 引入训练好的vgg16_weights模型\n", 231 | "** Note: ** \n", 232 | "- vgg16_weights.h5需单独下载,并与代码文件处于同一文件夹下,否则会报错。\n", 233 | " - 网上有资源 附百度云盘链接 [vgg16_weights.h5下载](http://pan.baidu.com/s/1qX0CJSC)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": { 240 | "collapsed": true 241 | }, 242 | "outputs": [], 243 | "source": [ 244 | "model = VGG_16('vgg16_weights.h5')" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": { 251 | "collapsed": true 252 | }, 253 | "outputs": [], 254 | "source": [ 255 | "sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)\n", 256 | "model.compile(optimizer=sgd, loss='categorical_crossentropy')" 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "metadata": {}, 262 | "source": [ 263 | "### 猫和狗的特征" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": null, 269 | "metadata": { 270 | "collapsed": true 271 | }, 272 | "outputs": [], 273 | "source": [ 274 | "dogs=[251, 268, 256, 253, 255, 254, 257, 159, 211, 210, 212, 214, 213, 216, 215, 219, 220, 221, 217, 218, 207, 209, 206, 205, 208, 193, 202, 194, 191, 204, 187, 203, 185, 192, 183, 199, 195, 181, 184, 201, 186, 200, 182, 188, 189, 190, 197, 196, 198, 179, 180, 177, 178, 175, 163, 174, 176, 160, 162, 161, 164, 168, 173, 170, 169, 165, 166, 167, 172, 171, 264, 263, 266, 265, 267, 262, 246, 242, 243, 248, 247, 229, 233, 234, 228, 231, 232, 230, 227, 226, 235, 225, 224, 223, 222, 236, 252, 237, 250, 249, 241, 239, 238, 240, 244, 245, 259, 261, 260, 258, 154, 153, 158, 152, 155, 151, 157, 156]\n", 275 | "\n", 276 | "cats=[281,282,283,284,285,286,287]" 277 | ] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "metadata": {}, 282 | "source": [ 283 | "### 待处理文件导入\n", 284 | "** Note: **\n", 285 | "- 将测试集改名为test,放入imgs文件夹下,imgs文件夹又与此代码处于同一文件夹下。\n", 286 | "- 当然,你也可以修改下面的路径。" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": null, 292 | "metadata": { 293 | "collapsed": true 294 | }, 295 | "outputs": [], 296 | "source": [ 297 | "path = os.path.join('imgs', 'test', '*.jpg') #拼接路径\n", 298 | " \n", 299 | "files = glob.glob(path) #返回路径" 300 | ] 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "metadata": {}, 305 | "source": [ 306 | "### 定义几个变量" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "metadata": { 313 | "collapsed": true 314 | }, 315 | "outputs": [], 316 | "source": [ 317 | "result=[]" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": null, 323 | "metadata": { 324 | "collapsed": true 325 | }, 326 | "outputs": [], 327 | "source": [ 328 | "flbase=0\n", 329 | "p=0\n", 330 | "temp=0" 331 | ] 332 | }, 333 | { 334 | "cell_type": "markdown", 335 | "metadata": {}, 336 | "source": [ 337 | "### 定义图像加载函数" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "metadata": { 344 | "collapsed": false 345 | }, 346 | "outputs": [], 347 | "source": [ 348 | "def load_image(imageurl):\n", 349 | " im = cv2.resize(temp ,(224,224)).astype(np.float32)\n", 350 | " im[:,:,0] -= 103.939\n", 351 | " im[:,:,1] -= 116.779\n", 352 | " im[:,:,2] -= 123.68\n", 353 | " im = im.transpose((2,0,1))\n", 354 | " im = np.expand_dims(im,axis=0)\n", 355 | " return im " 356 | ] 357 | }, 358 | { 359 | "cell_type": "markdown", 360 | "metadata": {}, 361 | "source": [ 362 | "### 定义预测函数" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": null, 368 | "metadata": { 369 | "collapsed": true 370 | }, 371 | "outputs": [], 372 | "source": [ 373 | "def predict(url):\n", 374 | " im = load_image(url) \n", 375 | " out = model.predict(im)\n", 376 | " flbase = os.path.basename(url)\n", 377 | " p = np.sum(out[0,dogs]) / (np.sum(out[0,dogs]) + np.sum(out[0,cats]))\n", 378 | " result.append((flbase,p))" 379 | ] 380 | }, 381 | { 382 | "cell_type": "markdown", 383 | "metadata": {}, 384 | "source": [ 385 | "### 开始预测\n", 386 | "** Note: **\n", 387 | "- 此处的if,else异常检测很重要,因为cv2.imread(fl)在遇到某几张图时会为空,抛出错误,程序中途停止,图片集得不到完全检测。\n", 388 | "- 一般配置电脑跑这部分时,大约需要20~30分钟,不是程序没有工作,请耐心等待。" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": null, 394 | "metadata": { 395 | "collapsed": false 396 | }, 397 | "outputs": [], 398 | "source": [ 399 | "for fl in files:\n", 400 | " temp=cv2.imread(fl) \n", 401 | " if temp ==None: \n", 402 | " pass\n", 403 | " else:\n", 404 | " predict(fl) " 405 | ] 406 | }, 407 | { 408 | "cell_type": "markdown", 409 | "metadata": {}, 410 | "source": [ 411 | "### 对结果进行排序" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": null, 417 | "metadata": { 418 | "collapsed": false 419 | }, 420 | "outputs": [], 421 | "source": [ 422 | "result=sorted(result, key=lambda x:x[1], reverse=True)" 423 | ] 424 | }, 425 | { 426 | "cell_type": "markdown", 427 | "metadata": {}, 428 | "source": [ 429 | "### 打印预测结果与相应概率" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": null, 435 | "metadata": { 436 | "collapsed": false 437 | }, 438 | "outputs": [], 439 | "source": [ 440 | "for x in result:\n", 441 | " print x[0],x[1]" 442 | ] 443 | }, 444 | { 445 | "cell_type": "markdown", 446 | "metadata": {}, 447 | "source": [ 448 | "### 预测结果\n", 449 | "- 根据上面的概率,选择相应的前多少张图片\n", 450 | "- 复制进csv文件,使用一般编辑器将\".jpg\"以空格替代" 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": null, 456 | "metadata": { 457 | "collapsed": false 458 | }, 459 | "outputs": [], 460 | "source": [ 461 | "for x in result:\n", 462 | " print x[0]" 463 | ] 464 | } 465 | ], 466 | "metadata": { 467 | "anaconda-cloud": {}, 468 | "kernelspec": { 469 | "display_name": "Python [Root]", 470 | "language": "python", 471 | "name": "Python [Root]" 472 | }, 473 | "language_info": { 474 | "codemirror_mode": { 475 | "name": "ipython", 476 | "version": 2 477 | }, 478 | "file_extension": ".py", 479 | "mimetype": "text/x-python", 480 | "name": "python", 481 | "nbconvert_exporter": "python", 482 | "pygments_lexer": "ipython2", 483 | "version": "2.7.11" 484 | } 485 | }, 486 | "nbformat": 4, 487 | "nbformat_minor": 0 488 | } 489 | -------------------------------------------------------------------------------- /Recommender-systems/shape.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import numpy as np" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "test=pd.read_csv('test.csv')" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 3, 29 | "metadata": { 30 | "collapsed": true 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "train=pd.read_csv('train.csv')" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 5, 40 | "metadata": { 41 | "collapsed": false 42 | }, 43 | "outputs": [ 44 | { 45 | "name": "stdout", 46 | "output_type": "stream", 47 | "text": [ 48 | "(33177270, 4)\n", 49 | " uid iid score time\n", 50 | "0 0 0 2 19\n", 51 | "1 0 8 4 273\n", 52 | "2 0 13 1 587\n", 53 | "3 0 18 3 15\n", 54 | "4 0 34 3 17\n", 55 | " uid iid score time\n", 56 | "33177265 223969 12729 2 1346\n", 57 | "33177266 223969 12983 1 1346\n", 58 | "33177267 223969 13000 4 1346\n", 59 | "33177268 223969 13291 3 1346\n", 60 | "33177269 223969 13531 4 1346\n" 61 | ] 62 | } 63 | ], 64 | "source": [ 65 | "print train.shape\n", 66 | "print train.head()\n", 67 | "print train.tail()" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 7, 73 | "metadata": { 74 | "collapsed": false 75 | }, 76 | "outputs": [], 77 | "source": [ 78 | "user=train['uid']\n", 79 | "item=train['iid']\n", 80 | "score=train['score']" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 8, 86 | "metadata": { 87 | "collapsed": true 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "user = user.drop_duplicates() \n", 92 | "item = item.drop_duplicates()\n", 93 | "score = score.drop_duplicates() " 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 11, 99 | "metadata": { 100 | "collapsed": false 101 | }, 102 | "outputs": [ 103 | { 104 | "name": "stdout", 105 | "output_type": "stream", 106 | "text": [ 107 | "(157949L,)\n", 108 | "(14620L,)\n", 109 | "(5L,)\n" 110 | ] 111 | } 112 | ], 113 | "source": [ 114 | "print user.shape\n", 115 | "print item.shape\n", 116 | "print score.shape" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 3, 122 | "metadata": { 123 | "collapsed": false 124 | }, 125 | "outputs": [], 126 | "source": [ 127 | "train2=train.loc[:,['uid','iid','score']]" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 5, 133 | "metadata": { 134 | "collapsed": true 135 | }, 136 | "outputs": [], 137 | "source": [ 138 | "train2.to_csv('train2.csv')" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 6, 144 | "metadata": { 145 | "collapsed": false 146 | }, 147 | "outputs": [ 148 | { 149 | "data": { 150 | "text/html": [ 151 | "
\n", 152 | "\n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | "
uidiidscore
count3.317727e+073.317727e+073.317727e+07
mean8.481261e+045.109407e+033.495277e+00
std6.057698e+043.616496e+031.088213e+00
min0.000000e+000.000000e+001.000000e+00
25%3.136900e+041.993000e+033.000000e+00
50%7.584700e+044.417000e+034.000000e+00
75%1.314850e+058.039000e+034.000000e+00
max2.239690e+051.472500e+045.000000e+00
\n", 212 | "
" 213 | ], 214 | "text/plain": [ 215 | " uid iid score\n", 216 | "count 3.317727e+07 3.317727e+07 3.317727e+07\n", 217 | "mean 8.481261e+04 5.109407e+03 3.495277e+00\n", 218 | "std 6.057698e+04 3.616496e+03 1.088213e+00\n", 219 | "min 0.000000e+00 0.000000e+00 1.000000e+00\n", 220 | "25% 3.136900e+04 1.993000e+03 3.000000e+00\n", 221 | "50% 7.584700e+04 4.417000e+03 4.000000e+00\n", 222 | "75% 1.314850e+05 8.039000e+03 4.000000e+00\n", 223 | "max 2.239690e+05 1.472500e+04 5.000000e+00" 224 | ] 225 | }, 226 | "execution_count": 6, 227 | "metadata": {}, 228 | "output_type": "execute_result" 229 | } 230 | ], 231 | "source": [ 232 | "train2.describe()" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 11, 238 | "metadata": { 239 | "collapsed": false 240 | }, 241 | "outputs": [ 242 | { 243 | "name": "stdout", 244 | "output_type": "stream", 245 | "text": [ 246 | "uid 0\n", 247 | "iid 0\n", 248 | "score 2\n", 249 | "Name: 0, dtype: int64\n", 250 | "uid 0\n", 251 | "iid 8\n", 252 | "score 4\n", 253 | "Name: 1, dtype: int64\n", 254 | "uid 0\n", 255 | "iid 13\n", 256 | "score 1\n", 257 | "Name: 2, dtype: int64\n", 258 | "uid 0\n", 259 | "iid 18\n", 260 | "score 3\n", 261 | "Name: 3, dtype: int64\n", 262 | "uid 0\n", 263 | "iid 34\n", 264 | "score 3\n", 265 | "Name: 4, dtype: int64\n" 266 | ] 267 | } 268 | ], 269 | "source": [ 270 | "train3=train2.head()\n", 271 | "for index, row in train3.iterrows():\n", 272 | " print row" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 16, 278 | "metadata": { 279 | "collapsed": false 280 | }, 281 | "outputs": [ 282 | { 283 | "data": { 284 | "text/html": [ 285 | "
\n", 286 | "\n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | "
uidiid
0012960
1112726
2111463
3110739
413441
51301
6113291
712814
812857
9212860
10211091
11213057
1238992
13311082
1432665
15312570
16313410
17312714
18314649
1932635
20414339
21413000
2251326
2352308
2451934
2552405
26513509
27512362
2857636
2955155
.........
54616622326712181
5461672232673569
54616822327711865
54616922368612983
5461702238421801
5461712238421418
546172223842146
5461732238423033
546174223842282
5461752238422883
5461762238422161
54617722384210018
54617822384211218
5461792238424753
5461802238429687
5461812238421321
546182223842601
5461832238421340
5461842238421335
5461852238423428
5461862238426155
5461872238424664
5461882238422918
5461892238426607
5461902238423576
5461912238427033
5461922238422391
5461932238422625
5461942238426477
5461952239699758
\n", 602 | "

546196 rows × 2 columns

\n", 603 | "
" 604 | ], 605 | "text/plain": [ 606 | " uid iid\n", 607 | "0 0 12960\n", 608 | "1 1 12726\n", 609 | "2 1 11463\n", 610 | "3 1 10739\n", 611 | "4 1 3441\n", 612 | "5 1 301\n", 613 | "6 1 13291\n", 614 | "7 1 2814\n", 615 | "8 1 2857\n", 616 | "9 2 12860\n", 617 | "10 2 11091\n", 618 | "11 2 13057\n", 619 | "12 3 8992\n", 620 | "13 3 11082\n", 621 | "14 3 2665\n", 622 | "15 3 12570\n", 623 | "16 3 13410\n", 624 | "17 3 12714\n", 625 | "18 3 14649\n", 626 | "19 3 2635\n", 627 | "20 4 14339\n", 628 | "21 4 13000\n", 629 | "22 5 1326\n", 630 | "23 5 2308\n", 631 | "24 5 1934\n", 632 | "25 5 2405\n", 633 | "26 5 13509\n", 634 | "27 5 12362\n", 635 | "28 5 7636\n", 636 | "29 5 5155\n", 637 | "... ... ...\n", 638 | "546166 223267 12181\n", 639 | "546167 223267 3569\n", 640 | "546168 223277 11865\n", 641 | "546169 223686 12983\n", 642 | "546170 223842 1801\n", 643 | "546171 223842 1418\n", 644 | "546172 223842 146\n", 645 | "546173 223842 3033\n", 646 | "546174 223842 282\n", 647 | "546175 223842 2883\n", 648 | "546176 223842 2161\n", 649 | "546177 223842 10018\n", 650 | "546178 223842 11218\n", 651 | "546179 223842 4753\n", 652 | "546180 223842 9687\n", 653 | "546181 223842 1321\n", 654 | "546182 223842 601\n", 655 | "546183 223842 1340\n", 656 | "546184 223842 1335\n", 657 | "546185 223842 3428\n", 658 | "546186 223842 6155\n", 659 | "546187 223842 4664\n", 660 | "546188 223842 2918\n", 661 | "546189 223842 6607\n", 662 | "546190 223842 3576\n", 663 | "546191 223842 7033\n", 664 | "546192 223842 2391\n", 665 | "546193 223842 2625\n", 666 | "546194 223842 6477\n", 667 | "546195 223969 9758\n", 668 | "\n", 669 | "[546196 rows x 2 columns]" 670 | ] 671 | }, 672 | "execution_count": 16, 673 | "metadata": {}, 674 | "output_type": "execute_result" 675 | } 676 | ], 677 | "source": [ 678 | "test=" 679 | ] 680 | }, 681 | { 682 | "cell_type": "code", 683 | "execution_count": null, 684 | "metadata": { 685 | "collapsed": true 686 | }, 687 | "outputs": [], 688 | "source": [] 689 | } 690 | ], 691 | "metadata": { 692 | "anaconda-cloud": {}, 693 | "kernelspec": { 694 | "display_name": "Python [Root]", 695 | "language": "python", 696 | "name": "Python [Root]" 697 | }, 698 | "language_info": { 699 | "codemirror_mode": { 700 | "name": "ipython", 701 | "version": 2 702 | }, 703 | "file_extension": ".py", 704 | "mimetype": "text/x-python", 705 | "name": "python", 706 | "nbconvert_exporter": "python", 707 | "pygments_lexer": "ipython2", 708 | "version": "2.7.11" 709 | } 710 | }, 711 | "nbformat": 4, 712 | "nbformat_minor": 0 713 | } 714 | --------------------------------------------------------------------------------